napistu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +12 -0
- napistu/__main__.py +867 -0
- napistu/consensus.py +1557 -0
- napistu/constants.py +500 -0
- napistu/gcs/__init__.py +10 -0
- napistu/gcs/constants.py +69 -0
- napistu/gcs/downloads.py +180 -0
- napistu/identifiers.py +805 -0
- napistu/indices.py +227 -0
- napistu/ingestion/__init__.py +10 -0
- napistu/ingestion/bigg.py +146 -0
- napistu/ingestion/constants.py +296 -0
- napistu/ingestion/cpr_edgelist.py +106 -0
- napistu/ingestion/identifiers_etl.py +148 -0
- napistu/ingestion/obo.py +268 -0
- napistu/ingestion/psi_mi.py +276 -0
- napistu/ingestion/reactome.py +218 -0
- napistu/ingestion/sbml.py +621 -0
- napistu/ingestion/string.py +356 -0
- napistu/ingestion/trrust.py +285 -0
- napistu/ingestion/yeast.py +147 -0
- napistu/mechanism_matching.py +597 -0
- napistu/modify/__init__.py +10 -0
- napistu/modify/constants.py +86 -0
- napistu/modify/curation.py +628 -0
- napistu/modify/gaps.py +635 -0
- napistu/modify/pathwayannot.py +1381 -0
- napistu/modify/uncompartmentalize.py +264 -0
- napistu/network/__init__.py +10 -0
- napistu/network/constants.py +117 -0
- napistu/network/neighborhoods.py +1594 -0
- napistu/network/net_create.py +1647 -0
- napistu/network/net_utils.py +652 -0
- napistu/network/paths.py +500 -0
- napistu/network/precompute.py +221 -0
- napistu/rpy2/__init__.py +127 -0
- napistu/rpy2/callr.py +168 -0
- napistu/rpy2/constants.py +101 -0
- napistu/rpy2/netcontextr.py +464 -0
- napistu/rpy2/rids.py +697 -0
- napistu/sbml_dfs_core.py +2216 -0
- napistu/sbml_dfs_utils.py +304 -0
- napistu/source.py +394 -0
- napistu/utils.py +943 -0
- napistu-0.1.0.dist-info/METADATA +56 -0
- napistu-0.1.0.dist-info/RECORD +77 -0
- napistu-0.1.0.dist-info/WHEEL +5 -0
- napistu-0.1.0.dist-info/entry_points.txt +2 -0
- napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
- napistu-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/conftest.py +83 -0
- tests/test_consensus.py +255 -0
- tests/test_constants.py +20 -0
- tests/test_curation.py +134 -0
- tests/test_data/__init__.py +0 -0
- tests/test_edgelist.py +20 -0
- tests/test_gcs.py +23 -0
- tests/test_identifiers.py +151 -0
- tests/test_igraph.py +353 -0
- tests/test_indices.py +88 -0
- tests/test_mechanism_matching.py +126 -0
- tests/test_net_utils.py +66 -0
- tests/test_netcontextr.py +105 -0
- tests/test_obo.py +34 -0
- tests/test_pathwayannot.py +95 -0
- tests/test_precomputed_distances.py +222 -0
- tests/test_rpy2.py +61 -0
- tests/test_sbml.py +46 -0
- tests/test_sbml_dfs_create.py +307 -0
- tests/test_sbml_dfs_utils.py +22 -0
- tests/test_sbo.py +11 -0
- tests/test_set_coverage.py +50 -0
- tests/test_source.py +67 -0
- tests/test_uncompartmentalize.py +40 -0
- tests/test_utils.py +487 -0
- tests/utils.py +30 -0
napistu/identifiers.py
ADDED
@@ -0,0 +1,805 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import itertools
|
4
|
+
import logging
|
5
|
+
import re
|
6
|
+
import sys
|
7
|
+
from typing import Optional
|
8
|
+
from urllib.parse import urlparse
|
9
|
+
|
10
|
+
import libsbml
|
11
|
+
import pandas as pd
|
12
|
+
from napistu import utils
|
13
|
+
from pydantic import BaseModel
|
14
|
+
|
15
|
+
from napistu.constants import IDENTIFIERS
|
16
|
+
from napistu.constants import BIOLOGICAL_QUALIFIER_CODES
|
17
|
+
from napistu.constants import ENSEMBL_MOLECULE_TYPES_TO_ONTOLOGY
|
18
|
+
from napistu.constants import ENSEMBL_MOLECULE_TYPES_FROM_ONTOLOGY
|
19
|
+
from napistu.constants import ENSEMBL_SPECIES_FROM_CODE
|
20
|
+
from napistu.constants import ENSEMBL_SPECIES_TO_CODE
|
21
|
+
|
22
|
+
logger = logging.getLogger(__name__)
|
23
|
+
|
24
|
+
|
25
|
+
class Identifiers:
|
26
|
+
"""
|
27
|
+
Identifiers for a single entity or relationship.
|
28
|
+
|
29
|
+
Attributes
|
30
|
+
----------
|
31
|
+
ids : list
|
32
|
+
a list of identifiers which are each a dict containing an ontology and identifier
|
33
|
+
verbose : bool
|
34
|
+
extra reporting, defaults to False
|
35
|
+
|
36
|
+
Methods
|
37
|
+
-------
|
38
|
+
print
|
39
|
+
Print a table of identifiers
|
40
|
+
filter(ontologies, summarize)
|
41
|
+
Returns a bool of whether 1+ of the ontologies was represented
|
42
|
+
hoist(ontology)
|
43
|
+
Returns value(s) from an ontology
|
44
|
+
|
45
|
+
"""
|
46
|
+
|
47
|
+
def __init__(self, id_list: list, verbose: bool = False) -> None:
|
48
|
+
"""
|
49
|
+
Tracks a set of identifiers and the ontologies they belong to.
|
50
|
+
|
51
|
+
Parameters
|
52
|
+
----------
|
53
|
+
id_list : list
|
54
|
+
a list of identifier dictionaries containing ontology, identifier, and optionally url
|
55
|
+
|
56
|
+
Returns
|
57
|
+
-------
|
58
|
+
None.
|
59
|
+
|
60
|
+
"""
|
61
|
+
|
62
|
+
# read list and validate format
|
63
|
+
validated_id_list = _IdentifiersValidator(id_list=id_list).model_dump()[
|
64
|
+
"id_list"
|
65
|
+
]
|
66
|
+
|
67
|
+
if (len(id_list) == 0) and verbose:
|
68
|
+
logger.debug('zero identifiers in "id_list"')
|
69
|
+
|
70
|
+
if len(id_list) != 0:
|
71
|
+
# de-duplicate {identifier, ontology} tuples
|
72
|
+
|
73
|
+
coded_ids = [
|
74
|
+
x[IDENTIFIERS.ONTOLOGY] + "_" + x[IDENTIFIERS.IDENTIFIER]
|
75
|
+
for x in validated_id_list
|
76
|
+
]
|
77
|
+
unique_cids = []
|
78
|
+
unique_cid_indices = []
|
79
|
+
i = 0
|
80
|
+
for cid in coded_ids:
|
81
|
+
if cid not in unique_cids:
|
82
|
+
unique_cids.append(cid)
|
83
|
+
unique_cid_indices.append(i)
|
84
|
+
i += 1
|
85
|
+
validated_id_list = [validated_id_list[i] for i in unique_cid_indices]
|
86
|
+
|
87
|
+
self.ids = validated_id_list
|
88
|
+
|
89
|
+
def print(self):
|
90
|
+
"""Print a table of identifiers"""
|
91
|
+
|
92
|
+
utils.style_df(pd.DataFrame(self.ids), hide_index=True)
|
93
|
+
|
94
|
+
def filter(self, ontologies, summarize=True):
|
95
|
+
"""Returns a bool of whether 1+ of the ontologies was represented"""
|
96
|
+
|
97
|
+
if isinstance(ontologies, str):
|
98
|
+
ontologies = [ontologies]
|
99
|
+
|
100
|
+
# filter based on whether any ontology of interest are present
|
101
|
+
# identifier_matches = [x['ontology'] == y for x in self.ids for y in ontologies]
|
102
|
+
|
103
|
+
identifier_matches = []
|
104
|
+
for an_id in self.ids:
|
105
|
+
identifier_matches.append(
|
106
|
+
any([an_id[IDENTIFIERS.ONTOLOGY] == y for y in ontologies])
|
107
|
+
)
|
108
|
+
|
109
|
+
if summarize:
|
110
|
+
return any(identifier_matches)
|
111
|
+
else:
|
112
|
+
return identifier_matches
|
113
|
+
|
114
|
+
def hoist(self, ontology: str, squeeze: bool = True) -> str | list[str] | None:
|
115
|
+
"""Returns value(s) from an ontology
|
116
|
+
|
117
|
+
Args:
|
118
|
+
ontology (str): the ontology of interest
|
119
|
+
squeeze (bool): if True, return a single value if possible
|
120
|
+
|
121
|
+
Returns:
|
122
|
+
str or list: the value(s) of an ontology of interest
|
123
|
+
|
124
|
+
"""
|
125
|
+
|
126
|
+
if not isinstance(ontology, str):
|
127
|
+
raise TypeError(f"{ontology} must be a str")
|
128
|
+
|
129
|
+
# return the value(s) of an ontology of interest
|
130
|
+
ontology_matches = [
|
131
|
+
x for x, y in zip(self.ids, self.filter(ontology, summarize=False)) if y
|
132
|
+
]
|
133
|
+
ontology_ids = [x[IDENTIFIERS.IDENTIFIER] for x in ontology_matches]
|
134
|
+
|
135
|
+
if squeeze:
|
136
|
+
if len(ontology_ids) == 0:
|
137
|
+
return None
|
138
|
+
elif len(ontology_ids) == 1:
|
139
|
+
return ontology_ids[0]
|
140
|
+
return ontology_ids
|
141
|
+
|
142
|
+
|
143
|
+
def merge_identifiers(identifier_series: pd.Series) -> Identifiers:
|
144
|
+
"""
|
145
|
+
Aggregate Identifiers
|
146
|
+
|
147
|
+
Merge a pd.Series of Identifiers objects into a single Identifiers object
|
148
|
+
|
149
|
+
Args:
|
150
|
+
identifier_series: pd.Series
|
151
|
+
A pd.Series of of identifiers.Identifiers objects
|
152
|
+
|
153
|
+
|
154
|
+
Returns:
|
155
|
+
An identifiers.Identifiers object
|
156
|
+
|
157
|
+
"""
|
158
|
+
|
159
|
+
if len(identifier_series) == 1:
|
160
|
+
# if there is only a single entry then just return it because no merge is needed
|
161
|
+
return identifier_series.iloc[0]
|
162
|
+
else:
|
163
|
+
# merge a list of identifiers objects into a single identifers object
|
164
|
+
# Identifiers will remove redundancy
|
165
|
+
merged_ids = list(
|
166
|
+
itertools.chain.from_iterable(identifier_series.map(lambda x: x.ids))
|
167
|
+
)
|
168
|
+
return Identifiers(merged_ids)
|
169
|
+
|
170
|
+
|
171
|
+
def format_uri(uri: str, biological_qualifier_type: str | None = None) -> Identifiers:
|
172
|
+
"""
|
173
|
+
Convert a RDF URI into an Identifier object
|
174
|
+
"""
|
175
|
+
|
176
|
+
identifier = format_uri_url(uri)
|
177
|
+
|
178
|
+
if identifier is None:
|
179
|
+
raise NotImplementedError(f"{uri} is not a valid way of specifying a uri")
|
180
|
+
|
181
|
+
_validate_bqb(biological_qualifier_type)
|
182
|
+
identifier[IDENTIFIERS.BQB] = biological_qualifier_type
|
183
|
+
|
184
|
+
return identifier
|
185
|
+
|
186
|
+
|
187
|
+
def _validate_bqb(bqb):
|
188
|
+
if bqb is None:
|
189
|
+
logger.warning(
|
190
|
+
'"biological_qualifier_type" is None; consider adding a valid '
|
191
|
+
'BQB code. For a list of BQB codes see "BQB" in constants.py'
|
192
|
+
)
|
193
|
+
else:
|
194
|
+
if not isinstance(bqb, str):
|
195
|
+
raise TypeError(
|
196
|
+
f"biological_qualifier_type was a {type(bqb)} and must be a str or None"
|
197
|
+
)
|
198
|
+
|
199
|
+
if not bqb.startswith("BQB"):
|
200
|
+
raise ValueError(
|
201
|
+
f"The provided BQB code was {bqb} and all BQB codes start with "
|
202
|
+
'start with "BQB". Please either use a valid BQB code (see '
|
203
|
+
'"BQB" in constansts.py) or use None'
|
204
|
+
)
|
205
|
+
|
206
|
+
|
207
|
+
def format_uri_url(uri: str) -> dict:
|
208
|
+
# check whether the uri is specified using a url
|
209
|
+
result = urlparse(uri)
|
210
|
+
if not all([result.scheme, result.netloc, result.path]):
|
211
|
+
return None
|
212
|
+
|
213
|
+
# valid url
|
214
|
+
|
215
|
+
netloc = result.netloc
|
216
|
+
split_path = result.path.split("/")
|
217
|
+
|
218
|
+
try:
|
219
|
+
if netloc == "identifiers.org":
|
220
|
+
ontology, identifier = format_uri_url_identifiers_dot_org(split_path)
|
221
|
+
elif netloc == "reactome.org":
|
222
|
+
ontology = "reactome"
|
223
|
+
identifier = split_path[-1]
|
224
|
+
# genes and gene products
|
225
|
+
elif netloc == "www.ensembl.org" and split_path[-1] == "geneview":
|
226
|
+
ontology = "ensembl_gene"
|
227
|
+
identifier, id_ontology, _ = parse_ensembl_id(result.query) # type: ignore
|
228
|
+
assert ontology == id_ontology
|
229
|
+
elif netloc == "www.ensembl.org" and split_path[-1] in [
|
230
|
+
"transview",
|
231
|
+
"Transcript",
|
232
|
+
]:
|
233
|
+
ontology = "ensembl_transcript"
|
234
|
+
identifier, id_ontology, _ = parse_ensembl_id(result.query) # type: ignore
|
235
|
+
assert ontology == id_ontology
|
236
|
+
elif netloc == "www.ensembl.org" and split_path[-1] == "ProteinSummary":
|
237
|
+
ontology = "ensembl_protein"
|
238
|
+
identifier, id_ontology, _ = parse_ensembl_id(result.query) # type: ignore
|
239
|
+
assert ontology == id_ontology
|
240
|
+
elif netloc == "www.ensembl.org" and (
|
241
|
+
re.search("ENS[GTP]", split_path[-1])
|
242
|
+
or re.search("ENS[A-Z]{3}[GTP]", split_path[-1])
|
243
|
+
):
|
244
|
+
# format ensembl IDs which lack gene/transview
|
245
|
+
identifier, ontology, _ = parse_ensembl_id(split_path[-1])
|
246
|
+
elif netloc == "www.mirbase.org" or netloc == "mirbase.org":
|
247
|
+
ontology = "mirbase"
|
248
|
+
if re.search("MI[0-9]+", split_path[-1]):
|
249
|
+
identifier = utils.extract_regex_search("MI[0-9]+", split_path[-1])
|
250
|
+
elif re.search("MIMAT[0-9]+", split_path[-1]):
|
251
|
+
identifier = utils.extract_regex_search("MIMAT[0-9]+", split_path[-1])
|
252
|
+
elif re.search("MI[0-9]+", result.query):
|
253
|
+
identifier = utils.extract_regex_search("MI[0-9]+", result.query)
|
254
|
+
elif re.search("MIMAT[0-9]+", result.query):
|
255
|
+
identifier = utils.extract_regex_search("MIMAT[0-9]+", result.query)
|
256
|
+
else:
|
257
|
+
raise TypeError(
|
258
|
+
f"{result.query} does not appear to match MiRBase identifiers"
|
259
|
+
)
|
260
|
+
elif netloc == "purl.uniprot.org":
|
261
|
+
ontology = "uniprot"
|
262
|
+
identifier = split_path[-1]
|
263
|
+
elif netloc == "rnacentral.org":
|
264
|
+
ontology = "rnacentral"
|
265
|
+
identifier = split_path[-1]
|
266
|
+
# chemicals
|
267
|
+
elif split_path[1] == "chebi":
|
268
|
+
ontology = "chebi"
|
269
|
+
identifier = utils.extract_regex_search("[0-9]+$", result.query)
|
270
|
+
elif netloc == "pubchem.ncbi.nlm.nih.gov":
|
271
|
+
ontology = "pubchem"
|
272
|
+
if result.query != "":
|
273
|
+
identifier = utils.extract_regex_search("[0-9]+$", result.query)
|
274
|
+
else:
|
275
|
+
identifier = utils.extract_regex_search("[0-9]+$", split_path[-1])
|
276
|
+
elif netloc == "www.genome.ad.jp":
|
277
|
+
ontology = "genome_net"
|
278
|
+
identifier = utils.extract_regex_search("[A-Za-z]+:[0-9]+$", uri)
|
279
|
+
elif (
|
280
|
+
netloc == "www.guidetopharmacology.org"
|
281
|
+
and split_path[-1] == "LigandDisplayForward"
|
282
|
+
):
|
283
|
+
ontology = "grac"
|
284
|
+
identifier = utils.extract_regex_search("[0-9]+$", result.query)
|
285
|
+
elif netloc == "www.chemspider.com" or netloc == "chemspider.com":
|
286
|
+
ontology = "chemspider"
|
287
|
+
identifier = split_path[-1]
|
288
|
+
# reactions
|
289
|
+
elif split_path[1] == "ec-code":
|
290
|
+
ontology = "ec-code"
|
291
|
+
identifier = split_path[-1]
|
292
|
+
elif netloc == "www.rhea-db.org":
|
293
|
+
ontology = "rhea"
|
294
|
+
identifier = utils.extract_regex_search("[0-9]+$", result.query)
|
295
|
+
# misc
|
296
|
+
elif split_path[1] == "ols":
|
297
|
+
ontology = "ols"
|
298
|
+
identifier = split_path[-1]
|
299
|
+
elif split_path[1] == "QuickGO":
|
300
|
+
ontology = "go"
|
301
|
+
identifier = split_path[-1]
|
302
|
+
elif split_path[1] == "pubmed":
|
303
|
+
ontology = "pubmed"
|
304
|
+
identifier = split_path[-1]
|
305
|
+
# DNA sequences
|
306
|
+
elif netloc == "www.ncbi.nlm.nih.gov" and split_path[1] == "nuccore":
|
307
|
+
ontology = "ncbi_refseq"
|
308
|
+
identifier = split_path[-1]
|
309
|
+
elif netloc == "www.ncbi.nlm.nih.gov" and split_path[1] == "sites":
|
310
|
+
ontology = "ncbi_entrez_" + utils.extract_regex_search(
|
311
|
+
"db=([A-Za-z0-9]+)\\&", result.query, 1
|
312
|
+
)
|
313
|
+
identifier = utils.extract_regex_search(
|
314
|
+
r"term=([A-Za-z0-9\-]+)$", result.query, 1
|
315
|
+
)
|
316
|
+
elif netloc == "www.ebi.ac.uk" and split_path[1] == "ena":
|
317
|
+
ontology = "ebi_refseq"
|
318
|
+
identifier = split_path[-1]
|
319
|
+
elif netloc == "www.thesgc.org" and split_path[1] == "structures":
|
320
|
+
ontology = "sgc"
|
321
|
+
identifier = split_path[-2]
|
322
|
+
elif netloc == "www.mdpi.com":
|
323
|
+
ontology = "mdpi"
|
324
|
+
identifier = "/".join([i for i in split_path[1:] if i != ""])
|
325
|
+
elif netloc == "dx.doi.org":
|
326
|
+
ontology = "dx_doi"
|
327
|
+
identifier = "/".join(split_path[1:])
|
328
|
+
elif netloc == "doi.org":
|
329
|
+
ontology = "doi"
|
330
|
+
identifier = "/".join(split_path[1:])
|
331
|
+
elif netloc == "www.ncbi.nlm.nih.gov" and split_path[1] == "books":
|
332
|
+
ontology = "ncbi_books"
|
333
|
+
identifier = split_path[2]
|
334
|
+
elif netloc == "www.ncbi.nlm.nih.gov" and split_path[1] == "gene":
|
335
|
+
ontology = "ncbi_gene"
|
336
|
+
identifier = split_path[2]
|
337
|
+
elif netloc == "www.phosphosite.org":
|
338
|
+
ontology = "phosphosite"
|
339
|
+
identifier = utils.extract_regex_match(".*id=([0-9]+).*", uri)
|
340
|
+
elif netloc == "ncithesaurus.nci.nih.gov":
|
341
|
+
ontology = "NCI_Thesaurus"
|
342
|
+
identifier = utils.extract_regex_match(".*code=([0-9A-Z]+).*", uri)
|
343
|
+
elif netloc == "matrixdb.ibcp.fr":
|
344
|
+
molecule_class = utils.extract_regex_match(
|
345
|
+
".*class=([a-zA-Z]+).*", uri
|
346
|
+
).lower()
|
347
|
+
ontology = f"matrixdb_{molecule_class}"
|
348
|
+
identifier = utils.extract_regex_match(".*name=([0-9A-Za-z]+).*", uri)
|
349
|
+
elif netloc == "matrixdb.univ-lyon1.fr":
|
350
|
+
molecule_class = utils.extract_regex_match(
|
351
|
+
".*type=([a-zA-Z]+).*", uri
|
352
|
+
).lower()
|
353
|
+
ontology = f"matrixdb_{molecule_class}"
|
354
|
+
identifier = utils.extract_regex_match(".*value=([0-9A-Za-z]+).*", uri)
|
355
|
+
else:
|
356
|
+
raise NotImplementedError(
|
357
|
+
f"{netloc} in the {uri} url has not been associated with a known ontology"
|
358
|
+
)
|
359
|
+
except TypeError:
|
360
|
+
logger.warning(
|
361
|
+
f"An identifier could not be found using the specified regex for {uri} based on the {ontology} ontology"
|
362
|
+
)
|
363
|
+
logger.warning(result)
|
364
|
+
logger.warning("ERROR")
|
365
|
+
sys.exit(1)
|
366
|
+
|
367
|
+
# rename some entries
|
368
|
+
|
369
|
+
if ontology == "ncbi_gene":
|
370
|
+
ontology = "ncbi_entrez_gene"
|
371
|
+
|
372
|
+
id_dict = {"ontology": ontology, "identifier": identifier, "url": uri}
|
373
|
+
|
374
|
+
return id_dict
|
375
|
+
|
376
|
+
|
377
|
+
def parse_ensembl_id(input_str: str) -> tuple[str, str, str]:
|
378
|
+
"""
|
379
|
+
Parse Ensembl ID
|
380
|
+
|
381
|
+
Extract the molecule type and species name from a string containing an ensembl identifier.
|
382
|
+
|
383
|
+
Args:
|
384
|
+
input_str (str):
|
385
|
+
A string containing an ensembl gene, transcript, or protein identifier
|
386
|
+
|
387
|
+
Returns:
|
388
|
+
identifier (str):
|
389
|
+
The substring matching the full identifier
|
390
|
+
molecule_type (str):
|
391
|
+
The ontology the identifier belongs to:
|
392
|
+
- G -> ensembl_gene
|
393
|
+
- T -> ensembl_transcript
|
394
|
+
- P -> ensembl_protein
|
395
|
+
species (str):
|
396
|
+
The species name the identifier belongs to
|
397
|
+
|
398
|
+
"""
|
399
|
+
|
400
|
+
# validate that input is an ensembl ID
|
401
|
+
if not re.search("ENS[GTP][0-9]+", input_str) and not re.search(
|
402
|
+
"ENS[A-Z]{3}[GTP][0-9]+", input_str
|
403
|
+
):
|
404
|
+
ValueError(
|
405
|
+
f"{input_str} did not match the expected formats of an ensembl identifier:",
|
406
|
+
"ENS[GTP][0-9]+ or ENS[A-Z]{3}[GTP][0-9]+",
|
407
|
+
)
|
408
|
+
|
409
|
+
# extract the species code (three letters after ENS if non-human)
|
410
|
+
species_code_search = re.compile("ENS([A-Z]{3})?[GTP]").search(input_str)
|
411
|
+
|
412
|
+
if species_code_search.group(1) is None: # type:ignore
|
413
|
+
species = "Homo sapiens"
|
414
|
+
molecule_type_regex = "ENS([GTP])"
|
415
|
+
id_regex = "ENS[GTP][0-9]+"
|
416
|
+
else:
|
417
|
+
species_code = species_code_search.group(1) # type:ignore
|
418
|
+
|
419
|
+
if species_code not in ENSEMBL_SPECIES_FROM_CODE.keys():
|
420
|
+
raise ValueError(
|
421
|
+
f"The species code for {input_str}: {species_code} did not "
|
422
|
+
"match any of the entries in ENSEMBL_SPECIES_CODE_LOOKUPS."
|
423
|
+
)
|
424
|
+
|
425
|
+
species = ENSEMBL_SPECIES_FROM_CODE[species_code]
|
426
|
+
molecule_type_regex = "ENS[A-Z]{3}([GTP])"
|
427
|
+
id_regex = "ENS[A-Z]{3}[GTP][0-9]+"
|
428
|
+
|
429
|
+
# extract the molecule type (genes, transcripts or proteins)
|
430
|
+
molecule_type_code_search = re.compile(molecule_type_regex).search(input_str)
|
431
|
+
if not molecule_type_code_search:
|
432
|
+
raise ValueError(
|
433
|
+
"The ensembl molecule code (i.e., G, T or P) could not be extracted from {input_str}"
|
434
|
+
)
|
435
|
+
else:
|
436
|
+
molecule_type_code = molecule_type_code_search.group(1) # type: str
|
437
|
+
|
438
|
+
if molecule_type_code not in ENSEMBL_MOLECULE_TYPES_TO_ONTOLOGY.keys():
|
439
|
+
raise ValueError(
|
440
|
+
f"The molecule type code for {input_str}: {molecule_type_code} did not "
|
441
|
+
"match ensembl genes (G), transcripts (T), or proteins (P)."
|
442
|
+
)
|
443
|
+
|
444
|
+
molecule_type = ENSEMBL_MOLECULE_TYPES_TO_ONTOLOGY[molecule_type_code] # type: str
|
445
|
+
|
446
|
+
identifier = utils.extract_regex_search(id_regex, input_str) # type: str
|
447
|
+
|
448
|
+
return identifier, molecule_type, species
|
449
|
+
|
450
|
+
|
451
|
+
def format_uri_url_identifiers_dot_org(split_path: list[str]):
|
452
|
+
"""Parse identifiers.org identifiers
|
453
|
+
|
454
|
+
The identifiers.org identifier have two different formats:
|
455
|
+
1. http://identifiers.org/<ontology>/<id>
|
456
|
+
2. http://identifiers.org/<ontology>:<id>
|
457
|
+
|
458
|
+
Currently we are identifying the newer format 2. by
|
459
|
+
looking for the `:` in the second element of the split path.
|
460
|
+
|
461
|
+
Also the ontology is converted to lower case letters.
|
462
|
+
|
463
|
+
Args:
|
464
|
+
split_path (list[str]): split url path
|
465
|
+
|
466
|
+
Returns:
|
467
|
+
tuple[str, str]: ontology, identifier
|
468
|
+
"""
|
469
|
+
|
470
|
+
# formatting for the identifiers.org meta ontology
|
471
|
+
|
472
|
+
# meta ontologies
|
473
|
+
|
474
|
+
# identify old versions without `:`
|
475
|
+
V2_SEPARATOR = ":"
|
476
|
+
if V2_SEPARATOR in split_path[1]:
|
477
|
+
# identifiers.org switched to format <ontology>:<id>
|
478
|
+
path = "/".join(split_path[1:])
|
479
|
+
if path.count(V2_SEPARATOR) != 1:
|
480
|
+
raise ValueError(
|
481
|
+
"The assumption is that there is only one ':'"
|
482
|
+
f"in an identifiers.org url. Found more in: {path}"
|
483
|
+
)
|
484
|
+
ontology, identifier = path.split(":")
|
485
|
+
ontology = ontology.lower()
|
486
|
+
else:
|
487
|
+
ontology = split_path[1]
|
488
|
+
|
489
|
+
if ontology in ["chebi"]:
|
490
|
+
identifier = utils.extract_regex_search("[0-9]+$", split_path[-1])
|
491
|
+
elif len(split_path) != 3:
|
492
|
+
identifier = "/".join(split_path[2:])
|
493
|
+
else:
|
494
|
+
identifier = split_path[-1]
|
495
|
+
|
496
|
+
return ontology, identifier
|
497
|
+
|
498
|
+
|
499
|
+
def cv_to_Identifiers(entity):
|
500
|
+
"""
|
501
|
+
Convert an SBML controlled vocabulary element into a cpr Identifiers object.
|
502
|
+
|
503
|
+
Parameters:
|
504
|
+
entity: libsbml.Species
|
505
|
+
An entity (species, reaction, compartment, ...) with attached CV terms
|
506
|
+
|
507
|
+
Returns:
|
508
|
+
|
509
|
+
|
510
|
+
"""
|
511
|
+
|
512
|
+
# TO DO: add qualifier type http://sbml.org/Software/libSBML/5.18.0/docs/python-api/classlibsbml_1_1_c_v_term.html#a6a613cc17c6f853cf1c68da59286b373
|
513
|
+
|
514
|
+
cv_list = list()
|
515
|
+
for cv in entity.getCVTerms():
|
516
|
+
if cv.getQualifierType() != libsbml.BIOLOGICAL_QUALIFIER:
|
517
|
+
# only care about biological annotations
|
518
|
+
continue
|
519
|
+
|
520
|
+
biological_qualifier_type = BIOLOGICAL_QUALIFIER_CODES[
|
521
|
+
cv.getBiologicalQualifierType()
|
522
|
+
]
|
523
|
+
out_list = list()
|
524
|
+
for i in range(cv.getNumResources()):
|
525
|
+
try:
|
526
|
+
out_list.append(
|
527
|
+
format_uri(cv.getResourceURI(i), biological_qualifier_type)
|
528
|
+
)
|
529
|
+
except NotImplementedError:
|
530
|
+
logger.warning("Not all identifiers resolved: ", exc_info=True)
|
531
|
+
|
532
|
+
cv_list.extend(out_list)
|
533
|
+
return Identifiers(cv_list)
|
534
|
+
|
535
|
+
|
536
|
+
def create_uri_url(ontology: str, identifier: str, strict: bool = True) -> str:
|
537
|
+
"""
|
538
|
+
Create URI URL
|
539
|
+
|
540
|
+
Convert from an identifier and ontology to a URL reference for the identifier
|
541
|
+
|
542
|
+
Parameters:
|
543
|
+
ontology (str): An ontology for organizing genes, metabolites, etc.
|
544
|
+
identifier (str): A systematic identifier from the \"ontology\" ontology.
|
545
|
+
strict (bool): if strict then throw errors for invalid IDs otherwise return None
|
546
|
+
|
547
|
+
Returns:
|
548
|
+
url (str): A url representing a unique identifier
|
549
|
+
|
550
|
+
"""
|
551
|
+
|
552
|
+
# check input types
|
553
|
+
if not isinstance(ontology, str):
|
554
|
+
raise TypeError(f"ontology was an {type(ontology).__name__} and must be a str")
|
555
|
+
if not isinstance(identifier, str):
|
556
|
+
raise TypeError(
|
557
|
+
f"identifier was an {type(identifier).__name__} and must be a str"
|
558
|
+
)
|
559
|
+
if not isinstance(strict, bool):
|
560
|
+
raise TypeError(f"strict was an {type(strict).__name__} and must be a bool")
|
561
|
+
|
562
|
+
# default to no id_regex
|
563
|
+
id_regex = None
|
564
|
+
|
565
|
+
if ontology in ["ensembl_gene", "ensembl_transcript", "ensembl_protein"]:
|
566
|
+
id_regex, url = ensembl_id_to_url_regex(identifier, ontology)
|
567
|
+
elif ontology == "bigg.metabolite":
|
568
|
+
url = f"http://identifiers.org/bigg.metabolite/{identifier}"
|
569
|
+
elif ontology == "chebi":
|
570
|
+
id_regex = "^[0-9]+$"
|
571
|
+
url = f"http://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:{identifier}"
|
572
|
+
elif ontology == "ec-code":
|
573
|
+
id_regex = "^[0-9]+\\.[0-9]+\\.[0-9]+(\\.[0-9]+)?$"
|
574
|
+
url = f"https://identifiers.org/ec-code/{identifier}"
|
575
|
+
elif ontology == "envipath":
|
576
|
+
url = f"http://identifiers.org/envipath/{identifier}"
|
577
|
+
elif ontology == "go":
|
578
|
+
id_regex = "^GO:[0-9]{7}$"
|
579
|
+
url = f"https://www.ebi.ac.uk/QuickGO/term/{identifier}"
|
580
|
+
elif ontology == "ncbi_entrez_gene":
|
581
|
+
url = f"https://www.ncbi.nlm.nih.gov/gene/{identifier}"
|
582
|
+
elif ontology == "ncbi_entrez_pccompound":
|
583
|
+
id_regex = "^[A-Z]{14}\\-[A-Z]{10}\\-[A-Z]{1}$"
|
584
|
+
url = f"http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=search&db=pccompound&term={identifier}"
|
585
|
+
elif ontology == "pubchem":
|
586
|
+
id_regex = "^[0-9]+$"
|
587
|
+
url = f"http://pubchem.ncbi.nlm.nih.gov/compound/{identifier}"
|
588
|
+
elif ontology == "pubmed":
|
589
|
+
id_regex = "^[0-9]+$"
|
590
|
+
url = f"http://www.ncbi.nlm.nih.gov/pubmed/{identifier}"
|
591
|
+
elif ontology == "reactome":
|
592
|
+
id_regex = "^R\\-[A-Z]{3}\\-[0-9]{7}$"
|
593
|
+
url = f"https://reactome.org/content/detail/{identifier}"
|
594
|
+
elif ontology == "uniprot":
|
595
|
+
id_regex = "^[A-Z0-9]+$"
|
596
|
+
url = f"https://purl.uniprot.org/uniprot/{identifier}"
|
597
|
+
elif ontology == "sgc":
|
598
|
+
id_regex = "^[0-9A-Z]+$"
|
599
|
+
url = f"https://www.thesgc.org/structures/structure_description/{identifier}/"
|
600
|
+
elif ontology == "mdpi":
|
601
|
+
id_regex = None
|
602
|
+
url = f"https://www.mdpi.com/{identifier}"
|
603
|
+
elif ontology == "mirbase":
|
604
|
+
id_regex = None
|
605
|
+
if re.match("MIMAT[0-9]", identifier):
|
606
|
+
url = f"https://www.mirbase.org/mature/{identifier}"
|
607
|
+
elif re.match("MI[0-9]", identifier):
|
608
|
+
url = f"https://www.mirbase.org/hairpin/{identifier}"
|
609
|
+
else:
|
610
|
+
raise NotImplementedError(f"url not defined for this MiRBase {identifier}")
|
611
|
+
elif ontology == "rnacentral":
|
612
|
+
id_regex = None
|
613
|
+
url = f"https://rnacentral.org/rna/{identifier}"
|
614
|
+
elif ontology == "chemspider":
|
615
|
+
id_regex = "^[0-9]+$"
|
616
|
+
url = f"https://www.chemspider.com/{identifier}"
|
617
|
+
|
618
|
+
elif ontology == "dx_doi":
|
619
|
+
id_regex = r"^[0-9]+\.[0-9]+$"
|
620
|
+
url = f"https://dx.doi.org/{identifier}"
|
621
|
+
elif ontology == "doi":
|
622
|
+
id_regex = None
|
623
|
+
url = f"https://doi.org/{identifier}"
|
624
|
+
|
625
|
+
elif ontology == "ncbi_books":
|
626
|
+
id_regex = "^[0-9A-Z]+$"
|
627
|
+
url = f"http://www.ncbi.nlm.nih.gov/books/{identifier}/"
|
628
|
+
|
629
|
+
elif ontology == "ncbi_entrez_gene":
|
630
|
+
id_regex = "^[0-9]+$"
|
631
|
+
url = f"https://www.ncbi.nlm.nih.gov/gene/{identifier}"
|
632
|
+
elif ontology == "phosphosite":
|
633
|
+
id_regex = "^[0-9]+$"
|
634
|
+
url = f"https://www.phosphosite.org/siteAction.action?id={identifier}"
|
635
|
+
elif ontology == "NCI_Thesaurus":
|
636
|
+
id_regex = "^[A-Z][0-9]+$"
|
637
|
+
url = f"https://ncithesaurus.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&code={identifier}"
|
638
|
+
elif ontology == "matrixdb_biomolecule":
|
639
|
+
id_regex = "^[0-9A-Za-z]+$"
|
640
|
+
url = f"http://matrixdb.univ-lyon1.fr/cgi-bin/current/newPort?type=biomolecule&value={identifier}"
|
641
|
+
else:
|
642
|
+
raise NotImplementedError(
|
643
|
+
f"No identifier -> url logic exists for the {ontology} ontology in create_uri_url()"
|
644
|
+
)
|
645
|
+
|
646
|
+
# validate identifier with regex if one exists
|
647
|
+
if id_regex is not None:
|
648
|
+
if re.search(id_regex, identifier) is None:
|
649
|
+
failure_msg = f"{identifier} is not a valid {ontology} id, it did not match the regex: {id_regex}"
|
650
|
+
if strict:
|
651
|
+
raise TypeError(failure_msg)
|
652
|
+
else:
|
653
|
+
print(failure_msg + " returning None")
|
654
|
+
return None
|
655
|
+
|
656
|
+
return url
|
657
|
+
|
658
|
+
|
659
|
+
def ensembl_id_to_url_regex(identifier: str, ontology: str) -> tuple[str, str]:
|
660
|
+
"""
|
661
|
+
Ensembl ID to URL and Regex
|
662
|
+
|
663
|
+
Map an ensembl ID to a validation regex and its canonical url on ensembl
|
664
|
+
|
665
|
+
Args:
|
666
|
+
identifier: str
|
667
|
+
A standard identifier from ensembl genes, transcripts, or proteins
|
668
|
+
ontology: str
|
669
|
+
The standard ontology (ensembl_gene, ensembl_transcript, or ensembl_protein)
|
670
|
+
|
671
|
+
Returns:
|
672
|
+
id_regex: a regex which should match a valid entry in this ontology
|
673
|
+
url: the id's url on ensembl
|
674
|
+
"""
|
675
|
+
|
676
|
+
# extract the species name from the 3 letter species code in the id
|
677
|
+
# (these letters are not present for humans)
|
678
|
+
identifier, implied_ontology, species = parse_ensembl_id(identifier) # type: ignore
|
679
|
+
assert implied_ontology == ontology
|
680
|
+
|
681
|
+
# create an appropriate regex for validating input
|
682
|
+
# this provides testing for other identifiers even if it is redundant with other
|
683
|
+
# validation of ensembl ids
|
684
|
+
|
685
|
+
if species == "Homo sapiens":
|
686
|
+
species_code = ""
|
687
|
+
else:
|
688
|
+
species_code = ENSEMBL_SPECIES_TO_CODE[species]
|
689
|
+
molecule_type_code = ENSEMBL_MOLECULE_TYPES_FROM_ONTOLOGY[ontology]
|
690
|
+
|
691
|
+
id_regex = "ENS" + species_code + molecule_type_code + "[0-9]{11}"
|
692
|
+
|
693
|
+
# convert to species format in ensembl urls
|
694
|
+
species_url_field = re.sub(" ", "_", species)
|
695
|
+
|
696
|
+
if ontology == "ensembl_gene":
|
697
|
+
url = f"http://www.ensembl.org/{species_url_field}/geneview?gene={identifier}"
|
698
|
+
elif ontology == "ensembl_transcript":
|
699
|
+
url = f"http://www.ensembl.org/{species_url_field}/Transcript?t={identifier}"
|
700
|
+
elif ontology == "ensembl_protein":
|
701
|
+
url = f"https://www.ensembl.org/{species_url_field}/Transcript/ProteinSummary?t={identifier}"
|
702
|
+
else:
|
703
|
+
ValueError(f"{ontology} not defined")
|
704
|
+
|
705
|
+
return id_regex, url
|
706
|
+
|
707
|
+
|
708
|
+
def check_reactome_identifier_compatibility(
|
709
|
+
reactome_series_a: pd.Series,
|
710
|
+
reactome_series_b: pd.Series,
|
711
|
+
) -> None:
|
712
|
+
"""
|
713
|
+
Check Reactome Identifier Compatibility
|
714
|
+
|
715
|
+
Determine whether two sets of Reactome identifiers are from the same species.
|
716
|
+
|
717
|
+
Args:
|
718
|
+
reactome_series_a: pd.Series
|
719
|
+
a Series containing Reactome identifiers
|
720
|
+
reactome_series_b: pd.Series
|
721
|
+
a Series containing Reactome identifiers
|
722
|
+
|
723
|
+
Returns:
|
724
|
+
None
|
725
|
+
|
726
|
+
"""
|
727
|
+
|
728
|
+
a_species, a_species_counts = _infer_primary_reactome_species(reactome_series_a)
|
729
|
+
b_species, b_species_counts = _infer_primary_reactome_species(reactome_series_b)
|
730
|
+
|
731
|
+
if a_species != b_species:
|
732
|
+
a_name = reactome_series_a.name
|
733
|
+
if a_name is None:
|
734
|
+
a_name = "unnamed"
|
735
|
+
|
736
|
+
b_name = reactome_series_b.name
|
737
|
+
if b_name is None:
|
738
|
+
b_name = "unnamed"
|
739
|
+
|
740
|
+
raise ValueError(
|
741
|
+
"The two provided pd.Series containing Reactome identifiers appear to be from different species. "
|
742
|
+
f"The pd.Series named {a_name} appears to be {a_species} with {a_species_counts} examples of this code. "
|
743
|
+
f"The pd.Series named {b_name} appears to be {b_species} with {b_species_counts} examples of this code."
|
744
|
+
)
|
745
|
+
|
746
|
+
return None
|
747
|
+
|
748
|
+
|
749
|
+
def _infer_primary_reactome_species(reactome_series: pd.Series) -> tuple[str, int]:
|
750
|
+
"""Infer the best supported species based on a set of Reactome identifiers"""
|
751
|
+
|
752
|
+
series_counts = _count_reactome_species(reactome_series)
|
753
|
+
|
754
|
+
if "ALL" in series_counts.index:
|
755
|
+
series_counts = series_counts.drop("ALL", axis=0)
|
756
|
+
|
757
|
+
return series_counts.index[0], series_counts.iloc[0]
|
758
|
+
|
759
|
+
|
760
|
+
def _count_reactome_species(reactome_series: pd.Series) -> pd.Series:
|
761
|
+
"""Count the number of species tags in a set of reactome IDs"""
|
762
|
+
|
763
|
+
return (
|
764
|
+
reactome_series.drop_duplicates().transform(_reactome_id_species).value_counts()
|
765
|
+
)
|
766
|
+
|
767
|
+
|
768
|
+
def _reactome_id_species(reactome_id: str) -> str:
|
769
|
+
"""Extract the species code from a Reactome ID"""
|
770
|
+
|
771
|
+
reactome_match = re.match("^R\\-([A-Z]{3})\\-[0-9]+", reactome_id)
|
772
|
+
if reactome_match:
|
773
|
+
try:
|
774
|
+
value = reactome_match[1]
|
775
|
+
except ValueError:
|
776
|
+
raise ValueError(f"{reactome_id} is not a valid reactome ID")
|
777
|
+
else:
|
778
|
+
raise ValueError(f"{reactome_id} is not a valid reactome ID")
|
779
|
+
|
780
|
+
return value
|
781
|
+
|
782
|
+
|
783
|
+
def _format_Identifiers_pubmed(pubmed_id: str) -> Identifiers:
|
784
|
+
"""
|
785
|
+
Format Identifiers for a single PubMed ID.
|
786
|
+
|
787
|
+
These will generally be used in an r_Identifiers field.
|
788
|
+
"""
|
789
|
+
|
790
|
+
# create a url for lookup and validate the pubmed id
|
791
|
+
url = create_uri_url(ontology="pubmed", identifier=pubmed_id, strict=False)
|
792
|
+
id_entry = format_uri(uri=url, biological_qualifier_type="BQB_IS_DESCRIBED_BY")
|
793
|
+
|
794
|
+
return Identifiers([id_entry])
|
795
|
+
|
796
|
+
|
797
|
+
class _IdentifierValidator(BaseModel):
|
798
|
+
ontology: str
|
799
|
+
identifier: str
|
800
|
+
url: Optional[str] = None
|
801
|
+
bqb: Optional[str] = None
|
802
|
+
|
803
|
+
|
804
|
+
class _IdentifiersValidator(BaseModel):
|
805
|
+
id_list: list[_IdentifierValidator]
|