napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. napistu/__init__.py +12 -0
  2. napistu/__main__.py +867 -0
  3. napistu/consensus.py +1557 -0
  4. napistu/constants.py +500 -0
  5. napistu/gcs/__init__.py +10 -0
  6. napistu/gcs/constants.py +69 -0
  7. napistu/gcs/downloads.py +180 -0
  8. napistu/identifiers.py +805 -0
  9. napistu/indices.py +227 -0
  10. napistu/ingestion/__init__.py +10 -0
  11. napistu/ingestion/bigg.py +146 -0
  12. napistu/ingestion/constants.py +296 -0
  13. napistu/ingestion/cpr_edgelist.py +106 -0
  14. napistu/ingestion/identifiers_etl.py +148 -0
  15. napistu/ingestion/obo.py +268 -0
  16. napistu/ingestion/psi_mi.py +276 -0
  17. napistu/ingestion/reactome.py +218 -0
  18. napistu/ingestion/sbml.py +621 -0
  19. napistu/ingestion/string.py +356 -0
  20. napistu/ingestion/trrust.py +285 -0
  21. napistu/ingestion/yeast.py +147 -0
  22. napistu/mechanism_matching.py +597 -0
  23. napistu/modify/__init__.py +10 -0
  24. napistu/modify/constants.py +86 -0
  25. napistu/modify/curation.py +628 -0
  26. napistu/modify/gaps.py +635 -0
  27. napistu/modify/pathwayannot.py +1381 -0
  28. napistu/modify/uncompartmentalize.py +264 -0
  29. napistu/network/__init__.py +10 -0
  30. napistu/network/constants.py +117 -0
  31. napistu/network/neighborhoods.py +1594 -0
  32. napistu/network/net_create.py +1647 -0
  33. napistu/network/net_utils.py +652 -0
  34. napistu/network/paths.py +500 -0
  35. napistu/network/precompute.py +221 -0
  36. napistu/rpy2/__init__.py +127 -0
  37. napistu/rpy2/callr.py +168 -0
  38. napistu/rpy2/constants.py +101 -0
  39. napistu/rpy2/netcontextr.py +464 -0
  40. napistu/rpy2/rids.py +697 -0
  41. napistu/sbml_dfs_core.py +2216 -0
  42. napistu/sbml_dfs_utils.py +304 -0
  43. napistu/source.py +394 -0
  44. napistu/utils.py +943 -0
  45. napistu-0.1.0.dist-info/METADATA +56 -0
  46. napistu-0.1.0.dist-info/RECORD +77 -0
  47. napistu-0.1.0.dist-info/WHEEL +5 -0
  48. napistu-0.1.0.dist-info/entry_points.txt +2 -0
  49. napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. napistu-0.1.0.dist-info/top_level.txt +2 -0
  51. tests/__init__.py +0 -0
  52. tests/conftest.py +83 -0
  53. tests/test_consensus.py +255 -0
  54. tests/test_constants.py +20 -0
  55. tests/test_curation.py +134 -0
  56. tests/test_data/__init__.py +0 -0
  57. tests/test_edgelist.py +20 -0
  58. tests/test_gcs.py +23 -0
  59. tests/test_identifiers.py +151 -0
  60. tests/test_igraph.py +353 -0
  61. tests/test_indices.py +88 -0
  62. tests/test_mechanism_matching.py +126 -0
  63. tests/test_net_utils.py +66 -0
  64. tests/test_netcontextr.py +105 -0
  65. tests/test_obo.py +34 -0
  66. tests/test_pathwayannot.py +95 -0
  67. tests/test_precomputed_distances.py +222 -0
  68. tests/test_rpy2.py +61 -0
  69. tests/test_sbml.py +46 -0
  70. tests/test_sbml_dfs_create.py +307 -0
  71. tests/test_sbml_dfs_utils.py +22 -0
  72. tests/test_sbo.py +11 -0
  73. tests/test_set_coverage.py +50 -0
  74. tests/test_source.py +67 -0
  75. tests/test_uncompartmentalize.py +40 -0
  76. tests/test_utils.py +487 -0
  77. tests/utils.py +30 -0
napistu/identifiers.py ADDED
@@ -0,0 +1,805 @@
1
+ from __future__ import annotations
2
+
3
+ import itertools
4
+ import logging
5
+ import re
6
+ import sys
7
+ from typing import Optional
8
+ from urllib.parse import urlparse
9
+
10
+ import libsbml
11
+ import pandas as pd
12
+ from napistu import utils
13
+ from pydantic import BaseModel
14
+
15
+ from napistu.constants import IDENTIFIERS
16
+ from napistu.constants import BIOLOGICAL_QUALIFIER_CODES
17
+ from napistu.constants import ENSEMBL_MOLECULE_TYPES_TO_ONTOLOGY
18
+ from napistu.constants import ENSEMBL_MOLECULE_TYPES_FROM_ONTOLOGY
19
+ from napistu.constants import ENSEMBL_SPECIES_FROM_CODE
20
+ from napistu.constants import ENSEMBL_SPECIES_TO_CODE
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class Identifiers:
26
+ """
27
+ Identifiers for a single entity or relationship.
28
+
29
+ Attributes
30
+ ----------
31
+ ids : list
32
+ a list of identifiers which are each a dict containing an ontology and identifier
33
+ verbose : bool
34
+ extra reporting, defaults to False
35
+
36
+ Methods
37
+ -------
38
+ print
39
+ Print a table of identifiers
40
+ filter(ontologies, summarize)
41
+ Returns a bool of whether 1+ of the ontologies was represented
42
+ hoist(ontology)
43
+ Returns value(s) from an ontology
44
+
45
+ """
46
+
47
+ def __init__(self, id_list: list, verbose: bool = False) -> None:
48
+ """
49
+ Tracks a set of identifiers and the ontologies they belong to.
50
+
51
+ Parameters
52
+ ----------
53
+ id_list : list
54
+ a list of identifier dictionaries containing ontology, identifier, and optionally url
55
+
56
+ Returns
57
+ -------
58
+ None.
59
+
60
+ """
61
+
62
+ # read list and validate format
63
+ validated_id_list = _IdentifiersValidator(id_list=id_list).model_dump()[
64
+ "id_list"
65
+ ]
66
+
67
+ if (len(id_list) == 0) and verbose:
68
+ logger.debug('zero identifiers in "id_list"')
69
+
70
+ if len(id_list) != 0:
71
+ # de-duplicate {identifier, ontology} tuples
72
+
73
+ coded_ids = [
74
+ x[IDENTIFIERS.ONTOLOGY] + "_" + x[IDENTIFIERS.IDENTIFIER]
75
+ for x in validated_id_list
76
+ ]
77
+ unique_cids = []
78
+ unique_cid_indices = []
79
+ i = 0
80
+ for cid in coded_ids:
81
+ if cid not in unique_cids:
82
+ unique_cids.append(cid)
83
+ unique_cid_indices.append(i)
84
+ i += 1
85
+ validated_id_list = [validated_id_list[i] for i in unique_cid_indices]
86
+
87
+ self.ids = validated_id_list
88
+
89
+ def print(self):
90
+ """Print a table of identifiers"""
91
+
92
+ utils.style_df(pd.DataFrame(self.ids), hide_index=True)
93
+
94
+ def filter(self, ontologies, summarize=True):
95
+ """Returns a bool of whether 1+ of the ontologies was represented"""
96
+
97
+ if isinstance(ontologies, str):
98
+ ontologies = [ontologies]
99
+
100
+ # filter based on whether any ontology of interest are present
101
+ # identifier_matches = [x['ontology'] == y for x in self.ids for y in ontologies]
102
+
103
+ identifier_matches = []
104
+ for an_id in self.ids:
105
+ identifier_matches.append(
106
+ any([an_id[IDENTIFIERS.ONTOLOGY] == y for y in ontologies])
107
+ )
108
+
109
+ if summarize:
110
+ return any(identifier_matches)
111
+ else:
112
+ return identifier_matches
113
+
114
+ def hoist(self, ontology: str, squeeze: bool = True) -> str | list[str] | None:
115
+ """Returns value(s) from an ontology
116
+
117
+ Args:
118
+ ontology (str): the ontology of interest
119
+ squeeze (bool): if True, return a single value if possible
120
+
121
+ Returns:
122
+ str or list: the value(s) of an ontology of interest
123
+
124
+ """
125
+
126
+ if not isinstance(ontology, str):
127
+ raise TypeError(f"{ontology} must be a str")
128
+
129
+ # return the value(s) of an ontology of interest
130
+ ontology_matches = [
131
+ x for x, y in zip(self.ids, self.filter(ontology, summarize=False)) if y
132
+ ]
133
+ ontology_ids = [x[IDENTIFIERS.IDENTIFIER] for x in ontology_matches]
134
+
135
+ if squeeze:
136
+ if len(ontology_ids) == 0:
137
+ return None
138
+ elif len(ontology_ids) == 1:
139
+ return ontology_ids[0]
140
+ return ontology_ids
141
+
142
+
143
+ def merge_identifiers(identifier_series: pd.Series) -> Identifiers:
144
+ """
145
+ Aggregate Identifiers
146
+
147
+ Merge a pd.Series of Identifiers objects into a single Identifiers object
148
+
149
+ Args:
150
+ identifier_series: pd.Series
151
+ A pd.Series of of identifiers.Identifiers objects
152
+
153
+
154
+ Returns:
155
+ An identifiers.Identifiers object
156
+
157
+ """
158
+
159
+ if len(identifier_series) == 1:
160
+ # if there is only a single entry then just return it because no merge is needed
161
+ return identifier_series.iloc[0]
162
+ else:
163
+ # merge a list of identifiers objects into a single identifers object
164
+ # Identifiers will remove redundancy
165
+ merged_ids = list(
166
+ itertools.chain.from_iterable(identifier_series.map(lambda x: x.ids))
167
+ )
168
+ return Identifiers(merged_ids)
169
+
170
+
171
+ def format_uri(uri: str, biological_qualifier_type: str | None = None) -> Identifiers:
172
+ """
173
+ Convert a RDF URI into an Identifier object
174
+ """
175
+
176
+ identifier = format_uri_url(uri)
177
+
178
+ if identifier is None:
179
+ raise NotImplementedError(f"{uri} is not a valid way of specifying a uri")
180
+
181
+ _validate_bqb(biological_qualifier_type)
182
+ identifier[IDENTIFIERS.BQB] = biological_qualifier_type
183
+
184
+ return identifier
185
+
186
+
187
+ def _validate_bqb(bqb):
188
+ if bqb is None:
189
+ logger.warning(
190
+ '"biological_qualifier_type" is None; consider adding a valid '
191
+ 'BQB code. For a list of BQB codes see "BQB" in constants.py'
192
+ )
193
+ else:
194
+ if not isinstance(bqb, str):
195
+ raise TypeError(
196
+ f"biological_qualifier_type was a {type(bqb)} and must be a str or None"
197
+ )
198
+
199
+ if not bqb.startswith("BQB"):
200
+ raise ValueError(
201
+ f"The provided BQB code was {bqb} and all BQB codes start with "
202
+ 'start with "BQB". Please either use a valid BQB code (see '
203
+ '"BQB" in constansts.py) or use None'
204
+ )
205
+
206
+
207
+ def format_uri_url(uri: str) -> dict:
208
+ # check whether the uri is specified using a url
209
+ result = urlparse(uri)
210
+ if not all([result.scheme, result.netloc, result.path]):
211
+ return None
212
+
213
+ # valid url
214
+
215
+ netloc = result.netloc
216
+ split_path = result.path.split("/")
217
+
218
+ try:
219
+ if netloc == "identifiers.org":
220
+ ontology, identifier = format_uri_url_identifiers_dot_org(split_path)
221
+ elif netloc == "reactome.org":
222
+ ontology = "reactome"
223
+ identifier = split_path[-1]
224
+ # genes and gene products
225
+ elif netloc == "www.ensembl.org" and split_path[-1] == "geneview":
226
+ ontology = "ensembl_gene"
227
+ identifier, id_ontology, _ = parse_ensembl_id(result.query) # type: ignore
228
+ assert ontology == id_ontology
229
+ elif netloc == "www.ensembl.org" and split_path[-1] in [
230
+ "transview",
231
+ "Transcript",
232
+ ]:
233
+ ontology = "ensembl_transcript"
234
+ identifier, id_ontology, _ = parse_ensembl_id(result.query) # type: ignore
235
+ assert ontology == id_ontology
236
+ elif netloc == "www.ensembl.org" and split_path[-1] == "ProteinSummary":
237
+ ontology = "ensembl_protein"
238
+ identifier, id_ontology, _ = parse_ensembl_id(result.query) # type: ignore
239
+ assert ontology == id_ontology
240
+ elif netloc == "www.ensembl.org" and (
241
+ re.search("ENS[GTP]", split_path[-1])
242
+ or re.search("ENS[A-Z]{3}[GTP]", split_path[-1])
243
+ ):
244
+ # format ensembl IDs which lack gene/transview
245
+ identifier, ontology, _ = parse_ensembl_id(split_path[-1])
246
+ elif netloc == "www.mirbase.org" or netloc == "mirbase.org":
247
+ ontology = "mirbase"
248
+ if re.search("MI[0-9]+", split_path[-1]):
249
+ identifier = utils.extract_regex_search("MI[0-9]+", split_path[-1])
250
+ elif re.search("MIMAT[0-9]+", split_path[-1]):
251
+ identifier = utils.extract_regex_search("MIMAT[0-9]+", split_path[-1])
252
+ elif re.search("MI[0-9]+", result.query):
253
+ identifier = utils.extract_regex_search("MI[0-9]+", result.query)
254
+ elif re.search("MIMAT[0-9]+", result.query):
255
+ identifier = utils.extract_regex_search("MIMAT[0-9]+", result.query)
256
+ else:
257
+ raise TypeError(
258
+ f"{result.query} does not appear to match MiRBase identifiers"
259
+ )
260
+ elif netloc == "purl.uniprot.org":
261
+ ontology = "uniprot"
262
+ identifier = split_path[-1]
263
+ elif netloc == "rnacentral.org":
264
+ ontology = "rnacentral"
265
+ identifier = split_path[-1]
266
+ # chemicals
267
+ elif split_path[1] == "chebi":
268
+ ontology = "chebi"
269
+ identifier = utils.extract_regex_search("[0-9]+$", result.query)
270
+ elif netloc == "pubchem.ncbi.nlm.nih.gov":
271
+ ontology = "pubchem"
272
+ if result.query != "":
273
+ identifier = utils.extract_regex_search("[0-9]+$", result.query)
274
+ else:
275
+ identifier = utils.extract_regex_search("[0-9]+$", split_path[-1])
276
+ elif netloc == "www.genome.ad.jp":
277
+ ontology = "genome_net"
278
+ identifier = utils.extract_regex_search("[A-Za-z]+:[0-9]+$", uri)
279
+ elif (
280
+ netloc == "www.guidetopharmacology.org"
281
+ and split_path[-1] == "LigandDisplayForward"
282
+ ):
283
+ ontology = "grac"
284
+ identifier = utils.extract_regex_search("[0-9]+$", result.query)
285
+ elif netloc == "www.chemspider.com" or netloc == "chemspider.com":
286
+ ontology = "chemspider"
287
+ identifier = split_path[-1]
288
+ # reactions
289
+ elif split_path[1] == "ec-code":
290
+ ontology = "ec-code"
291
+ identifier = split_path[-1]
292
+ elif netloc == "www.rhea-db.org":
293
+ ontology = "rhea"
294
+ identifier = utils.extract_regex_search("[0-9]+$", result.query)
295
+ # misc
296
+ elif split_path[1] == "ols":
297
+ ontology = "ols"
298
+ identifier = split_path[-1]
299
+ elif split_path[1] == "QuickGO":
300
+ ontology = "go"
301
+ identifier = split_path[-1]
302
+ elif split_path[1] == "pubmed":
303
+ ontology = "pubmed"
304
+ identifier = split_path[-1]
305
+ # DNA sequences
306
+ elif netloc == "www.ncbi.nlm.nih.gov" and split_path[1] == "nuccore":
307
+ ontology = "ncbi_refseq"
308
+ identifier = split_path[-1]
309
+ elif netloc == "www.ncbi.nlm.nih.gov" and split_path[1] == "sites":
310
+ ontology = "ncbi_entrez_" + utils.extract_regex_search(
311
+ "db=([A-Za-z0-9]+)\\&", result.query, 1
312
+ )
313
+ identifier = utils.extract_regex_search(
314
+ r"term=([A-Za-z0-9\-]+)$", result.query, 1
315
+ )
316
+ elif netloc == "www.ebi.ac.uk" and split_path[1] == "ena":
317
+ ontology = "ebi_refseq"
318
+ identifier = split_path[-1]
319
+ elif netloc == "www.thesgc.org" and split_path[1] == "structures":
320
+ ontology = "sgc"
321
+ identifier = split_path[-2]
322
+ elif netloc == "www.mdpi.com":
323
+ ontology = "mdpi"
324
+ identifier = "/".join([i for i in split_path[1:] if i != ""])
325
+ elif netloc == "dx.doi.org":
326
+ ontology = "dx_doi"
327
+ identifier = "/".join(split_path[1:])
328
+ elif netloc == "doi.org":
329
+ ontology = "doi"
330
+ identifier = "/".join(split_path[1:])
331
+ elif netloc == "www.ncbi.nlm.nih.gov" and split_path[1] == "books":
332
+ ontology = "ncbi_books"
333
+ identifier = split_path[2]
334
+ elif netloc == "www.ncbi.nlm.nih.gov" and split_path[1] == "gene":
335
+ ontology = "ncbi_gene"
336
+ identifier = split_path[2]
337
+ elif netloc == "www.phosphosite.org":
338
+ ontology = "phosphosite"
339
+ identifier = utils.extract_regex_match(".*id=([0-9]+).*", uri)
340
+ elif netloc == "ncithesaurus.nci.nih.gov":
341
+ ontology = "NCI_Thesaurus"
342
+ identifier = utils.extract_regex_match(".*code=([0-9A-Z]+).*", uri)
343
+ elif netloc == "matrixdb.ibcp.fr":
344
+ molecule_class = utils.extract_regex_match(
345
+ ".*class=([a-zA-Z]+).*", uri
346
+ ).lower()
347
+ ontology = f"matrixdb_{molecule_class}"
348
+ identifier = utils.extract_regex_match(".*name=([0-9A-Za-z]+).*", uri)
349
+ elif netloc == "matrixdb.univ-lyon1.fr":
350
+ molecule_class = utils.extract_regex_match(
351
+ ".*type=([a-zA-Z]+).*", uri
352
+ ).lower()
353
+ ontology = f"matrixdb_{molecule_class}"
354
+ identifier = utils.extract_regex_match(".*value=([0-9A-Za-z]+).*", uri)
355
+ else:
356
+ raise NotImplementedError(
357
+ f"{netloc} in the {uri} url has not been associated with a known ontology"
358
+ )
359
+ except TypeError:
360
+ logger.warning(
361
+ f"An identifier could not be found using the specified regex for {uri} based on the {ontology} ontology"
362
+ )
363
+ logger.warning(result)
364
+ logger.warning("ERROR")
365
+ sys.exit(1)
366
+
367
+ # rename some entries
368
+
369
+ if ontology == "ncbi_gene":
370
+ ontology = "ncbi_entrez_gene"
371
+
372
+ id_dict = {"ontology": ontology, "identifier": identifier, "url": uri}
373
+
374
+ return id_dict
375
+
376
+
377
+ def parse_ensembl_id(input_str: str) -> tuple[str, str, str]:
378
+ """
379
+ Parse Ensembl ID
380
+
381
+ Extract the molecule type and species name from a string containing an ensembl identifier.
382
+
383
+ Args:
384
+ input_str (str):
385
+ A string containing an ensembl gene, transcript, or protein identifier
386
+
387
+ Returns:
388
+ identifier (str):
389
+ The substring matching the full identifier
390
+ molecule_type (str):
391
+ The ontology the identifier belongs to:
392
+ - G -> ensembl_gene
393
+ - T -> ensembl_transcript
394
+ - P -> ensembl_protein
395
+ species (str):
396
+ The species name the identifier belongs to
397
+
398
+ """
399
+
400
+ # validate that input is an ensembl ID
401
+ if not re.search("ENS[GTP][0-9]+", input_str) and not re.search(
402
+ "ENS[A-Z]{3}[GTP][0-9]+", input_str
403
+ ):
404
+ ValueError(
405
+ f"{input_str} did not match the expected formats of an ensembl identifier:",
406
+ "ENS[GTP][0-9]+ or ENS[A-Z]{3}[GTP][0-9]+",
407
+ )
408
+
409
+ # extract the species code (three letters after ENS if non-human)
410
+ species_code_search = re.compile("ENS([A-Z]{3})?[GTP]").search(input_str)
411
+
412
+ if species_code_search.group(1) is None: # type:ignore
413
+ species = "Homo sapiens"
414
+ molecule_type_regex = "ENS([GTP])"
415
+ id_regex = "ENS[GTP][0-9]+"
416
+ else:
417
+ species_code = species_code_search.group(1) # type:ignore
418
+
419
+ if species_code not in ENSEMBL_SPECIES_FROM_CODE.keys():
420
+ raise ValueError(
421
+ f"The species code for {input_str}: {species_code} did not "
422
+ "match any of the entries in ENSEMBL_SPECIES_CODE_LOOKUPS."
423
+ )
424
+
425
+ species = ENSEMBL_SPECIES_FROM_CODE[species_code]
426
+ molecule_type_regex = "ENS[A-Z]{3}([GTP])"
427
+ id_regex = "ENS[A-Z]{3}[GTP][0-9]+"
428
+
429
+ # extract the molecule type (genes, transcripts or proteins)
430
+ molecule_type_code_search = re.compile(molecule_type_regex).search(input_str)
431
+ if not molecule_type_code_search:
432
+ raise ValueError(
433
+ "The ensembl molecule code (i.e., G, T or P) could not be extracted from {input_str}"
434
+ )
435
+ else:
436
+ molecule_type_code = molecule_type_code_search.group(1) # type: str
437
+
438
+ if molecule_type_code not in ENSEMBL_MOLECULE_TYPES_TO_ONTOLOGY.keys():
439
+ raise ValueError(
440
+ f"The molecule type code for {input_str}: {molecule_type_code} did not "
441
+ "match ensembl genes (G), transcripts (T), or proteins (P)."
442
+ )
443
+
444
+ molecule_type = ENSEMBL_MOLECULE_TYPES_TO_ONTOLOGY[molecule_type_code] # type: str
445
+
446
+ identifier = utils.extract_regex_search(id_regex, input_str) # type: str
447
+
448
+ return identifier, molecule_type, species
449
+
450
+
451
+ def format_uri_url_identifiers_dot_org(split_path: list[str]):
452
+ """Parse identifiers.org identifiers
453
+
454
+ The identifiers.org identifier have two different formats:
455
+ 1. http://identifiers.org/<ontology>/<id>
456
+ 2. http://identifiers.org/<ontology>:<id>
457
+
458
+ Currently we are identifying the newer format 2. by
459
+ looking for the `:` in the second element of the split path.
460
+
461
+ Also the ontology is converted to lower case letters.
462
+
463
+ Args:
464
+ split_path (list[str]): split url path
465
+
466
+ Returns:
467
+ tuple[str, str]: ontology, identifier
468
+ """
469
+
470
+ # formatting for the identifiers.org meta ontology
471
+
472
+ # meta ontologies
473
+
474
+ # identify old versions without `:`
475
+ V2_SEPARATOR = ":"
476
+ if V2_SEPARATOR in split_path[1]:
477
+ # identifiers.org switched to format <ontology>:<id>
478
+ path = "/".join(split_path[1:])
479
+ if path.count(V2_SEPARATOR) != 1:
480
+ raise ValueError(
481
+ "The assumption is that there is only one ':'"
482
+ f"in an identifiers.org url. Found more in: {path}"
483
+ )
484
+ ontology, identifier = path.split(":")
485
+ ontology = ontology.lower()
486
+ else:
487
+ ontology = split_path[1]
488
+
489
+ if ontology in ["chebi"]:
490
+ identifier = utils.extract_regex_search("[0-9]+$", split_path[-1])
491
+ elif len(split_path) != 3:
492
+ identifier = "/".join(split_path[2:])
493
+ else:
494
+ identifier = split_path[-1]
495
+
496
+ return ontology, identifier
497
+
498
+
499
+ def cv_to_Identifiers(entity):
500
+ """
501
+ Convert an SBML controlled vocabulary element into a cpr Identifiers object.
502
+
503
+ Parameters:
504
+ entity: libsbml.Species
505
+ An entity (species, reaction, compartment, ...) with attached CV terms
506
+
507
+ Returns:
508
+
509
+
510
+ """
511
+
512
+ # TO DO: add qualifier type http://sbml.org/Software/libSBML/5.18.0/docs/python-api/classlibsbml_1_1_c_v_term.html#a6a613cc17c6f853cf1c68da59286b373
513
+
514
+ cv_list = list()
515
+ for cv in entity.getCVTerms():
516
+ if cv.getQualifierType() != libsbml.BIOLOGICAL_QUALIFIER:
517
+ # only care about biological annotations
518
+ continue
519
+
520
+ biological_qualifier_type = BIOLOGICAL_QUALIFIER_CODES[
521
+ cv.getBiologicalQualifierType()
522
+ ]
523
+ out_list = list()
524
+ for i in range(cv.getNumResources()):
525
+ try:
526
+ out_list.append(
527
+ format_uri(cv.getResourceURI(i), biological_qualifier_type)
528
+ )
529
+ except NotImplementedError:
530
+ logger.warning("Not all identifiers resolved: ", exc_info=True)
531
+
532
+ cv_list.extend(out_list)
533
+ return Identifiers(cv_list)
534
+
535
+
536
+ def create_uri_url(ontology: str, identifier: str, strict: bool = True) -> str:
537
+ """
538
+ Create URI URL
539
+
540
+ Convert from an identifier and ontology to a URL reference for the identifier
541
+
542
+ Parameters:
543
+ ontology (str): An ontology for organizing genes, metabolites, etc.
544
+ identifier (str): A systematic identifier from the \"ontology\" ontology.
545
+ strict (bool): if strict then throw errors for invalid IDs otherwise return None
546
+
547
+ Returns:
548
+ url (str): A url representing a unique identifier
549
+
550
+ """
551
+
552
+ # check input types
553
+ if not isinstance(ontology, str):
554
+ raise TypeError(f"ontology was an {type(ontology).__name__} and must be a str")
555
+ if not isinstance(identifier, str):
556
+ raise TypeError(
557
+ f"identifier was an {type(identifier).__name__} and must be a str"
558
+ )
559
+ if not isinstance(strict, bool):
560
+ raise TypeError(f"strict was an {type(strict).__name__} and must be a bool")
561
+
562
+ # default to no id_regex
563
+ id_regex = None
564
+
565
+ if ontology in ["ensembl_gene", "ensembl_transcript", "ensembl_protein"]:
566
+ id_regex, url = ensembl_id_to_url_regex(identifier, ontology)
567
+ elif ontology == "bigg.metabolite":
568
+ url = f"http://identifiers.org/bigg.metabolite/{identifier}"
569
+ elif ontology == "chebi":
570
+ id_regex = "^[0-9]+$"
571
+ url = f"http://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:{identifier}"
572
+ elif ontology == "ec-code":
573
+ id_regex = "^[0-9]+\\.[0-9]+\\.[0-9]+(\\.[0-9]+)?$"
574
+ url = f"https://identifiers.org/ec-code/{identifier}"
575
+ elif ontology == "envipath":
576
+ url = f"http://identifiers.org/envipath/{identifier}"
577
+ elif ontology == "go":
578
+ id_regex = "^GO:[0-9]{7}$"
579
+ url = f"https://www.ebi.ac.uk/QuickGO/term/{identifier}"
580
+ elif ontology == "ncbi_entrez_gene":
581
+ url = f"https://www.ncbi.nlm.nih.gov/gene/{identifier}"
582
+ elif ontology == "ncbi_entrez_pccompound":
583
+ id_regex = "^[A-Z]{14}\\-[A-Z]{10}\\-[A-Z]{1}$"
584
+ url = f"http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=search&db=pccompound&term={identifier}"
585
+ elif ontology == "pubchem":
586
+ id_regex = "^[0-9]+$"
587
+ url = f"http://pubchem.ncbi.nlm.nih.gov/compound/{identifier}"
588
+ elif ontology == "pubmed":
589
+ id_regex = "^[0-9]+$"
590
+ url = f"http://www.ncbi.nlm.nih.gov/pubmed/{identifier}"
591
+ elif ontology == "reactome":
592
+ id_regex = "^R\\-[A-Z]{3}\\-[0-9]{7}$"
593
+ url = f"https://reactome.org/content/detail/{identifier}"
594
+ elif ontology == "uniprot":
595
+ id_regex = "^[A-Z0-9]+$"
596
+ url = f"https://purl.uniprot.org/uniprot/{identifier}"
597
+ elif ontology == "sgc":
598
+ id_regex = "^[0-9A-Z]+$"
599
+ url = f"https://www.thesgc.org/structures/structure_description/{identifier}/"
600
+ elif ontology == "mdpi":
601
+ id_regex = None
602
+ url = f"https://www.mdpi.com/{identifier}"
603
+ elif ontology == "mirbase":
604
+ id_regex = None
605
+ if re.match("MIMAT[0-9]", identifier):
606
+ url = f"https://www.mirbase.org/mature/{identifier}"
607
+ elif re.match("MI[0-9]", identifier):
608
+ url = f"https://www.mirbase.org/hairpin/{identifier}"
609
+ else:
610
+ raise NotImplementedError(f"url not defined for this MiRBase {identifier}")
611
+ elif ontology == "rnacentral":
612
+ id_regex = None
613
+ url = f"https://rnacentral.org/rna/{identifier}"
614
+ elif ontology == "chemspider":
615
+ id_regex = "^[0-9]+$"
616
+ url = f"https://www.chemspider.com/{identifier}"
617
+
618
+ elif ontology == "dx_doi":
619
+ id_regex = r"^[0-9]+\.[0-9]+$"
620
+ url = f"https://dx.doi.org/{identifier}"
621
+ elif ontology == "doi":
622
+ id_regex = None
623
+ url = f"https://doi.org/{identifier}"
624
+
625
+ elif ontology == "ncbi_books":
626
+ id_regex = "^[0-9A-Z]+$"
627
+ url = f"http://www.ncbi.nlm.nih.gov/books/{identifier}/"
628
+
629
+ elif ontology == "ncbi_entrez_gene":
630
+ id_regex = "^[0-9]+$"
631
+ url = f"https://www.ncbi.nlm.nih.gov/gene/{identifier}"
632
+ elif ontology == "phosphosite":
633
+ id_regex = "^[0-9]+$"
634
+ url = f"https://www.phosphosite.org/siteAction.action?id={identifier}"
635
+ elif ontology == "NCI_Thesaurus":
636
+ id_regex = "^[A-Z][0-9]+$"
637
+ url = f"https://ncithesaurus.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&code={identifier}"
638
+ elif ontology == "matrixdb_biomolecule":
639
+ id_regex = "^[0-9A-Za-z]+$"
640
+ url = f"http://matrixdb.univ-lyon1.fr/cgi-bin/current/newPort?type=biomolecule&value={identifier}"
641
+ else:
642
+ raise NotImplementedError(
643
+ f"No identifier -> url logic exists for the {ontology} ontology in create_uri_url()"
644
+ )
645
+
646
+ # validate identifier with regex if one exists
647
+ if id_regex is not None:
648
+ if re.search(id_regex, identifier) is None:
649
+ failure_msg = f"{identifier} is not a valid {ontology} id, it did not match the regex: {id_regex}"
650
+ if strict:
651
+ raise TypeError(failure_msg)
652
+ else:
653
+ print(failure_msg + " returning None")
654
+ return None
655
+
656
+ return url
657
+
658
+
659
+ def ensembl_id_to_url_regex(identifier: str, ontology: str) -> tuple[str, str]:
660
+ """
661
+ Ensembl ID to URL and Regex
662
+
663
+ Map an ensembl ID to a validation regex and its canonical url on ensembl
664
+
665
+ Args:
666
+ identifier: str
667
+ A standard identifier from ensembl genes, transcripts, or proteins
668
+ ontology: str
669
+ The standard ontology (ensembl_gene, ensembl_transcript, or ensembl_protein)
670
+
671
+ Returns:
672
+ id_regex: a regex which should match a valid entry in this ontology
673
+ url: the id's url on ensembl
674
+ """
675
+
676
+ # extract the species name from the 3 letter species code in the id
677
+ # (these letters are not present for humans)
678
+ identifier, implied_ontology, species = parse_ensembl_id(identifier) # type: ignore
679
+ assert implied_ontology == ontology
680
+
681
+ # create an appropriate regex for validating input
682
+ # this provides testing for other identifiers even if it is redundant with other
683
+ # validation of ensembl ids
684
+
685
+ if species == "Homo sapiens":
686
+ species_code = ""
687
+ else:
688
+ species_code = ENSEMBL_SPECIES_TO_CODE[species]
689
+ molecule_type_code = ENSEMBL_MOLECULE_TYPES_FROM_ONTOLOGY[ontology]
690
+
691
+ id_regex = "ENS" + species_code + molecule_type_code + "[0-9]{11}"
692
+
693
+ # convert to species format in ensembl urls
694
+ species_url_field = re.sub(" ", "_", species)
695
+
696
+ if ontology == "ensembl_gene":
697
+ url = f"http://www.ensembl.org/{species_url_field}/geneview?gene={identifier}"
698
+ elif ontology == "ensembl_transcript":
699
+ url = f"http://www.ensembl.org/{species_url_field}/Transcript?t={identifier}"
700
+ elif ontology == "ensembl_protein":
701
+ url = f"https://www.ensembl.org/{species_url_field}/Transcript/ProteinSummary?t={identifier}"
702
+ else:
703
+ ValueError(f"{ontology} not defined")
704
+
705
+ return id_regex, url
706
+
707
+
708
+ def check_reactome_identifier_compatibility(
709
+ reactome_series_a: pd.Series,
710
+ reactome_series_b: pd.Series,
711
+ ) -> None:
712
+ """
713
+ Check Reactome Identifier Compatibility
714
+
715
+ Determine whether two sets of Reactome identifiers are from the same species.
716
+
717
+ Args:
718
+ reactome_series_a: pd.Series
719
+ a Series containing Reactome identifiers
720
+ reactome_series_b: pd.Series
721
+ a Series containing Reactome identifiers
722
+
723
+ Returns:
724
+ None
725
+
726
+ """
727
+
728
+ a_species, a_species_counts = _infer_primary_reactome_species(reactome_series_a)
729
+ b_species, b_species_counts = _infer_primary_reactome_species(reactome_series_b)
730
+
731
+ if a_species != b_species:
732
+ a_name = reactome_series_a.name
733
+ if a_name is None:
734
+ a_name = "unnamed"
735
+
736
+ b_name = reactome_series_b.name
737
+ if b_name is None:
738
+ b_name = "unnamed"
739
+
740
+ raise ValueError(
741
+ "The two provided pd.Series containing Reactome identifiers appear to be from different species. "
742
+ f"The pd.Series named {a_name} appears to be {a_species} with {a_species_counts} examples of this code. "
743
+ f"The pd.Series named {b_name} appears to be {b_species} with {b_species_counts} examples of this code."
744
+ )
745
+
746
+ return None
747
+
748
+
749
+ def _infer_primary_reactome_species(reactome_series: pd.Series) -> tuple[str, int]:
750
+ """Infer the best supported species based on a set of Reactome identifiers"""
751
+
752
+ series_counts = _count_reactome_species(reactome_series)
753
+
754
+ if "ALL" in series_counts.index:
755
+ series_counts = series_counts.drop("ALL", axis=0)
756
+
757
+ return series_counts.index[0], series_counts.iloc[0]
758
+
759
+
760
+ def _count_reactome_species(reactome_series: pd.Series) -> pd.Series:
761
+ """Count the number of species tags in a set of reactome IDs"""
762
+
763
+ return (
764
+ reactome_series.drop_duplicates().transform(_reactome_id_species).value_counts()
765
+ )
766
+
767
+
768
+ def _reactome_id_species(reactome_id: str) -> str:
769
+ """Extract the species code from a Reactome ID"""
770
+
771
+ reactome_match = re.match("^R\\-([A-Z]{3})\\-[0-9]+", reactome_id)
772
+ if reactome_match:
773
+ try:
774
+ value = reactome_match[1]
775
+ except ValueError:
776
+ raise ValueError(f"{reactome_id} is not a valid reactome ID")
777
+ else:
778
+ raise ValueError(f"{reactome_id} is not a valid reactome ID")
779
+
780
+ return value
781
+
782
+
783
+ def _format_Identifiers_pubmed(pubmed_id: str) -> Identifiers:
784
+ """
785
+ Format Identifiers for a single PubMed ID.
786
+
787
+ These will generally be used in an r_Identifiers field.
788
+ """
789
+
790
+ # create a url for lookup and validate the pubmed id
791
+ url = create_uri_url(ontology="pubmed", identifier=pubmed_id, strict=False)
792
+ id_entry = format_uri(uri=url, biological_qualifier_type="BQB_IS_DESCRIBED_BY")
793
+
794
+ return Identifiers([id_entry])
795
+
796
+
797
+ class _IdentifierValidator(BaseModel):
798
+ ontology: str
799
+ identifier: str
800
+ url: Optional[str] = None
801
+ bqb: Optional[str] = None
802
+
803
+
804
+ class _IdentifiersValidator(BaseModel):
805
+ id_list: list[_IdentifierValidator]