pyobo 0.11.0__py3-none-any.whl → 0.11.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. pyobo/constants.py +1 -0
  2. pyobo/gilda_utils.py +14 -11
  3. pyobo/obographs.py +5 -2
  4. pyobo/resources/so.py +55 -0
  5. pyobo/resources/so.tsv +2604 -0
  6. pyobo/sources/complexportal.py +54 -15
  7. pyobo/sources/dictybase_gene.py +14 -9
  8. pyobo/sources/drugcentral.py +4 -1
  9. pyobo/sources/expasy.py +22 -4
  10. pyobo/sources/flybase.py +3 -2
  11. pyobo/sources/hgnc.py +24 -19
  12. pyobo/sources/hgncgenefamily.py +7 -7
  13. pyobo/sources/kegg/genome.py +18 -6
  14. pyobo/sources/mirbase.py +9 -3
  15. pyobo/sources/npass.py +1 -1
  16. pyobo/sources/pathbank.py +32 -23
  17. pyobo/sources/pombase.py +6 -3
  18. pyobo/sources/reactome.py +28 -7
  19. pyobo/sources/rgd.py +1 -1
  20. pyobo/sources/slm.py +28 -14
  21. pyobo/sources/uniprot/uniprot.py +7 -6
  22. pyobo/sources/zfin.py +18 -6
  23. pyobo/struct/reference.py +9 -8
  24. pyobo/struct/struct.py +30 -20
  25. pyobo/struct/typedef.py +5 -0
  26. pyobo/version.py +1 -1
  27. {pyobo-0.11.0.dist-info → pyobo-0.11.2.dist-info}/METADATA +50 -62
  28. {pyobo-0.11.0.dist-info → pyobo-0.11.2.dist-info}/RECORD +31 -45
  29. {pyobo-0.11.0.dist-info → pyobo-0.11.2.dist-info}/WHEEL +1 -1
  30. pyobo/apps/__init__.py +0 -3
  31. pyobo/apps/cli.py +0 -24
  32. pyobo/apps/gilda/__init__.py +0 -3
  33. pyobo/apps/gilda/__main__.py +0 -8
  34. pyobo/apps/gilda/app.py +0 -48
  35. pyobo/apps/gilda/cli.py +0 -36
  36. pyobo/apps/gilda/templates/base.html +0 -33
  37. pyobo/apps/gilda/templates/home.html +0 -11
  38. pyobo/apps/gilda/templates/matches.html +0 -32
  39. pyobo/apps/mapper/__init__.py +0 -3
  40. pyobo/apps/mapper/__main__.py +0 -11
  41. pyobo/apps/mapper/cli.py +0 -37
  42. pyobo/apps/mapper/mapper.py +0 -187
  43. pyobo/apps/mapper/templates/base.html +0 -35
  44. pyobo/apps/mapper/templates/mapper_home.html +0 -64
  45. pyobo-0.11.0.dist-info/LICENSE +0 -21
  46. {pyobo-0.11.0.dist-info → pyobo-0.11.2.dist-info}/entry_points.txt +0 -0
  47. {pyobo-0.11.0.dist-info → pyobo-0.11.2.dist-info}/top_level.txt +0 -0
@@ -58,11 +58,31 @@ def _parse_members(s) -> list[tuple[Reference, str]]:
58
58
  for member in s.split("|"):
59
59
  entity_id, count = member.split("(")
60
60
  count = count.rstrip(")")
61
- if ":" in entity_id:
61
+ if entity_id.startswith("URS"):
62
+ prefix, identifier = "rnacentral", entity_id
63
+ elif entity_id.startswith("CPX"):
64
+ # TODO why self xref?
65
+ prefix, identifier = "complexportal", entity_id
66
+ elif entity_id.startswith("["):
67
+ continue # this is a list of uniprot IDs, not sure what to do with this
68
+ elif entity_id.startswith("EBI-"):
69
+ continue
70
+ elif ":" not in entity_id:
71
+ if "PRO_" in entity_id:
72
+ prefix = "uniprot.chain"
73
+ identifier = entity_id.split("-")[1]
74
+ elif "-" in entity_id:
75
+ prefix, identifier = "uniprot.isoform", entity_id
76
+ else:
77
+ prefix, identifier = "uniprot", entity_id
78
+ else:
62
79
  prefix, identifier = entity_id.split(":", 1)
80
+ try:
81
+ reference = Reference(prefix=prefix, identifier=identifier)
82
+ except ValueError:
83
+ tqdm.write(f"failed to validate reference: {entity_id}")
63
84
  else:
64
- prefix, identifier = "uniprot", entity_id
65
- rv.append((Reference(prefix=prefix, identifier=identifier), count))
85
+ rv.append((reference, count))
66
86
  return rv
67
87
 
68
88
 
@@ -74,27 +94,40 @@ def _parse_xrefs(s) -> list[tuple[Reference, str]]:
74
94
  for xref in s.split("|"):
75
95
  xref = xref.replace("protein ontology:PR:", "PR:")
76
96
  xref = xref.replace("protein ontology:PR_", "PR:")
97
+ xref = xref.replace("rhea:rhea ", "rhea:")
98
+ xref = xref.replace("rhea:Rhea ", "rhea:")
99
+ xref = xref.replace("rhea:RHEA:rhea", "rhea:")
100
+ xref = xref.replace("rhea:RHEA: ", "rhea:")
101
+ xref = xref.replace("rhea:RHEA:rhea ", "rhea:")
102
+ xref = xref.replace("intenz:RHEA:", "rhea:")
103
+ xref = xref.replace("eccode::", "eccode:")
104
+ xref = xref.replace("eccode:EC:", "eccode:")
105
+ xref = xref.replace("intenz:EC:", "eccode:")
106
+ xref = xref.replace("eccode:RHEA:", "rhea:")
107
+ xref = xref.replace("efo:MONDO:", "MONDO:")
108
+ xref = xref.replace("omim:MIM:", "omim:")
109
+ xref = xref.replace("efo:HP:", "HP:")
110
+ xref = xref.replace("efo:Orphanet:", "Orphanet:")
111
+ xref = xref.replace("orphanet:ORDO:", "Orphanet:")
112
+ xref = xref.replace("biorxiv:doi.org/", "doi:")
113
+ xref = xref.replace("emdb:EMDB-", "emdb:EMD-")
114
+ xref = xref.replace("wwpdb:EMD-", "emdb:EMD-")
115
+ xref = xref.replace("signor:CPX-", "complexportal:CPX-")
116
+
77
117
  try:
78
118
  xref_curie, note = xref.split("(")
79
119
  except ValueError:
80
120
  logger.warning("xref missing (: %s", xref)
81
121
  continue
82
122
  note = note.rstrip(")")
83
- note.replace("rhea:rhea ", "rhea:")
84
- note.replace("rhea:Rhea ", "rhea:")
85
- note.replace("eccode::", "eccode:")
86
- note.replace("eccode:EC:", "eccode:")
87
- note.replace("eccode:RHEA:", "rhea:")
88
- if note.lower().startswith("rhea "):
89
- note = note[len("Rhea ") :]
90
- if note.lower().startswith("rhea:rhea "):
91
- note = note[len("rhea:rhea ") :]
92
- if note.lower().startswith("EC:"):
93
- note = note[len("EC:") :]
123
+
124
+ if xref_curie.startswith("intenz:"):
125
+ xref_curie = _clean_intenz(xref_curie)
126
+
94
127
  try:
95
128
  reference = Reference.from_curie(xref_curie)
96
129
  except ValueError:
97
- logger.warning("can not parse CURIE: %s", xref)
130
+ logger.warning("can not parse CURIE: %s", xref_curie)
98
131
  continue
99
132
  if reference is None:
100
133
  logger.warning("reference is None after parsing: %s", xref)
@@ -103,6 +136,12 @@ def _parse_xrefs(s) -> list[tuple[Reference, str]]:
103
136
  return rv
104
137
 
105
138
 
139
+ def _clean_intenz(s: str) -> str:
140
+ for _ in range(3):
141
+ s = s.rstrip("-").rstrip(".")
142
+ return s
143
+
144
+
106
145
  class ComplexPortalGetter(Obo):
107
146
  """An ontology representation of the Complex Portal."""
108
147
 
@@ -9,8 +9,7 @@ from collections.abc import Iterable
9
9
  import pandas as pd
10
10
  from tqdm.auto import tqdm
11
11
 
12
- from pyobo.struct import Obo, Reference, Synonym, Term, from_species, has_gene_product
13
- from pyobo.utils.io import multisetdict
12
+ from pyobo.struct import Obo, Synonym, Term, from_species, has_gene_product
14
13
  from pyobo.utils.path import ensure_df
15
14
 
16
15
  __all__ = [
@@ -49,10 +48,11 @@ def get_obo(force: bool = False) -> Obo:
49
48
 
50
49
  def get_terms(force: bool = False) -> Iterable[Term]:
51
50
  """Get terms."""
51
+ # TODO the mappings file has actually no uniprot at all, and requires text mining
52
52
  # DDB ID DDB_G ID Name UniProt ID
53
- uniprot_mappings = multisetdict(
54
- ensure_df(PREFIX, url=URL, force=force, name="uniprot_mappings.tsv", usecols=[1, 3]).values
55
- )
53
+ # uniprot_mappings = multisetdict(
54
+ # ensure_df(PREFIX, url=URL, force=force, name="uniprot_mappings.tsv", usecols=[1, 3]).values
55
+ # )
56
56
 
57
57
  terms = ensure_df(PREFIX, url=URL, force=force, name="gene_info.tsv")
58
58
  # GENE ID (DDB_G ID) Gene Name Synonyms Gene products
@@ -68,10 +68,15 @@ def get_terms(force: bool = False) -> Iterable[Term]:
68
68
  if synonyms and pd.notna(synonyms):
69
69
  for synonym in synonyms.split(","):
70
70
  term.append_synonym(Synonym(synonym.strip()))
71
- for uniprot_id in uniprot_mappings.get(identifier, []):
72
- if not uniprot_id or pd.isna(uniprot_id) or uniprot_id not in {"unknown", "pseudogene"}:
73
- continue
74
- term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id))
71
+ # for uniprot_id in uniprot_mappings.get(identifier, []):
72
+ # if not uniprot_id or pd.isna(uniprot_id) or uniprot_id in {"unknown", "pseudogene"}:
73
+ # continue
74
+ # try:
75
+ # uniprot_ref = Reference(prefix="uniprot", identifier=uniprot_id)
76
+ # except ValueError:
77
+ # tqdm.write(f"[dictybase.gene] invalid uniprot ref: {uniprot_id}")
78
+ # else:
79
+ # term.append_relationship(has_gene_product, uniprot_ref)
75
80
 
76
81
  term.set_species(identifier="44689", name="Dictyostelium discoideum")
77
82
  yield term
@@ -68,6 +68,9 @@ def iter_terms() -> Iterable[Term]:
68
68
  if xref_prefix_norm is None:
69
69
  tqdm.write(f"did not normalize {prefix}:{identifier}")
70
70
  continue
71
+ if xref_prefix_norm == "pdb.ligand":
72
+ # there is a weird invalid escaped \W appearing in pdb ligand ids
73
+ identifier = identifier.strip()
71
74
  identifier = bioregistry.standardize_identifier(xref_prefix_norm, identifier)
72
75
  xrefs[str(drugcentral_id)].append(
73
76
  Reference(prefix=xref_prefix_norm, identifier=identifier)
@@ -98,4 +101,4 @@ def iter_terms() -> Iterable[Term]:
98
101
 
99
102
 
100
103
  if __name__ == "__main__":
101
- get_obo().write_default(write_obo=True)
104
+ DrugCentralGetter.cli()
pyobo/sources/expasy.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """Convert ExPASy to OBO."""
2
2
 
3
3
  import logging
4
+ import re
4
5
  from collections import defaultdict
5
6
  from collections.abc import Iterable, Mapping
6
7
  from typing import Any, Optional
@@ -42,7 +43,7 @@ class ExpasyGetter(Obo):
42
43
  """A getter for ExPASy Enzyme Classes."""
43
44
 
44
45
  bioversions_key = ontology = PREFIX
45
- typedefs = [has_member, enables]
46
+ typedefs = [has_member, enables, term_replaced_by]
46
47
  root_terms = [
47
48
  Reference(prefix="eccode", identifier="1"),
48
49
  Reference(prefix="eccode", identifier="2"),
@@ -145,7 +146,9 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
145
146
  for domain in data.get("domains", []):
146
147
  term.append_relationship(
147
148
  has_member,
148
- Reference(prefix=domain["namespace"], identifier=domain["identifier"]),
149
+ Reference.model_validate(
150
+ {"prefix": domain["namespace"], "identifier": domain["identifier"]},
151
+ ),
149
152
  )
150
153
  for protein in data.get("proteins", []):
151
154
  term.append_relationship(
@@ -248,8 +251,10 @@ def get_database(lines: Iterable[str]) -> Mapping:
248
251
  elif descriptor == DE and value == "Deleted entry.":
249
252
  ec_data_entry["deleted"] = True
250
253
  elif descriptor == DE and value.startswith("Transferred entry: "):
251
- value = value[len("Transferred entry: ") :].rstrip().rstrip(".")
252
- ec_data_entry["transfer_id"] = value.split(" and ")
254
+ # TODO There's a situation where there are enough transfers that it goes on to a second line
255
+ # the following line just gives up on this one. or maybe I don't understand
256
+ value = value.strip().removesuffix("and").rstrip(",").strip()
257
+ ec_data_entry["transfer_id"] = _parse_transfer(value)
253
258
  elif descriptor == DE:
254
259
  ec_data_entry["concept"]["name"] = value.rstrip(".") # type:ignore
255
260
  elif descriptor == AN:
@@ -279,6 +284,19 @@ def get_database(lines: Iterable[str]) -> Mapping:
279
284
  return rv
280
285
 
281
286
 
287
+ TRANSFER_SPLIT_RE = re.compile(r",\s*|\s+and\s+")
288
+
289
+
290
+ def _parse_transfer(value: str) -> list[str]:
291
+ """Parse transferred entry string.
292
+
293
+ >>> _parse_transfer("Transferred entry: 1.1.1.198, 1.1.1.227 and 1.1.1.228.")
294
+ ['1.1.1.198', '1.1.1.227', '1.1.1.228']
295
+ """
296
+ value = value[len("Transferred entry: ") :].rstrip().rstrip(".")
297
+ return sorted(x.strip().removeprefix("and").strip() for x in TRANSFER_SPLIT_RE.split(value))
298
+
299
+
282
300
  def _group_by_id(lines):
283
301
  """Group lines by identifier."""
284
302
  groups = []
pyobo/sources/flybase.py CHANGED
@@ -7,6 +7,7 @@ import pandas as pd
7
7
  from tqdm.auto import tqdm
8
8
 
9
9
  from pyobo import Reference
10
+ from pyobo.resources.so import get_so_name
10
11
  from pyobo.struct import Obo, Term, from_species, orthologous
11
12
  from pyobo.utils.io import multisetdict
12
13
  from pyobo.utils.path import ensure_df
@@ -133,7 +134,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
133
134
  "FlyBase gene type is missing mapping to Sequence Ontology (SO): %s", gtype
134
135
  )
135
136
  else:
136
- so[gtype] = Reference.auto("SO", so_id)
137
+ so[gtype] = Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id))
137
138
 
138
139
  for _, reference in sorted(so.items()):
139
140
  yield Term(reference=reference)
@@ -153,7 +154,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
153
154
  for hgnc_curie in human_orthologs.get(identifier, []):
154
155
  if not hgnc_curie or pd.isna(hgnc_curie):
155
156
  continue
156
- hgnc_ortholog = Reference.from_curie(hgnc_curie, auto=True)
157
+ hgnc_ortholog = Reference.from_curie(hgnc_curie)
157
158
  if hgnc_ortholog is None:
158
159
  tqdm.write(f"[{PREFIX}] {identifier} had invalid ortholog: {hgnc_curie}")
159
160
  else:
pyobo/sources/hgnc.py CHANGED
@@ -13,6 +13,7 @@ from tabulate import tabulate
13
13
  from tqdm.auto import tqdm
14
14
 
15
15
  from pyobo.api.utils import get_version
16
+ from pyobo.resources.so import get_so_name
16
17
  from pyobo.struct import (
17
18
  Obo,
18
19
  Reference,
@@ -37,8 +38,8 @@ logger = logging.getLogger(__name__)
37
38
 
38
39
  PREFIX = "hgnc"
39
40
  DEFINITIONS_URL_FMT = (
40
- "http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/"
41
- "archive/monthly/json/hgnc_complete_set_{version}.json"
41
+ "https://storage.googleapis.com/public-download-files/hgnc/archive/archive/monthly/json/"
42
+ "hgnc_complete_set_{version}.json"
42
43
  )
43
44
 
44
45
  previous_symbol_type = SynonymTypeDef.from_text("previous_symbol")
@@ -222,7 +223,7 @@ class HGNCGetter(Obo):
222
223
  alias_symbol_type,
223
224
  ]
224
225
  root_terms = [
225
- Reference(prefix="so", identifier=so_id)
226
+ Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id))
226
227
  for so_id in sorted(set(LOCUS_TYPE_TO_SO.values()))
227
228
  if so_id
228
229
  ]
@@ -256,7 +257,7 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
256
257
  yield Term.from_triple("NCBITaxon", "9606", "Homo sapiens")
257
258
  yield from sorted(
258
259
  {
259
- Term(reference=Reference.auto("SO", so_id))
260
+ Term(reference=Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id)))
260
261
  for so_id in sorted(LOCUS_TYPE_TO_SO.values())
261
262
  if so_id
262
263
  },
@@ -363,23 +364,25 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
363
364
  xref_identifiers = entry.pop(key, None)
364
365
  if xref_identifiers is None:
365
366
  continue
366
-
367
367
  if isinstance(xref_identifiers, (str, int)):
368
+ xref_identifiers = [str(xref_identifiers)]
369
+
370
+ if xref_prefix == "merops.entry":
371
+ continue
372
+ # e.g., XM02-001 should be rewritten as XM02.001
373
+ xref_identifiers = [i.replace("-", ".") for i in xref_identifiers]
374
+
375
+ if xref_prefix == "refseq":
376
+ # e.g., strip off dots without substantiated record versions like in NM_021728.
377
+ xref_identifiers = [i.strip(".") for i in xref_identifiers]
378
+
379
+ if len(xref_identifiers) == 1:
368
380
  term.append_exact_match(
369
- Reference(prefix=xref_prefix, identifier=str(xref_identifiers))
381
+ Reference(prefix=xref_prefix, identifier=str(xref_identifiers[0]))
370
382
  )
371
- elif isinstance(xref_identifiers, list):
372
- if len(xref_identifiers) == 1:
373
- term.append_exact_match(
374
- Reference(prefix=xref_prefix, identifier=str(xref_identifiers[0]))
375
- )
376
- else:
377
- for xref_identifier in xref_identifiers:
378
- term.append_xref(
379
- Reference(prefix=xref_prefix, identifier=str(xref_identifier))
380
- )
381
383
  else:
382
- raise TypeError
384
+ for xref_identifier in xref_identifiers:
385
+ term.append_xref(Reference(prefix=xref_prefix, identifier=str(xref_identifier)))
383
386
 
384
387
  for pubmed_id in entry.pop("pubmed_id", []):
385
388
  term.append_provenance(Reference(prefix="pubmed", identifier=str(pubmed_id)))
@@ -416,9 +419,11 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
416
419
  locus_group = entry.pop("locus_group")
417
420
  so_id = LOCUS_TYPE_TO_SO.get(locus_type)
418
421
  if so_id:
419
- term.append_parent(Reference.auto("SO", so_id))
422
+ term.append_parent(Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id)))
420
423
  else:
421
- term.append_parent(Reference.auto("SO", "0000704")) # gene
424
+ term.append_parent(
425
+ Reference(prefix="SO", identifier="0000704", name=get_so_name("0000704"))
426
+ ) # gene
422
427
  unhandle_locus_types[locus_type][identifier] = term
423
428
  term.append_property("locus_type", locus_type)
424
429
  term.append_property("locus_group", locus_group)
@@ -21,13 +21,13 @@ __all__ = [
21
21
  ]
22
22
 
23
23
  PREFIX = "hgnc.genegroup"
24
- FAMILIES_URL = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/csv/genefamily_db_tables/family.csv"
24
+ FAMILIES_URL = "https://storage.googleapis.com/public-download-files/hgnc/csv/csv/genefamily_db_tables/family.csv"
25
25
  # TODO use family_alias.csv
26
- HIERARCHY_URL = (
27
- "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/csv/genefamily_db_tables/hierarchy.csv"
28
- )
26
+ HIERARCHY_URL = "https://storage.googleapis.com/public-download-files/hgnc/csv/csv/genefamily_db_tables/hierarchy.csv"
29
27
 
30
- symbol_type = SynonymTypeDef.from_text("symbol")
28
+ symbol_type = SynonymTypeDef(
29
+ reference=Reference(prefix="OMO", identifier="0004000", name="has symbol")
30
+ )
31
31
 
32
32
 
33
33
  class HGNCGroupGetter(Obo):
@@ -78,7 +78,7 @@ def get_terms(force: bool = False) -> Iterable[Term]:
78
78
  name=parent.name,
79
79
  )
80
80
  )
81
- gene_group = Reference.auto("SO", "0005855")
81
+ gene_group = Reference(prefix="SO", identifier="0005855", name="gene group")
82
82
  yield Term(reference=gene_group)
83
83
  for term in terms:
84
84
  if not term.parents:
@@ -98,7 +98,7 @@ def _get_terms_helper(force: bool = False) -> Iterable[Term]:
98
98
  definition=definition,
99
99
  )
100
100
  if pubmed_ids and pd.notna(pubmed_ids):
101
- for s in pubmed_ids.split(","):
101
+ for s in pubmed_ids.replace(" ", ",").split(","):
102
102
  term.append_provenance(Reference(prefix="pubmed", identifier=s.strip()))
103
103
  if desc_go and pd.notna(desc_go):
104
104
  go_id = desc_go[len("http://purl.uniprot.org/go/") :]
@@ -3,6 +3,8 @@
3
3
  Run with ``python -m pyobo.sources.kegg.genome``
4
4
  """
5
5
 
6
+ from __future__ import annotations
7
+
6
8
  import logging
7
9
  from collections.abc import Iterable
8
10
 
@@ -46,8 +48,11 @@ def get_obo() -> Obo:
46
48
  return KEGGGenomeGetter()
47
49
 
48
50
 
49
- def parse_genome_line(line: str) -> KEGGGenome:
51
+ def parse_genome_line(line: str) -> KEGGGenome | None:
50
52
  """Parse a line from the KEGG Genome database."""
53
+ if not line.startswith("T"):
54
+ # This is for an NCBI Taxonomy
55
+ return None
51
56
  line = line.strip()
52
57
  identifier, rest = _s(line, "\t")
53
58
  identifier = identifier[len("gn:") :]
@@ -94,6 +99,8 @@ def iter_kegg_genomes(version: str, desc: str) -> Iterable[KEGGGenome]:
94
99
  it = tqdm(lines, desc=desc, unit_scale=True, unit="genome")
95
100
  for line in it:
96
101
  yv = parse_genome_line(line)
102
+ if yv is None:
103
+ continue
97
104
  it.set_postfix({"id": yv.identifier, "name": yv.name})
98
105
  yield yv
99
106
 
@@ -105,11 +112,16 @@ def iter_terms(version: str) -> Iterable[Term]:
105
112
  for kegg_genome in iter_kegg_genomes(version=version, desc="KEGG Genomes"):
106
113
  if kegg_genome.identifier in SKIP:
107
114
  continue
108
- term = Term.from_triple(
109
- prefix=KEGG_GENOME_PREFIX,
110
- identifier=kegg_genome.identifier,
111
- name=kegg_genome.name,
112
- )
115
+
116
+ try:
117
+ reference = Reference(
118
+ prefix=KEGG_GENOME_PREFIX, identifier=kegg_genome.identifier, name=kegg_genome.name
119
+ )
120
+ except ValueError:
121
+ tqdm.write(f"[{KEGG_GENOME_PREFIX}] invalid identifier: {kegg_genome}")
122
+ continue
123
+
124
+ term = Term(reference=reference)
113
125
  if kegg_genome.taxonomy_id is not None:
114
126
  taxonomy_name = get_ncbitaxon_name(kegg_genome.taxonomy_id)
115
127
  if taxonomy_name is None:
pyobo/sources/mirbase.py CHANGED
@@ -136,9 +136,15 @@ def _process_definitions_lines(
136
136
  xref_prefix = xref_mapping.get(xref_prefix, xref_prefix)
137
137
  if xref_prefix == "pictar":
138
138
  continue
139
- xrefs.append(
140
- Reference(prefix=xref_prefix, identifier=xref_identifier, name=xref_label or None)
141
- )
139
+
140
+ try:
141
+ xref = Reference(
142
+ prefix=xref_prefix, identifier=xref_identifier, name=xref_label or None
143
+ )
144
+ except ValueError:
145
+ tqdm.write(f"invalid xref: {xref_prefix}:{xref_identifier}")
146
+ else:
147
+ xrefs.append(xref)
142
148
 
143
149
  # TODO add pubmed references
144
150
 
pyobo/sources/npass.py CHANGED
@@ -39,7 +39,7 @@ def get_obo(force: bool = False) -> Obo:
39
39
 
40
40
  def get_df(version: str, force: bool = False) -> pd.DataFrame:
41
41
  """Get the NPASS chemical nomenclature."""
42
- base_url = f"http://bidd.group/NPASS/downloadFiles/NPASSv{version}_download"
42
+ base_url = f"https://bidd.group/NPASS/downloadFiles/NPASSv{version}_download"
43
43
  url = f"{base_url}_naturalProducts_generalInfo.txt"
44
44
  return ensure_df(
45
45
  PREFIX,
pyobo/sources/pathbank.py CHANGED
@@ -1,5 +1,7 @@
1
1
  """Converter for PathBank."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import logging
4
6
  from collections import defaultdict
5
7
  from collections.abc import Iterable, Mapping
@@ -8,7 +10,7 @@ import pandas as pd
8
10
  from tqdm.auto import tqdm
9
11
 
10
12
  from ..struct import Obo, Reference, Term
11
- from ..struct.typedef import has_participant
13
+ from ..struct.typedef import has_category, has_participant
12
14
  from ..utils.path import ensure_df
13
15
 
14
16
  __all__ = [
@@ -68,7 +70,7 @@ class PathBankGetter(Obo):
68
70
  """An ontology representation of PathBank's pathway nomenclature."""
69
71
 
70
72
  ontology = bioversions_key = PREFIX
71
- typedefs = [has_participant]
73
+ typedefs = [has_participant, has_category]
72
74
 
73
75
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
74
76
  """Iterate over terms in the ontology."""
@@ -103,21 +105,30 @@ def get_protein_mapping(version: str, force: bool = False) -> Mapping[str, set[R
103
105
  for pathway_id, protein_id in tqdm(
104
106
  proteins_df.values, desc=f"[{PREFIX}] mapping proteins", unit_scale=True
105
107
  ):
106
- # TODO get protein names
107
- smpdb_id_to_proteins[pathway_id].add(Reference(prefix="uniprot", identifier=protein_id))
108
+ try:
109
+ if "-" in protein_id:
110
+ reference = Reference(prefix="uniprot.isoform", identifier=protein_id)
111
+ else:
112
+ reference = Reference(prefix="uniprot", identifier=protein_id)
113
+ except ValueError:
114
+ tqdm.write(f"[pathbank] invalid uniprot identifier: {protein_id}")
115
+ else:
116
+ smpdb_id_to_proteins[pathway_id].add(reference)
108
117
  return smpdb_id_to_proteins
109
118
 
110
119
 
111
120
  def get_metabolite_df(version: str, force: bool = False) -> pd.DataFrame:
112
121
  """Get the metabolites dataframe."""
113
- return ensure_df(
122
+ df = ensure_df(
114
123
  PREFIX,
115
124
  url=METABOLITE_URL,
116
125
  sep=",",
117
- usecols=["PathBank ID", "Metabolite ID", "Metabolite Name"],
126
+ usecols=["PathBank ID", "ChEBI ID"],
118
127
  force=force,
119
128
  version=version,
120
129
  )
130
+ df = df[df["ChEBI ID"].notna()]
131
+ return df
121
132
 
122
133
 
123
134
  def get_metabolite_mapping(version: str, force: bool = False) -> Mapping[str, set[Reference]]:
@@ -125,17 +136,20 @@ def get_metabolite_mapping(version: str, force: bool = False) -> Mapping[str, se
125
136
  metabolites_df = get_metabolite_df(version=version, force=force)
126
137
  smpdb_id_to_metabolites = defaultdict(set)
127
138
  it = tqdm(metabolites_df.values, desc=f"[{PREFIX}] mapping metabolites", unit_scale=True)
128
- for pathway_id, metabolite_id, metabolite_name in it:
129
- smpdb_id_to_metabolites[pathway_id].add(
130
- Reference(
131
- prefix=PREFIX,
132
- identifier=metabolite_id,
133
- name=metabolite_name,
134
- )
135
- )
139
+ for pathway_id, metabolite_id in it:
140
+ reference = Reference(prefix="chebi", identifier=metabolite_id.strip())
141
+ smpdb_id_to_metabolites[pathway_id].add(reference)
136
142
  return smpdb_id_to_metabolites
137
143
 
138
144
 
145
+ def _clean_description(description: str) -> str | None:
146
+ """Clean the description."""
147
+ if pd.isna(description) or not description:
148
+ return None
149
+ parts = [part.strip() for part in description.strip().splitlines()]
150
+ return " ".join(parts)
151
+
152
+
139
153
  def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
140
154
  """Get PathBank's terms."""
141
155
  smpdb_id_to_proteins = get_protein_mapping(version=version, force=force)
@@ -147,16 +161,11 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
147
161
  reference = Reference(prefix=PREFIX, identifier=pathbank_id, name=name)
148
162
  term = Term(
149
163
  reference=reference,
150
- # definition=description.replace('\n', ' '),
151
- xrefs=[Reference(prefix="smpdb", identifier=smpdb_id)],
152
- )
153
- term.append_parent(
154
- Reference(
155
- prefix=PREFIX,
156
- identifier=subject.lower().replace(" ", "_"),
157
- name=subject,
158
- )
164
+ # TODO use _clean_description(description) to add a description,
165
+ # but there are weird parser errors
159
166
  )
167
+ term.append_exact_match(Reference(prefix="smpdb", identifier=smpdb_id))
168
+ term.append_property(has_category, subject.lower().replace(" ", "_"))
160
169
  term.extend_relationship(has_participant, smpdb_id_to_proteins[smpdb_id])
161
170
  term.extend_relationship(has_participant, smpdb_id_to_metabolites[smpdb_id])
162
171
  yield term
pyobo/sources/pombase.py CHANGED
@@ -9,6 +9,7 @@ from tqdm.auto import tqdm
9
9
 
10
10
  import pyobo
11
11
  from pyobo import Reference
12
+ from pyobo.resources.so import get_so_name
12
13
  from pyobo.struct import Obo, Term, from_species, has_gene_product, orthologous
13
14
  from pyobo.utils.path import ensure_df
14
15
 
@@ -19,7 +20,7 @@ __all__ = [
19
20
  logger = logging.getLogger(__name__)
20
21
 
21
22
  PREFIX = "pombase"
22
- URL = "https://www.pombase.org/data/names_and_identifiers/gene_IDs_names_products.tsv"
23
+ GENE_NAMES_URL = "https://www.pombase.org/data/names_and_identifiers/gene_IDs_names_products.tsv"
23
24
  ORTHOLOGS_URL = "https://www.pombase.org/data/orthologs/human-orthologs.txt.gz"
24
25
 
25
26
 
@@ -68,9 +69,11 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
68
69
  if hgnc_id is not None:
69
70
  identifier_to_hgnc_ids[identifier].add(hgnc_id)
70
71
 
71
- df = ensure_df(PREFIX, url=URL, force=force, header=None, version=version)
72
+ df = ensure_df(PREFIX, url=GENE_NAMES_URL, force=force, version=version)
72
73
  so = {
73
- gtype: Reference.auto("SO", POMBASE_TO_SO[gtype])
74
+ gtype: Reference(
75
+ prefix="SO", identifier=POMBASE_TO_SO[gtype], name=get_so_name(POMBASE_TO_SO[gtype])
76
+ )
74
77
  for gtype in sorted(df[df.columns[6]].unique())
75
78
  }
76
79
  for _, reference in sorted(so.items()):
pyobo/sources/reactome.py CHANGED
@@ -70,7 +70,9 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
70
70
  df["taxonomy_id"] = df["species"].map(get_ncbitaxon_id)
71
71
 
72
72
  terms = {}
73
- it = tqdm(df.values, total=len(df.index), desc=f"mapping {PREFIX}")
73
+ it = tqdm(
74
+ df.values, total=len(df.index), desc=f"mapping {PREFIX}", unit_scale=True, unit="pathway"
75
+ )
74
76
  for reactome_id, name, species_name, taxonomy_id in it:
75
77
  terms[reactome_id] = term = Term(
76
78
  reference=Reference(prefix=PREFIX, identifier=reactome_id, name=name),
@@ -92,10 +94,21 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
92
94
  terms[child_id].append_parent(terms[parent_id])
93
95
 
94
96
  uniprot_pathway_df = ensure_participant_df(version=version, force=force)
95
- for uniprot_id, reactome_id in tqdm(uniprot_pathway_df.values, total=len(uniprot_pathway_df)):
96
- terms[reactome_id].append_relationship(
97
- has_participant, Reference(prefix="uniprot", identifier=uniprot_id)
98
- )
97
+ for uniprot_id, reactome_id in tqdm(
98
+ uniprot_pathway_df.values,
99
+ total=len(uniprot_pathway_df),
100
+ unit_scale=True,
101
+ unit="pathway-protein",
102
+ ):
103
+ if reactome_id not in terms:
104
+ tqdm.write(f"{reactome_id} appears in uniprot participants file but not pathways file")
105
+ continue
106
+
107
+ if "-" in uniprot_id:
108
+ reference = Reference(prefix="uniprot.isoform", identifier=uniprot_id)
109
+ else:
110
+ reference = Reference(prefix="uniprot", identifier=uniprot_id)
111
+ terms[reactome_id].append_relationship(has_participant, reference)
99
112
 
100
113
  chebi_pathway_url = f"https://reactome.org/download/{version}/ChEBI2Reactome_All_Levels.txt"
101
114
  chebi_pathway_df = ensure_df(
@@ -106,7 +119,15 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
106
119
  version=version,
107
120
  force=force,
108
121
  )
109
- for chebi_id, reactome_id in tqdm(chebi_pathway_df.values, total=len(chebi_pathway_df)):
122
+ for chebi_id, reactome_id in tqdm(
123
+ chebi_pathway_df.values,
124
+ total=len(chebi_pathway_df),
125
+ unit_scale=True,
126
+ unit="pathway-chemical",
127
+ ):
128
+ if reactome_id not in terms:
129
+ tqdm.write(f"{reactome_id} appears in chebi participants file but not pathways file")
130
+ continue
110
131
  terms[reactome_id].append_relationship(
111
132
  has_participant, Reference(prefix="chebi", identifier=chebi_id)
112
133
  )
@@ -133,4 +154,4 @@ def get_protein_to_pathways() -> Mapping[str, set[str]]:
133
154
 
134
155
 
135
156
  if __name__ == "__main__":
136
- get_obo().write_default()
157
+ ReactomeGetter.cli()