pyobo 0.10.4__py3-none-any.whl → 0.10.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,229 @@
1
+ """Get terms from geonames."""
2
+
3
+ import logging
4
+ from typing import Collection, Iterable, Mapping
5
+
6
+ import pandas as pd
7
+ from pystow.utils import read_zipfile_csv
8
+ from tqdm import tqdm
9
+
10
+ from pyobo import Obo, Term
11
+ from pyobo.struct import Reference, part_of
12
+ from pyobo.utils.path import ensure_df, ensure_path
13
+
14
+ __all__ = ["GeonamesGetter"]
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ PREFIX = "geonames"
19
+ COUNTRIES_URL = "https://download.geonames.org/export/dump/countryInfo.txt"
20
+ ADMIN1_URL = "https://download.geonames.org/export/dump/admin1CodesASCII.txt"
21
+ ADMIN2_URL = "https://download.geonames.org/export/dump/admin2Codes.txt"
22
+ CITIES_URL = "https://download.geonames.org/export/dump/cities15000.zip"
23
+
24
+
25
+ class GeonamesGetter(Obo):
26
+ """An ontology representation of GeoNames."""
27
+
28
+ ontology = PREFIX
29
+ dynamic_version = True
30
+ typedefs = [part_of]
31
+
32
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
33
+ """Iterate over terms in the ontology."""
34
+ return get_terms(force=force)
35
+
36
+
37
+ def get_terms(*, force: bool = False) -> Collection[Term]:
38
+ """Get terms."""
39
+ code_to_country = get_code_to_country(force=force)
40
+ code_to_admin1 = get_code_to_admin1(code_to_country, force=force)
41
+ code_to_admin2 = get_code_to_admin2(code_to_admin1, force=force)
42
+ id_to_term = get_cities(
43
+ code_to_country=code_to_country,
44
+ code_to_admin1=code_to_admin1,
45
+ code_to_admin2=code_to_admin2,
46
+ force=force,
47
+ )
48
+ return id_to_term.values()
49
+
50
+
51
+ def get_code_to_country(*, force: bool = False) -> Mapping[str, Term]:
52
+ """Get a mapping from country code to country term."""
53
+ countries_df = ensure_df(
54
+ PREFIX,
55
+ url=COUNTRIES_URL,
56
+ force=force,
57
+ skiprows=49,
58
+ keep_default_na=False, # NA is a country code
59
+ dtype=str,
60
+ )
61
+ logger.info(f"got {len(countries_df.index):,} countries")
62
+ reorder = ["geonameid", *(c for c in countries_df.columns if c != "geonameid")]
63
+ countries_df = countries_df[reorder]
64
+ code_to_country = {}
65
+ cols = ["geonameid", "Country", "#ISO", "fips", "ISO3"]
66
+ for identifier, name, code, fips, iso3 in countries_df[cols].values:
67
+ if pd.isna(code):
68
+ continue
69
+ term = Term.from_triple(
70
+ "geonames", identifier, name if pd.notna(name) else None, type="Instance"
71
+ )
72
+ term.append_synonym(code)
73
+ if name.startswith("The "):
74
+ term.append_synonym(name.removeprefix("The "))
75
+ if pd.notna(fips):
76
+ term.append_synonym(fips)
77
+ if pd.notna(iso3):
78
+ term.append_synonym(iso3)
79
+ term.append_property("code", code)
80
+ code_to_country[code] = term
81
+ logger.info(f"got {len(code_to_country):,} country records")
82
+ return code_to_country
83
+
84
+
85
+ def get_code_to_admin1(
86
+ code_to_country: Mapping[str, Term], *, force: bool = False
87
+ ) -> Mapping[str, Term]:
88
+ """Get a mapping from admin1 code to term."""
89
+ admin1_df = ensure_df(
90
+ PREFIX,
91
+ url=ADMIN1_URL,
92
+ header=None,
93
+ names=["code", "name", "asciiname", "geonames_id"],
94
+ dtype=str,
95
+ force=force,
96
+ )
97
+ code_to_admin1 = {}
98
+ for code, name, asciiname, identifier in admin1_df.values:
99
+ if pd.isna(identifier) or pd.isna(code):
100
+ tqdm.write(f"Missing info for {name} / {asciiname} / {code=} / {identifier=}")
101
+ continue
102
+
103
+ term = Term.from_triple(
104
+ "geonames", identifier, name if pd.notna(name) else None, type="Instance"
105
+ )
106
+ term.append_property("code", code)
107
+ code_to_admin1[code] = term
108
+
109
+ country_code = code.split(".")[0]
110
+ country_term = code_to_country[country_code]
111
+ term.append_relationship(part_of, country_term)
112
+ return code_to_admin1
113
+
114
+
115
+ def get_code_to_admin2(
116
+ code_to_admin1: Mapping[str, Term], *, force: bool = False
117
+ ) -> Mapping[str, Term]:
118
+ """Get a mapping from admin2 code to term."""
119
+ admin2_df = ensure_df(
120
+ PREFIX,
121
+ url=ADMIN2_URL,
122
+ header=None,
123
+ names=["code", "name", "asciiname", "geonames_id"],
124
+ dtype=str,
125
+ force=force,
126
+ )
127
+ code_to_admin2 = {}
128
+ for identifier, name, code in admin2_df[["geonames_id", "name", "code"]].values:
129
+ if pd.isna(identifier) or pd.isna(code):
130
+ continue
131
+ term = Term.from_triple(
132
+ "geonames", identifier, name if pd.notna(name) else None, type="Instance"
133
+ )
134
+ term.append_property("code", code)
135
+ code_to_admin2[code] = term
136
+ admin1_code = code.rsplit(".", 1)[0]
137
+ admin1_term = code_to_admin1[admin1_code]
138
+ term.append_relationship(part_of, admin1_term)
139
+ return code_to_admin2
140
+
141
+
142
+ def get_cities(
143
+ code_to_country,
144
+ code_to_admin1,
145
+ code_to_admin2,
146
+ *,
147
+ minimum_population: int = 100_000,
148
+ force: bool = False,
149
+ ) -> Mapping[str, Term]:
150
+ """Get a mapping from city code to term."""
151
+ columns = [
152
+ "geonames_id",
153
+ "name",
154
+ "asciiname",
155
+ "synonyms",
156
+ "latitude",
157
+ "longitude",
158
+ "feature_class",
159
+ "feature_code",
160
+ "country_code",
161
+ "cc2",
162
+ "admin1",
163
+ "admin2",
164
+ "admin3",
165
+ "admin4",
166
+ "population",
167
+ "elevation",
168
+ "dem",
169
+ "timezone",
170
+ "date_modified",
171
+ ]
172
+ path = ensure_path(PREFIX, url=CITIES_URL, force=force)
173
+ cities_df = read_zipfile_csv(
174
+ path=path,
175
+ inner_path="cities15000.txt",
176
+ header=None,
177
+ names=columns,
178
+ dtype=str,
179
+ )
180
+
181
+ cities_df = cities_df[cities_df.population.astype(int) > minimum_population]
182
+ cities_df.synonyms = cities_df.synonyms.str.split(",")
183
+
184
+ terms = {}
185
+ for term in code_to_country.values():
186
+ terms[term.identifier] = term
187
+
188
+ cols = ["geonames_id", "name", "synonyms", "country_code", "admin1", "admin2", "feature_code"]
189
+ for identifier, name, synonyms, country, admin1, admin2, feature_code in cities_df[cols].values:
190
+ terms[identifier] = term = Term.from_triple(
191
+ "geonames", identifier, name if pd.notna(name) else None, type="Instance"
192
+ )
193
+ term.append_parent(Reference(prefix="geonames.feature", identifier=feature_code))
194
+ if synonyms and not isinstance(synonyms, float):
195
+ for synonym in synonyms:
196
+ if pd.notna(synonym):
197
+ term.append_synonym(synonym)
198
+
199
+ if pd.isna(admin1):
200
+ tqdm.write(f"[geonames:{identifier}] missing admin 1 code for {name} ({country})")
201
+ continue
202
+
203
+ admin1_full = f"{country}.{admin1}"
204
+ admin1_term = code_to_admin1.get(admin1_full)
205
+ if admin1_term is None:
206
+ logger.info(f"could not find admin1 {admin1_full}")
207
+ continue
208
+
209
+ terms[admin1_term.identifier] = admin1_term
210
+
211
+ if pd.notna(admin2):
212
+ admin2_full = f"{country}.{admin1}.{admin2}"
213
+ admin2_term = code_to_admin2.get(admin2_full)
214
+ if admin2_term is None or admin1_term is None:
215
+ pass
216
+ # print("could not find admin2", admin2_full)
217
+ else:
218
+ term.append_relationship(part_of, admin2_term)
219
+ terms[admin2_term.identifier] = admin2_term
220
+
221
+ else: # pd.notna(admin1):
222
+ # If there's no admin 2, just annotate directly onto admin 1
223
+ term.append_relationship(part_of, admin1_term)
224
+
225
+ return terms
226
+
227
+
228
+ if __name__ == "__main__":
229
+ GeonamesGetter().write_default(write_obo=True, force=True)
pyobo/sources/hgnc.py CHANGED
@@ -27,6 +27,7 @@ from pyobo.struct import (
27
27
  orthologous,
28
28
  transcribes_to,
29
29
  )
30
+ from pyobo.struct.typedef import exact_match
30
31
  from pyobo.utils.path import ensure_path, prefix_directory_join
31
32
 
32
33
  __all__ = [
@@ -108,6 +109,28 @@ ENCODINGS = {
108
109
  "unknown": "GRP",
109
110
  }
110
111
 
112
+ SKIP_KEYS = {
113
+ "date_approved_reserved",
114
+ "_version_",
115
+ "uuid",
116
+ "date_modified",
117
+ "date_name_changed",
118
+ "date_symbol_changed",
119
+ "symbol_report_tag",
120
+ "location_sortable",
121
+ "curator_notes",
122
+ "agr", # repeat of HGNC ID
123
+ "gencc", # repeat of HGNC ID
124
+ "bioparadigms_slc", # repeat of symbol
125
+ "lncrnadb", # repeat of symbol
126
+ "gtrnadb", # repeat of symbol
127
+ "horde_id", # repeat of symbol
128
+ "imgt", # repeat of symbol
129
+ "cd", # symbol
130
+ "homeodb", # TODO add to bioregistry, though this is defunct
131
+ "mamit-trnadb", # TODO add to bioregistry, though this is defunct
132
+ }
133
+
111
134
  #: A mapping from HGNC's locus_type annotations to sequence ontology identifiers
112
135
  LOCUS_TYPE_TO_SO = {
113
136
  # protein-coding gene
@@ -190,6 +213,7 @@ class HGNCGetter(Obo):
190
213
  transcribes_to,
191
214
  orthologous,
192
215
  member_of,
216
+ exact_match,
193
217
  ]
194
218
  idspaces = IDSPACES
195
219
  synonym_typedefs = [
@@ -330,6 +354,12 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
330
354
  else:
331
355
  tqdm.write(f"unhandled IUPHAR: {iuphar}")
332
356
 
357
+ for lrg_info in entry.pop("lsdb", []):
358
+ if lrg_info.startswith("LRG_"):
359
+ lrg_curie = lrg_info.split("|")[0]
360
+ _, lrg_id = lrg_curie.split("_")
361
+ term.append_xref(Reference(prefix="lrg", identifier=lrg_id))
362
+
333
363
  for xref_prefix, key in gene_xrefs:
334
364
  xref_identifiers = entry.pop(key, None)
335
365
  if xref_identifiers is None:
@@ -397,7 +427,8 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
397
427
  term.set_species(identifier="9606", name="Homo sapiens")
398
428
 
399
429
  for key in entry:
400
- unhandled_entry_keys[key] += 1
430
+ if key not in SKIP_KEYS:
431
+ unhandled_entry_keys[key] += 1
401
432
  yield term
402
433
 
403
434
  with open(prefix_directory_join(PREFIX, name="unhandled.json"), "w") as file:
@@ -36,7 +36,7 @@ class HGNCGroupGetter(Obo):
36
36
  """An ontology representation of HGNC's gene group nomenclature."""
37
37
 
38
38
  ontology = PREFIX
39
- dynamic_version = True
39
+ bioversions_key = "hgnc"
40
40
  synonym_typedefs = [symbol_type]
41
41
  typedefs = [from_species, enables]
42
42
 
pyobo/sources/mgi.py CHANGED
@@ -9,6 +9,8 @@ from typing import Iterable
9
9
  import pandas as pd
10
10
  from tqdm.auto import tqdm
11
11
 
12
+ from pyobo.struct.typedef import exact_match
13
+
12
14
  from ..struct import (
13
15
  Obo,
14
16
  Reference,
@@ -37,7 +39,7 @@ class MGIGetter(Obo):
37
39
 
38
40
  ontology = PREFIX
39
41
  dynamic_version = True
40
- typedefs = [from_species, has_gene_product, transcribes_to]
42
+ typedefs = [from_species, has_gene_product, transcribes_to, exact_match]
41
43
 
42
44
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
43
45
  """Iterate over terms in the ontology."""
@@ -161,7 +163,9 @@ def get_terms(force: bool = False) -> Iterable[Term]:
161
163
  for synonym in mgi_to_synonyms[identifier]:
162
164
  term.append_synonym(Synonym(name=synonym))
163
165
  if identifier in mgi_to_entrez_id:
164
- term.append_xref(Reference(prefix="ncbigene", identifier=mgi_to_entrez_id[identifier]))
166
+ term.append_exact_match(
167
+ Reference(prefix="ncbigene", identifier=mgi_to_entrez_id[identifier])
168
+ )
165
169
  for ensembl_id in mgi_to_ensemble_accession_ids[identifier]:
166
170
  term.append_xref(Reference(prefix="ensembl", identifier=ensembl_id))
167
171
  for ensembl_id in mgi_to_ensemble_transcript_ids[identifier]:
pyobo/sources/mirbase.py CHANGED
@@ -136,6 +136,8 @@ def _process_definitions_lines(
136
136
  xref_prefix, xref_identifier, xref_label = map(str.strip, line.split(";"))
137
137
  xref_prefix = xref_prefix.lower()
138
138
  xref_prefix = xref_mapping.get(xref_prefix, xref_prefix)
139
+ if xref_prefix == "pictar":
140
+ continue
139
141
  xrefs.append(
140
142
  Reference(prefix=xref_prefix, identifier=xref_identifier, name=xref_label or None)
141
143
  )
@@ -26,6 +26,7 @@ class MiRBaseFamilyGetter(Obo):
26
26
 
27
27
  ontology = PREFIX
28
28
  bioversions_key = "mirbase"
29
+ typedefs = [has_member]
29
30
 
30
31
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
31
32
  """Iterate over terms in the ontology."""
@@ -40,7 +41,9 @@ def get_obo(force: bool = False) -> Obo:
40
41
  def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
41
42
  """Get miRBase family terms."""
42
43
  df = get_df(version, force=force)
43
- for family_id, name, mirna_id, mirna_name in tqdm(df.values, total=len(df.index)):
44
+ for family_id, name, mirna_id, mirna_name in tqdm(
45
+ df.values, total=len(df.index), unit_scale=True, desc="miRBase Family"
46
+ ):
44
47
  term = Term(
45
48
  reference=Reference(prefix=PREFIX, identifier=family_id, name=name),
46
49
  )
@@ -65,4 +68,4 @@ def get_df(version: str, force: bool = False) -> pd.DataFrame:
65
68
 
66
69
 
67
70
  if __name__ == "__main__":
68
- get_obo().write_default(use_tqdm=True)
71
+ get_obo().write_default(use_tqdm=True, write_obo=True, force=True)
@@ -39,15 +39,16 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
39
39
  for _, name, previous_name, mirbase_mature_id in tqdm(
40
40
  df.values, total=len(df.index), unit_scale=True
41
41
  ):
42
+ synonyms = []
43
+ if pd.notna(previous_name):
44
+ synonyms.append(Synonym(name=previous_name))
42
45
  yield Term(
43
46
  reference=Reference(
44
47
  prefix=PREFIX, identifier=mirbase_mature_id, name=name if pd.notna(name) else None
45
48
  ),
46
- synonyms=[
47
- Synonym(name=previous_name),
48
- ],
49
+ synonyms=synonyms,
49
50
  )
50
51
 
51
52
 
52
53
  if __name__ == "__main__":
53
- MiRBaseMatureGetter.cli()
54
+ get_obo().write_default(write_obo=True, write_obograph=True, use_tqdm=True)
pyobo/sources/npass.py CHANGED
@@ -77,7 +77,7 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
77
77
  logger.debug("multiple cids for %s: %s", identifier, pubchem_compound_ids)
78
78
  for pubchem_compound_id in pubchem_compound_ids:
79
79
  term.append_xref(
80
- Reference(prefix="pubchem.compound", identifier=pubchem_compound_id)
80
+ Reference(prefix="pubchem.compound", identifier=pubchem_compound_id.strip())
81
81
  )
82
82
 
83
83
  for synonym in [iupac]:
pyobo/sources/pombase.py CHANGED
@@ -29,7 +29,7 @@ class PomBaseGetter(Obo):
29
29
  """An ontology representation of PomBase's fission yeast gene nomenclature."""
30
30
 
31
31
  ontology = bioversions_key = PREFIX
32
- typedefs = [from_species, has_gene_product]
32
+ typedefs = [from_species, has_gene_product, orthologous]
33
33
 
34
34
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
35
35
  """Iterate over terms in the ontology."""
pyobo/sources/ror.py ADDED
@@ -0,0 +1,163 @@
1
+ """Convert the Research Organization Registry (ROR) into an ontology."""
2
+
3
+ import json
4
+ import zipfile
5
+ from typing import Iterable
6
+
7
+ import bioregistry
8
+ import zenodo_client
9
+ from tqdm.auto import tqdm
10
+
11
+ from pyobo.struct import Obo, Reference, Term, TypeDef
12
+ from pyobo.struct.struct import acronym
13
+
14
+ PREFIX = "ror"
15
+ ROR_ZENODO_RECORD_ID = "10086202"
16
+
17
+ # Constants
18
+ ORG_CLASS = Reference(prefix="OBI", identifier="0000245")
19
+ LOCATED_IN = Reference(prefix="RO", identifier="0001025")
20
+ PART_OF = Reference(prefix="BFO", identifier="0000050")
21
+ HAS_PART = Reference(prefix="BFO", identifier="0000051")
22
+ SUCCESSOR = Reference(prefix="BFO", identifier="0000063")
23
+ PREDECESSOR = Reference(prefix="BFO", identifier="0000062")
24
+
25
+ RMAP = {
26
+ "Related": TypeDef.from_triple("rdfs", "seeAlso"),
27
+ "Child": TypeDef(HAS_PART),
28
+ "Parent": TypeDef(PART_OF),
29
+ "Predecessor": TypeDef(PREDECESSOR),
30
+ "Successor": TypeDef(SUCCESSOR),
31
+ "Located in": TypeDef(LOCATED_IN),
32
+ }
33
+ NAME_REMAPPING = {
34
+ "'s-Hertogenbosch": "Den Bosch", # SMH Netherlands, why u gotta be like this
35
+ "'s Heeren Loo": "s Heeren Loo",
36
+ "'s-Heerenberg": "s-Heerenberg",
37
+ "Institut Virion\\Serion": "Institut Virion/Serion",
38
+ "Hematology\\Oncology Clinic": "Hematology/Oncology Clinic",
39
+ }
40
+
41
+
42
+ class RORGetter(Obo):
43
+ """An ontology representation of the ROR."""
44
+
45
+ ontology = bioregistry_key = PREFIX
46
+ typedefs = list(RMAP.values())
47
+ synonym_typedefs = [acronym]
48
+ idspaces = {
49
+ "ror": "https://ror.org/",
50
+ "geonames": "https://www.geonames.org/",
51
+ "envo": "http://purl.obolibrary.org/obo/ENVO_",
52
+ "bfo": "http://purl.obolibrary.org/obo/BFO_",
53
+ "ro": "http://purl.obolibrary.org/obo/RO_",
54
+ "obi": "http://purl.obolibrary.org/obo/OBI_",
55
+ "omo": "http://purl.obolibrary.org/obo/OMO_",
56
+ "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
57
+ }
58
+
59
+ def __post_init__(self): # noqa: D105
60
+ self.data_version, _url, _path = _get_info()
61
+ super().__post_init__()
62
+
63
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
64
+ """Iterate over terms in the ontology."""
65
+ return iterate_ror_terms(force=force)
66
+
67
+
68
+ def iterate_ror_terms(*, force: bool = False) -> Iterable[Term]:
69
+ """Iterate over terms in ROR."""
70
+ version, source_uri, records = get_latest(force=force)
71
+ unhandled_xref_prefixes = set()
72
+ for record in tqdm(records, unit_scale=True, unit="record", desc=PREFIX):
73
+ identifier = record["id"].removeprefix("https://ror.org/")
74
+ name = record["name"]
75
+ name = NAME_REMAPPING.get(name, name)
76
+
77
+ term = Term(
78
+ reference=Reference(prefix=PREFIX, identifier=identifier, name=name), type="Instance"
79
+ )
80
+ term.append_parent(ORG_CLASS)
81
+
82
+ if name.startswith("The "):
83
+ term.append_synonym(name.removeprefix("The "))
84
+
85
+ for relationship in record.get("relationships", []):
86
+ target_id = relationship["id"].removeprefix("https://ror.org/")
87
+ term.append_relationship(
88
+ RMAP[relationship["type"]], Reference(prefix=PREFIX, identifier=target_id)
89
+ )
90
+
91
+ term.is_obsolete = record.get("status") != "active"
92
+
93
+ for address in record.get("addresses", []):
94
+ city = address.get("geonames_city")
95
+ if not city:
96
+ continue
97
+ term.append_relationship(
98
+ RMAP["Located in"], Reference(prefix="geonames", identifier=str(city["id"]))
99
+ )
100
+
101
+ for label in record.get("labels", []):
102
+ label = label["label"] # there's a language availabel in this dict too
103
+ term.append_synonym(label)
104
+ if label.startswith("The "):
105
+ term.append_synonym(label.removeprefix("The "))
106
+
107
+ for synonym in record.get("aliases", []):
108
+ term.append_synonym(synonym)
109
+ if synonym.startswith("The "):
110
+ term.append_synonym(synonym.removeprefix("The "))
111
+
112
+ for acronym_synonym in record.get("acronyms", []):
113
+ term.append_synonym(acronym_synonym, type=acronym)
114
+
115
+ for prefix, xref_data in record.get("external_ids", {}).items():
116
+ if prefix == "OrgRef":
117
+ # OrgRef refers to wikipedia page id, see
118
+ # https://stackoverflow.com/questions/6168020/what-is-wikipedia-pageid-how-to-change-it-into-real-page-url
119
+ continue
120
+ norm_prefix = bioregistry.normalize_prefix(prefix)
121
+ if norm_prefix is None:
122
+ if prefix not in unhandled_xref_prefixes:
123
+ tqdm.write(f"Unhandled prefix: {prefix} in {name} ({term.curie}). Values:")
124
+ for xref_id in xref_data["all"]:
125
+ tqdm.write(f"- {xref_id}")
126
+ unhandled_xref_prefixes.add(prefix)
127
+ continue
128
+
129
+ identifiers = xref_data["all"]
130
+ if isinstance(identifiers, str):
131
+ identifiers = [identifiers]
132
+ for xref_id in identifiers:
133
+ term.append_xref(Reference(prefix=norm_prefix, identifier=xref_id.replace(" ", "")))
134
+
135
+ yield term
136
+
137
+
138
+ def _get_info(*, force: bool = False):
139
+ client = zenodo_client.Zenodo()
140
+ latest_record_id = client.get_latest_record(ROR_ZENODO_RECORD_ID)
141
+ response = client.get_record(latest_record_id)
142
+ response_json = response.json()
143
+ version = response_json["metadata"]["version"].lstrip("v")
144
+ file_record = response_json["files"][0]
145
+ name = file_record["key"]
146
+ url = file_record["links"]["self"]
147
+ path = client.download(latest_record_id, name=name, force=force)
148
+ return version, url, path
149
+
150
+
151
+ def get_latest(*, force: bool = False):
152
+ """Get the latest ROR metadata and records."""
153
+ version, url, path = _get_info(force=force)
154
+ with zipfile.ZipFile(path) as zf:
155
+ for zip_info in zf.filelist:
156
+ if zip_info.filename.endswith(".json"):
157
+ with zf.open(zip_info) as file:
158
+ return version, url, json.load(file)
159
+ raise FileNotFoundError
160
+
161
+
162
+ if __name__ == "__main__":
163
+ RORGetter().write_default(write_obo=True, force=True)
pyobo/sources/sgd.py CHANGED
@@ -5,7 +5,7 @@
5
5
  from typing import Iterable
6
6
  from urllib.parse import unquote_plus
7
7
 
8
- from ..struct import Obo, Reference, Synonym, SynonymTypeDef, Term, from_species
8
+ from ..struct import Obo, Reference, Synonym, Term, from_species
9
9
  from ..utils.path import ensure_tar_df
10
10
 
11
11
  __all__ = [
@@ -21,15 +21,12 @@ URL = (
21
21
  )
22
22
  INNER_PATH = "S288C_reference_genome_R64-2-1_20150113/saccharomyces_cerevisiae_R64-2-1_20150113.gff"
23
23
 
24
- alias_type = SynonymTypeDef.from_text("alias")
25
-
26
24
 
27
25
  class SGDGetter(Obo):
28
26
  """An ontology representation of SGD's yeast gene nomenclature."""
29
27
 
30
28
  bioversions_key = ontology = PREFIX
31
29
  typedefs = [from_species]
32
- synonym_typedefs = [alias_type]
33
30
 
34
31
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
35
32
  """Iterate over terms for SGD."""
@@ -68,7 +65,7 @@ def get_terms(ontology: Obo, force: bool = False) -> Iterable[Term]:
68
65
  aliases = d.get("Alias")
69
66
  if aliases:
70
67
  for alias in aliases.split(","):
71
- synonyms.append(Synonym(name=unquote_plus(alias), type=alias_type))
68
+ synonyms.append(Synonym(name=unquote_plus(alias)))
72
69
 
73
70
  term = Term(
74
71
  reference=Reference(prefix=PREFIX, identifier=identifier, name=name),
pyobo/sources/slm.py CHANGED
@@ -7,7 +7,9 @@ from typing import Iterable
7
7
  import pandas as pd
8
8
  from tqdm.auto import tqdm
9
9
 
10
- from pyobo import Obo, SynonymTypeDef, Term
10
+ from pyobo import Obo, Reference, Term
11
+ from pyobo.struct.struct import abbreviation as abbreviation_typedef
12
+ from pyobo.struct.typedef import exact_match, has_inchi, has_smiles
11
13
  from pyobo.utils.path import ensure_df
12
14
 
13
15
  __all__ = [
@@ -37,14 +39,13 @@ COLUMNS = [
37
39
  "PMID",
38
40
  ]
39
41
 
40
- abreviation_type = SynonymTypeDef.from_text("abbreviation")
41
-
42
42
 
43
43
  class SLMGetter(Obo):
44
44
  """An ontology representation of SwissLipid's lipid nomenclature."""
45
45
 
46
46
  ontology = bioversions_key = PREFIX
47
- synonym_typedefs = [abreviation_type]
47
+ typedefs = [exact_match]
48
+ synonym_typedefs = [abbreviation_typedef]
48
49
 
49
50
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
50
51
  """Iterate over terms in the ontology."""
@@ -90,28 +91,29 @@ def iter_terms(version: str, force: bool = False):
90
91
  else:
91
92
  raise ValueError(identifier)
92
93
  term = Term.from_triple(PREFIX, identifier, name)
93
- term.append_property("level", level)
94
+ if pd.notna(level):
95
+ term.append_property("level", level)
94
96
  if pd.notna(abbreviation):
95
- term.append_synonym(abbreviation, type=abreviation_type)
97
+ term.append_synonym(abbreviation, type=abbreviation_typedef)
96
98
  if pd.notna(synonyms):
97
99
  for synonym in synonyms.split("|"):
98
100
  term.append_synonym(synonym.strip())
99
101
  if pd.notna(smiles):
100
- term.append_property("smiles", smiles)
102
+ term.append_property(has_smiles, smiles)
101
103
  if pd.notna(inchi) and inchi != "InChI=none":
102
104
  if inchi.startswith("InChI="):
103
105
  inchi = inchi[len("InChI=") :]
104
- term.append_property("inchi", inchi)
106
+ term.append_property(has_inchi, inchi)
105
107
  if pd.notna(inchikey):
106
108
  if inchikey.startswith("InChIKey="):
107
109
  inchikey = inchikey[len("InChIKey=") :]
108
- term.append_property("inchikey", inchikey)
110
+ term.append_exact_match(Reference(prefix="inchikey", identifier=inchikey))
109
111
  if pd.notna(chebi_id):
110
- term.append_xref(("chebi", chebi_id))
112
+ term.append_exact_match(("chebi", chebi_id))
111
113
  if pd.notna(lipidmaps_id):
112
- term.append_xref(("lipidmaps", lipidmaps_id))
114
+ term.append_exact_match(("lipidmaps", lipidmaps_id))
113
115
  if pd.notna(hmdb_id):
114
- term.append_xref(("hmdb", hmdb_id))
116
+ term.append_exact_match(("hmdb", hmdb_id))
115
117
  if pd.notna(pmids):
116
118
  for pmid in pmids.split("|"):
117
119
  term.append_provenance(("pubmed", pmid))