pymetadata 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pymetadata might be problematic. Click here for more details.

pymetadata/__init__.py CHANGED
@@ -3,7 +3,7 @@
3
3
  from pathlib import Path
4
4
 
5
5
  __author__ = "Matthias Koenig"
6
- __version__ = "0.5.4"
6
+ __version__ = "0.5.5"
7
7
 
8
8
 
9
9
  program_name: str = "pymetadata"
@@ -5,12 +5,14 @@ Core data structure to store annotations.
5
5
 
6
6
  import re
7
7
  import urllib
8
+ from enum import Enum
8
9
  from pprint import pprint
9
10
  from typing import Any, Dict, Final, List, Optional, Tuple, Union
10
11
 
11
12
  import requests
12
13
 
13
14
  from pymetadata import log
15
+ from pymetadata.console import console
14
16
  from pymetadata.core.xref import CrossReference, is_url
15
17
  from pymetadata.identifiers.miriam import BQB, BQM
16
18
  from pymetadata.identifiers.registry import REGISTRY
@@ -19,19 +21,31 @@ from pymetadata.ontologies.ols import ONTOLOGIES, OLSQuery
19
21
 
20
22
  OLS_QUERY = OLSQuery(ontologies=ONTOLOGIES)
21
23
 
22
- IDENTIFIERS_ORG_PREFIX: Final = "http://identifiers.org"
24
+ IDENTIFIERS_ORG_PREFIX: Final = "https://identifiers.org"
23
25
  IDENTIFIERS_ORG_PATTERN1: Final = re.compile(r"^https?://identifiers.org/(.+?)/(.+)")
24
26
  IDENTIFIERS_ORG_PATTERN2: Final = re.compile(r"^https?://identifiers.org/(.+)")
27
+
28
+ BIOREGISTRY_PREFIX: Final = "https://bioregistry.io"
29
+ BIOREGISTRY_PATTERN: Final = re.compile(r"^https?://bioregistry.io/(.+)")
30
+
25
31
  MIRIAM_URN_PATTERN: Final = re.compile(r"^urn:miriam:(.+)")
26
32
 
27
33
  logger = log.get_logger(__name__)
28
34
 
29
35
 
36
+ class ProviderType(str, Enum):
37
+ """Provider type."""
38
+
39
+ IDENTIFIERS_ORG = "identifiers.org"
40
+ BIOREGISTRY_IO = "bioregistry.io"
41
+ NONE = "none"
42
+
43
+
30
44
  class RDFAnnotation:
31
45
  """RDFAnnotation class.
32
46
 
33
47
  Basic storage of annotation information. This consists of the relation
34
- and the the resource.
48
+ and the resource.
35
49
  The annotations can be attached to other objects thereby forming
36
50
  triples which can be converted to RDF.
37
51
 
@@ -40,6 +54,7 @@ class RDFAnnotation:
40
54
  - `collection/term`, i.e., the combination of collection and term
41
55
  - `http(s)://arbitrary.url`, an arbitrary URL
42
56
  - urn:miriam:uniprot:P03023
57
+ - https://bioregistry.io/chebi:15996 urls via the bioregistry provider
43
58
  """
44
59
 
45
60
  replaced_collections: Dict[str, str] = {
@@ -53,6 +68,7 @@ class RDFAnnotation:
53
68
  self.collection: Optional[str] = None
54
69
  self.term: Optional[str] = None
55
70
  self.resource: str = resource
71
+ self.provider: ProviderType = ProviderType.IDENTIFIERS_ORG
56
72
 
57
73
  if not qualifier:
58
74
  raise ValueError(
@@ -75,15 +91,19 @@ class RDFAnnotation:
75
91
  if match1:
76
92
  # handle identifiers.org pattern
77
93
  self.collection, self.term = match1.group(1), match1.group(2)
94
+ self.provider = ProviderType.IDENTIFIERS_ORG
78
95
 
79
96
  if not self.collection:
80
- # tests new short pattern
97
+ # tests new compact patterns
81
98
  match2 = IDENTIFIERS_ORG_PATTERN2.match(resource)
82
99
  if match2:
83
100
  tokens = match2.group(1).split(":")
84
101
  if len(tokens) == 2:
85
102
  self.collection = tokens[0].lower()
103
+
104
+ # check if the namespace is embedded
86
105
  self.term = match2.group(1)
106
+ self.provider = ProviderType.IDENTIFIERS_ORG
87
107
  else:
88
108
  logger.warning(
89
109
  f"Identifiers.org URL does not conform to new"
@@ -94,10 +114,16 @@ class RDFAnnotation:
94
114
  # other urls are directly stored as resources without collection
95
115
  self.collection = None
96
116
  self.term = resource
97
- logger.debug(
98
- f"{resource} does not conform to "
99
- f"http(s)://identifiers.org/collection/id or http(s)://identifiers.org/id",
100
- )
117
+ if BIOREGISTRY_PATTERN.match(resource):
118
+ self.provider = ProviderType.BIOREGISTRY_IO
119
+ console.print(self.provider)
120
+ else:
121
+ self.provider = ProviderType.NONE
122
+ logger.warning(
123
+ f"{resource} does not conform to "
124
+ f"http(s)://identifiers.org/collection/id or http(s)://identifiers.org/id or "
125
+ f"https://bioregistry.io/id .",
126
+ )
101
127
 
102
128
  # handle urns
103
129
  elif resource.startswith("urn:miriam:"):
@@ -106,6 +132,7 @@ class RDFAnnotation:
106
132
  tokens = match3.group(1).split(":")
107
133
  self.collection = tokens[0]
108
134
  self.term = ":".join(tokens[1:]).replace("%3A", ":")
135
+ self.provider = ProviderType.IDENTIFIERS_ORG
109
136
 
110
137
  logger.warning(
111
138
  f"Deprecated urn pattern `{resource}` updated: "
@@ -118,9 +145,11 @@ class RDFAnnotation:
118
145
  if len(tokens) > 1:
119
146
  self.collection = tokens[0]
120
147
  self.term = "/".join(tokens[1:])
148
+ self.provider = ProviderType.IDENTIFIERS_ORG
121
149
  elif len(tokens) == 1 and ":" in tokens[0]:
122
150
  self.collection = tokens[0].split(":")[0].lower()
123
151
  self.term = tokens[0]
152
+ self.provider = ProviderType.IDENTIFIERS_ORG
124
153
 
125
154
  # validation
126
155
  if len(tokens) < 2 and not self.collection:
@@ -132,6 +161,13 @@ class RDFAnnotation:
132
161
  )
133
162
  self.collection = None
134
163
  self.term = resource
164
+ self.provider = ProviderType.NONE
165
+
166
+ # shorten compact terms
167
+ if self.term and self.collection:
168
+ self.term = self.shorten_compact_term(
169
+ term=self.term, collection=self.collection
170
+ )
135
171
 
136
172
  # clean legacy collections
137
173
  if self.collection in self.replaced_collections:
@@ -139,6 +175,21 @@ class RDFAnnotation:
139
175
 
140
176
  self.validate()
141
177
 
178
+ @staticmethod
179
+ def shorten_compact_term(term: str, collection: str) -> str:
180
+ """Shorten the compact terms and return term.
181
+
182
+ If the namespace is not embeddd in the term return the shortened term.
183
+ """
184
+ namespace = REGISTRY.ns_dict.get(collection, None)
185
+ if namespace and not namespace.namespaceEmbeddedInLui:
186
+ # shorter term
187
+ if term.lower().startswith(collection):
188
+ tokens = term.split(":")
189
+ term = ":".join(tokens[1:])
190
+
191
+ return term
192
+
142
193
  @staticmethod
143
194
  def from_tuple(t: Tuple[Union[BQB, BQM], str]) -> "RDFAnnotation":
144
195
  """Construct from tuple."""
@@ -164,12 +215,12 @@ class RDFAnnotation:
164
215
 
165
216
  def __repr__(self) -> str:
166
217
  """Get representation string."""
167
- return f"RDFAnnotation({self.qualifier}|{self.collection}|{self.term})"
218
+ return f"RDFAnnotation({self.qualifier}|{self.collection}|{self.term}|{self.provider.value})"
168
219
 
169
220
  def to_dict(self) -> Dict:
170
221
  """Convert to dict."""
171
222
  return {
172
- "qualifier": self.qualifier.value, # FIXME use enums!
223
+ "qualifier": self.qualifier.value,
173
224
  "collection": self.collection,
174
225
  "term": self.term,
175
226
  }
@@ -346,52 +397,59 @@ class RDFAnnotationData(RDFAnnotation):
346
397
 
347
398
  if __name__ == "__main__":
348
399
  for annotation in [
349
- # FIXME: support this
350
- RDFAnnotation(
351
- qualifier=BQB.IS_VERSION_OF,
352
- resource="NCIT:C75913",
353
- ),
354
- RDFAnnotation(
355
- qualifier=BQB.IS_VERSION_OF,
356
- resource="taxonomy/562",
357
- ),
358
- RDFAnnotation(
359
- qualifier=BQB.IS_VERSION_OF,
360
- resource="http://identifiers.org/taxonomy/9606",
361
- ),
362
400
  RDFAnnotation(
363
401
  qualifier=BQB.IS_VERSION_OF,
364
- resource="http://identifiers.org/biomodels.sbo/SBO:0000247",
365
- ),
366
- RDFAnnotation(
367
- qualifier=BQB.IS_VERSION_OF, resource="urn:miriam:obo.go:GO%3A0005623"
368
- ),
369
- RDFAnnotation(
370
- qualifier=BQB.IS_VERSION_OF, resource="urn:miriam:chebi:CHEBI%3A33699"
371
- ),
372
- RDFAnnotation(qualifier=BQB.IS_VERSION_OF, resource="chebi/CHEBI:456215"),
373
- RDFAnnotation(
374
- qualifier=BQB.IS, resource="https://en.wikipedia.org/wiki/Cytosol"
375
- ),
376
- RDFAnnotation(
377
- qualifier=BQB.IS_VERSION_OF, resource="urn:miriam:uniprot:P03023"
378
- ),
379
- RDFAnnotation(
380
- qualifier=BQB.IS_VERSION_OF,
381
- resource="http://identifiers.org/go/GO:0005829",
382
- ),
383
- RDFAnnotation(
384
- qualifier=BQB.IS_VERSION_OF, resource="http://identifiers.org/go/GO:0005829"
385
- ),
386
- RDFAnnotation(
387
- qualifier=BQB.IS_VERSION_OF, resource="http://identifiers.org/GO:0005829"
388
- ),
389
- RDFAnnotation(
390
- qualifier=BQB.IS_VERSION_OF, resource="http://identifiers.org/GO:0005829"
402
+ resource="https://bioregistry.io/chebi:15996",
391
403
  ),
392
- RDFAnnotation(qualifier=BQB.IS_VERSION_OF, resource="bto/BTO:0000089"),
393
- RDFAnnotation(qualifier=BQB.IS_VERSION_OF, resource="BTO:0000089"),
394
- RDFAnnotation(qualifier=BQB.IS_VERSION_OF, resource="chebi/CHEBI:000012"),
404
+ # RDFAnnotation(
405
+ # qualifier=BQB.IS_VERSION_OF,
406
+ # resource="NCIT:C75913",
407
+ # ),
408
+ # RDFAnnotation(
409
+ # qualifier=BQB.IS_VERSION_OF,
410
+ # resource="ncit:C75913",
411
+ # ),
412
+ # RDFAnnotation(
413
+ # qualifier=BQB.IS_VERSION_OF,
414
+ # resource="taxonomy/562",
415
+ # ),
416
+ # RDFAnnotation(
417
+ # qualifier=BQB.IS_VERSION_OF,
418
+ # resource="http://identifiers.org/taxonomy/9606",
419
+ # ),
420
+ # RDFAnnotation(
421
+ # qualifier=BQB.IS_VERSION_OF,
422
+ # resource="http://identifiers.org/biomodels.sbo/SBO:0000247",
423
+ # ),
424
+ # RDFAnnotation(
425
+ # qualifier=BQB.IS_VERSION_OF, resource="urn:miriam:obo.go:GO%3A0005623"
426
+ # ),
427
+ # RDFAnnotation(
428
+ # qualifier=BQB.IS_VERSION_OF, resource="urn:miriam:chebi:CHEBI%3A33699"
429
+ # ),
430
+ # RDFAnnotation(qualifier=BQB.IS_VERSION_OF, resource="chebi/CHEBI:456215"),
431
+ # RDFAnnotation(
432
+ # qualifier=BQB.IS, resource="https://en.wikipedia.org/wiki/Cytosol"
433
+ # ),
434
+ # RDFAnnotation(
435
+ # qualifier=BQB.IS_VERSION_OF, resource="urn:miriam:uniprot:P03023"
436
+ # ),
437
+ # RDFAnnotation(
438
+ # qualifier=BQB.IS_VERSION_OF,
439
+ # resource="http://identifiers.org/go/GO:0005829",
440
+ # ),
441
+ # RDFAnnotation(
442
+ # qualifier=BQB.IS_VERSION_OF, resource="http://identifiers.org/go/GO:0005829"
443
+ # ),
444
+ # RDFAnnotation(
445
+ # qualifier=BQB.IS_VERSION_OF, resource="http://identifiers.org/GO:0005829"
446
+ # ),
447
+ # RDFAnnotation(
448
+ # qualifier=BQB.IS_VERSION_OF, resource="http://identifiers.org/GO:0005829"
449
+ # ),
450
+ # RDFAnnotation(qualifier=BQB.IS_VERSION_OF, resource="bto/BTO:0000089"),
451
+ # RDFAnnotation(qualifier=BQB.IS_VERSION_OF, resource="BTO:0000089"),
452
+ # RDFAnnotation(qualifier=BQB.IS_VERSION_OF, resource="chebi/CHEBI:000012"),
395
453
  ]:
396
454
  print("-" * 80)
397
455
  data = RDFAnnotationData(annotation)
@@ -88,195 +88,6 @@ class Namespace:
88
88
  self.resources = list()
89
89
 
90
90
 
91
- def ols_namespaces() -> Dict[str, Namespace]:
92
- """Define Ontologies available from OLS but not in identifiers.org."""
93
- ols_info: Dict = {
94
- "deprecated": False,
95
- "deprecationDate": None,
96
- "institution": {
97
- "description": "At EMBL-EBI, we make the "
98
- "world’s public biological data "
99
- "freely available to the "
100
- "scientific community via a "
101
- "range of services and tools, "
102
- "perform basic research and "
103
- "provide professional training "
104
- "in bioinformatics. \n"
105
- "We are part of the European "
106
- "Molecular Biology Laboratory "
107
- "(EMBL), an international, "
108
- "innovative and "
109
- "interdisciplinary research "
110
- "organisation funded by 26 "
111
- "member states and two "
112
- "associate member states.",
113
- "homeUrl": "https://www.ebi.ac.uk",
114
- "id": 2,
115
- "location": {"countryCode": "GB", "countryName": "United Kingdom"},
116
- "name": "European Bioinformatics Institute",
117
- "rorId": "https://ror.org/02catss52",
118
- },
119
- "location": {"countryCode": "GB", "countryName": "United Kingdom"},
120
- "official": False,
121
- "providerCode": "ols",
122
- }
123
-
124
- # Custom namespaces for OLS ontology, for simple support
125
- namespaces = [
126
- Namespace(
127
- id=None,
128
- prefix="omim",
129
- pattern=r"^MI:\d+$",
130
- name="OMIM",
131
- description="Molecular Interactions Controlled Vocabulary",
132
- namespaceEmbeddedInLui=True,
133
- ),
134
- Namespace(
135
- id=None,
136
- prefix="dron",
137
- pattern=r"^DRON:\d+$",
138
- name="DRON",
139
- description="The drug ontology",
140
- namespaceEmbeddedInLui=True,
141
- ),
142
- Namespace(
143
- id=None,
144
- prefix="cmo",
145
- pattern=r"^CMO:\d+$",
146
- name="Chemical methods ontology",
147
- description="Morphological and physiological measurement records "
148
- "generated from clinical and model organism research and health programs.",
149
- namespaceEmbeddedInLui=True,
150
- ),
151
- Namespace(
152
- id=None,
153
- prefix="chmo",
154
- pattern=r"^CHMO:\d+$",
155
- name="Chemical methods ontology",
156
- description="CHMO, the chemical methods ontology",
157
- namespaceEmbeddedInLui=True,
158
- ),
159
- Namespace(
160
- id=None,
161
- prefix="vto",
162
- pattern=r"^VTO:\d+$",
163
- name="Vertebrate Taxonomy Ontology",
164
- description="VTO Vertebrate Taxonomy Ontology",
165
- namespaceEmbeddedInLui=True,
166
- ),
167
- Namespace(
168
- id=None,
169
- prefix="opmi",
170
- pattern=r"^OPMI:\d+$",
171
- name="Ontology of Precision Medicine and Investigation",
172
- description="OPMI: Ontology of Precision Medicine and Investigation",
173
- namespaceEmbeddedInLui=True,
174
- ),
175
- Namespace(
176
- id=None,
177
- prefix="atol",
178
- pattern=r"^ATOL:\d+$",
179
- name="ATOL",
180
- description="Animal Trait Ontology for Livestock",
181
- namespaceEmbeddedInLui=True,
182
- ),
183
- Namespace(
184
- id=None,
185
- prefix="nbo",
186
- pattern=r"^NBO:\d+$",
187
- name="NBO",
188
- description="Neuro Behavior Ontology",
189
- namespaceEmbeddedInLui=True,
190
- ),
191
- Namespace(
192
- id=None,
193
- prefix="scdo",
194
- pattern=r"^SCDO:\d+$",
195
- name="Sickle Cell Disease Ontology",
196
- description="Sickle Cell Disease Ontology",
197
- namespaceEmbeddedInLui=True,
198
- ),
199
- Namespace(
200
- id=None,
201
- prefix="fix",
202
- pattern=r"^FIX:\d+$",
203
- name="Physico-chemical methods and properties Ontology",
204
- description="Physico-chemical methods and properties Ontology",
205
- namespaceEmbeddedInLui=True,
206
- ),
207
- Namespace(
208
- id=None,
209
- prefix="oba",
210
- pattern=r"^OBA:\d+$",
211
- name="Ontology of Biological Attributes",
212
- description="PubChem is an open chemistry database at the National "
213
- "Institutes of Health (NIH).",
214
- namespaceEmbeddedInLui=True,
215
- ),
216
- Namespace(
217
- id=None,
218
- prefix="mmo",
219
- pattern=r"^MMO:\d+$",
220
- name="Measurement method ontology",
221
- description="Measurement method ontology",
222
- namespaceEmbeddedInLui=True,
223
- ),
224
- Namespace(
225
- id=None,
226
- prefix="symp",
227
- pattern=r"^SYMP:\d+$",
228
- name="Symptom ontology",
229
- description="The Symptom Ontology has been developed as a standardized ontology for symptoms of human diseases.",
230
- namespaceEmbeddedInLui=True,
231
- ),
232
- ]
233
-
234
- for ns in namespaces:
235
- if not ns.resources:
236
- ns.resources = []
237
- if not ns.prefix:
238
- continue
239
- ns.resources.append(
240
- Resource(
241
- id=None,
242
- name=f"{ns.prefix} through OLS",
243
- description=f"{ns.prefix} through OLS",
244
- mirId=None,
245
- sampleId=None,
246
- resourceHomeUrl=None,
247
- urlPattern=f"https://www.ebi.ac.uk/ols4/ontologies/{ns.prefix}/terms?obo_id={ns.prefix.upper()}"
248
- + ":{$id}",
249
- **ols_info,
250
- )
251
- )
252
-
253
- return {ns.prefix: ns for ns in namespaces} # type: ignore
254
-
255
-
256
- def misc_namespaces() -> Dict[str, Namespace]:
257
- """Define misc namespaces."""
258
- namespaces = [
259
- Namespace(
260
- id="brenda.ligand",
261
- pattern=r"^\d+$",
262
- name="BRENDA Ligand",
263
- prefix=None,
264
- description="BRENDA Ligand Information",
265
- namespaceEmbeddedInLui=False,
266
- ),
267
- Namespace(
268
- id="metabolights.compound",
269
- pattern=r"^MTBLC\d+$",
270
- name="Metabolights compound",
271
- prefix=None,
272
- description="metabolights compound",
273
- namespaceEmbeddedInLui=False,
274
- ),
275
- ]
276
-
277
- return {ns.id: ns for ns in namespaces} # type: ignore
278
-
279
-
280
91
  class Registry:
281
92
  """Managing the available annotation information.
282
93
 
@@ -284,10 +95,6 @@ class Registry:
284
95
  """
285
96
 
286
97
  URL = "https://registry.api.identifiers.org/resolutionApi/getResolverDataset"
287
- CUSTOM_NAMESPACES = {
288
- **ols_namespaces(),
289
- **misc_namespaces(),
290
- }
291
98
 
292
99
  def __init__(
293
100
  self,
@@ -325,7 +132,6 @@ class Registry:
325
132
 
326
133
  @staticmethod
327
134
  def update_registry(
328
- custom_namespaces: Dict[str, Namespace] = CUSTOM_NAMESPACES,
329
135
  registry_path: Optional[Path] = None,
330
136
  ) -> Dict[str, Namespace]:
331
137
  """Update registry from identifiers.org webservice."""
@@ -338,14 +144,6 @@ class Registry:
338
144
  ns = Namespace.from_dict(data)
339
145
  ns_dict[ns.prefix] = ns
340
146
 
341
- if custom_namespaces is not None:
342
- for key, ns in custom_namespaces.items():
343
- if key in ns_dict:
344
- logger.error(
345
- f"Namespace with key '{key}' exists in MIRIAM. Overwrite namespace!"
346
- )
347
- ns_dict[key] = ns
348
-
349
147
  if registry_path is not None:
350
148
  write_json_cache(
351
149
  data=ns_dict,