pyobo 0.12.10__py3-none-any.whl → 0.12.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. pyobo/__init__.py +6 -0
  2. pyobo/api/__init__.py +11 -1
  3. pyobo/api/alts.py +18 -4
  4. pyobo/api/embedding.py +108 -9
  5. pyobo/api/names.py +28 -6
  6. pyobo/api/xrefs.py +21 -1
  7. pyobo/cli/cli.py +9 -3
  8. pyobo/cli/database.py +63 -22
  9. pyobo/cli/lookup.py +39 -24
  10. pyobo/cli/utils.py +6 -2
  11. pyobo/constants.py +66 -7
  12. pyobo/getters.py +8 -3
  13. pyobo/ner/api.py +17 -10
  14. pyobo/ner/scispacy_utils.py +2 -0
  15. pyobo/plugins.py +3 -1
  16. pyobo/sources/__init__.py +2 -0
  17. pyobo/sources/antibodyregistry.py +3 -3
  18. pyobo/sources/bigg/bigg_compartment.py +1 -1
  19. pyobo/sources/complexportal.py +3 -3
  20. pyobo/sources/conso.py +3 -3
  21. pyobo/sources/famplex.py +3 -3
  22. pyobo/sources/goldbook.py +86 -0
  23. pyobo/sources/hgnc/hgnc.py +157 -96
  24. pyobo/sources/hgnc/hgncgenefamily.py +14 -13
  25. pyobo/sources/msigdb.py +3 -3
  26. pyobo/sources/omim_ps.py +8 -2
  27. pyobo/sources/reactome.py +3 -3
  28. pyobo/sources/rgd.py +7 -11
  29. pyobo/sources/slm.py +3 -3
  30. pyobo/sources/uniprot/uniprot.py +3 -3
  31. pyobo/sources/wikipathways.py +7 -2
  32. pyobo/struct/__init__.py +2 -2
  33. pyobo/struct/functional/macros.py +1 -1
  34. pyobo/struct/functional/obo_to_functional.py +7 -3
  35. pyobo/struct/obo/reader.py +4 -4
  36. pyobo/struct/struct.py +48 -18
  37. pyobo/struct/struct_utils.py +19 -5
  38. pyobo/struct/typedef.py +19 -3
  39. pyobo/struct/vocabulary.py +6 -3
  40. pyobo/utils/path.py +5 -4
  41. pyobo/version.py +1 -1
  42. {pyobo-0.12.10.dist-info → pyobo-0.12.12.dist-info}/METADATA +45 -23
  43. {pyobo-0.12.10.dist-info → pyobo-0.12.12.dist-info}/RECORD +46 -45
  44. {pyobo-0.12.10.dist-info → pyobo-0.12.12.dist-info}/WHEEL +1 -1
  45. {pyobo-0.12.10.dist-info → pyobo-0.12.12.dist-info}/entry_points.txt +0 -0
  46. {pyobo-0.12.10.dist-info → pyobo-0.12.12.dist-info}/licenses/LICENSE +0 -0
pyobo/__init__.py CHANGED
@@ -14,6 +14,7 @@ from .api import (
14
14
  get_filtered_relations_df,
15
15
  get_filtered_xrefs,
16
16
  get_graph,
17
+ get_graph_embeddings_df,
17
18
  get_hierarchy,
18
19
  get_id_definition_mapping,
19
20
  get_id_multirelations_mapping,
@@ -37,6 +38,7 @@ from .api import (
37
38
  get_obsolete,
38
39
  get_primary_curie,
39
40
  get_primary_identifier,
41
+ get_primary_reference,
40
42
  get_properties,
41
43
  get_properties_df,
42
44
  get_property,
@@ -44,6 +46,7 @@ from .api import (
44
46
  get_relation,
45
47
  get_relation_mapping,
46
48
  get_relations_df,
49
+ get_semantic_mappings,
47
50
  get_species,
48
51
  get_sssom_df,
49
52
  get_subhierarchy,
@@ -114,6 +117,7 @@ __all__ = [
114
117
  "get_filtered_relations_df",
115
118
  "get_filtered_xrefs",
116
119
  "get_graph",
120
+ "get_graph_embeddings_df",
117
121
  "get_grounder",
118
122
  "get_hierarchy",
119
123
  "get_id_definition_mapping",
@@ -139,6 +143,7 @@ __all__ = [
139
143
  "get_ontology",
140
144
  "get_primary_curie",
141
145
  "get_primary_identifier",
146
+ "get_primary_reference",
142
147
  "get_properties",
143
148
  "get_properties_df",
144
149
  "get_property",
@@ -149,6 +154,7 @@ __all__ = [
149
154
  "get_scispacy_entities",
150
155
  "get_scispacy_entity_linker",
151
156
  "get_scispacy_knowledgebase",
157
+ "get_semantic_mappings",
152
158
  "get_species",
153
159
  "get_sssom_df",
154
160
  "get_subhierarchy",
pyobo/api/__init__.py CHANGED
@@ -5,10 +5,16 @@ from .alts import (
5
5
  get_id_to_alts,
6
6
  get_primary_curie,
7
7
  get_primary_identifier,
8
+ get_primary_reference,
8
9
  )
9
10
  from .combine import get_literal_mappings_subset
10
11
  from .edges import get_edges, get_edges_df, get_graph
11
- from .embedding import get_text_embedding, get_text_embedding_similarity, get_text_embeddings_df
12
+ from .embedding import (
13
+ get_graph_embeddings_df,
14
+ get_text_embedding,
15
+ get_text_embedding_similarity,
16
+ get_text_embeddings_df,
17
+ )
12
18
  from .hierarchy import (
13
19
  get_ancestors,
14
20
  get_children,
@@ -59,6 +65,7 @@ from .typedefs import get_typedef_df
59
65
  from .xrefs import (
60
66
  get_filtered_xrefs,
61
67
  get_mappings_df,
68
+ get_semantic_mappings,
62
69
  get_sssom_df,
63
70
  get_xref,
64
71
  get_xrefs,
@@ -80,6 +87,7 @@ __all__ = [
80
87
  "get_filtered_relations_df",
81
88
  "get_filtered_xrefs",
82
89
  "get_graph",
90
+ "get_graph_embeddings_df",
83
91
  "get_hierarchy",
84
92
  "get_id_definition_mapping",
85
93
  "get_id_multirelations_mapping",
@@ -105,6 +113,7 @@ __all__ = [
105
113
  "get_ontology",
106
114
  "get_primary_curie",
107
115
  "get_primary_identifier",
116
+ "get_primary_reference",
108
117
  "get_priority_curie",
109
118
  "get_properties",
110
119
  "get_properties_df",
@@ -113,6 +122,7 @@ __all__ = [
113
122
  "get_relation",
114
123
  "get_relation_mapping",
115
124
  "get_relations_df",
125
+ "get_semantic_mappings",
116
126
  "get_species",
117
127
  "get_sssom_df",
118
128
  "get_subhierarchy",
pyobo/api/alts.py CHANGED
@@ -20,6 +20,7 @@ __all__ = [
20
20
  "get_id_to_alts",
21
21
  "get_primary_curie",
22
22
  "get_primary_identifier",
23
+ "get_primary_reference",
23
24
  ]
24
25
 
25
26
  logger = logging.getLogger(__name__)
@@ -61,13 +62,13 @@ def get_alts_to_id(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> Mapping[
61
62
  }
62
63
 
63
64
 
64
- def get_primary_curie(
65
+ def get_primary_reference(
65
66
  prefix: str | curies.Reference | curies.ReferenceTuple,
66
67
  identifier: str | None = None,
67
68
  /,
68
69
  **kwargs: Unpack[GetOntologyKwargs],
69
- ) -> str | None:
70
- """Get the primary curie for an entity."""
70
+ ) -> curies.ReferenceTuple | None:
71
+ """Get the primary reference for an entity."""
71
72
  reference = _get_pi(prefix, identifier)
72
73
  try:
73
74
  primary_identifier = get_primary_identifier(reference, **kwargs)
@@ -76,7 +77,20 @@ def get_primary_curie(
76
77
  raise
77
78
  # this happens on invalid prefix. maybe revise?
78
79
  return None
79
- return f"{reference.prefix}:{primary_identifier}"
80
+ return curies.ReferenceTuple(reference.prefix, primary_identifier)
81
+
82
+
83
+ def get_primary_curie(
84
+ prefix: str | curies.Reference | curies.ReferenceTuple,
85
+ identifier: str | None = None,
86
+ /,
87
+ **kwargs: Unpack[GetOntologyKwargs],
88
+ ) -> str | None:
89
+ """Get the primary curie for an entity."""
90
+ reference = get_primary_reference(prefix, identifier, **kwargs)
91
+ if reference is None:
92
+ return None
93
+ return reference.curie
80
94
 
81
95
 
82
96
  def get_primary_identifier(
pyobo/api/embedding.py CHANGED
@@ -2,18 +2,29 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from typing import TYPE_CHECKING
5
+ import tempfile
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Literal
6
8
 
9
+ import bioregistry
7
10
  import curies
8
11
  import numpy as np
9
12
  import pandas as pd
13
+ from tqdm import tqdm
14
+ from typing_extensions import Unpack
10
15
 
11
- from pyobo.api.names import get_definition, get_name, get_references
16
+ from pyobo.api.edges import get_edges_df
17
+ from pyobo.api.names import get_definition, get_id_name_mapping, get_name
18
+ from pyobo.api.utils import get_version_from_kwargs
19
+ from pyobo.constants import GetOntologyKwargs, check_should_force
20
+ from pyobo.identifier_utils import wrap_norm_prefix
21
+ from pyobo.utils.path import CacheArtifact, get_cache_path
12
22
 
13
23
  if TYPE_CHECKING:
14
24
  import sentence_transformers
15
25
 
16
26
  __all__ = [
27
+ "get_graph_embeddings_df",
17
28
  "get_text_embedding",
18
29
  "get_text_embedding_model",
19
30
  "get_text_embedding_similarity",
@@ -31,38 +42,126 @@ def get_text_embedding_model() -> sentence_transformers.SentenceTransformer:
31
42
 
32
43
  def _get_text(
33
44
  reference: str | curies.Reference | curies.ReferenceTuple,
45
+ /,
46
+ *,
47
+ name: str | None = None,
48
+ **kwargs: Unpack[GetOntologyKwargs],
34
49
  ) -> str | None:
35
- name = get_name(reference)
50
+ if name is None:
51
+ name = get_name(reference, **kwargs)
36
52
  if name is None:
37
53
  return None
38
- description = get_definition(reference)
54
+ description = get_definition(reference, **kwargs)
39
55
  if description:
40
56
  name += " " + description
41
57
  return name
42
58
 
43
59
 
60
+ def get_graph_embeddings_df(
61
+ prefix: str,
62
+ *,
63
+ method: Literal["pykeen", "grape"] | None = None,
64
+ epochs: int = 30,
65
+ dimension: int = 32,
66
+ **kwargs: Unpack[GetOntologyKwargs],
67
+ ) -> pd.DataFrame:
68
+ """Get graph machine learning embeddings."""
69
+ if method == "pykeen" or method is None:
70
+ from pykeen.models import PairRE
71
+ from pykeen.training import SLCWATrainingLoop
72
+ from pykeen.triples import TriplesFactory
73
+ from torch.optim import Adam
74
+
75
+ triples_df = get_edges_df(prefix, **kwargs)
76
+ training = TriplesFactory.from_labeled_triples(triples_df.values)
77
+ model = PairRE(triples_factory=training, embedding_dim=dimension)
78
+ optimizer = Adam(params=model.get_grad_params())
79
+ training_loop = SLCWATrainingLoop(
80
+ model=model, triples_factory=training, optimizer=optimizer
81
+ )
82
+ # can also set batch size here
83
+ training_loop.train(triples_factory=training, num_epochs=epochs)
84
+ embeddings = model.entity_representations[0]()
85
+ df = pd.DataFrame(
86
+ embeddings.detach().numpy(),
87
+ index=[training.entity_id_to_label[i] for i in range(embeddings.shape[0])],
88
+ )
89
+
90
+ elif method == "grape":
91
+ from ensmallen import Graph
92
+
93
+ edges_df = get_edges_df(prefix, **kwargs)
94
+ with tempfile.TemporaryDirectory() as d:
95
+ path = Path(d).joinpath("test.tsv")
96
+ edges_df[[":START_ID", ":END_ID"]].to_csv(path, header=None, sep="\t", index=False)
97
+ graph = Graph.from_csv(
98
+ edge_path=str(path),
99
+ edge_list_separator="\t",
100
+ sources_column_number=0,
101
+ destinations_column_number=1,
102
+ edge_list_numeric_node_ids=False,
103
+ directed=True,
104
+ name=bioregistry.get_name(prefix, strict=True),
105
+ verbose=True,
106
+ )
107
+ graph = graph.remove_disconnected_nodes()
108
+
109
+ from embiggen.embedders.ensmallen_embedders.second_order_line import (
110
+ SecondOrderLINEEnsmallen,
111
+ )
112
+
113
+ embedding = SecondOrderLINEEnsmallen(embedding_size=dimension, epochs=epochs).fit_transform(
114
+ graph
115
+ )
116
+ df = embedding.get_all_node_embedding()[0].sort_index()
117
+ # df.columns = [str(c) for c in df.columns]
118
+ else:
119
+ raise ValueError(f"invalid graph machine learning method: {method}")
120
+
121
+ df.index.name = "curie"
122
+ return df
123
+
124
+
125
+ @wrap_norm_prefix
44
126
  def get_text_embeddings_df(
45
127
  prefix: str,
46
128
  *,
47
129
  model: sentence_transformers.SentenceTransformer | None = None,
130
+ **kwargs: Unpack[GetOntologyKwargs],
48
131
  ) -> pd.DataFrame:
49
132
  """Get embeddings for all entities in the resource.
50
133
 
51
134
  :param prefix: A reference, either as a string or Reference object
52
135
  :param model: A sentence transformer model. Defaults to ``all-MiniLM-L6-v2`` if not
53
136
  given.
137
+ :param kwargs: The keyword arguments to forward to ontology getter functions for
138
+ names, definitions, and version
139
+
140
+ :returns: A pandas dataframe with an index representing local unique identifiers and
141
+ columns for the values of the model returned vectors
54
142
  """
143
+ path = get_cache_path(
144
+ prefix, CacheArtifact.embeddings, version=get_version_from_kwargs(prefix, kwargs)
145
+ )
146
+ if path.is_file() and not check_should_force(kwargs):
147
+ df = pd.read_csv(path, sep="\t").set_index(0)
148
+ return df
149
+
150
+ id_to_name = get_id_name_mapping(prefix, **kwargs)
151
+
55
152
  luids, texts = [], []
56
- for reference in get_references(prefix):
57
- text = _get_text(reference)
153
+ for identifier, name in tqdm(id_to_name.items(), desc=f"[{prefix}] constructing text"):
154
+ text = _get_text(curies.ReferenceTuple(prefix, identifier), name=name, **kwargs)
58
155
  if text is None:
59
156
  continue
60
- luids.append(reference.identifier)
157
+ luids.append(identifier)
61
158
  texts.append(text)
62
159
  if model is None:
63
160
  model = get_text_embedding_model()
64
- res = model.encode(texts)
65
- return pd.DataFrame(res, index=luids)
161
+ res = model.encode(texts, show_progress_bar=True)
162
+ df = pd.DataFrame(res, index=luids)
163
+ df.to_csv(path, sep="\t") # index is important here!
164
+ return df
66
165
 
67
166
 
68
167
  def get_text_embedding(
pyobo/api/names.py CHANGED
@@ -6,7 +6,7 @@ import logging
6
6
  import subprocess
7
7
  from collections.abc import Callable, Mapping
8
8
  from functools import lru_cache
9
- from typing import Any, TypeVar
9
+ from typing import TypeVar
10
10
 
11
11
  import curies
12
12
  import pandas as pd
@@ -49,9 +49,15 @@ __all__ = [
49
49
  logger = logging.getLogger(__name__)
50
50
 
51
51
 
52
- def get_name_by_curie(curie: str, **kwargs: Any) -> str | None:
52
+ def get_name_by_curie(
53
+ curie: str,
54
+ /,
55
+ *,
56
+ upgrade_identifier: bool | None = None,
57
+ **kwargs: Unpack[GetOntologyKwargs],
58
+ ) -> str | None:
53
59
  """Get the name for a CURIE, if possible."""
54
- return get_name(curie, **kwargs)
60
+ return get_name(curie, upgrade_identifier=upgrade_identifier, **kwargs)
55
61
 
56
62
 
57
63
  X = TypeVar("X")
@@ -63,6 +69,8 @@ NO_BUILD_LOGGED: set = set()
63
69
  def _help_get(
64
70
  f: Callable[[str, Unpack[GetOntologyKwargs]], Mapping[str, X]],
65
71
  reference: Reference,
72
+ *,
73
+ upgrade_identifier: bool | None = None,
66
74
  **kwargs: Unpack[GetOntologyKwargs],
67
75
  ) -> X | None:
68
76
  """Get the result for an entity based on a mapping maker function ``f``."""
@@ -87,19 +95,32 @@ def _help_get(
87
95
  NO_BUILD_PREFIXES.add(reference.prefix)
88
96
  return None
89
97
 
90
- primary_id = get_primary_identifier(reference, **kwargs)
91
- return mapping.get(primary_id)
98
+ if upgrade_identifier is None:
99
+ if reference.identifier in mapping:
100
+ return mapping[reference.identifier]
101
+ else:
102
+ primary_id = get_primary_identifier(reference, **kwargs)
103
+ return mapping.get(primary_id)
104
+ elif upgrade_identifier is True:
105
+ primary_id = get_primary_identifier(reference, **kwargs)
106
+ return mapping.get(primary_id)
107
+ else:
108
+ return mapping.get(reference.identifier)
92
109
 
93
110
 
94
111
  def get_name(
95
112
  prefix: str | curies.Reference | curies.ReferenceTuple,
96
113
  identifier: str | None = None,
97
114
  /,
115
+ *,
116
+ upgrade_identifier: bool | None = None,
98
117
  **kwargs: Unpack[GetOntologyKwargs],
99
118
  ) -> str | None:
100
119
  """Get the name for an entity."""
101
120
  reference = _get_pi(prefix, identifier)
102
- return _help_get(get_id_name_mapping, reference, **kwargs)
121
+ return _help_get(
122
+ get_id_name_mapping, reference, upgrade_identifier=upgrade_identifier, **kwargs
123
+ )
103
124
 
104
125
 
105
126
  @lru_cache
@@ -325,6 +346,7 @@ def get_literal_mappings(
325
346
  return rv
326
347
 
327
348
 
349
+ @wrap_norm_prefix
328
350
  def get_literal_mappings_df(
329
351
  prefix: str,
330
352
  **kwargs: Unpack[GetOntologyKwargs],
pyobo/api/xrefs.py CHANGED
@@ -5,8 +5,11 @@ import warnings
5
5
  from collections.abc import Mapping
6
6
  from functools import lru_cache
7
7
 
8
+ import curies
8
9
  import pandas as pd
9
10
  from curies import ReferenceTuple
11
+ from sssom_pydantic import SemanticMapping
12
+ from sssom_pydantic.io import row_to_semantic_mapping
10
13
  from typing_extensions import Unpack
11
14
 
12
15
  from .utils import get_version_from_kwargs
@@ -19,7 +22,7 @@ from ..constants import (
19
22
  check_should_use_tqdm,
20
23
  )
21
24
  from ..getters import get_ontology
22
- from ..identifier_utils import wrap_norm_prefix
25
+ from ..identifier_utils import get_converter, wrap_norm_prefix
23
26
  from ..struct import Obo
24
27
  from ..utils.cache import cached_df
25
28
  from ..utils.path import CacheArtifact, get_cache_path
@@ -27,6 +30,7 @@ from ..utils.path import CacheArtifact, get_cache_path
27
30
  __all__ = [
28
31
  "get_filtered_xrefs",
29
32
  "get_mappings_df",
33
+ "get_semantic_mappings",
30
34
  "get_sssom_df",
31
35
  "get_xref",
32
36
  "get_xrefs",
@@ -107,6 +111,22 @@ def get_sssom_df(
107
111
  return get_mappings_df(prefix=prefix, names=names, **kwargs)
108
112
 
109
113
 
114
+ def get_semantic_mappings(
115
+ prefix: str,
116
+ converter: curies.Converter | None = None,
117
+ names: bool = True,
118
+ include_mapping_source_column: bool = False,
119
+ **kwargs: Unpack[GetOntologyKwargs],
120
+ ) -> list[SemanticMapping]:
121
+ """Get semantic mapping objects."""
122
+ df = get_mappings_df(
123
+ prefix, names=names, include_mapping_source_column=include_mapping_source_column, **kwargs
124
+ )
125
+ if converter is None:
126
+ converter = get_converter()
127
+ return [row_to_semantic_mapping(row, converter=converter) for _, row in df.iterrows()]
128
+
129
+
110
130
  def get_mappings_df(
111
131
  prefix: str | Obo,
112
132
  *,
pyobo/cli/cli.py CHANGED
@@ -6,10 +6,7 @@ from collections.abc import Iterable
6
6
  from functools import lru_cache
7
7
  from operator import itemgetter
8
8
 
9
- import bioregistry
10
9
  import click
11
- import humanize
12
- from tabulate import tabulate
13
10
 
14
11
  from .database import main as database_main
15
12
  from .lookup import lookup
@@ -59,6 +56,9 @@ def clean(remove_obo: bool):
59
56
  @main.command()
60
57
  def ls():
61
58
  """List how big all of the OBO files are."""
59
+ import humanize
60
+ from tabulate import tabulate
61
+
62
62
  entries = [(prefix, os.path.getsize(path)) for prefix, path in _iter_cached_obo()]
63
63
  entries = [
64
64
  (prefix, humanize.naturalsize(size), "✅" if not has_nomenclature_plugin(prefix) else "❌")
@@ -69,6 +69,8 @@ def ls():
69
69
 
70
70
  def _iter_cached_obo() -> Iterable[tuple[str, str]]:
71
71
  """Iterate over cached OBO paths."""
72
+ import bioregistry
73
+
72
74
  for prefix in os.listdir(RAW_DIRECTORY):
73
75
  if prefix in GLOBAL_SKIP or _has_no_download(prefix) or bioregistry.is_deprecated(prefix):
74
76
  continue
@@ -83,6 +85,8 @@ def _iter_cached_obo() -> Iterable[tuple[str, str]]:
83
85
 
84
86
  def _has_no_download(prefix: str) -> bool:
85
87
  """Return if the prefix is not available."""
88
+ import bioregistry
89
+
86
90
  prefix_norm = bioregistry.normalize_prefix(prefix)
87
91
  return prefix_norm is not None and prefix_norm in _no_download()
88
92
 
@@ -90,6 +94,8 @@ def _has_no_download(prefix: str) -> bool:
90
94
  @lru_cache(maxsize=1)
91
95
  def _no_download() -> set[str]:
92
96
  """Get the list of prefixes not available as OBO."""
97
+ import bioregistry
98
+
93
99
  return {resource.prefix for resource in bioregistry.resources() if not resource.has_download()}
94
100
 
95
101