robokop-genetics 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. robokop_genetics-0.5.0/LICENSE +21 -0
  2. robokop_genetics-0.5.0/PKG-INFO +40 -0
  3. robokop_genetics-0.5.0/README.md +21 -0
  4. robokop_genetics-0.5.0/pyproject.toml +3 -0
  5. robokop_genetics-0.5.0/robokop_genetics/__init__.py +0 -0
  6. robokop_genetics-0.5.0/robokop_genetics/genetics_cache.py +138 -0
  7. robokop_genetics-0.5.0/robokop_genetics/genetics_normalization.py +163 -0
  8. robokop_genetics-0.5.0/robokop_genetics/genetics_services.py +125 -0
  9. robokop_genetics-0.5.0/robokop_genetics/node_types.py +8 -0
  10. robokop_genetics-0.5.0/robokop_genetics/services/__init__.py +0 -0
  11. robokop_genetics-0.5.0/robokop_genetics/services/clingen.py +277 -0
  12. robokop_genetics-0.5.0/robokop_genetics/services/ensembl.py +256 -0
  13. robokop_genetics-0.5.0/robokop_genetics/services/hgnc.py +62 -0
  14. robokop_genetics-0.5.0/robokop_genetics/services/myvariant.py +158 -0
  15. robokop_genetics-0.5.0/robokop_genetics/simple_graph_components.py +34 -0
  16. robokop_genetics-0.5.0/robokop_genetics/util.py +124 -0
  17. robokop_genetics-0.5.0/robokop_genetics.egg-info/PKG-INFO +40 -0
  18. robokop_genetics-0.5.0/robokop_genetics.egg-info/SOURCES.txt +24 -0
  19. robokop_genetics-0.5.0/robokop_genetics.egg-info/dependency_links.txt +1 -0
  20. robokop_genetics-0.5.0/robokop_genetics.egg-info/requires.txt +2 -0
  21. robokop_genetics-0.5.0/robokop_genetics.egg-info/top_level.txt +1 -0
  22. robokop_genetics-0.5.0/setup.cfg +4 -0
  23. robokop_genetics-0.5.0/setup.py +29 -0
  24. robokop_genetics-0.5.0/tests/test_cache.py +147 -0
  25. robokop_genetics-0.5.0/tests/test_normalization.py +201 -0
  26. robokop_genetics-0.5.0/tests/test_services.py +194 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2020 ObesityHub
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,40 @@
1
+ Metadata-Version: 2.1
2
+ Name: robokop-genetics
3
+ Version: 0.5.0
4
+ Summary: A package for Robokop genetics tools and services.
5
+ Home-page: https://github.com/RobokopU24/robokop-genetics
6
+ Author: Evan Morris
7
+ Author-email: evandietzmorris@gmail.com
8
+ Maintainer: Evan Morris
9
+ Maintainer-email: evandietzmorris@gmail.com
10
+ License: CC-0
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Requires-Python: >=3.9
15
+ Description-Content-Type: text/markdown
16
+ License-File: LICENSE
17
+ Requires-Dist: requests>=2.32.3
18
+ Requires-Dist: redis>=5.0.4
19
+
20
+ # robokop-genetics
21
+ Tools and service wrappers for building Robokop graphs.
22
+
23
+ #### Caching
24
+ To utilize a redis cache, set the following environment variables to match your own redis cache instance:
25
+ ```
26
+ ROBO_GENETICS_CACHE_HOST=localhost
27
+ ROBO_GENETICS_CACHE_PORT=6379
28
+ ROBO_GENETICS_CACHE_DB=0
29
+ ROBO_GENETICS_CACHE_PASSWORD=yourpassword
30
+ ```
31
+
32
+ #### Logging and Temporary Files
33
+ robokop-genetics depends on a local directory with write permissions for temporary files and logging.
34
+
35
+ When used in conjunction with robo-commons or robokop-rags, the default robokop home directory will be used.
36
+
37
+ For testing or other purposes, set the following environment variable to specify a valid location.
38
+ ```
39
+ ROBO_GENETICS_HOME=/home/example_directory
40
+ ```
@@ -0,0 +1,21 @@
1
+ # robokop-genetics
2
+ Tools and service wrappers for building Robokop graphs.
3
+
4
+ #### Caching
5
+ To utilize a redis cache, set the following environment variables to match your own redis cache instance:
6
+ ```
7
+ ROBO_GENETICS_CACHE_HOST=localhost
8
+ ROBO_GENETICS_CACHE_PORT=6379
9
+ ROBO_GENETICS_CACHE_DB=0
10
+ ROBO_GENETICS_CACHE_PASSWORD=yourpassword
11
+ ```
12
+
13
+ #### Logging and Temporary Files
14
+ robokop-genetics depends on a local directory with write permissions for temporary files and logging.
15
+
16
+ When used in conjunction with robo-commons or robokop-rags, the default robokop home directory will be used.
17
+
18
+ For testing or other purposes, set the following environment variable to specify a valid location.
19
+ ```
20
+ ROBO_GENETICS_HOME=/home/example_directory
21
+ ```
@@ -0,0 +1,3 @@
1
+ [build-system]
2
+ requires = ["setuptools"]
3
+ build-backend = "setuptools.build_meta"
File without changes
@@ -0,0 +1,138 @@
1
+ import os
2
+ import json
3
+ import redis
4
+ import logging
5
+ from robokop_genetics.util import LoggingUtil
6
+ from robokop_genetics.simple_graph_components import SimpleEdge, SimpleNode
7
+
8
+
9
+ class GeneticsCache:
10
+
11
+ logger = LoggingUtil.init_logging(__name__,
12
+ logging.INFO,
13
+ log_file_path=LoggingUtil.get_logging_path())
14
+
15
+ def __init__(self,
16
+ use_default_credentials: bool = True,
17
+ redis_host: str = "localhost",
18
+ redis_port: int = 6379,
19
+ redis_db: int = 0,
20
+ redis_password: str = "",
21
+ prefix: str = ""):
22
+ self.NORMALIZATION_KEY_PREFIX = f'{prefix}normalize-'
23
+
24
+ if use_default_credentials:
25
+ try:
26
+ redis_host = os.environ['ROBO_GENETICS_CACHE_HOST']
27
+ redis_port = os.environ['ROBO_GENETICS_CACHE_PORT']
28
+ redis_db = os.environ['ROBO_GENETICS_CACHE_DB']
29
+ redis_password = os.environ['ROBO_GENETICS_CACHE_PASSWORD']
30
+ except KeyError:
31
+ self.logger.warning('ROBO_GENETICS_CACHE environment variables not set. No cache activated.')
32
+ raise Exception("Cache requested but ROBO_GENETICS_CACHE environment variables not set!")
33
+
34
+ try:
35
+ if redis_password:
36
+ self.redis = redis.Redis(host=redis_host,
37
+ port=int(redis_port),
38
+ db=int(redis_db),
39
+ password=redis_password)
40
+ else:
41
+ self.redis = redis.Redis(host=redis_host,
42
+ port=int(redis_port),
43
+ db=int(redis_db))
44
+ self.redis.get('x')
45
+ self.logger.info(f"Genetics cache connected to redis at {redis_host}:{redis_port}/{redis_db}")
46
+ except Exception as e:
47
+ self.logger.error(f"Genetics cache failed to connect to redis at {redis_host}:{redis_port}/{redis_db}.")
48
+ raise e
49
+
50
+ #def set_normalization(self, node_id: str, normalization: tuple):
51
+ # normalization_key = f'{self.NORMALIZATION_KEY_PREFIX}{node_id}'
52
+ # self.redis.set(normalization_key, json.dumps(normalization))
53
+
54
+ def set_batch_normalization(self, normalization_map: dict):
55
+ pipeline = self.redis.pipeline()
56
+ for node_id, normalization in normalization_map.items():
57
+ normalization_key = f'{self.NORMALIZATION_KEY_PREFIX}{node_id}'
58
+ pipeline.set(normalization_key, json.dumps(normalization))
59
+ pipeline.execute()
60
+
61
+ #def get_normalization(self, node_id: str):
62
+ # normalization_key = f'{self.NORMALIZATION_KEY_PREFIX}{node_id}'
63
+ # result = self.redis.get(normalization_key)
64
+ # normalization = json.loads(result) if result is not None else None
65
+ # return normalization
66
+
67
+ def get_batch_normalization(self, node_ids: list):
68
+ pipeline = self.redis.pipeline()
69
+ for node_id in node_ids:
70
+ normalization_key = f'{self.NORMALIZATION_KEY_PREFIX}{node_id}'
71
+ pipeline.get(normalization_key)
72
+ results = pipeline.execute()
73
+
74
+ normalization_map = {}
75
+ for i, result in enumerate(results):
76
+ if result is not None:
77
+ normalization_map[node_ids[i]] = json.loads(result)
78
+ return normalization_map
79
+
80
+ def set_service_results(self, service_key: str, results_dict: dict):
81
+ pipeline = self.redis.pipeline()
82
+ for node_id, results in results_dict.items():
83
+ redis_key = f'{service_key}-{node_id}'
84
+ pipeline.set(redis_key, self.__encode_service_results(results))
85
+ pipeline.execute()
86
+
87
+ def __encode_service_results(self, service_results: list):
88
+ encoded_results = []
89
+ for (edge, node) in service_results:
90
+ json_node = {"id": node.id, "type": node.type, "name": node.name}
91
+ json_edge = {"source_id": edge.source_id,
92
+ "target_id": edge.target_id,
93
+ "provided_by": edge.provided_by,
94
+ "input_id": edge.input_id,
95
+ "predicate_id": edge.predicate_id,
96
+ "predicate_label": edge.predicate_label,
97
+ "ctime": edge.ctime,
98
+ "properties": edge.properties}
99
+ encoded_result = {"edge": json_edge, "node": json_node}
100
+ encoded_results.append(encoded_result)
101
+ return json.dumps(encoded_results)
102
+
103
+ def get_service_results(self, service_key: str, node_ids: list):
104
+ pipeline = self.redis.pipeline()
105
+ for node_id in node_ids:
106
+ pipeline.get(f'{service_key}-{node_id}')
107
+ redis_results = pipeline.execute()
108
+ local_decode_results = self.__decode_service_results
109
+ decoded_results = list(map(lambda result: local_decode_results(result) if result else None, redis_results))
110
+ return decoded_results
111
+
112
+ def __decode_service_results(self, redis_results):
113
+ decoded_results = []
114
+ json_object = json.loads(redis_results)
115
+ for result in json_object:
116
+ edge_json = result["edge"]
117
+ edge_object = SimpleEdge(source_id=edge_json['source_id'],
118
+ target_id=edge_json['target_id'],
119
+ provided_by=edge_json['provided_by'],
120
+ input_id=edge_json['input_id'],
121
+ predicate_id=edge_json['predicate_id'],
122
+ predicate_label=edge_json['predicate_label'],
123
+ ctime=edge_json['ctime'],
124
+ properties=edge_json['properties'])
125
+ # note that right now we're not caching properties or synonyms for service nodes,
126
+ # properties aren't used yet, synonyms will come from normalization after the fact
127
+ node_json = result["node"]
128
+ node_object = SimpleNode(id=node_json["id"],
129
+ type=node_json["type"],
130
+ name=node_json["name"])
131
+ decoded_results.append((edge_object,
132
+ node_object))
133
+ return decoded_results
134
+
135
+ def delete_all_keys_with_prefix(self, prefix: str):
136
+ keys = self.redis.keys(f'{prefix}*')
137
+ if keys:
138
+ self.redis.delete(*keys)
@@ -0,0 +1,163 @@
1
+ from robokop_genetics.services.clingen import ClinGenService, ClinGenSynonymizationResult, batchable_variant_curie_prefixes
2
+ from robokop_genetics.genetics_cache import GeneticsCache
3
+ import robokop_genetics.node_types as node_types
4
+ from robokop_genetics.util import LoggingUtil, Text
5
+ import logging
6
+ import requests
7
+ import os
8
+
9
+
10
+ class GeneticsNormalizer(object):
11
+ DEFAULT_EDGE_NORM_ENDPOINT = f'https://biolink-lookup.transltr.io/'
12
+
13
+ logger = LoggingUtil.init_logging(__name__,
14
+ logging.INFO,
15
+ log_file_path=LoggingUtil.get_logging_path())
16
+
17
+ def __init__(self, use_cache: bool = False, bl_version: str = "latest"):
18
+ if use_cache:
19
+ self.cache = GeneticsCache()
20
+ self.logger.info('Robokop Genetics Normalizer initialized with cache activated.')
21
+ else:
22
+ self.cache = None
23
+ self.logger.info('Robokop Genetics Normalizer initialized with no cache activated.')
24
+
25
+ if 'EDGE_NORMALIZATION_ENDPOINT' in os.environ:
26
+ self.edge_norm_endpoint = os.environ['EDGE_NORMALIZATION_ENDPOINT']
27
+ self.logger.info(f'Using EDGE_NORMALIZATION_ENDPOINT from env var: {self.edge_norm_endpoint}')
28
+ else:
29
+ self.edge_norm_endpoint = self.DEFAULT_EDGE_NORM_ENDPOINT
30
+ self.logger.info(f'Using default EDGE_NORMALIZATION_ENDPOINT: {self.edge_norm_endpoint}')
31
+
32
+ self.bl_version = bl_version
33
+ self.sequence_variant_node_types = self.fetch_sequence_variant_node_types()
34
+ self.clingen = ClinGenService()
35
+
36
+ def get_sequence_variant_node_types(self):
37
+ """
38
+ Returns a list of all normalized node types for sequence variant nodes
39
+ :return:
40
+ """
41
+ return self.sequence_variant_node_types
42
+
43
+ def fetch_sequence_variant_node_types(self):
44
+ bl_url = f"{self.edge_norm_endpoint}bl/{node_types.SEQUENCE_VARIANT}/ancestors?version={self.bl_version}"
45
+ with requests.session() as client:
46
+ response = client.get(bl_url)
47
+ if response.status_code == 200:
48
+ sequence_variant_node_types = list(set(response.json() + [node_types.SEQUENCE_VARIANT]))
49
+ else:
50
+ sequence_variant_node_types = [node_types.NAMED_THING, node_types.SEQUENCE_VARIANT]
51
+ self.logger.error(
52
+ f'Failed bl-lookup for {node_types.SEQUENCE_VARIANT} ancestor types: (response code: {response.status_code})')
53
+ return sequence_variant_node_types
54
+
55
+ def normalize_variants(self, variant_ids):
56
+ """
57
+ Normalize a list of variants in the most efficient way ie. check the cache, then process in batches if possible.
58
+ :param variant_ids: a list of variant curie identifiers
59
+ :return: a dictionary of normalization information, with the provided curie list as keys
60
+ """
61
+
62
+ # if there is a cache active, check it for existing results and grab them
63
+ if self.cache:
64
+ all_normalization_results = self.cache.get_batch_normalization(variant_ids)
65
+ variants_that_need_normalizing = [variant_id for variant_id in variant_ids if variant_id not in all_normalization_results]
66
+ self.logger.info(f'Batch normalizing found {len(all_normalization_results)}/{len(variant_ids)} results in the cache.')
67
+ else:
68
+ all_normalization_results = {}
69
+ variants_that_need_normalizing = variant_ids
70
+
71
+ # normalize batches of variants with the same curie prefix because that's how clingen accepts them
72
+ for curie_prefix in batchable_variant_curie_prefixes:
73
+ batchable_variant_curies = [v_curie for v_curie in variants_that_need_normalizing if v_curie.startswith(curie_prefix)]
74
+ batched_normalizations = self.get_batch_sequence_variant_normalization(batchable_variant_curies)
75
+ all_normalization_results.update(batched_normalizations)
76
+ if self.cache:
77
+ # cache the results if possible
78
+ self.cache.set_batch_normalization(batched_normalizations)
79
+
80
+ # for remaining variants batching is not possible - try to find results one at a time
81
+ unbatchable_variant_ids = [v_curie for v_curie in variants_that_need_normalizing if v_curie not in all_normalization_results]
82
+ unbatchable_norm_results = map(self.get_sequence_variant_normalization, unbatchable_variant_ids)
83
+ # this could probably be done more efficiently, we only create unbatchable_norm_result_map for the cache
84
+ unbatchable_norm_result_map = {}
85
+ for i, result in enumerate(unbatchable_norm_results):
86
+ if self.cache:
87
+ unbatchable_norm_result_map[unbatchable_variant_ids[i]] = result
88
+ all_normalization_results[unbatchable_variant_ids[i]] = result
89
+ if self.cache:
90
+ # cache the results if possible
91
+ self.cache.set_batch_normalization(unbatchable_norm_result_map)
92
+ return all_normalization_results
93
+
94
+ # variant_curie: the id of the variant that needs normalizing
95
+ def get_sequence_variant_normalization(self, variant_curie: str):
96
+ normalizations = []
97
+ # Note that clingen.get_synonyms_by_other_id supports variants which may return multiple synonymization results.
98
+ # So here we may create more than one normalized node for each provided variant curie.
99
+ synonymization_results = self.clingen.get_synonyms_by_other_id(variant_curie)
100
+ for synonymization_result in synonymization_results:
101
+ if synonymization_result.success:
102
+ normalized_id, normalized_name = self.get_id_and_name_from_synonyms(synonymization_result.synonyms)
103
+ normalization_dict = {
104
+ "id": normalized_id,
105
+ "name": normalized_name,
106
+ "equivalent_identifiers": list(synonymization_result.synonyms),
107
+ "type": self.sequence_variant_node_types
108
+ }
109
+ else:
110
+ normalization_dict = {
111
+ "error_type": synonymization_result.error_type,
112
+ "error_message": synonymization_result.error_message,
113
+ }
114
+ normalizations.append(normalization_dict)
115
+ return normalizations
116
+
117
+ # Given a list of batchable curies with the same prefix, return a map of corresponding normalization information.
118
+ def get_batch_sequence_variant_normalization(self, curies: list):
119
+ normalization_map = {}
120
+ # Note that for batch normalization clingen only supports variant types which return a single set of synonyms,
121
+ # as opposed to potentially returning multiple sets such as when calling get_synonyms_by_other_id.
122
+ # Here we always only create one normalized node per provided ID.
123
+ synonymization_results = self.clingen.get_batch_of_synonyms(curies)
124
+ sequence_variant_node_types = self.sequence_variant_node_types
125
+ for i, synonymization_result in enumerate(synonymization_results):
126
+ if synonymization_result.success:
127
+ normalized_id, normalized_name = self.get_id_and_name_from_synonyms(synonymization_result.synonyms)
128
+ normalization_dict = {
129
+ "id": normalized_id,
130
+ "name": normalized_name,
131
+ "equivalent_identifiers": list(synonymization_result.synonyms),
132
+ "type": sequence_variant_node_types
133
+ }
134
+ else:
135
+ normalization_dict = {
136
+ "error_type": synonymization_result.error_type,
137
+ "error_message": synonymization_result.error_message,
138
+ }
139
+ normalization_map[curies[i]] = [normalization_dict]
140
+ return normalization_map
141
+
142
+ # extract the preferred curie and name from the synonym set
143
+ def get_id_and_name_from_synonyms(self, synonyms: set):
144
+
145
+ # find the best ID available - prefer CAID over HGVS over anything else
146
+ caid_curies = Text.get_curies_by_prefix('CAID', synonyms)
147
+ if caid_curies:
148
+ normalized_id = caid_curies.pop()
149
+ else:
150
+ hgvs_curies = Text.get_curies_by_prefix('HGVS', synonyms)
151
+ if hgvs_curies:
152
+ normalized_id = hgvs_curies.pop()
153
+ else:
154
+ # we didn't find a CAID or HGVS, just take the first one as an arbitrary id
155
+ normalized_id = next(iter(synonyms))
156
+
157
+ rsid_curies = Text.get_curies_by_prefix('DBSNP', synonyms)
158
+ if rsid_curies:
159
+ normalized_name = Text.un_curie(rsid_curies.pop())
160
+ else:
161
+ normalized_name = Text.un_curie(normalized_id)
162
+
163
+ return normalized_id, normalized_name
@@ -0,0 +1,125 @@
1
+ from robokop_genetics.services.myvariant import MyVariantService
2
+ from robokop_genetics.services.ensembl import EnsemblService
3
+ from robokop_genetics.services.hgnc import HGNCService
4
+ from robokop_genetics.util import LoggingUtil
5
+ from robokop_genetics.genetics_cache import GeneticsCache
6
+ from collections import defaultdict
7
+ import logging
8
+
9
+ MYVARIANT = "MyVariant"
10
+ ENSEMBL = "Ensembl"
11
+
12
+ ALL_VARIANT_TO_GENE_SERVICES = [MYVARIANT, ENSEMBL]
13
+ BATCHABLE_VARIANT_TO_GENE_SERVES = [MYVARIANT]
14
+
15
+
16
+ class GeneticsServices(object):
17
+
18
+ logger = LoggingUtil.init_logging(__name__,
19
+ logging.INFO,
20
+ log_file_path=LoggingUtil.get_logging_path())
21
+
22
+ def __init__(self, use_cache: bool=True):
23
+
24
+ if use_cache:
25
+ self.cache = GeneticsCache()
26
+ self.logger.info('Robokop Genetics Services initialized with cache activated.')
27
+ else:
28
+ self.cache = None
29
+ self.logger.info('Robokop Genetics Services initialized with no cache activated.')
30
+
31
+ self.hgnc = HGNCService()
32
+ self.myvariant = MyVariantService(hgnc_service=self.hgnc)
33
+ self.ensembl = EnsemblService(temp_dir=LoggingUtil.get_logging_path())
34
+
35
+ def get_variant_to_gene(self, services: list, variant_nodes: list):
36
+ self.logger.info(f'Get variant to gene called on {len(variant_nodes)} nodes.')
37
+ all_results = defaultdict(list)
38
+ for service in services:
39
+ if self.cache:
40
+ cache_key = f'{service}_sequence_variant_to_gene'
41
+ cached_results = self.cache.get_service_results(cache_key, [node.id for node in variant_nodes])
42
+
43
+ nodes_that_need_results = []
44
+ for i, node in enumerate(variant_nodes):
45
+ cached_result = cached_results[i]
46
+ if cached_result is not None:
47
+ all_results[node.id].extend(cached_result)
48
+ else:
49
+ nodes_that_need_results.append(node)
50
+ self.logger.info(f'{service} variant to gene found results for {len(variant_nodes) - len(nodes_that_need_results)} nodes in the cache.')
51
+ else:
52
+ nodes_that_need_results = variant_nodes
53
+
54
+ if service == MYVARIANT:
55
+ # send batches to myvariant
56
+ counter = 0
57
+ myvariant_syn_dict = {}
58
+ for node in nodes_that_need_results:
59
+ myvariant_syn_dict[node.id] = node.synonyms
60
+ counter += 1
61
+ # this batch size is pretty arbitrary
62
+ # myvariant really sends batches of 1000
63
+ # but we can probably cache more at a time
64
+ if counter == 10000:
65
+ new_myvariant_results = self.batch_query_variant_to_gene(MYVARIANT, myvariant_syn_dict)
66
+ for node_id, results in new_myvariant_results.items():
67
+ all_results[node_id].extend(results)
68
+ if self.cache:
69
+ self.cache.set_service_results(cache_key, new_myvariant_results)
70
+ counter = 0
71
+ myvariant_syn_dict = {}
72
+ if counter > 0:
73
+ new_myvariant_results = self.batch_query_variant_to_gene(MYVARIANT, myvariant_syn_dict)
74
+ for node_id, results in new_myvariant_results.items():
75
+ all_results[node_id].extend(results)
76
+ if self.cache:
77
+ self.cache.set_service_results(cache_key, new_myvariant_results)
78
+
79
+ elif service == ENSEMBL:
80
+ new_ensembl_results = {}
81
+ counter = 0
82
+ for node in nodes_that_need_results:
83
+ variant_id = node.id
84
+ variant_syns = node.get_synonyms_by_prefix('ROBO_VARIANT')
85
+ new_ensembl_results[variant_id] = self.ensembl.sequence_variant_to_gene(variant_id, variant_syns)
86
+ all_results[variant_id].extend(new_ensembl_results[variant_id])
87
+ counter += 1
88
+ if counter == 10000 and self.cache:
89
+ self.cache.set_service_results(cache_key, new_ensembl_results)
90
+ new_ensembl_results = {}
91
+ counter = 0
92
+
93
+ if counter > 0 and self.cache:
94
+ self.cache.set_service_results(cache_key, new_ensembl_results)
95
+
96
+ return all_results
97
+
98
+ # service: the service to query (from ALL_VARIANT_TO_GENE_SERVICES)
99
+ # variant_id: plain curie string
100
+ # variant_synonyms: a set of synonym curies
101
+ #
102
+ # specify the service and provide variant information to find gene relationships
103
+ # results will be in a list of tuples
104
+ # (edge: SimpleEdge, gene_node: SimpleNode)
105
+ def query_variant_to_gene(self, service: str, variant_id: str, variant_synonyms: set):
106
+ if service == MYVARIANT:
107
+ return self.myvariant.sequence_variant_to_gene(variant_id, variant_synonyms)
108
+ elif service == ENSEMBL:
109
+ return self.ensembl.sequence_variant_to_gene(variant_id, variant_synonyms)
110
+ else:
111
+ self.logger.warning(f'Service ({service}) not found! Variant to gene failed.')
112
+
113
+ # variant_dict: a dictionary of variant_id (curie) to variant_synonyms (set of curies)
114
+ # these are the same parameters for get_variant_to_gene
115
+ # returns a dictionary with the variant id curie as keys and the results from get_variant_to_gene as values
116
+ def batch_query_variant_to_gene(self, service: str, variant_dict: dict):
117
+ if service == MYVARIANT:
118
+ return self.myvariant.batch_sequence_variant_to_gene(variant_dict)
119
+ else:
120
+ self.logger.warning(f'Service ({service}) not batch-able! Variant to gene failed.')
121
+
122
+ # given a plain string gene_symbol return a valid curie gene ID
123
+ # eg. BRCA1 -> HGNC:1100
124
+ def get_gene_id_from_symbol(self, gene_symbol: str):
125
+ return self.hgnc.get_gene_id_from_symbol(gene_symbol)
@@ -0,0 +1,8 @@
1
+ GENE = 'biolink:Gene'
2
+ NAMED_THING = 'biolink:NamedThing'
3
+ BIOLOGICAL_ENTITY = 'biolink:BiologicalEntity'
4
+ SEQUENCE_VARIANT = 'biolink:SequenceVariant'
5
+
6
+
7
+ #The root of all biolink_model entities
8
+ ROOT_ENTITY = NAMED_THING