PyPI - robokop-genetics - Versions diffs - 0.5.0__tar.gz - Mend

robokop-genetics 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

robokop_genetics-0.5.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2020 ObesityHub
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

robokop_genetics-0.5.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,40 @@
+Metadata-Version: 2.1
+Name: robokop-genetics
+Version: 0.5.0
+Summary: A package for Robokop genetics tools and services.
+Home-page: https://github.com/RobokopU24/robokop-genetics
+Author: Evan Morris
+Author-email: evandietzmorris@gmail.com
+Maintainer: Evan Morris
+Maintainer-email: evandietzmorris@gmail.com
+License: CC-0
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: requests>=2.32.3
+Requires-Dist: redis>=5.0.4
+# robokop-genetics
+Tools and service wrappers for building Robokop graphs.
+#### Caching
+To utilize a redis cache, set the following environment variables to match your own redis cache instance:
+```
+ROBO_GENETICS_CACHE_HOST=localhost
+ROBO_GENETICS_CACHE_PORT=6379
+ROBO_GENETICS_CACHE_DB=0
+ROBO_GENETICS_CACHE_PASSWORD=yourpassword
+```
+#### Logging and Temporary Files
+robokop-genetics depends on a local directory with write permissions for temporary files and logging.
+When used in conjunction with robo-commons or robokop-rags, the default robokop home directory will be used.
+For testing or other purposes, set the following environment variable to specify a valid location.
+```
+ROBO_GENETICS_HOME=/home/example_directory
+```

robokop_genetics-0.5.0/README.md ADDED Viewed

@@ -0,0 +1,21 @@
+# robokop-genetics
+Tools and service wrappers for building Robokop graphs.
+#### Caching
+To utilize a redis cache, set the following environment variables to match your own redis cache instance:
+```
+ROBO_GENETICS_CACHE_HOST=localhost
+ROBO_GENETICS_CACHE_PORT=6379
+ROBO_GENETICS_CACHE_DB=0
+ROBO_GENETICS_CACHE_PASSWORD=yourpassword
+```
+#### Logging and Temporary Files
+robokop-genetics depends on a local directory with write permissions for temporary files and logging.
+When used in conjunction with robo-commons or robokop-rags, the default robokop home directory will be used.
+For testing or other purposes, set the following environment variable to specify a valid location.
+```
+ROBO_GENETICS_HOME=/home/example_directory
+```

robokop_genetics-0.5.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"

robokop_genetics-0.5.0/robokop_genetics/__init__.py ADDED Viewed

File without changes

robokop_genetics-0.5.0/robokop_genetics/genetics_cache.py ADDED Viewed

@@ -0,0 +1,138 @@
+import os
+import json
+import redis
+import logging
+from robokop_genetics.util import LoggingUtil
+from robokop_genetics.simple_graph_components import SimpleEdge, SimpleNode
+class GeneticsCache:
+    logger = LoggingUtil.init_logging(__name__,
+                                      logging.INFO,
+                                      log_file_path=LoggingUtil.get_logging_path())
+    def __init__(self,
+                 use_default_credentials: bool = True,
+                 redis_host: str = "localhost",
+                 redis_port: int = 6379,
+                 redis_db: int = 0,
+                 redis_password: str = "",
+                 prefix: str = ""):
+        self.NORMALIZATION_KEY_PREFIX = f'{prefix}normalize-'
+        if use_default_credentials:
+            try:
+                redis_host = os.environ['ROBO_GENETICS_CACHE_HOST']
+                redis_port = os.environ['ROBO_GENETICS_CACHE_PORT']
+                redis_db = os.environ['ROBO_GENETICS_CACHE_DB']
+                redis_password = os.environ['ROBO_GENETICS_CACHE_PASSWORD']
+            except KeyError:
+                self.logger.warning('ROBO_GENETICS_CACHE environment variables not set. No cache activated.')
+                raise Exception("Cache requested but ROBO_GENETICS_CACHE environment variables not set!")
+        try:
+            if redis_password:
+                self.redis = redis.Redis(host=redis_host,
+                                         port=int(redis_port),
+                                         db=int(redis_db),
+                                         password=redis_password)
+            else:
+                self.redis = redis.Redis(host=redis_host,
+                                         port=int(redis_port),
+                                         db=int(redis_db))
+            self.redis.get('x')
+            self.logger.info(f"Genetics cache connected to redis at {redis_host}:{redis_port}/{redis_db}")
+        except Exception as e:
+            self.logger.error(f"Genetics cache failed to connect to redis at {redis_host}:{redis_port}/{redis_db}.")
+            raise e
+    #def set_normalization(self, node_id: str, normalization: tuple):
+    #    normalization_key = f'{self.NORMALIZATION_KEY_PREFIX}{node_id}'
+    #    self.redis.set(normalization_key, json.dumps(normalization))
+    def set_batch_normalization(self, normalization_map: dict):
+        pipeline = self.redis.pipeline()
+        for node_id, normalization in normalization_map.items():
+            normalization_key = f'{self.NORMALIZATION_KEY_PREFIX}{node_id}'
+            pipeline.set(normalization_key, json.dumps(normalization))
+        pipeline.execute()
+    #def get_normalization(self, node_id: str):
+    #    normalization_key = f'{self.NORMALIZATION_KEY_PREFIX}{node_id}'
+    #    result = self.redis.get(normalization_key)
+    #    normalization = json.loads(result) if result is not None else None
+    #    return normalization
+    def get_batch_normalization(self, node_ids: list):
+        pipeline = self.redis.pipeline()
+        for node_id in node_ids:
+            normalization_key = f'{self.NORMALIZATION_KEY_PREFIX}{node_id}'
+            pipeline.get(normalization_key)
+        results = pipeline.execute()
+        normalization_map = {}
+        for i, result in enumerate(results):
+            if result is not None:
+                normalization_map[node_ids[i]] = json.loads(result)
+        return normalization_map
+    def set_service_results(self, service_key: str, results_dict: dict):
+        pipeline = self.redis.pipeline()
+        for node_id, results in results_dict.items():
+            redis_key = f'{service_key}-{node_id}'
+            pipeline.set(redis_key, self.__encode_service_results(results))
+        pipeline.execute()
+    def __encode_service_results(self, service_results: list):
+        encoded_results = []
+        for (edge, node) in service_results:
+            json_node = {"id": node.id, "type": node.type, "name": node.name}
+            json_edge = {"source_id": edge.source_id,
+                         "target_id": edge.target_id,
+                         "provided_by": edge.provided_by,
+                         "input_id": edge.input_id,
+                         "predicate_id": edge.predicate_id,
+                         "predicate_label": edge.predicate_label,
+                         "ctime": edge.ctime,
+                         "properties": edge.properties}
+            encoded_result = {"edge": json_edge, "node": json_node}
+            encoded_results.append(encoded_result)
+        return json.dumps(encoded_results)
+    def get_service_results(self, service_key: str, node_ids: list):
+        pipeline = self.redis.pipeline()
+        for node_id in node_ids:
+            pipeline.get(f'{service_key}-{node_id}')
+        redis_results = pipeline.execute()
+        local_decode_results = self.__decode_service_results
+        decoded_results = list(map(lambda result: local_decode_results(result) if result else None, redis_results))
+        return decoded_results
+    def __decode_service_results(self, redis_results):
+        decoded_results = []
+        json_object = json.loads(redis_results)
+        for result in json_object:
+            edge_json = result["edge"]
+            edge_object = SimpleEdge(source_id=edge_json['source_id'],
+                                     target_id=edge_json['target_id'],
+                                     provided_by=edge_json['provided_by'],
+                                     input_id=edge_json['input_id'],
+                                     predicate_id=edge_json['predicate_id'],
+                                     predicate_label=edge_json['predicate_label'],
+                                     ctime=edge_json['ctime'],
+                                     properties=edge_json['properties'])
+            # note that right now we're not caching properties or synonyms for service nodes,
+            # properties aren't used yet, synonyms will come from normalization after the fact
+            node_json = result["node"]
+            node_object = SimpleNode(id=node_json["id"],
+                                     type=node_json["type"],
+                                     name=node_json["name"])
+            decoded_results.append((edge_object,
+                                   node_object))
+        return decoded_results
+    def delete_all_keys_with_prefix(self, prefix: str):
+        keys = self.redis.keys(f'{prefix}*')
+        if keys:
+            self.redis.delete(*keys)

robokop_genetics-0.5.0/robokop_genetics/genetics_normalization.py ADDED Viewed

@@ -0,0 +1,163 @@
+from robokop_genetics.services.clingen import ClinGenService, ClinGenSynonymizationResult, batchable_variant_curie_prefixes
+from robokop_genetics.genetics_cache import GeneticsCache
+import robokop_genetics.node_types as node_types
+from robokop_genetics.util import LoggingUtil, Text
+import logging
+import requests
+import os
+class GeneticsNormalizer(object):
+    DEFAULT_EDGE_NORM_ENDPOINT = f'https://biolink-lookup.transltr.io/'
+    logger = LoggingUtil.init_logging(__name__,
+                                      logging.INFO,
+                                      log_file_path=LoggingUtil.get_logging_path())
+    def __init__(self, use_cache: bool = False, bl_version: str = "latest"):
+        if use_cache:
+            self.cache = GeneticsCache()
+            self.logger.info('Robokop Genetics Normalizer initialized with cache activated.')
+        else:
+            self.cache = None
+            self.logger.info('Robokop Genetics Normalizer initialized with no cache activated.')
+        if 'EDGE_NORMALIZATION_ENDPOINT' in os.environ:
+            self.edge_norm_endpoint = os.environ['EDGE_NORMALIZATION_ENDPOINT']
+            self.logger.info(f'Using EDGE_NORMALIZATION_ENDPOINT from env var: {self.edge_norm_endpoint}')
+        else:
+            self.edge_norm_endpoint = self.DEFAULT_EDGE_NORM_ENDPOINT
+            self.logger.info(f'Using default EDGE_NORMALIZATION_ENDPOINT: {self.edge_norm_endpoint}')
+        self.bl_version = bl_version
+        self.sequence_variant_node_types = self.fetch_sequence_variant_node_types()
+        self.clingen = ClinGenService()
+    def get_sequence_variant_node_types(self):
+        """
+        Returns a list of all normalized node types for sequence variant nodes
+        :return:
+        """
+        return self.sequence_variant_node_types
+    def fetch_sequence_variant_node_types(self):
+        bl_url = f"{self.edge_norm_endpoint}bl/{node_types.SEQUENCE_VARIANT}/ancestors?version={self.bl_version}"
+        with requests.session() as client:
+            response = client.get(bl_url)
+            if response.status_code == 200:
+                sequence_variant_node_types = list(set(response.json() + [node_types.SEQUENCE_VARIANT]))
+            else:
+                sequence_variant_node_types = [node_types.NAMED_THING, node_types.SEQUENCE_VARIANT]
+                self.logger.error(
+                    f'Failed bl-lookup for {node_types.SEQUENCE_VARIANT} ancestor types: (response code: {response.status_code})')
+        return sequence_variant_node_types
+    def normalize_variants(self, variant_ids):
+        """
+        Normalize a list of variants in the most efficient way ie. check the cache, then process in batches if possible.
+        :param variant_ids: a list of variant curie identifiers
+        :return: a dictionary of normalization information, with the provided curie list as keys
+        """
+        # if there is a cache active, check it for existing results and grab them
+        if self.cache:
+            all_normalization_results = self.cache.get_batch_normalization(variant_ids)
+            variants_that_need_normalizing = [variant_id for variant_id in variant_ids if variant_id not in all_normalization_results]
+            self.logger.info(f'Batch normalizing found {len(all_normalization_results)}/{len(variant_ids)} results in the cache.')
+        else:
+            all_normalization_results = {}
+            variants_that_need_normalizing = variant_ids
+        # normalize batches of variants with the same curie prefix because that's how clingen accepts them
+        for curie_prefix in batchable_variant_curie_prefixes:
+            batchable_variant_curies = [v_curie for v_curie in variants_that_need_normalizing if v_curie.startswith(curie_prefix)]
+            batched_normalizations = self.get_batch_sequence_variant_normalization(batchable_variant_curies)
+            all_normalization_results.update(batched_normalizations)
+            if self.cache:
+                # cache the results if possible
+                self.cache.set_batch_normalization(batched_normalizations)
+        # for remaining variants batching is not possible - try to find results one at a time
+        unbatchable_variant_ids = [v_curie for v_curie in variants_that_need_normalizing if v_curie not in all_normalization_results]
+        unbatchable_norm_results = map(self.get_sequence_variant_normalization, unbatchable_variant_ids)
+        # this could probably be done more efficiently, we only create unbatchable_norm_result_map for the cache
+        unbatchable_norm_result_map = {}
+        for i, result in enumerate(unbatchable_norm_results):
+            if self.cache:
+                unbatchable_norm_result_map[unbatchable_variant_ids[i]] = result
+            all_normalization_results[unbatchable_variant_ids[i]] = result
+        if self.cache:
+            # cache the results if possible
+            self.cache.set_batch_normalization(unbatchable_norm_result_map)
+        return all_normalization_results
+    # variant_curie: the id of the variant that needs normalizing
+    def get_sequence_variant_normalization(self, variant_curie: str):
+        normalizations = []
+        # Note that clingen.get_synonyms_by_other_id supports variants which may return multiple synonymization results.
+        # So here we may create more than one normalized node for each provided variant curie.
+        synonymization_results = self.clingen.get_synonyms_by_other_id(variant_curie)
+        for synonymization_result in synonymization_results:
+            if synonymization_result.success:
+                normalized_id, normalized_name = self.get_id_and_name_from_synonyms(synonymization_result.synonyms)
+                normalization_dict = {
+                    "id": normalized_id,
+                    "name": normalized_name,
+                    "equivalent_identifiers": list(synonymization_result.synonyms),
+                    "type": self.sequence_variant_node_types
+                }
+            else:
+                normalization_dict = {
+                    "error_type": synonymization_result.error_type,
+                    "error_message": synonymization_result.error_message,
+                }
+            normalizations.append(normalization_dict)
+        return normalizations
+    # Given a list of batchable curies with the same prefix, return a map of corresponding normalization information.
+    def get_batch_sequence_variant_normalization(self, curies: list):
+        normalization_map = {}
+        # Note that for batch normalization clingen only supports variant types which return a single set of synonyms,
+        # as opposed to potentially returning multiple sets such as when calling get_synonyms_by_other_id.
+        # Here we always only create one normalized node per provided ID.
+        synonymization_results = self.clingen.get_batch_of_synonyms(curies)
+        sequence_variant_node_types = self.sequence_variant_node_types
+        for i, synonymization_result in enumerate(synonymization_results):
+            if synonymization_result.success:
+                normalized_id, normalized_name = self.get_id_and_name_from_synonyms(synonymization_result.synonyms)
+                normalization_dict = {
+                    "id": normalized_id,
+                    "name": normalized_name,
+                    "equivalent_identifiers": list(synonymization_result.synonyms),
+                    "type": sequence_variant_node_types
+                }
+            else:
+                normalization_dict = {
+                    "error_type": synonymization_result.error_type,
+                    "error_message": synonymization_result.error_message,
+                }
+            normalization_map[curies[i]] = [normalization_dict]
+        return normalization_map
+    # extract the preferred curie and name from the synonym set
+    def get_id_and_name_from_synonyms(self, synonyms: set):
+        # find the best ID available - prefer CAID over HGVS over anything else
+        caid_curies = Text.get_curies_by_prefix('CAID', synonyms)
+        if caid_curies:
+            normalized_id = caid_curies.pop()
+        else:
+            hgvs_curies = Text.get_curies_by_prefix('HGVS', synonyms)
+            if hgvs_curies:
+                normalized_id = hgvs_curies.pop()
+            else:
+                # we didn't find a CAID or HGVS, just take the first one as an arbitrary id
+                normalized_id = next(iter(synonyms))
+        rsid_curies = Text.get_curies_by_prefix('DBSNP', synonyms)
+        if rsid_curies:
+            normalized_name = Text.un_curie(rsid_curies.pop())
+        else:
+            normalized_name = Text.un_curie(normalized_id)
+        return normalized_id, normalized_name

robokop_genetics-0.5.0/robokop_genetics/genetics_services.py ADDED Viewed

@@ -0,0 +1,125 @@
+from robokop_genetics.services.myvariant import MyVariantService
+from robokop_genetics.services.ensembl import EnsemblService
+from robokop_genetics.services.hgnc import HGNCService
+from robokop_genetics.util import LoggingUtil
+from robokop_genetics.genetics_cache import GeneticsCache
+from collections import defaultdict
+import logging
+MYVARIANT = "MyVariant"
+ENSEMBL = "Ensembl"
+ALL_VARIANT_TO_GENE_SERVICES = [MYVARIANT, ENSEMBL]
+BATCHABLE_VARIANT_TO_GENE_SERVES = [MYVARIANT]
+class GeneticsServices(object):
+    logger = LoggingUtil.init_logging(__name__,
+                                      logging.INFO,
+                                      log_file_path=LoggingUtil.get_logging_path())
+    def __init__(self, use_cache: bool=True):
+        if use_cache:
+            self.cache = GeneticsCache()
+            self.logger.info('Robokop Genetics Services initialized with cache activated.')
+        else:
+            self.cache = None
+            self.logger.info('Robokop Genetics Services initialized with no cache activated.')
+        self.hgnc = HGNCService()
+        self.myvariant = MyVariantService(hgnc_service=self.hgnc)
+        self.ensembl = EnsemblService(temp_dir=LoggingUtil.get_logging_path())
+    def get_variant_to_gene(self, services: list, variant_nodes: list):
+        self.logger.info(f'Get variant to gene called on {len(variant_nodes)} nodes.')
+        all_results = defaultdict(list)
+        for service in services:
+            if self.cache:
+                cache_key = f'{service}_sequence_variant_to_gene'
+                cached_results = self.cache.get_service_results(cache_key, [node.id for node in variant_nodes])
+                nodes_that_need_results = []
+                for i, node in enumerate(variant_nodes):
+                    cached_result = cached_results[i]
+                    if cached_result is not None:
+                        all_results[node.id].extend(cached_result)
+                    else:
+                        nodes_that_need_results.append(node)
+                self.logger.info(f'{service} variant to gene found results for {len(variant_nodes) - len(nodes_that_need_results)} nodes in the cache.')
+            else:
+                nodes_that_need_results = variant_nodes
+            if service == MYVARIANT:
+                # send batches to myvariant
+                counter = 0
+                myvariant_syn_dict = {}
+                for node in nodes_that_need_results:
+                    myvariant_syn_dict[node.id] = node.synonyms
+                    counter += 1
+                    # this batch size is pretty arbitrary
+                    # myvariant really sends batches of 1000
+                    # but we can probably cache more at a time
+                    if counter == 10000:
+                        new_myvariant_results = self.batch_query_variant_to_gene(MYVARIANT, myvariant_syn_dict)
+                        for node_id, results in new_myvariant_results.items():
+                            all_results[node_id].extend(results)
+                        if self.cache:
+                            self.cache.set_service_results(cache_key, new_myvariant_results)
+                        counter = 0
+                        myvariant_syn_dict = {}
+                if counter > 0:
+                    new_myvariant_results = self.batch_query_variant_to_gene(MYVARIANT, myvariant_syn_dict)
+                    for node_id, results in new_myvariant_results.items():
+                        all_results[node_id].extend(results)
+                    if self.cache:
+                        self.cache.set_service_results(cache_key, new_myvariant_results)
+            elif service == ENSEMBL:
+                new_ensembl_results = {}
+                counter = 0
+                for node in nodes_that_need_results:
+                    variant_id = node.id
+                    variant_syns = node.get_synonyms_by_prefix('ROBO_VARIANT')
+                    new_ensembl_results[variant_id] = self.ensembl.sequence_variant_to_gene(variant_id, variant_syns)
+                    all_results[variant_id].extend(new_ensembl_results[variant_id])
+                    counter += 1
+                    if counter == 10000 and self.cache:
+                        self.cache.set_service_results(cache_key, new_ensembl_results)
+                        new_ensembl_results = {}
+                        counter = 0
+                if counter > 0 and self.cache:
+                    self.cache.set_service_results(cache_key, new_ensembl_results)
+        return all_results
+    # service: the service to query (from ALL_VARIANT_TO_GENE_SERVICES)
+    # variant_id: plain curie string
+    # variant_synonyms: a set of synonym curies
+    #
+    # specify the service and provide variant information to find gene relationships
+    # results will be in a list of tuples
+    # (edge: SimpleEdge, gene_node: SimpleNode)
+    def query_variant_to_gene(self, service: str, variant_id: str, variant_synonyms: set):
+        if service == MYVARIANT:
+            return self.myvariant.sequence_variant_to_gene(variant_id, variant_synonyms)
+        elif service == ENSEMBL:
+            return self.ensembl.sequence_variant_to_gene(variant_id, variant_synonyms)
+        else:
+            self.logger.warning(f'Service ({service}) not found! Variant to gene failed.')
+    # variant_dict: a dictionary of variant_id (curie) to variant_synonyms (set of curies)
+    # these are the same parameters for get_variant_to_gene
+    # returns a dictionary with the variant id curie as keys and the results from get_variant_to_gene as values
+    def batch_query_variant_to_gene(self, service: str, variant_dict: dict):
+        if service == MYVARIANT:
+            return self.myvariant.batch_sequence_variant_to_gene(variant_dict)
+        else:
+            self.logger.warning(f'Service ({service}) not batch-able! Variant to gene failed.')
+    # given a plain string gene_symbol return a valid curie gene ID
+    # eg. BRCA1 -> HGNC:1100
+    def get_gene_id_from_symbol(self, gene_symbol: str):
+        return self.hgnc.get_gene_id_from_symbol(gene_symbol)

robokop_genetics-0.5.0/robokop_genetics/node_types.py ADDED Viewed

@@ -0,0 +1,8 @@
+GENE = 'biolink:Gene'
+NAMED_THING = 'biolink:NamedThing'
+BIOLOGICAL_ENTITY = 'biolink:BiologicalEntity'
+SEQUENCE_VARIANT = 'biolink:SequenceVariant'
+#The root of all biolink_model entities
+ROOT_ENTITY = NAMED_THING

robokop_genetics-0.5.0/robokop_genetics/services/__init__.py ADDED Viewed

File without changes