aiagents4pharma 1.32.0__py3-none-any.whl → 1.34.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/talk2knowledgegraphs/configs/config.yaml +3 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_ols.py +62 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_pubchem.py +10 -3
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_reactome.py +44 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_uniprot.py +44 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_pubchem_utils.py +25 -3
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/__init__.py +3 -0
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/ols_terms.py +76 -0
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/pubchem_strings.py +11 -6
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/reactome_pathways.py +65 -0
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/uniprot_proteins.py +90 -0
- aiagents4pharma/talk2knowledgegraphs/utils/pubchem_utils.py +34 -4
- {aiagents4pharma-1.32.0.dist-info → aiagents4pharma-1.34.0.dist-info}/METADATA +1 -1
- {aiagents4pharma-1.32.0.dist-info → aiagents4pharma-1.34.0.dist-info}/RECORD +17 -11
- {aiagents4pharma-1.32.0.dist-info → aiagents4pharma-1.34.0.dist-info}/WHEEL +0 -0
- {aiagents4pharma-1.32.0.dist-info → aiagents4pharma-1.34.0.dist-info}/licenses/LICENSE +0 -0
- {aiagents4pharma-1.32.0.dist-info → aiagents4pharma-1.34.0.dist-info}/top_level.txt +0 -0
@@ -5,4 +5,7 @@ defaults:
|
|
5
5
|
- tools/subgraph_summarization: default
|
6
6
|
- tools/graphrag_reasoning: default
|
7
7
|
- utils/pubchem_utils: default
|
8
|
+
- utils/enrichments/uniprot_proteins: default
|
9
|
+
- utils/enrichments/ols_terms: default
|
10
|
+
- utils/enrichments/reactome_pathways: default
|
8
11
|
- app/frontend: default
|
@@ -0,0 +1,62 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
"""
|
4
|
+
Test cases for utils/enrichments/ols_terms.py
|
5
|
+
"""
|
6
|
+
|
7
|
+
import pytest
|
8
|
+
from ..utils.enrichments.ols_terms import EnrichmentWithOLS
|
9
|
+
|
10
|
+
# In this test, we will consider 5 examples:
|
11
|
+
# 1. CL_0000899: T-helper 17 cell (Cell Ontology)
|
12
|
+
# 2. GO_0046427: positive regulation of receptor signaling pathway via JAK-STAT (GO)
|
13
|
+
# 3. UBERON_0000004: nose (Uberon)
|
14
|
+
# 4. HP_0009739: Hypoplasia of the antihelix (Human Phenotype Ontology)
|
15
|
+
# 5. MONDO_0005011: Crohn disease (MONDO)
|
16
|
+
# 6. XYZ_0000000: Non-existing term (for testing error handling)
|
17
|
+
|
18
|
+
# The expected description for each term starts with:
|
19
|
+
CL_DESC = "CD4-positive, alpha-beta T cell"
|
20
|
+
GO_DESC = "Any process that activates or increases the frequency, rate or extent"
|
21
|
+
UBERON_DESC = "The olfactory organ of vertebrates, consisting of nares"
|
22
|
+
HP_DESC = "Hypoplasia of the antihelix"
|
23
|
+
MONDO_DESC = "A gastrointestinal disorder characterized by chronic inflammation"
|
24
|
+
|
25
|
+
# The expected description for the non-existing term is None
|
26
|
+
|
27
|
+
@pytest.fixture(name="enrich_obj")
|
28
|
+
def fixture_uniprot_config():
|
29
|
+
"""Return a dictionary with the configuration for OLS enrichment."""
|
30
|
+
return EnrichmentWithOLS()
|
31
|
+
|
32
|
+
def test_enrich_documents(enrich_obj):
|
33
|
+
"""Test the enrich_documents method."""
|
34
|
+
ols_terms = ["CL_0000899",
|
35
|
+
"GO_0046427",
|
36
|
+
"UBERON_0000004",
|
37
|
+
"HP_0009739",
|
38
|
+
"MONDO_0005011",
|
39
|
+
"XYZ_0000000"]
|
40
|
+
descriptions = enrich_obj.enrich_documents(ols_terms)
|
41
|
+
assert descriptions[0].startswith(CL_DESC)
|
42
|
+
assert descriptions[1].startswith(GO_DESC)
|
43
|
+
assert descriptions[2].startswith(UBERON_DESC)
|
44
|
+
assert descriptions[3].startswith(HP_DESC)
|
45
|
+
assert descriptions[4].startswith(MONDO_DESC)
|
46
|
+
assert descriptions[5] is None
|
47
|
+
|
48
|
+
def test_enrich_documents_with_rag(enrich_obj):
|
49
|
+
"""Test the enrich_documents_with_rag method."""
|
50
|
+
ols_terms = ["CL_0000899",
|
51
|
+
"GO_0046427",
|
52
|
+
"UBERON_0000004",
|
53
|
+
"HP_0009739",
|
54
|
+
"MONDO_0005011",
|
55
|
+
"XYZ_0000000"]
|
56
|
+
descriptions = enrich_obj.enrich_documents_with_rag(ols_terms, None)
|
57
|
+
assert descriptions[0].startswith(CL_DESC)
|
58
|
+
assert descriptions[1].startswith(GO_DESC)
|
59
|
+
assert descriptions[2].startswith(UBERON_DESC)
|
60
|
+
assert descriptions[3].startswith(HP_DESC)
|
61
|
+
assert descriptions[4].startswith(MONDO_DESC)
|
62
|
+
assert descriptions[5] is None
|
@@ -13,7 +13,9 @@ from ..utils.enrichments.pubchem_strings import EnrichmentWithPubChem
|
|
13
13
|
# The expected SMILES representation for the first PubChem ID is:
|
14
14
|
SMILES_FIRST = 'C[C@@H]1C[C@H]2[C@@H]3[C@@H](CC4=CC(=O)C=C[C@@]'
|
15
15
|
SMILES_FIRST += '4([C@H]3[C@H](C[C@@]2([C@]1(C(=O)CO)O)C)O)C)Cl'
|
16
|
-
# The expected
|
16
|
+
# The expected description for the first PubChem ID starts with:
|
17
|
+
DESCRIPTION_FIRST = "Alclometasone is a prednisolone compound having an alpha-chloro substituent"
|
18
|
+
# The expected SMILES representation and description for the second PubChem ID is None.
|
17
19
|
|
18
20
|
@pytest.fixture(name="enrich_obj")
|
19
21
|
def fixture_pubchem_config():
|
@@ -23,11 +25,16 @@ def fixture_pubchem_config():
|
|
23
25
|
def test_enrich_documents(enrich_obj):
|
24
26
|
"""Test the enrich_documents method."""
|
25
27
|
pubchem_ids = ["5311000", "1X"]
|
26
|
-
enriched_strings = enrich_obj.enrich_documents(pubchem_ids)
|
28
|
+
enriched_descriptions, enriched_strings = enrich_obj.enrich_documents(pubchem_ids)
|
27
29
|
assert enriched_strings == [SMILES_FIRST, None]
|
30
|
+
assert enriched_descriptions[0].startswith(DESCRIPTION_FIRST)
|
31
|
+
assert enriched_descriptions[1] is None
|
28
32
|
|
29
33
|
def test_enrich_documents_with_rag(enrich_obj):
|
30
34
|
"""Test the enrich_documents_with_rag method."""
|
31
35
|
pubchem_ids = ["5311000", "1X"]
|
32
|
-
enriched_strings = enrich_obj.enrich_documents_with_rag(pubchem_ids,
|
36
|
+
enriched_descriptions, enriched_strings = enrich_obj.enrich_documents_with_rag(pubchem_ids,
|
37
|
+
None)
|
33
38
|
assert enriched_strings == [SMILES_FIRST, None]
|
39
|
+
assert enriched_descriptions[0].startswith(DESCRIPTION_FIRST)
|
40
|
+
assert enriched_descriptions[1] is None
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
"""
|
4
|
+
Test cases for utils/enrichments/reactome_pathways.py
|
5
|
+
"""
|
6
|
+
|
7
|
+
import pytest
|
8
|
+
from ..utils.enrichments.reactome_pathways import EnrichmentWithReactome
|
9
|
+
|
10
|
+
# In this test, we will consider 2 examples:
|
11
|
+
# 1. R-HSA-3244647: cGAS binds cytosolic DNA
|
12
|
+
# 2. R-HSA-9905952: ATP binds P2RX7 in P2RX7 trimer:PANX1 heptamer
|
13
|
+
# 3. R-HSA-1234567: Fake pathway
|
14
|
+
|
15
|
+
# The expected description of pathway R-HSA-3244647 startswith:
|
16
|
+
FIRST_PATHWAY = "Cyclic GMP-AMP (cGAMP) synthase (cGAS) was identified as a cytosolic DNA"
|
17
|
+
# The expected description of pathway R-HSA-9905952 startswith:
|
18
|
+
SECOND_PATHWAY = "The P2RX7 (P2X7, P2Z) trimer binds ATP,"
|
19
|
+
# The expected description of pathway R-HSA-1234567 is None.
|
20
|
+
|
21
|
+
@pytest.fixture(name="enrich_obj")
|
22
|
+
def fixture_uniprot_config():
|
23
|
+
"""Return a dictionary with the configuration for Reactome enrichment."""
|
24
|
+
return EnrichmentWithReactome()
|
25
|
+
|
26
|
+
def test_enrich_documents(enrich_obj):
|
27
|
+
"""Test the enrich_documents method."""
|
28
|
+
reactome_pathways = ["R-HSA-3244647",
|
29
|
+
"R-HSA-9905952",
|
30
|
+
"R-HSA-1234567"]
|
31
|
+
descriptions = enrich_obj.enrich_documents(reactome_pathways)
|
32
|
+
assert descriptions[0].startswith(FIRST_PATHWAY)
|
33
|
+
assert descriptions[1].startswith(SECOND_PATHWAY)
|
34
|
+
assert descriptions[2] is None
|
35
|
+
|
36
|
+
def test_enrich_documents_with_rag(enrich_obj):
|
37
|
+
"""Test the enrich_documents_with_rag method."""
|
38
|
+
reactome_pathways = ["R-HSA-3244647",
|
39
|
+
"R-HSA-9905952",
|
40
|
+
"R-HSA-1234567"]
|
41
|
+
descriptions = enrich_obj.enrich_documents_with_rag(reactome_pathways, None)
|
42
|
+
assert descriptions[0].startswith(FIRST_PATHWAY)
|
43
|
+
assert descriptions[1].startswith(SECOND_PATHWAY)
|
44
|
+
assert descriptions[2] is None
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
"""
|
4
|
+
Test cases for utils/enrichments/uniprot_proteins.py
|
5
|
+
"""
|
6
|
+
|
7
|
+
import pytest
|
8
|
+
from ..utils.enrichments.uniprot_proteins import EnrichmentWithUniProt
|
9
|
+
|
10
|
+
# In this test, we will consider 2 examples:
|
11
|
+
# 1. Gene Name: TP53
|
12
|
+
# 2. Gene Name: TP5 (Incomplete; must return empty results)
|
13
|
+
# 2. Gene Name: XZ (Shorter than 3 characters; must return empty results)
|
14
|
+
# The expected description of TP53 startswith:
|
15
|
+
START_DESCP = "Multifunctional transcription factor"
|
16
|
+
# The expected amino acid sequence of TP53 startswith:
|
17
|
+
START_SEQ = "MEEPQSDPSV"
|
18
|
+
|
19
|
+
@pytest.fixture(name="enrich_obj")
|
20
|
+
def fixture_uniprot_config():
|
21
|
+
"""Return a dictionary with the configuration for UniProt enrichment."""
|
22
|
+
return EnrichmentWithUniProt()
|
23
|
+
|
24
|
+
def test_enrich_documents(enrich_obj):
|
25
|
+
"""Test the enrich_documents method."""
|
26
|
+
gene_names = ["TP53", "TP5", "XZ"]
|
27
|
+
descriptions, sequences = enrich_obj.enrich_documents(gene_names)
|
28
|
+
assert descriptions[0].startswith(START_DESCP)
|
29
|
+
assert sequences[0].startswith(START_SEQ)
|
30
|
+
assert descriptions[1] is None
|
31
|
+
assert sequences[1] is None
|
32
|
+
assert descriptions[2] is None
|
33
|
+
assert sequences[2] is None
|
34
|
+
|
35
|
+
def test_enrich_documents_with_rag(enrich_obj):
|
36
|
+
"""Test the enrich_documents_with_rag method."""
|
37
|
+
gene_names = ["TP53", "TP5", "XZ"]
|
38
|
+
descriptions, sequences = enrich_obj.enrich_documents_with_rag(gene_names, None)
|
39
|
+
assert descriptions[0].startswith(START_DESCP)
|
40
|
+
assert sequences[0].startswith(START_SEQ)
|
41
|
+
assert descriptions[1] is None
|
42
|
+
assert sequences[1] is None
|
43
|
+
assert descriptions[2] is None
|
44
|
+
assert sequences[2] is None
|
@@ -4,13 +4,35 @@ Test cases for utils/pubchem_utils.py
|
|
4
4
|
|
5
5
|
from ..utils import pubchem_utils
|
6
6
|
|
7
|
-
def
|
7
|
+
def test_external_id2pubchem_cid():
|
8
8
|
"""
|
9
|
-
Test the
|
9
|
+
Test the external_id2pubchem_cid function.
|
10
10
|
|
11
11
|
The DrugBank ID for Alclometasone is DB00240.
|
12
12
|
The PubChem CID for Alclometasone is 5311000.
|
13
|
+
|
14
|
+
The CTD ID for Butylated Hydroxyanisole is D002083
|
15
|
+
The PubChem CID for Butylated Hydroxyanisole is 24667.
|
13
16
|
"""
|
14
17
|
drugbank_id = "DB00240"
|
15
|
-
pubchem_cid = pubchem_utils.
|
18
|
+
pubchem_cid = pubchem_utils.external_id2pubchem_cid('drugbank', drugbank_id)
|
16
19
|
assert pubchem_cid == 5311000
|
20
|
+
|
21
|
+
ctd_id = "D002083"
|
22
|
+
pubchem_cid = pubchem_utils.external_id2pubchem_cid(
|
23
|
+
'comparative toxicogenomics database',
|
24
|
+
ctd_id)
|
25
|
+
assert pubchem_cid == 24667
|
26
|
+
|
27
|
+
def test_pubchem_cid_description():
|
28
|
+
"""
|
29
|
+
Test the pubchem_cid_description function.
|
30
|
+
|
31
|
+
The PubChem CID for Alclometasone is 5311000.
|
32
|
+
The description for Alclometasone starts with
|
33
|
+
"Alclometasone is a prednisolone compound having an alpha-chloro substituent".
|
34
|
+
"""
|
35
|
+
pubchem_cid = 5311000
|
36
|
+
description = pubchem_utils.pubchem_cid_description(pubchem_cid)
|
37
|
+
assert description.startswith(
|
38
|
+
"Alclometasone is a prednisolone compound having an alpha-chloro substituent")
|
@@ -0,0 +1,76 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
"""
|
4
|
+
Enrichment class for enriching OLS terms with textual descriptions
|
5
|
+
"""
|
6
|
+
|
7
|
+
from typing import List
|
8
|
+
import logging
|
9
|
+
import json
|
10
|
+
import hydra
|
11
|
+
import requests
|
12
|
+
from .enrichments import Enrichments
|
13
|
+
|
14
|
+
# Initialize logger
|
15
|
+
logging.basicConfig(level=logging.INFO)
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
class EnrichmentWithOLS(Enrichments):
|
19
|
+
"""
|
20
|
+
Enrichment class using OLS terms
|
21
|
+
"""
|
22
|
+
def enrich_documents(self, texts: List[str]) -> List[str]:
|
23
|
+
"""
|
24
|
+
Enrich a list of input OLS terms
|
25
|
+
|
26
|
+
Args:
|
27
|
+
texts: The list of OLS terms to be enriched.
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
The list of enriched descriptions
|
31
|
+
"""
|
32
|
+
|
33
|
+
ols_ids = texts
|
34
|
+
|
35
|
+
logger.log(logging.INFO,
|
36
|
+
"Load Hydra configuration for OLS enrichments.")
|
37
|
+
with hydra.initialize(version_base=None, config_path="../../configs"):
|
38
|
+
cfg = hydra.compose(config_name='config',
|
39
|
+
overrides=['utils/enrichments/ols_terms=default'])
|
40
|
+
cfg = cfg.utils.enrichments.ols_terms
|
41
|
+
|
42
|
+
descriptions = []
|
43
|
+
for ols_id in ols_ids:
|
44
|
+
params = {
|
45
|
+
'short_form': ols_id
|
46
|
+
}
|
47
|
+
r = requests.get(cfg.base_url,
|
48
|
+
headers={ "Accept" : "application/json"},
|
49
|
+
params=params,
|
50
|
+
timeout=cfg.timeout)
|
51
|
+
response_body = json.loads(r.text)
|
52
|
+
# if the response body is empty
|
53
|
+
if '_embedded' not in response_body:
|
54
|
+
descriptions.append(None)
|
55
|
+
continue
|
56
|
+
# Add the description to the list
|
57
|
+
description = response_body['_embedded']['terms'][0]['description']
|
58
|
+
# Add synonyms to the description
|
59
|
+
description += response_body['_embedded']['terms'][0]['synonyms']
|
60
|
+
# Add the label to the description
|
61
|
+
# Label is not provided as list, so we need to convert it to a list
|
62
|
+
description += [response_body['_embedded']['terms'][0]['label']]
|
63
|
+
descriptions.append('\n'.join(description))
|
64
|
+
return descriptions
|
65
|
+
|
66
|
+
def enrich_documents_with_rag(self, texts, docs):
|
67
|
+
"""
|
68
|
+
Enrich a list of input OLS terms
|
69
|
+
|
70
|
+
Args:
|
71
|
+
texts: The list of OLS to be enriched.
|
72
|
+
|
73
|
+
Returns:
|
74
|
+
The list of enriched descriptions
|
75
|
+
"""
|
76
|
+
return self.enrich_documents(texts)
|
@@ -1,12 +1,13 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
2
|
|
3
3
|
"""
|
4
|
-
Enrichment class for enriching PubChem IDs with their STRINGS representation.
|
4
|
+
Enrichment class for enriching PubChem IDs with their STRINGS representation and descriptions.
|
5
5
|
"""
|
6
6
|
|
7
7
|
from typing import List
|
8
8
|
import pubchempy as pcp
|
9
9
|
from .enrichments import Enrichments
|
10
|
+
from ..pubchem_utils import pubchem_cid_description
|
10
11
|
|
11
12
|
class EnrichmentWithPubChem(Enrichments):
|
12
13
|
"""
|
@@ -20,20 +21,24 @@ class EnrichmentWithPubChem(Enrichments):
|
|
20
21
|
texts: The list of pubchem IDs to be enriched.
|
21
22
|
|
22
23
|
Returns:
|
23
|
-
The list of enriched STRINGS
|
24
|
+
The list of enriched STRINGS and their descriptions.
|
24
25
|
"""
|
25
26
|
|
26
|
-
|
27
|
+
enriched_pubchem_ids_smiles = []
|
28
|
+
enriched_pubchem_ids_descriptions = []
|
29
|
+
|
27
30
|
pubchem_cids = texts
|
28
31
|
for pubchem_cid in pubchem_cids:
|
29
32
|
try:
|
30
33
|
c = pcp.Compound.from_cid(pubchem_cid)
|
31
34
|
except pcp.BadRequestError:
|
32
|
-
|
35
|
+
enriched_pubchem_ids_smiles.append(None)
|
36
|
+
enriched_pubchem_ids_descriptions.append(None)
|
33
37
|
continue
|
34
|
-
|
38
|
+
enriched_pubchem_ids_smiles.append(c.isomeric_smiles)
|
39
|
+
enriched_pubchem_ids_descriptions.append(pubchem_cid_description(pubchem_cid))
|
35
40
|
|
36
|
-
return
|
41
|
+
return enriched_pubchem_ids_descriptions, enriched_pubchem_ids_smiles
|
37
42
|
|
38
43
|
def enrich_documents_with_rag(self, texts, docs):
|
39
44
|
"""
|
@@ -0,0 +1,65 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
"""
|
4
|
+
Enrichment class for enriching Reactome pathways with textual descriptions
|
5
|
+
"""
|
6
|
+
|
7
|
+
from typing import List
|
8
|
+
import logging
|
9
|
+
import hydra
|
10
|
+
import requests
|
11
|
+
from .enrichments import Enrichments
|
12
|
+
|
13
|
+
# Initialize logger
|
14
|
+
logging.basicConfig(level=logging.INFO)
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
class EnrichmentWithReactome(Enrichments):
|
18
|
+
"""
|
19
|
+
Enrichment class using Reactome pathways
|
20
|
+
"""
|
21
|
+
def enrich_documents(self, texts: List[str]) -> List[str]:
|
22
|
+
"""
|
23
|
+
Enrich a list of input Reactome pathways
|
24
|
+
|
25
|
+
Args:
|
26
|
+
texts: The list of Reactome pathways to be enriched.
|
27
|
+
|
28
|
+
Returns:
|
29
|
+
The list of enriched descriptions
|
30
|
+
"""
|
31
|
+
|
32
|
+
reactome_pathways_ids = texts
|
33
|
+
|
34
|
+
logger.log(logging.INFO,
|
35
|
+
"Load Hydra configuration for reactome enrichment")
|
36
|
+
with hydra.initialize(version_base=None, config_path="../../configs"):
|
37
|
+
cfg = hydra.compose(config_name='config',
|
38
|
+
overrides=['utils/enrichments/reactome_pathways=default'])
|
39
|
+
cfg = cfg.utils.enrichments.reactome_pathways
|
40
|
+
|
41
|
+
descriptions = []
|
42
|
+
for reactome_pathway_id in reactome_pathways_ids:
|
43
|
+
r = requests.get(cfg.base_url + reactome_pathway_id + '/summation',
|
44
|
+
headers={ "Accept" : "text/plain"},
|
45
|
+
timeout=cfg.timeout)
|
46
|
+
# if the response is not ok
|
47
|
+
if not r.ok:
|
48
|
+
descriptions.append(None)
|
49
|
+
continue
|
50
|
+
response_body = r.text
|
51
|
+
# if the response is ok
|
52
|
+
descriptions.append(response_body.split('\t')[1])
|
53
|
+
return descriptions
|
54
|
+
|
55
|
+
def enrich_documents_with_rag(self, texts, docs):
|
56
|
+
"""
|
57
|
+
Enrich a list of input Reactome pathways
|
58
|
+
|
59
|
+
Args:
|
60
|
+
texts: The list of Reactome pathways to be enriched.
|
61
|
+
|
62
|
+
Returns:
|
63
|
+
The list of enriched descriptions
|
64
|
+
"""
|
65
|
+
return self.enrich_documents(texts)
|
@@ -0,0 +1,90 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
"""
|
4
|
+
Enrichment class for enriching Gene names with their function and sequence using UniProt.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from typing import List
|
8
|
+
import logging
|
9
|
+
import json
|
10
|
+
import hydra
|
11
|
+
import requests
|
12
|
+
from .enrichments import Enrichments
|
13
|
+
|
14
|
+
# Initialize logger
|
15
|
+
logging.basicConfig(level=logging.INFO)
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
class EnrichmentWithUniProt(Enrichments):
|
19
|
+
"""
|
20
|
+
Enrichment class using UniProt
|
21
|
+
"""
|
22
|
+
def enrich_documents(self, texts: List[str]) -> List[str]:
|
23
|
+
"""
|
24
|
+
Enrich a list of input UniProt gene names with their function and sequence.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
texts: The list of gene names to be enriched.
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
The list of enriched functions and sequences
|
31
|
+
"""
|
32
|
+
|
33
|
+
enriched_gene_names = texts
|
34
|
+
|
35
|
+
logger.log(logging.INFO,
|
36
|
+
"Load Hydra configuration for Gene enrichment with description and sequence.")
|
37
|
+
with hydra.initialize(version_base=None, config_path="../../configs"):
|
38
|
+
cfg = hydra.compose(config_name='config',
|
39
|
+
overrides=['utils/enrichments/uniprot_proteins=default'])
|
40
|
+
cfg = cfg.utils.enrichments.uniprot_proteins
|
41
|
+
|
42
|
+
|
43
|
+
descriptions = []
|
44
|
+
sequences = []
|
45
|
+
for gene in enriched_gene_names:
|
46
|
+
params = {
|
47
|
+
"reviewed": cfg.reviewed,
|
48
|
+
"isoform": cfg.isoform,
|
49
|
+
"exact_gene": gene,
|
50
|
+
"organism": cfg.organism,
|
51
|
+
# You can get the list of all available organisms here:
|
52
|
+
# https://www.uniprot.org/help/taxonomy
|
53
|
+
}
|
54
|
+
|
55
|
+
r = requests.get(cfg.uniprot_url,
|
56
|
+
headers={ "Accept" : "application/json"},
|
57
|
+
params=params,
|
58
|
+
timeout=cfg.timeout)
|
59
|
+
# if the response is not ok
|
60
|
+
if not r.ok:
|
61
|
+
descriptions.append(None)
|
62
|
+
sequences.append(None)
|
63
|
+
continue
|
64
|
+
response_body = json.loads(r.text)
|
65
|
+
# if the response body is empty
|
66
|
+
if not response_body:
|
67
|
+
descriptions.append(None)
|
68
|
+
sequences.append(None)
|
69
|
+
continue
|
70
|
+
description = ''
|
71
|
+
for comment in response_body[0]['comments']:
|
72
|
+
if comment['type'] == 'FUNCTION':
|
73
|
+
for value in comment['text']:
|
74
|
+
description += value['value']
|
75
|
+
sequence = response_body[0]['sequence']['sequence']
|
76
|
+
descriptions.append(description)
|
77
|
+
sequences.append(sequence)
|
78
|
+
return descriptions, sequences
|
79
|
+
|
80
|
+
def enrich_documents_with_rag(self, texts, docs):
|
81
|
+
"""
|
82
|
+
Enrich a list of input UniProt gene names with their function and sequence.
|
83
|
+
|
84
|
+
Args:
|
85
|
+
texts: The list of gene names to be enriched.
|
86
|
+
|
87
|
+
Returns:
|
88
|
+
The list of enriched functions and sequences
|
89
|
+
"""
|
90
|
+
return self.enrich_documents(texts)
|
@@ -12,12 +12,16 @@ import hydra
|
|
12
12
|
logging.basicConfig(level=logging.INFO)
|
13
13
|
logger = logging.getLogger(__name__)
|
14
14
|
|
15
|
-
def
|
15
|
+
def external_id2pubchem_cid(db, db_id):
|
16
16
|
"""
|
17
|
-
Convert
|
17
|
+
Convert external DB ID to PubChem CID.
|
18
|
+
Please refer to the following URL for more information
|
19
|
+
on data sources:
|
20
|
+
https://pubchem.ncbi.nlm.nih.gov/sources/
|
18
21
|
|
19
22
|
Args:
|
20
|
-
|
23
|
+
db: The database name.
|
24
|
+
db_id: The database ID of the drug.
|
21
25
|
|
22
26
|
Returns:
|
23
27
|
The PubChem CID of the drug.
|
@@ -28,7 +32,7 @@ def drugbank_id2pubchem_cid(drugbank_id):
|
|
28
32
|
overrides=['utils/pubchem_utils=default'])
|
29
33
|
cfg = cfg.utils.pubchem_utils
|
30
34
|
# Prepare the URL
|
31
|
-
pubchem_url_for_drug = cfg.
|
35
|
+
pubchem_url_for_drug = f"{cfg.pubchem_cid_base_url}/{db}/{db_id}/JSON"
|
32
36
|
# Get the data
|
33
37
|
response = requests.get(pubchem_url_for_drug, timeout=60)
|
34
38
|
data = response.json()
|
@@ -40,3 +44,29 @@ def drugbank_id2pubchem_cid(drugbank_id):
|
|
40
44
|
cid = compound["id"].get("id", {}).get("cid")
|
41
45
|
break
|
42
46
|
return cid
|
47
|
+
|
48
|
+
def pubchem_cid_description(cid):
|
49
|
+
"""
|
50
|
+
Get the description of a PubChem CID.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
cid: The PubChem CID of the drug.
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
The description of the PubChem CID.
|
57
|
+
"""
|
58
|
+
logger.log(logging.INFO, "Load Hydra configuration for PubChem CID description.")
|
59
|
+
with hydra.initialize(version_base=None, config_path="../configs"):
|
60
|
+
cfg = hydra.compose(config_name='config',
|
61
|
+
overrides=['utils/pubchem_utils=default'])
|
62
|
+
cfg = cfg.utils.pubchem_utils
|
63
|
+
# Prepare the URL
|
64
|
+
pubchem_url_for_descpription = f"{cfg.pubchem_cid_description_url}/{cid}/description/JSON"
|
65
|
+
# Get the data
|
66
|
+
response = requests.get(pubchem_url_for_descpription, timeout=60)
|
67
|
+
data = response.json()
|
68
|
+
# Extract the PubChem CID description
|
69
|
+
description = ''
|
70
|
+
for information in data["InformationList"]['Information']:
|
71
|
+
description += information.get("Description", '')
|
72
|
+
return description
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: aiagents4pharma
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.34.0
|
4
4
|
Summary: AI Agents for drug discovery, drug development, and other pharmaceutical R&D.
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
@@ -75,7 +75,7 @@ aiagents4pharma/talk2knowledgegraphs/__init__.py,sha256=Z0Eo7LTiKk0STsr8VI7wkCLq
|
|
75
75
|
aiagents4pharma/talk2knowledgegraphs/agents/__init__.py,sha256=iOAzuy_8A03tQDFtSBhC9dldUo62z5gfxcVtXAdLOJs,92
|
76
76
|
aiagents4pharma/talk2knowledgegraphs/agents/t2kg_agent.py,sha256=IcXSZ2qQA1m-gS-o0Pj_g1oar8uPdhsbaovloUFka3Q,3058
|
77
77
|
aiagents4pharma/talk2knowledgegraphs/configs/__init__.py,sha256=4_DVdpahaJ55yPl0aZotlFA_MYWLFF2cubWyKtBVI_Q,126
|
78
|
-
aiagents4pharma/talk2knowledgegraphs/configs/config.yaml,sha256=
|
78
|
+
aiagents4pharma/talk2knowledgegraphs/configs/config.yaml,sha256=X91262b-wkygiH4HrEr0bIzHxHDuDWwuxLQAmdUe-E4,367
|
79
79
|
aiagents4pharma/talk2knowledgegraphs/configs/agents/t2kg_agent/__init__.py,sha256=-fAORvyFmG2iSvFOFDixmt9OTQRR58y89uhhu2EgbA8,46
|
80
80
|
aiagents4pharma/talk2knowledgegraphs/configs/agents/t2kg_agent/default.yaml,sha256=ENCGROwYFpR6g4QD518h73sshdn3vPVpotBMk1QJcpU,4830
|
81
81
|
aiagents4pharma/talk2knowledgegraphs/configs/app/__init__.py,sha256=fKfc3FR7g5KjY9b6jzrU6cwKTVVpkoVZQS3dvUowu34,69
|
@@ -111,9 +111,12 @@ aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_ollama.py,sha25
|
|
111
111
|
aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_sentencetransformer.py,sha256=Qxo6WeIDRy8aLh1tNKw0kSlzmUj3MtTak63oW2YwB24,1327
|
112
112
|
aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_enrichments.py,sha256=N6HRr4lWHXY7bTHe2uXJe4D_EG9WqZPibZne6qLl9_k,1447
|
113
113
|
aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_ollama.py,sha256=JhY7axvVULLywDJ2ctA-gob5YPeaJYWsaMNjHT6L9CU,3021
|
114
|
-
aiagents4pharma/talk2knowledgegraphs/tests/
|
114
|
+
aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_ols.py,sha256=woSm723ns9fHieu-QWFiniLlm5h22v1qzO4v6n20K5g,2413
|
115
|
+
aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_pubchem.py,sha256=0SgYvqdvxseUYTHx2KuSNI2hnmQ3VVVz0F-79_-P41o,1769
|
116
|
+
aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_reactome.py,sha256=r1D74mavsnSCm4xnWl0n0nM9PZqgm3doD2dulNrKNVQ,1754
|
117
|
+
aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_uniprot.py,sha256=G13Diw7cA5TGINUNO1CDnN4rM6KbepxRXNjuzY578DI,1611
|
115
118
|
aiagents4pharma/talk2knowledgegraphs/tests/test_utils_kg_utils.py,sha256=pal76wi7WgQWUNk56BrzfFV8jKpbDaHHdbwtgx_gXLI,2410
|
116
|
-
aiagents4pharma/talk2knowledgegraphs/tests/test_utils_pubchem_utils.py,sha256=
|
119
|
+
aiagents4pharma/talk2knowledgegraphs/tests/test_utils_pubchem_utils.py,sha256=K1Y6QM0MDP1IrAdcWkigl8R-O-i-lsL4NCyOrWewhdM,1246
|
117
120
|
aiagents4pharma/talk2knowledgegraphs/tools/__init__.py,sha256=zpD4h7EYtyq0QNOqLd6bkxrPlPb2XN64ceI9ncgESrA,171
|
118
121
|
aiagents4pharma/talk2knowledgegraphs/tools/graphrag_reasoning.py,sha256=OEuOFncDRdb7TQEGq4rkT5On-jI-R7Nt8K5EBzaND8w,5338
|
119
122
|
aiagents4pharma/talk2knowledgegraphs/tools/load_arguments.py,sha256=zhmsRp-8vjB5rRekqTA07d3yb-42HWqng9dDMkvK6hM,623
|
@@ -121,17 +124,20 @@ aiagents4pharma/talk2knowledgegraphs/tools/subgraph_extraction.py,sha256=te06QMF
|
|
121
124
|
aiagents4pharma/talk2knowledgegraphs/tools/subgraph_summarization.py,sha256=mDSBOxopDfNhEJeU8fVI8b5lXTYrRzcc97aLbFgYSy4,4413
|
122
125
|
aiagents4pharma/talk2knowledgegraphs/utils/__init__.py,sha256=cZqb3LZLmBnmyAtWFv2Z-4uJvQmx0M4zKsfiWrlM3Pk,195
|
123
126
|
aiagents4pharma/talk2knowledgegraphs/utils/kg_utils.py,sha256=6vQnPkeOWae_8jePjhma3sJuMTngy0I0tqzdFt6OqKg,2507
|
124
|
-
aiagents4pharma/talk2knowledgegraphs/utils/pubchem_utils.py,sha256=
|
127
|
+
aiagents4pharma/talk2knowledgegraphs/utils/pubchem_utils.py,sha256=8cve_KLtQUhG3uMKYpyelZvpETSsNGRdGE4X0NXMk4M,2442
|
125
128
|
aiagents4pharma/talk2knowledgegraphs/utils/embeddings/__init__.py,sha256=POSDrSdFAWsBCueOPD-Fok-ARdTywJU1ivwpT9EU1Kw,199
|
126
129
|
aiagents4pharma/talk2knowledgegraphs/utils/embeddings/embeddings.py,sha256=1nGznrAj-xT0xuSMBGz2dOujJ7M_IwSR84njxtxsy9A,2523
|
127
130
|
aiagents4pharma/talk2knowledgegraphs/utils/embeddings/huggingface.py,sha256=2vi_elf6EgzfagFAO5QnL3a_aXZyN7B1EBziu44MTfM,3806
|
128
131
|
aiagents4pharma/talk2knowledgegraphs/utils/embeddings/nim_molmim.py,sha256=XH6JNfmMS38UEU7UGJeeabHfRykharnQpQaqjO86OlQ,1537
|
129
132
|
aiagents4pharma/talk2knowledgegraphs/utils/embeddings/ollama.py,sha256=8w0sjt3Ex5YJ_XvpKl9UbhdTiiaoMIarbPUxLBU-1Uw,2378
|
130
133
|
aiagents4pharma/talk2knowledgegraphs/utils/embeddings/sentence_transformer.py,sha256=36iKlisOpMtGR5xfTAlSHXWvPqVC_Jbezod8kbBBMVg,2136
|
131
|
-
aiagents4pharma/talk2knowledgegraphs/utils/enrichments/__init__.py,sha256=
|
134
|
+
aiagents4pharma/talk2knowledgegraphs/utils/enrichments/__init__.py,sha256=K157MWJ4dn2fj3G5ClhyAOXg9jI2H02GP07J6UpasJw,230
|
132
135
|
aiagents4pharma/talk2knowledgegraphs/utils/enrichments/enrichments.py,sha256=Bx8x6zzk5614ApWB90N_iv4_Y_Uq0-KwUeBwYSdQMU4,924
|
133
136
|
aiagents4pharma/talk2knowledgegraphs/utils/enrichments/ollama.py,sha256=8eoxR-VHo0G7ReQIwje7xEhE-SJlHdef7_wJRpnvFIc,4116
|
134
|
-
aiagents4pharma/talk2knowledgegraphs/utils/enrichments/
|
137
|
+
aiagents4pharma/talk2knowledgegraphs/utils/enrichments/ols_terms.py,sha256=xSPP-h2q9fABz6Sd6ZlH9WiyoO8KZeEnPI5n2nJpWL4,2443
|
138
|
+
aiagents4pharma/talk2knowledgegraphs/utils/enrichments/pubchem_strings.py,sha256=CQEGQ6Qsex2T91Vw7zTrclJBbSGGhxeWaVJb8tnURAQ,1691
|
139
|
+
aiagents4pharma/talk2knowledgegraphs/utils/enrichments/reactome_pathways.py,sha256=I0cD0Fk2Uk27_4jEaIhpoGhoMh_RphY1VtkMnk4dkPg,2011
|
140
|
+
aiagents4pharma/talk2knowledgegraphs/utils/enrichments/uniprot_proteins.py,sha256=z0Jb3tt8VzRjzqI9oVcUvRlPPg6BUdmslfKDIEFE_h8,3013
|
135
141
|
aiagents4pharma/talk2knowledgegraphs/utils/extractions/__init__.py,sha256=7gwwtfzKhB8GuOBD47XRi0NprwEXkOzwNl5eeu-hDTI,86
|
136
142
|
aiagents4pharma/talk2knowledgegraphs/utils/extractions/pcst.py,sha256=m5p0yoJb7I19ua5yeQfXPf7c4r6S1XPwttsrM7Qoy94,9336
|
137
143
|
aiagents4pharma/talk2scholars/__init__.py,sha256=NOZxTklAH1j1ggu97Ib8Xn9LCKudEWt-8dx8w7yxVD8,180
|
@@ -219,8 +225,8 @@ aiagents4pharma/talk2scholars/tools/zotero/utils/read_helper.py,sha256=lyrfpx8NH
|
|
219
225
|
aiagents4pharma/talk2scholars/tools/zotero/utils/review_helper.py,sha256=IPD1V9yrBYaDnRe7sR6PrpwR82OBJbA2P_Tc6RbxAbM,2748
|
220
226
|
aiagents4pharma/talk2scholars/tools/zotero/utils/write_helper.py,sha256=ALwLecy1QVebbsmXJiDj1GhGmyhq2R2tZlAyEl1vfhw,7410
|
221
227
|
aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_path.py,sha256=oIrfbOySgts50ksHKyjcWjRkPRIS88g3Lc0v9mBkU8w,6375
|
222
|
-
aiagents4pharma-1.
|
223
|
-
aiagents4pharma-1.
|
224
|
-
aiagents4pharma-1.
|
225
|
-
aiagents4pharma-1.
|
226
|
-
aiagents4pharma-1.
|
228
|
+
aiagents4pharma-1.34.0.dist-info/licenses/LICENSE,sha256=IcIbyB1Hyk5ZDah03VNQvJkbNk2hkBCDqQ8qtnCvB4Q,1077
|
229
|
+
aiagents4pharma-1.34.0.dist-info/METADATA,sha256=P4bmxMTSkbYdRNmw6mijR5O19PBaYRqqP3SgtFhdtpk,16043
|
230
|
+
aiagents4pharma-1.34.0.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
|
231
|
+
aiagents4pharma-1.34.0.dist-info/top_level.txt,sha256=-AH8rMmrSnJtq7HaAObS78UU-cTCwvX660dSxeM7a0A,16
|
232
|
+
aiagents4pharma-1.34.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|