chemsource 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chemsource-1.0.0/LICENSE +21 -0
- chemsource-1.0.0/PKG-INFO +15 -0
- chemsource-1.0.0/README.md +11 -0
- chemsource-1.0.0/setup.cfg +8 -0
- chemsource-1.0.0/setup.py +23 -0
- chemsource-1.0.0/src/chemsource/__init__.py +3 -0
- chemsource-1.0.0/src/chemsource/chemsource.py +55 -0
- chemsource-1.0.0/src/chemsource/classifier.py +22 -0
- chemsource-1.0.0/src/chemsource/config.py +70 -0
- chemsource-1.0.0/src/chemsource/exceptions.py +29 -0
- chemsource-1.0.0/src/chemsource/retriever.py +119 -0
- chemsource-1.0.0/src/chemsource.egg-info/PKG-INFO +15 -0
- chemsource-1.0.0/src/chemsource.egg-info/SOURCES.txt +15 -0
- chemsource-1.0.0/src/chemsource.egg-info/dependency_links.txt +1 -0
- chemsource-1.0.0/src/chemsource.egg-info/requires.txt +4 -0
- chemsource-1.0.0/src/chemsource.egg-info/top_level.txt +1 -0
chemsource-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Prajit Rajkumar
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: chemsource
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Tool to classify novel drugs and other health-related chemicals by origin
|
|
5
|
+
Author: Prajit Rajkumar
|
|
6
|
+
Author-email: prajkumar@ucsd.edu
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.6
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: lxml>=4.9.4
|
|
13
|
+
Requires-Dist: openai>=1.23.2
|
|
14
|
+
Requires-Dist: requests<3,>=2.0.0
|
|
15
|
+
Requires-Dist: wikipedia>=1.4.0
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# chemsource v1.0.0
|
|
2
|
+
`chemsource` is a tool to classify novel drugs and other chemicals by source that is currently offered in Python. The current iteration, `v1.0.0`, relies on information scraped from [Wikipedia](https://www.wikipedia.org/) and the NLM's [PubMed](https://pubmed.ncbi.nlm.nih.gov/) abstract database. Information retrieved is classified using OpenAI's [ChatGPT API](https://platform.openai.com/docs/api-reference) into a combination of 5 categories, `MEDICAL, ENDOGENOUS, FOOD, PERSONAL CARE,` or `INDUSTRIAL`. Chemicals without enough available information will be classified with the tag `INFO`.
|
|
3
|
+
|
|
4
|
+
## Installation & Setup
|
|
5
|
+
`chemsource` is available on `pypi` or can alternatively be downloaded directly from the GitHub repository. To use the classification feature of `chemsource`, users must have an OpenAI API key that can be provided to the model along with credits associated with the key. Information on where to find the key can be found [here](https://help.openai.com/en/articles/4936850-where-do-i-find-my-openai-api-key). Credits can be added to your OpenAI account [here](https://platform.openai.com/account/billing/overview).
|
|
6
|
+
See `Cost` for more information.
|
|
7
|
+
|
|
8
|
+
## Usage
|
|
9
|
+
|
|
10
|
+
## Cost
|
|
11
|
+
`chemsource` as a package is available with no additional charge to all users. However, the use of OpenAI's ChatGPT models within the classification step of the package is a service that costs money due to the energetically demanding nature of Large Language Models.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="chemsource",
|
|
5
|
+
author="Prajit Rajkumar",
|
|
6
|
+
author_email="prajkumar@ucsd.edu",
|
|
7
|
+
description="Tool to classify novel drugs and other health-related"
|
|
8
|
+
+ " chemicals by origin",
|
|
9
|
+
package_dir={"": "src"},
|
|
10
|
+
packages=find_packages(where='src'),
|
|
11
|
+
classifiers=[
|
|
12
|
+
"Programming Language :: Python :: 3",
|
|
13
|
+
"License :: OSI Approved :: MIT License",
|
|
14
|
+
"Operating System :: OS Independent",
|
|
15
|
+
],
|
|
16
|
+
python_requires='>=3.6',
|
|
17
|
+
install_requires=[
|
|
18
|
+
"lxml>=4.9.4",
|
|
19
|
+
"openai>=1.23.2",
|
|
20
|
+
"requests>=2.0.0,<3",
|
|
21
|
+
"wikipedia>=1.4.0",
|
|
22
|
+
],
|
|
23
|
+
)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from .config import Config
|
|
2
|
+
from .config import BASE_PROMPT
|
|
3
|
+
|
|
4
|
+
from .classifier import classify as cls
|
|
5
|
+
from .retriever import retrieve as ret
|
|
6
|
+
|
|
7
|
+
class ChemSource(Config):
|
|
8
|
+
def __init__(self,
|
|
9
|
+
openai_key=None,
|
|
10
|
+
model="gpt-4-0125-preview",
|
|
11
|
+
ncbi_key=None,
|
|
12
|
+
prompt=BASE_PROMPT,
|
|
13
|
+
max_tokens=250000
|
|
14
|
+
):
|
|
15
|
+
super().__init__(openai_key=openai_key,
|
|
16
|
+
model=model,
|
|
17
|
+
ncbi_key=ncbi_key,
|
|
18
|
+
prompt=prompt,
|
|
19
|
+
max_tokens=max_tokens
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
def chemsource(self, name, priority="WIKIPEDIA", single_source=False):
|
|
23
|
+
if self.openaikey is None:
|
|
24
|
+
raise ValueError("OpenAI API key must be provided")
|
|
25
|
+
|
|
26
|
+
information = ret(name,
|
|
27
|
+
priority,
|
|
28
|
+
single_source,
|
|
29
|
+
ncbikey=self.ncbi_key
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
return information, cls(name,
|
|
33
|
+
information,
|
|
34
|
+
self.openai_key,
|
|
35
|
+
self.prompt,
|
|
36
|
+
self.model,
|
|
37
|
+
self.max_tokens)
|
|
38
|
+
|
|
39
|
+
def classify(self, name, information):
|
|
40
|
+
if self.openaikey is None:
|
|
41
|
+
raise ValueError("OpenAI API key must be provided")
|
|
42
|
+
|
|
43
|
+
return cls(name,
|
|
44
|
+
information,
|
|
45
|
+
self.openai_key,
|
|
46
|
+
self.prompt,
|
|
47
|
+
self.model,
|
|
48
|
+
self.max_tokens)
|
|
49
|
+
|
|
50
|
+
def retrieve(self, name, priority="WIKIPEDIA", single_source=False):
|
|
51
|
+
return ret(name,
|
|
52
|
+
priority,
|
|
53
|
+
single_source,
|
|
54
|
+
ncbikey=self.ncbi_key
|
|
55
|
+
)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from openai import OpenAI
|
|
2
|
+
|
|
3
|
+
def classify(name,
|
|
4
|
+
input_text=None,
|
|
5
|
+
openaikey=None,
|
|
6
|
+
baseprompt=None,
|
|
7
|
+
gpt_model='gpt-4-0125-preview',
|
|
8
|
+
max_length=250000):
|
|
9
|
+
|
|
10
|
+
client = OpenAI(
|
|
11
|
+
api_key=(openaikey)
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
prompt = baseprompt[0] + str(name) + baseprompt[1] + str(input_text)
|
|
15
|
+
prompt = prompt[:max_length]
|
|
16
|
+
|
|
17
|
+
response = client.chat.completions.create(
|
|
18
|
+
model=gpt_model,
|
|
19
|
+
messages=[{"role": "system", "content": prompt}]
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
return response.choices[0].message.content
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
BASE_PROMPT = ("Classify this compound, COMPOUND_NAME, as any combination of"
|
|
2
|
+
+ " the following: MEDICAL, ENDOGENOUS, FOOD, PERSONAL CARE,"
|
|
3
|
+
+ " INDUSTRIAL. Note that ENDOGENOUS refers to compounds that"
|
|
4
|
+
+ " are human synthesized. ENDOGENOUS excludes essential"
|
|
5
|
+
+ " nutrients that cannot be synthesized by human body. Note"
|
|
6
|
+
+ " that FOOD refers to compounds present in natural food"
|
|
7
|
+
+ " items. Note that INDUSTRIAL should be used only for"
|
|
8
|
+
+ " compounds not used as a contributing ingredient in the"
|
|
9
|
+
+ " medical, personal care, or food industries. Note that"
|
|
10
|
+
+ " PERSONAL CARE refers to non-medicated compounds typically"
|
|
11
|
+
+ " used for activities such as skincare, beauty, and fitness."
|
|
12
|
+
+ " Specify INFO instead if more information is needed. DO NOT"
|
|
13
|
+
+ " MAKE ANY ASSUMPTIONS, USE ONLY THE INFORMATION PROVIDED."
|
|
14
|
+
+ " Provide the output as a plain text separated by commas,"
|
|
15
|
+
+ " and provide only the categories listed (either list a"
|
|
16
|
+
+ "combination of INDUSTRIAL, ENDOGENOUS, PERSONAL CARE,"
|
|
17
|
+
+ " MEDICAL, FOOD or list INFO), with no justification."
|
|
18
|
+
+ " Provided Information:\n")
|
|
19
|
+
|
|
20
|
+
class Config:
|
|
21
|
+
def __init__(self, openai_key=None,
|
|
22
|
+
model="gpt-4-0125-preview", ncbi_key=None,
|
|
23
|
+
prompt=BASE_PROMPT, max_tokens=250000):
|
|
24
|
+
self.openai_key = openai_key
|
|
25
|
+
self.model = model
|
|
26
|
+
self.ncbi_key = ncbi_key
|
|
27
|
+
self.prompt = prompt
|
|
28
|
+
self.max_tokens = max_tokens
|
|
29
|
+
|
|
30
|
+
def ncbi_key(self, ncbi_key):
|
|
31
|
+
self.ncbi_key = ncbi_key
|
|
32
|
+
|
|
33
|
+
def openai_key(self, openai_key):
|
|
34
|
+
self.openai_key = openai_key
|
|
35
|
+
|
|
36
|
+
def model(self, model):
|
|
37
|
+
self.model = model
|
|
38
|
+
|
|
39
|
+
def prompt(self, prompt):
|
|
40
|
+
self.prompt = prompt
|
|
41
|
+
|
|
42
|
+
def token_limit(self, max_tokens):
|
|
43
|
+
self.max_tokens = max_tokens
|
|
44
|
+
|
|
45
|
+
def configure(self, ncbi_key=None, openai_key=None,
|
|
46
|
+
model="gpt-4-0125-preview",
|
|
47
|
+
prompt=BASE_PROMPT, max_tokens=250000):
|
|
48
|
+
self.openai_key = openai_key
|
|
49
|
+
self.model = model
|
|
50
|
+
self.ncbi_key = ncbi_key
|
|
51
|
+
self.prompt = prompt
|
|
52
|
+
self.max_tokens = max_tokens
|
|
53
|
+
|
|
54
|
+
def configuration(self):
|
|
55
|
+
if self.openai_key is None:
|
|
56
|
+
openai_key_display = None
|
|
57
|
+
else:
|
|
58
|
+
openai_key_display = "*" * len(self.openai_key)
|
|
59
|
+
|
|
60
|
+
if self.ncbi_key is None:
|
|
61
|
+
ncbi_key_display = None
|
|
62
|
+
else:
|
|
63
|
+
ncbi_key_display = "*" * len(self.ncbi_key)
|
|
64
|
+
|
|
65
|
+
return {"ncbi_key": openai_key_display,
|
|
66
|
+
"openai_key": ncbi_key_display,
|
|
67
|
+
"model": self.model,
|
|
68
|
+
"prompt": self.prompt,
|
|
69
|
+
"token_limit": self.max_tokens
|
|
70
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
class XMLParseError(Exception):
|
|
2
|
+
def __init__(self, message=None):
|
|
3
|
+
self.message = message
|
|
4
|
+
super().__init__(message)
|
|
5
|
+
|
|
6
|
+
class XMLParseError2(Exception):
|
|
7
|
+
def __init__(self, message=None):
|
|
8
|
+
self.message = message
|
|
9
|
+
super().__init__(message)
|
|
10
|
+
|
|
11
|
+
class XMLRetrievalError(Exception):
|
|
12
|
+
def __init__(self, message=None):
|
|
13
|
+
self.message = message
|
|
14
|
+
super().__init__(message)
|
|
15
|
+
|
|
16
|
+
class XMLRetrievalError2(Exception):
|
|
17
|
+
def __init__(self, message=None):
|
|
18
|
+
self.message = message
|
|
19
|
+
super().__init__(message)
|
|
20
|
+
|
|
21
|
+
class JoinError(Exception):
|
|
22
|
+
def __init__(self, message=None):
|
|
23
|
+
self.message = message
|
|
24
|
+
super().__init__(message)
|
|
25
|
+
|
|
26
|
+
class DescriptionError(Exception):
|
|
27
|
+
def __init__(self, message=None):
|
|
28
|
+
self.message = message
|
|
29
|
+
super().__init__(message)
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
#!pip install wikipedia
|
|
2
|
+
#IMPORTANT IN FINAL VERSION, FIGURE OUT HOW TO EITHER QUERY WITHOUT
|
|
3
|
+
#PUBMED KEY OR HOW TO GET A PUBMED KEY FROM USER
|
|
4
|
+
|
|
5
|
+
from .exceptions import XMLParseError, XMLRetrievalError
|
|
6
|
+
from .exceptions import XMLParseError2, XMLRetrievalError2, JoinError
|
|
7
|
+
|
|
8
|
+
from lxml import etree
|
|
9
|
+
import re
|
|
10
|
+
import requests as r
|
|
11
|
+
import wikipedia
|
|
12
|
+
|
|
13
|
+
SEARCH_PARAMS = {'db': 'pubmed',
|
|
14
|
+
'term': '',
|
|
15
|
+
'retmax': '3',
|
|
16
|
+
'usehistory': 'n',
|
|
17
|
+
'sort': 'relevance',
|
|
18
|
+
'api_key': None
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
XML_RETRIEVAL_PARAMS = {'db': 'pubmed',
|
|
22
|
+
'query_key': '1',
|
|
23
|
+
'WebEnv': '',
|
|
24
|
+
'rettype': 'abstract',
|
|
25
|
+
'retmax': '3',
|
|
26
|
+
'api_key': None
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
def retrieve(name, priority="WIKIPEDIA", single_source=False, ncbikey=None):
|
|
30
|
+
if (priority == "WIKIPEDIA" and not single_source):
|
|
31
|
+
try:
|
|
32
|
+
description = wikipedia_retrieve(name)
|
|
33
|
+
info_source = "WIKIPEDIA"
|
|
34
|
+
except:
|
|
35
|
+
try:
|
|
36
|
+
description = pubmed_retrieve(name, ncbikey)
|
|
37
|
+
info_source = "PUBMED"
|
|
38
|
+
except:
|
|
39
|
+
description = None
|
|
40
|
+
info_source = None
|
|
41
|
+
elif (priority == "PUBMED" and not single_source):
|
|
42
|
+
try:
|
|
43
|
+
description = pubmed_retrieve(name, ncbikey)
|
|
44
|
+
info_source = "PUBMED"
|
|
45
|
+
except:
|
|
46
|
+
try:
|
|
47
|
+
description = wikipedia_retrieve(name)
|
|
48
|
+
info_source = "WIKIPEDIA"
|
|
49
|
+
except:
|
|
50
|
+
description = None
|
|
51
|
+
info_source = None
|
|
52
|
+
|
|
53
|
+
elif (priority == "WIKIPEDIA" and single_source):
|
|
54
|
+
try:
|
|
55
|
+
description = wikipedia_retrieve(name)
|
|
56
|
+
info_source = "WIKIPEDIA"
|
|
57
|
+
except:
|
|
58
|
+
description = None
|
|
59
|
+
info_source = None
|
|
60
|
+
|
|
61
|
+
else:
|
|
62
|
+
try:
|
|
63
|
+
description = pubmed_retrieve(name, ncbikey)
|
|
64
|
+
info_source = "PUBMED"
|
|
65
|
+
except:
|
|
66
|
+
description = None
|
|
67
|
+
info_source = None
|
|
68
|
+
|
|
69
|
+
return info_source, description
|
|
70
|
+
|
|
71
|
+
def pubmed_retrieve(drug, ncbikey=None):
|
|
72
|
+
temp_search_params = SEARCH_PARAMS
|
|
73
|
+
temp_search_params['api_key'] = ncbikey
|
|
74
|
+
|
|
75
|
+
if (temp_search_params["api_key"] is None):
|
|
76
|
+
del temp_search_params["api_key"]
|
|
77
|
+
temp_search_params['term'] = drug + '[ti]'
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
xml_content = etree.fromstring(r.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?",
|
|
81
|
+
params=temp_search_params).content)
|
|
82
|
+
except:
|
|
83
|
+
raise XMLParseError
|
|
84
|
+
try:
|
|
85
|
+
if (str(xml_content.find(".//Count").text) == 0):
|
|
86
|
+
return 'NO_RESULTS'
|
|
87
|
+
except:
|
|
88
|
+
raise XMLRetrievalError
|
|
89
|
+
else:
|
|
90
|
+
temp_retrieval_params = XML_RETRIEVAL_PARAMS
|
|
91
|
+
temp_retrieval_params['api_key'] = ncbikey
|
|
92
|
+
|
|
93
|
+
if (temp_retrieval_params["api_key"] is None):
|
|
94
|
+
del temp_retrieval_params["api_key"]
|
|
95
|
+
temp_retrieval_params['WebEnv'] = xml_content.find(".//WebEnv").text
|
|
96
|
+
try:
|
|
97
|
+
retrieval_content = etree.fromstring(r.get(('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'),
|
|
98
|
+
params=temp_retrieval_params
|
|
99
|
+
).content)
|
|
100
|
+
except:
|
|
101
|
+
raise XMLParseError2
|
|
102
|
+
try:
|
|
103
|
+
abstracts = retrieval_content.findall(".//AbstractText")
|
|
104
|
+
except:
|
|
105
|
+
raise XMLRetrievalError2
|
|
106
|
+
result = ''
|
|
107
|
+
try:
|
|
108
|
+
for abstract in abstracts:
|
|
109
|
+
result = result + ' ' + abstract.text
|
|
110
|
+
except:
|
|
111
|
+
raise JoinError
|
|
112
|
+
return result
|
|
113
|
+
|
|
114
|
+
def wikipedia_retrieve(drug):
|
|
115
|
+
description = wikipedia.page(drug, auto_suggest=False).content
|
|
116
|
+
description = description.replace('\n', ' ')
|
|
117
|
+
description = description.replace('\t', ' ')
|
|
118
|
+
description = ' '.join(description.split())
|
|
119
|
+
return description
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: chemsource
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Tool to classify novel drugs and other health-related chemicals by origin
|
|
5
|
+
Author: Prajit Rajkumar
|
|
6
|
+
Author-email: prajkumar@ucsd.edu
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.6
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: lxml>=4.9.4
|
|
13
|
+
Requires-Dist: openai>=1.23.2
|
|
14
|
+
Requires-Dist: requests<3,>=2.0.0
|
|
15
|
+
Requires-Dist: wikipedia>=1.4.0
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
setup.cfg
|
|
4
|
+
setup.py
|
|
5
|
+
src/chemsource/__init__.py
|
|
6
|
+
src/chemsource/chemsource.py
|
|
7
|
+
src/chemsource/classifier.py
|
|
8
|
+
src/chemsource/config.py
|
|
9
|
+
src/chemsource/exceptions.py
|
|
10
|
+
src/chemsource/retriever.py
|
|
11
|
+
src/chemsource.egg-info/PKG-INFO
|
|
12
|
+
src/chemsource.egg-info/SOURCES.txt
|
|
13
|
+
src/chemsource.egg-info/dependency_links.txt
|
|
14
|
+
src/chemsource.egg-info/requires.txt
|
|
15
|
+
src/chemsource.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
chemsource
|