chemsource 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Prajit Rajkumar
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.1
2
+ Name: chemsource
3
+ Version: 1.0.0
4
+ Summary: Tool to classify novel drugs and other health-related chemicals by origin
5
+ Author: Prajit Rajkumar
6
+ Author-email: prajkumar@ucsd.edu
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.6
11
+ License-File: LICENSE
12
+ Requires-Dist: lxml>=4.9.4
13
+ Requires-Dist: openai>=1.23.2
14
+ Requires-Dist: requests<3,>=2.0.0
15
+ Requires-Dist: wikipedia>=1.4.0
@@ -0,0 +1,11 @@
1
+ # chemsource v1.0.0
2
+ `chemsource` is a tool to classify novel drugs and other chemicals by source that is currently offered in Python. The current iteration, `v1.0.0`, relies on information scraped from [Wikipedia](https://www.wikipedia.org/) and the NLM's [PubMed](https://pubmed.ncbi.nlm.nih.gov/) abstract database. Information retrieved is classified using OpenAI's [ChatGPT API](https://platform.openai.com/docs/api-reference) into a combination of 5 categories, `MEDICAL, ENDOGENOUS, FOOD, PERSONAL CARE,` or `INDUSTRIAL`. Chemicals without enough available information will be classified with the tag `INFO`.
3
+
4
+ ## Installation & Setup
5
+ `chemsource` is available on `pypi` or can alternatively be downloaded directly from the GitHub repository. To use the classification feature of `chemsource`, users must have an OpenAI API key that can be provided to the model along with credits associated with the key. Information on where to find the key can be found [here](https://help.openai.com/en/articles/4936850-where-do-i-find-my-openai-api-key). Credits can be added to your OpenAI account [here](https://platform.openai.com/account/billing/overview).
6
+ See `Cost` for more information.
7
+
8
+ ## Usage
9
+
10
+ ## Cost
11
+ `chemsource` as a package is available with no additional charge to all users. However, the use of OpenAI's ChatGPT models within the classification step of the package is a service that costs money due to the energetically demanding nature of Large Language Models.
@@ -0,0 +1,8 @@
1
+ [metadata]
2
+ version = attr: chemsource.__version__
3
+ license_files = LICENSE
4
+
5
+ [egg_info]
6
+ tag_build =
7
+ tag_date = 0
8
+
@@ -0,0 +1,23 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="chemsource",
5
+ author="Prajit Rajkumar",
6
+ author_email="prajkumar@ucsd.edu",
7
+ description="Tool to classify novel drugs and other health-related"
8
+ + " chemicals by origin",
9
+ package_dir={"": "src"},
10
+ packages=find_packages(where='src'),
11
+ classifiers=[
12
+ "Programming Language :: Python :: 3",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Operating System :: OS Independent",
15
+ ],
16
+ python_requires='>=3.6',
17
+ install_requires=[
18
+ "lxml>=4.9.4",
19
+ "openai>=1.23.2",
20
+ "requests>=2.0.0,<3",
21
+ "wikipedia>=1.4.0",
22
+ ],
23
+ )
@@ -0,0 +1,3 @@
1
+ __version__ = "1.0.0"
2
+
3
+ from .chemsource import ChemSource
@@ -0,0 +1,55 @@
1
+ from .config import Config
2
+ from .config import BASE_PROMPT
3
+
4
+ from .classifier import classify as cls
5
+ from .retriever import retrieve as ret
6
+
7
+ class ChemSource(Config):
8
+ def __init__(self,
9
+ openai_key=None,
10
+ model="gpt-4-0125-preview",
11
+ ncbi_key=None,
12
+ prompt=BASE_PROMPT,
13
+ max_tokens=250000
14
+ ):
15
+ super().__init__(openai_key=openai_key,
16
+ model=model,
17
+ ncbi_key=ncbi_key,
18
+ prompt=prompt,
19
+ max_tokens=max_tokens
20
+ )
21
+
22
+ def chemsource(self, name, priority="WIKIPEDIA", single_source=False):
23
+ if self.openaikey is None:
24
+ raise ValueError("OpenAI API key must be provided")
25
+
26
+ information = ret(name,
27
+ priority,
28
+ single_source,
29
+ ncbikey=self.ncbi_key
30
+ )
31
+
32
+ return information, cls(name,
33
+ information,
34
+ self.openai_key,
35
+ self.prompt,
36
+ self.model,
37
+ self.max_tokens)
38
+
39
+ def classify(self, name, information):
40
+ if self.openaikey is None:
41
+ raise ValueError("OpenAI API key must be provided")
42
+
43
+ return cls(name,
44
+ information,
45
+ self.openai_key,
46
+ self.prompt,
47
+ self.model,
48
+ self.max_tokens)
49
+
50
+ def retrieve(self, name, priority="WIKIPEDIA", single_source=False):
51
+ return ret(name,
52
+ priority,
53
+ single_source,
54
+ ncbikey=self.ncbi_key
55
+ )
@@ -0,0 +1,22 @@
1
+ from openai import OpenAI
2
+
3
+ def classify(name,
4
+ input_text=None,
5
+ openaikey=None,
6
+ baseprompt=None,
7
+ gpt_model='gpt-4-0125-preview',
8
+ max_length=250000):
9
+
10
+ client = OpenAI(
11
+ api_key=(openaikey)
12
+ )
13
+
14
+ prompt = baseprompt[0] + str(name) + baseprompt[1] + str(input_text)
15
+ prompt = prompt[:max_length]
16
+
17
+ response = client.chat.completions.create(
18
+ model=gpt_model,
19
+ messages=[{"role": "system", "content": prompt}]
20
+ )
21
+
22
+ return response.choices[0].message.content
@@ -0,0 +1,70 @@
1
+ BASE_PROMPT = ("Classify this compound, COMPOUND_NAME, as any combination of"
2
+ + " the following: MEDICAL, ENDOGENOUS, FOOD, PERSONAL CARE,"
3
+ + " INDUSTRIAL. Note that ENDOGENOUS refers to compounds that"
4
+ + " are human synthesized. ENDOGENOUS excludes essential"
5
+ + " nutrients that cannot be synthesized by human body. Note"
6
+ + " that FOOD refers to compounds present in natural food"
7
+ + " items. Note that INDUSTRIAL should be used only for"
8
+ + " compounds not used as a contributing ingredient in the"
9
+ + " medical, personal care, or food industries. Note that"
10
+ + " PERSONAL CARE refers to non-medicated compounds typically"
11
+ + " used for activities such as skincare, beauty, and fitness."
12
+ + " Specify INFO instead if more information is needed. DO NOT"
13
+ + " MAKE ANY ASSUMPTIONS, USE ONLY THE INFORMATION PROVIDED."
14
+ + " Provide the output as a plain text separated by commas,"
15
+ + " and provide only the categories listed (either list a"
16
+ + "combination of INDUSTRIAL, ENDOGENOUS, PERSONAL CARE,"
17
+ + " MEDICAL, FOOD or list INFO), with no justification."
18
+ + " Provided Information:\n")
19
+
20
+ class Config:
21
+ def __init__(self, openai_key=None,
22
+ model="gpt-4-0125-preview", ncbi_key=None,
23
+ prompt=BASE_PROMPT, max_tokens=250000):
24
+ self.openai_key = openai_key
25
+ self.model = model
26
+ self.ncbi_key = ncbi_key
27
+ self.prompt = prompt
28
+ self.max_tokens = max_tokens
29
+
30
+ def ncbi_key(self, ncbi_key):
31
+ self.ncbi_key = ncbi_key
32
+
33
+ def openai_key(self, openai_key):
34
+ self.openai_key = openai_key
35
+
36
+ def model(self, model):
37
+ self.model = model
38
+
39
+ def prompt(self, prompt):
40
+ self.prompt = prompt
41
+
42
+ def token_limit(self, max_tokens):
43
+ self.max_tokens = max_tokens
44
+
45
+ def configure(self, ncbi_key=None, openai_key=None,
46
+ model="gpt-4-0125-preview",
47
+ prompt=BASE_PROMPT, max_tokens=250000):
48
+ self.openai_key = openai_key
49
+ self.model = model
50
+ self.ncbi_key = ncbi_key
51
+ self.prompt = prompt
52
+ self.max_tokens = max_tokens
53
+
54
+ def configuration(self):
55
+ if self.openai_key is None:
56
+ openai_key_display = None
57
+ else:
58
+ openai_key_display = "*" * len(self.openai_key)
59
+
60
+ if self.ncbi_key is None:
61
+ ncbi_key_display = None
62
+ else:
63
+ ncbi_key_display = "*" * len(self.ncbi_key)
64
+
65
+ return {"ncbi_key": openai_key_display,
66
+ "openai_key": ncbi_key_display,
67
+ "model": self.model,
68
+ "prompt": self.prompt,
69
+ "token_limit": self.max_tokens
70
+ }
@@ -0,0 +1,29 @@
1
+ class XMLParseError(Exception):
2
+ def __init__(self, message=None):
3
+ self.message = message
4
+ super().__init__(message)
5
+
6
+ class XMLParseError2(Exception):
7
+ def __init__(self, message=None):
8
+ self.message = message
9
+ super().__init__(message)
10
+
11
+ class XMLRetrievalError(Exception):
12
+ def __init__(self, message=None):
13
+ self.message = message
14
+ super().__init__(message)
15
+
16
+ class XMLRetrievalError2(Exception):
17
+ def __init__(self, message=None):
18
+ self.message = message
19
+ super().__init__(message)
20
+
21
+ class JoinError(Exception):
22
+ def __init__(self, message=None):
23
+ self.message = message
24
+ super().__init__(message)
25
+
26
+ class DescriptionError(Exception):
27
+ def __init__(self, message=None):
28
+ self.message = message
29
+ super().__init__(message)
@@ -0,0 +1,119 @@
1
+ #!pip install wikipedia
2
+ #IMPORTANT IN FINAL VERSION, FIGURE OUT HOW TO EITHER QUERY WITHOUT
3
+ #PUBMED KEY OR HOW TO GET A PUBMED KEY FROM USER
4
+
5
+ from .exceptions import XMLParseError, XMLRetrievalError
6
+ from .exceptions import XMLParseError2, XMLRetrievalError2, JoinError
7
+
8
+ from lxml import etree
9
+ import re
10
+ import requests as r
11
+ import wikipedia
12
+
13
+ SEARCH_PARAMS = {'db': 'pubmed',
14
+ 'term': '',
15
+ 'retmax': '3',
16
+ 'usehistory': 'n',
17
+ 'sort': 'relevance',
18
+ 'api_key': None
19
+ }
20
+
21
+ XML_RETRIEVAL_PARAMS = {'db': 'pubmed',
22
+ 'query_key': '1',
23
+ 'WebEnv': '',
24
+ 'rettype': 'abstract',
25
+ 'retmax': '3',
26
+ 'api_key': None
27
+ }
28
+
29
+ def retrieve(name, priority="WIKIPEDIA", single_source=False, ncbikey=None):
30
+ if (priority == "WIKIPEDIA" and not single_source):
31
+ try:
32
+ description = wikipedia_retrieve(name)
33
+ info_source = "WIKIPEDIA"
34
+ except:
35
+ try:
36
+ description = pubmed_retrieve(name, ncbikey)
37
+ info_source = "PUBMED"
38
+ except:
39
+ description = None
40
+ info_source = None
41
+ elif (priority == "PUBMED" and not single_source):
42
+ try:
43
+ description = pubmed_retrieve(name, ncbikey)
44
+ info_source = "PUBMED"
45
+ except:
46
+ try:
47
+ description = wikipedia_retrieve(name)
48
+ info_source = "WIKIPEDIA"
49
+ except:
50
+ description = None
51
+ info_source = None
52
+
53
+ elif (priority == "WIKIPEDIA" and single_source):
54
+ try:
55
+ description = wikipedia_retrieve(name)
56
+ info_source = "WIKIPEDIA"
57
+ except:
58
+ description = None
59
+ info_source = None
60
+
61
+ else:
62
+ try:
63
+ description = pubmed_retrieve(name, ncbikey)
64
+ info_source = "PUBMED"
65
+ except:
66
+ description = None
67
+ info_source = None
68
+
69
+ return info_source, description
70
+
71
+ def pubmed_retrieve(drug, ncbikey=None):
72
+ temp_search_params = SEARCH_PARAMS
73
+ temp_search_params['api_key'] = ncbikey
74
+
75
+ if (temp_search_params["api_key"] is None):
76
+ del temp_search_params["api_key"]
77
+ temp_search_params['term'] = drug + '[ti]'
78
+
79
+ try:
80
+ xml_content = etree.fromstring(r.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?",
81
+ params=temp_search_params).content)
82
+ except:
83
+ raise XMLParseError
84
+ try:
85
+ if (str(xml_content.find(".//Count").text) == 0):
86
+ return 'NO_RESULTS'
87
+ except:
88
+ raise XMLRetrievalError
89
+ else:
90
+ temp_retrieval_params = XML_RETRIEVAL_PARAMS
91
+ temp_retrieval_params['api_key'] = ncbikey
92
+
93
+ if (temp_retrieval_params["api_key"] is None):
94
+ del temp_retrieval_params["api_key"]
95
+ temp_retrieval_params['WebEnv'] = xml_content.find(".//WebEnv").text
96
+ try:
97
+ retrieval_content = etree.fromstring(r.get(('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'),
98
+ params=temp_retrieval_params
99
+ ).content)
100
+ except:
101
+ raise XMLParseError2
102
+ try:
103
+ abstracts = retrieval_content.findall(".//AbstractText")
104
+ except:
105
+ raise XMLRetrievalError2
106
+ result = ''
107
+ try:
108
+ for abstract in abstracts:
109
+ result = result + ' ' + abstract.text
110
+ except:
111
+ raise JoinError
112
+ return result
113
+
114
+ def wikipedia_retrieve(drug):
115
+ description = wikipedia.page(drug, auto_suggest=False).content
116
+ description = description.replace('\n', ' ')
117
+ description = description.replace('\t', ' ')
118
+ description = ' '.join(description.split())
119
+ return description
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.1
2
+ Name: chemsource
3
+ Version: 1.0.0
4
+ Summary: Tool to classify novel drugs and other health-related chemicals by origin
5
+ Author: Prajit Rajkumar
6
+ Author-email: prajkumar@ucsd.edu
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.6
11
+ License-File: LICENSE
12
+ Requires-Dist: lxml>=4.9.4
13
+ Requires-Dist: openai>=1.23.2
14
+ Requires-Dist: requests<3,>=2.0.0
15
+ Requires-Dist: wikipedia>=1.4.0
@@ -0,0 +1,15 @@
1
+ LICENSE
2
+ README.md
3
+ setup.cfg
4
+ setup.py
5
+ src/chemsource/__init__.py
6
+ src/chemsource/chemsource.py
7
+ src/chemsource/classifier.py
8
+ src/chemsource/config.py
9
+ src/chemsource/exceptions.py
10
+ src/chemsource/retriever.py
11
+ src/chemsource.egg-info/PKG-INFO
12
+ src/chemsource.egg-info/SOURCES.txt
13
+ src/chemsource.egg-info/dependency_links.txt
14
+ src/chemsource.egg-info/requires.txt
15
+ src/chemsource.egg-info/top_level.txt
@@ -0,0 +1,4 @@
1
+ lxml>=4.9.4
2
+ openai>=1.23.2
3
+ requests<3,>=2.0.0
4
+ wikipedia>=1.4.0
@@ -0,0 +1 @@
1
+ chemsource