scibite-toolkit 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ from .termite import *
2
+ from .texpress import *
3
+
4
+
@@ -0,0 +1,158 @@
1
+ import requests
2
+ import logging
3
+
4
+ # Get the logger for this module
5
+ logger = logging.getLogger(__name__)
6
+
7
+ class CentreeRequestBuilder:
8
+ """
9
+ Class for creating CENtree Requests.
10
+ """
11
+
12
+ def __init__(self, timeout: int = 10):
13
+ """
14
+ Initialize the CentreeRequestBuilder.
15
+
16
+ Parameters
17
+ ----------
18
+ timeout : int, optional
19
+ The timeout for HTTP requests in seconds (default is 10 seconds).
20
+ """
21
+ self.centree_url = ''
22
+ self.headers = {}
23
+ self.session = requests.Session()
24
+ self.timeout = timeout
25
+ self.logger: logging.Logger = logger
26
+
27
+ def set_url(self, centree_url: str):
28
+ """
29
+ Set the URL of the CENtree instance.
30
+
31
+ Parameters
32
+ ----------
33
+ centree_url : str
34
+ The URL of the CENtree instance to be hit.
35
+
36
+ Examples
37
+ --------
38
+ >>> crb.set_url("http://example.com")
39
+ """
40
+ self.centree_url = centree_url.rstrip('/')
41
+ self.logger.info(f"Set CENtree URL to {self.centree_url}")
42
+
43
+ def set_authentication(self, username: str, password: str, remember_me: bool = True, verification: bool = True):
44
+ """
45
+ Authenticates with the CENtree token API using username and password, generates an access token,
46
+ and sets the request header.
47
+
48
+ Parameters
49
+ ----------
50
+ username : str
51
+ The username for authentication.
52
+ password : str
53
+ The password for authentication.
54
+ remember_me : bool, optional
55
+ Whether to remember the user (default is True).
56
+ verification : bool, optional
57
+ Whether to verify SSL certificates (default is True).
58
+
59
+ Examples
60
+ --------
61
+ >>> crb.set_authentication("user", "pass")
62
+ """
63
+ authenticate_url = f"{self.centree_url}/api/authenticate"
64
+
65
+ try:
66
+ token_response = self.session.post(
67
+ authenticate_url,
68
+ json={
69
+ "rememberMe": remember_me,
70
+ "username": username,
71
+ "password": password,
72
+ },
73
+ headers={"Content-Type": "application/json"},
74
+ verify=verification,
75
+ timeout=self.timeout
76
+ )
77
+ token_response.raise_for_status()
78
+ access_token = token_response.json().get("id_token")
79
+
80
+ if not access_token:
81
+ raise ValueError("Access token not found in the response.")
82
+
83
+ self.headers = {"Authorization": f"Bearer {access_token}"}
84
+ self.logger.info("Authentication successful")
85
+
86
+
87
+ except requests.exceptions.HTTPError as http_err:
88
+ self.logger.error(f"HTTP error occurred: {http_err.response.status_code} - {http_err.response.reason}")
89
+ raise http_err # Re-raise the HTTPError for the test to catch
90
+ except requests.exceptions.RequestException as req_err:
91
+ self.logger.error(f"Request error: {req_err}")
92
+ raise req_err # Re-raise the RequestException for the test to catch
93
+ except ValueError as val_err:
94
+ self.logger.error(f"Value error: {val_err}")
95
+ raise val_err # Re-raise the ValueError for the test to catch
96
+ except Exception as err:
97
+ self.logger.error(f"An error occurred: {err}")
98
+ raise err # Re-raise the generic exception for the test to catch
99
+
100
+ def search_classes(self, query: str, ontology_id: str = None, exact: bool = False, obsolete: bool = False,
101
+ page_from: int = 0, page_size: int = 10) -> dict:
102
+ """
103
+ Search classes in the CENtree ontology.
104
+
105
+ Parameters
106
+ ----------
107
+ query : str
108
+ The search query.
109
+ ontology_id : str, optional
110
+ The ontology ID to search within.
111
+ exact : bool, optional
112
+ Whether to perform an exact search (default is False).
113
+ obsolete : bool, optional
114
+ Whether to include obsolete classes (default is False).
115
+ page_from : int, optional
116
+ The starting page number (default is 0).
117
+ page_size : int, optional
118
+ The number of results per page (default is 10).
119
+
120
+ Returns
121
+ -------
122
+ dict
123
+ The JSON response from the search endpoint.
124
+
125
+ Examples
126
+ --------
127
+ >>> result = crb.search_classes("diabetes")
128
+ """
129
+ params = {
130
+ "q": query,
131
+ "ontology": ontology_id,
132
+ "from": page_from,
133
+ "size": page_size
134
+ }
135
+
136
+ # Clean up params dictionary to remove None values
137
+ params = {k: v for k, v in params.items() if v is not None}
138
+
139
+ # Construct the endpoint URL
140
+ endpoint_suffix = ''
141
+ if obsolete:
142
+ endpoint_suffix += '/obsolete'
143
+ if exact:
144
+ endpoint_suffix += '/exact'
145
+
146
+ search_endpoint = f"{self.centree_url}/api/search{endpoint_suffix}"
147
+
148
+ try:
149
+ response = self.session.get(search_endpoint, params=params, headers=self.headers, timeout=self.timeout)
150
+ response.raise_for_status()
151
+ self.logger.info("Search request successful")
152
+ return response.json()
153
+ except requests.exceptions.HTTPError as http_err:
154
+ self.logger.error(f"HTTP error occurred: {http_err}")
155
+ except requests.exceptions.RequestException as req_err:
156
+ self.logger.error(f"Request error occurred: {req_err}")
157
+ except Exception as err:
158
+ self.logger.error(f"An error occurred: {err}")
@@ -0,0 +1,324 @@
1
+ """
2
+ ____ _ ____ _ _ _____ _ _ _ _
3
+ / ___| ___(_) __ )(_) |_ ___ |_ _|__ ___ | | | _(_) |_
4
+ \___ \ / __| | _ \| | __/ _ \ | |/ _ \ / _ \| | |/ / | __|
5
+ ___) | (__| | |_) | | || __/ | | (_) | (_) | | <| | |_
6
+ |____/ \___|_|____/|_|\__\___| |_|\___/ \___/|_|_|\_\_|\__|
7
+
8
+ Preprocessing functions- using your TERMite output to make AI-ready data
9
+
10
+ """
11
+
12
+ __author__ = 'SciBite'
13
+ __copyright__ = '(c) 2024, SciBite Ltd'
14
+ __license__ = 'Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License'
15
+
16
+ import requests
17
+ import pandas as pd
18
+
19
+
20
+ class DocStoreRequestBuilder():
21
+ """
22
+ Class for creating DOCStore requests
23
+ """
24
+
25
+ def __init__(self):
26
+ self.url = ""
27
+ self.input_file_path = ''
28
+ self.payload = {"output": "json"}
29
+ self.options = {}
30
+ self.binary_content = None
31
+ self.basic_auth = ()
32
+ self.verify_request = True
33
+
34
+ def set_basic_auth(self, username='', password='', verification=True):
35
+ """
36
+ Pass basic authentication credentials
37
+ **ONLY change verification if you are calling a known source**
38
+ :param username: username to be used for basic authentication
39
+ :param password: password to be used for basic authentication
40
+ :param verification: if set to False requests will ignore verifying the SSL certificate, can also pass the path
41
+ to a certificate file
42
+ """
43
+ self.basic_auth = (username, password)
44
+ self.verify_request = verification
45
+
46
+ def set_url(self, url):
47
+ """
48
+ Set the URL of the DOCStore instance
49
+ :param url: the URL of the DOCStore instance to be hit
50
+ """
51
+ self.url = url.rstrip('/')
52
+
53
+ def get_dcc_docs(self, entity_list, source='*', options_dict=None):
54
+ """
55
+ - Document co-occurrence -
56
+ Retrieve document co-occurrence of provided entities
57
+ :param entity_list: list of entities to be searched for
58
+ :param source: name of data source(s) to be searched against
59
+ :param options_dict: search parameters
60
+ :return: results of search in json format
61
+ """
62
+ base_url = self.url
63
+ query_url = (base_url) + "/api/ds/v1/search/co/document/{}/*/*/*".format(source)
64
+ entity_string = " ".join(entity_list)
65
+
66
+ options = {"fmt": "json",
67
+ "fields": "*",
68
+ "terms": entity_string,
69
+ "limit": "10",
70
+ "from": "0",
71
+ "facettype": "NONE",
72
+ "significantTerms": "false",
73
+ "excludehits": "false",
74
+ "sortby": "document_date:desc",
75
+ }
76
+
77
+ try:
78
+ for k, v in options_dict.items():
79
+ if k in options.keys():
80
+ options[k] = v
81
+ except:
82
+ pass
83
+
84
+ response = requests.get(query_url, params=options, auth=self.basic_auth)
85
+
86
+ resp_json = response.json()
87
+
88
+
89
+ return resp_json
90
+
91
+ def get_boolean_docs(self, query_string, source='*', options_dict=None):
92
+ """
93
+ - Document-level query of Doc Store -
94
+ Document-level query of Doc Store, produced both hit and facet data
95
+ :param query_string: query to be completed
96
+ :param source: name of data source(s) to be searched against
97
+ :param options_dict: search parameters
98
+ :return: results of search in json format
99
+ """
100
+ base_url = self.url
101
+ query_url = (base_url) + "/api/ds/v1/search/document/{}/*/*/*".format(source)
102
+ options = {"fmt": "json",
103
+ "fields": "*",
104
+ "query": query_string,
105
+ "limit": "10",
106
+ "from": "0",
107
+ "facettype": "NONE",
108
+ "significantTerms": "false",
109
+ "excludehits": "false",
110
+ "sortby": "document_date:desc",
111
+ "filters": ""
112
+ }
113
+
114
+ try:
115
+ for k, v in options_dict.items():
116
+ if k in options.keys():
117
+ options[k] = v
118
+ except:
119
+ pass
120
+
121
+ response = requests.get(query_url, params=options, auth=self.basic_auth)
122
+ resp_json = response.json()
123
+
124
+ return resp_json
125
+
126
+ def get_docs(self, query_string, source='*', options_dict=None):
127
+ """
128
+ - Document-level query of Doc Store, returning only the documents hit,
129
+ no facet data. -
130
+ The output is TERMite/TEXpress ready
131
+ :param query_string: query to be completed
132
+ :param source: name of data source(s) to be searched against
133
+ :param options_dict: search parameters
134
+ :return: results of search in json format
135
+ """
136
+ base_url = self.url
137
+ query_url = (base_url) + '/api/ds/v1/search/document/docs/{}/*/*/*'.format(source)
138
+ options = {"fields": "*",
139
+ "fmt":"json",
140
+ "query": query_string,
141
+ "limit": "10",
142
+ "from": "0",
143
+ "sortby": "document_date:desc",
144
+ "filters": "",
145
+ "zip":"false",
146
+ "metaonly":"false"
147
+ }
148
+
149
+ try:
150
+ for k, v in options_dict.items():
151
+ if k in options.keys():
152
+ options[k] = v
153
+ except:
154
+ pass
155
+
156
+ response = requests.get(query_url, params=options, auth=self.basic_auth)
157
+ resp_json = response.json()
158
+ return resp_json
159
+
160
+ def get_scc_docs(self, entity_list, source='*', options_dict=None):
161
+ """
162
+ - Sentence co-occurrence on entity ids or types, returns documents
163
+ containing sentences fulfilling the co-occurrence. -
164
+ :param entity_list: list of entities to be searched for
165
+ :param source: name of data source(s) to be searched against
166
+ :param options_dict: search parameters
167
+ :return: results of search in json format
168
+ """
169
+ base_url = self.url
170
+ query_url = (base_url) + "/api/ds/v1/search/co/sentence/sentencedetail/flat/{}/*/*/*".format(
171
+ source)
172
+ entity_string = " ".join(entity_list)
173
+
174
+ options = {"fmt": "json",
175
+ "fields": "*",
176
+ "terms": entity_string,
177
+ "inorder": "false",
178
+ "slop": "2",
179
+ "limit": "10",
180
+ "from": "0",
181
+ "sortby": "document_date:desc",
182
+ "zip": "false"}
183
+
184
+ try:
185
+ for k, v in options_dict.items():
186
+ if k in options.keys():
187
+ options[k] = v
188
+ except:
189
+ pass
190
+
191
+ response = requests.get(query_url, params=options, auth=self.basic_auth)
192
+ resp_json = response.json()
193
+
194
+ return resp_json
195
+
196
+ def get_doc_by_id(self,doc_id, fmt='json'):
197
+ """Retrieves document by its unique ID"""
198
+ options = {"fmt": fmt,
199
+ "uid":doc_id}
200
+ base_url = self.url
201
+ query_url = (base_url) + "/api/ds/v1/lookup/doc"
202
+ response = requests.get(query_url, params=options, auth=self.basic_auth)
203
+ resp_json = response.json()
204
+ return resp_json
205
+
206
+ def entity_lookup_id(self, syn, entity_type, options_dict=None):
207
+ """Lookup IDs for a synonym and type"""
208
+ options = {"syn": syn,
209
+ "type":entity_type}
210
+ base_url = self.url
211
+ query_url = (base_url) + "/api/entity/v1/lookup/id"
212
+ response = requests.get(query_url, params=options, auth=self.basic_auth)
213
+ resp_json = response.json()
214
+ return resp_json
215
+ def get_facets_only(self,query_string,facetFilter, source ='*', significantTerms = False, options_dict = None):
216
+ """Document-level query of Doc Store, returning only the facets"""
217
+ options ={"fmt": "json",
218
+ "fields": "*",
219
+ "query": query_string,
220
+ "facetFilter":facetFilter,
221
+ "limit": "10",
222
+ "from": "0",
223
+ "facettype": "BY_TYPE",
224
+ "significantTerms": "false",
225
+ "excludehits": "false",
226
+ }
227
+ try:
228
+ for k, v in options_dict.items():
229
+ if k in options.keys():
230
+ options[k] = v
231
+ except:
232
+ pass
233
+ base_url = self.url
234
+ query_url = (base_url) + '/api/ds/v1/search/document/facets/{}/*/*/*'.format(source)
235
+ response = requests.get(query_url, params=options, auth=self.basic_auth)
236
+ resp_json = response.json()
237
+ return resp_json
238
+
239
+ def get_docstore_dcc_df(json):
240
+ """
241
+ Converts document co-occurrence json into a dataframe
242
+ :param json: dcc json
243
+ :return: dcc dataframe
244
+ """
245
+ df_rows = []
246
+ hits = json["hits"]
247
+
248
+ for h in hits:
249
+ hit_dict = {}
250
+
251
+ # Document id
252
+ doc_id = h["id"]
253
+
254
+ # Document date
255
+ doc_date = ""
256
+ try:
257
+ doc_date = h["documentDate"][0:10]
258
+ except:
259
+ pass
260
+ # Title
261
+ highlighted_sections = h['highlightedSections'][0]
262
+ title_words = highlighted_sections['titleWords']
263
+
264
+ title_list = []
265
+
266
+ for t in title_words:
267
+ word = (t['p']).rstrip()
268
+ title_list.append(word)
269
+
270
+ title = ((' ').join(title_list))
271
+
272
+ # Authors
273
+ authors = ""
274
+ try:
275
+ authors = h["authors"]
276
+ except:
277
+ pass
278
+
279
+ # Citation
280
+ citation = ""
281
+ try:
282
+ citation = h["citation"]
283
+ except:
284
+ pass
285
+ hit_dict.update([("document_id", doc_id), ("document_date", doc_date), ("title", title),
286
+ ("authors", authors), ("citation", citation)])
287
+ df_rows.append(hit_dict)
288
+
289
+ dcc_df = pd.DataFrame(df_rows)
290
+ return (dcc_df)
291
+
292
+
293
+ def get_docstore_scc_df(json):
294
+ """
295
+ Converts sentence co-occurrence json into a dataframe
296
+ :param json: scc json
297
+ :return: scc dataframe
298
+ """
299
+ df_rows = []
300
+ hits = json["hits"]
301
+
302
+ for h in hits:
303
+ hit_dict = {}
304
+
305
+ # Document id
306
+ doc_id = h["docId"]
307
+
308
+ # Document date
309
+ doc_date = ""
310
+ try:
311
+ doc_date = h["docDate"][0:10]
312
+ except:
313
+ pass
314
+
315
+ # SCC Sentence
316
+ doc_sent = h["sentence"]
317
+
318
+ hit_dict.update([("document_id", doc_id), ("document_date", doc_date), ("scc_sentence", doc_sent)])
319
+ df_rows.append(hit_dict)
320
+
321
+ scc_df = pd.DataFrame(df_rows, columns=["document_id", "document_date", "scc_sentence"])
322
+ return (scc_df)
323
+
324
+