ddi-fw 0.0.238__py3-none-any.whl → 0.0.239__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,157 +0,0 @@
1
- # # https://caseolap.github.io/docs/drug/drugbank/
2
- # #https://gist.github.com/rosherbal/56461421c69a8a7da775336c95fa62e0
3
-
4
- import os
5
- import zipfile
6
- import xml.etree.ElementTree as ET
7
- from xml.etree.ElementTree import XMLParser, XMLPullParser
8
- import pandas as pd
9
- import xmlschema
10
- import json as json
11
- import sys
12
- import unicodedata
13
- import re
14
- from importlib import resources as impresources
15
- from ddi_fw.utils import ZipHelper
16
-
17
-
18
- def slugify(value, allow_unicode=False):
19
- """
20
- Taken from https://github.com/django/django/blob/master/django/utils/text.py
21
- Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
22
- dashes to single dashes. Remove characters that aren't alphanumerics,
23
- underscores, or hyphens. Convert to lowercase. Also strip leading and
24
- trailing whitespace, dashes, and underscores.
25
- """
26
- value = str(value)
27
- if allow_unicode:
28
- value = unicodedata.normalize('NFKC', value)
29
- else:
30
- value = unicodedata.normalize('NFKD', value).encode(
31
- 'ascii', 'ignore').decode('ascii')
32
- value = re.sub(r'[^\w\s-]', '', value.lower())
33
- return re.sub(r'[-\s]+', '-', value).strip('-_')
34
-
35
-
36
- def replace_key(key: str):
37
- if key.startswith('@'):
38
- key = key[1:]
39
- if key == '$':
40
- key = "value"
41
- elif '{http://www.drugbank.ca}' in key:
42
- key = key.replace('{http://www.drugbank.ca}', '')
43
- return key
44
-
45
-
46
- def modify_keys(d):
47
- for k, v in d.copy().items():
48
- if isinstance(v, dict):
49
- d.pop(k)
50
- d[replace_key(k)] = v
51
- modify_keys(v)
52
- elif isinstance(v, list):
53
- d.pop(k)
54
- d[replace_key(k)] = v
55
- for i in v:
56
- if isinstance(i, list) or isinstance(i, dict):
57
- modify_keys(i)
58
- # print(i)
59
-
60
- else:
61
- if k == "keyToChange":
62
- v = int(v)
63
- d.pop(k)
64
- d[replace_key(k)] = v
65
- return d
66
-
67
-
68
- class DrugBankParser:
69
- def __init__(self, zip_file='drugbank.zip', input_path='./drugbank'):
70
-
71
- # sys.path.insert(0,'/content/drive/My Drive/drugbank')
72
- # HERE = '/content/drive/My Drive/drugbank'
73
- HERE = input_path
74
- xsd_file='drugbank.xsd'
75
- DRUGBANK_XSD = impresources.files("ddi_fw.drugbank").joinpath("drugbank.xsd").open()
76
- # DRUGBANK_XSD = HERE + '/' + xsd_file
77
- DRUGBANK_ZIP = HERE + '/' + zip_file
78
- xsd = xmlschema.XMLSchema(DRUGBANK_XSD)
79
- self.drug_type_schema = xsd.complex_types[1]
80
- self.zf = zipfile.ZipFile(DRUGBANK_ZIP, 'r')
81
-
82
- def parse(self, save_path='./drugbank/drugs', override = False):
83
- if not override:
84
- print('No parsing process has been executed!!!')
85
- return
86
-
87
- elements = []
88
- k = 0
89
-
90
- for name in self.zf.namelist():
91
- f = self.zf.open(name)
92
- # tree = ET.parse(f)
93
- # root = tree.getroot()
94
- previous_element = None
95
- for event, element in ET.iterparse(f, events=('end',)): # "end"
96
- # if k == 10:
97
- # break
98
- if len(elements) == 0:
99
- elements.append(element)
100
- elif len(elements) == 1:
101
- elements.append(element)
102
- elif len(elements) == 2:
103
- elements[0] = elements[1]
104
- elements[1] = element
105
- if len(elements) == 2:
106
- previous_element = elements[len(elements)-2]
107
- drug = None
108
- # previous_element = element.find("..")
109
- #
110
- if previous_element is not None and previous_element.tag == '{http://www.drugbank.ca}transporters' and event == 'end' and element.tag == "{http://www.drugbank.ca}drug":
111
- drug = element
112
- elements = []
113
-
114
- # for child in element:
115
- # print(child.text)
116
-
117
- if drug is None:
118
- continue
119
-
120
- name = drug.find("{http://www.drugbank.ca}name")
121
-
122
- d_name = None
123
- if name is not None:
124
- d_name = name.text
125
- line = name.text
126
-
127
- if d_name is None:
128
- continue
129
-
130
- k = k + 1
131
-
132
- # print(d_name)
133
-
134
- # if lax is used we have to send d[0] as a parameter
135
- d = self.drug_type_schema.decode(drug, validation='strict')
136
- # pretty_dict = {replace_key(k): v for k, v in d[0].items()}
137
- pretty_dict = modify_keys(d)
138
- # for key, value in pretty_dict.items():
139
- # print(key, '->', value)
140
- # file_name = slugify(d_name)
141
-
142
- from pathlib import Path
143
-
144
- Path(save_path).mkdir(parents=True, exist_ok=True)
145
-
146
- primary_id = [
147
- id['value'] for id in pretty_dict["drugbank-id"] if id['primary'] == True][0]
148
- with open(f'{save_path}/{primary_id}.json', 'w', encoding='utf-8') as f:
149
- json.dump(pretty_dict, f, ensure_ascii=False, indent=4)
150
-
151
- print("Done")
152
-
153
- def zip_files(self, chunk_size=1000, input_path='./drugbank/drugs', output_path='./drugbank/zips'):
154
- zip_helper = ZipHelper()
155
- zip_helper.zip(zip_prefix='drugs', input_path=input_path,
156
- output_path=output_path, chunk_size=chunk_size)
157
-
@@ -1,355 +0,0 @@
1
- import sqlite3
2
- import pandas as pd
3
- import os
4
- import json
5
- import glob
6
- from tqdm import tqdm
7
-
8
- import csv
9
-
10
- from rdkit import Chem
11
- from rdkit.Chem import AllChem
12
- import numpy as np
13
- from ddi_fw.drugbank.event_extractor import EventExtractor
14
- from ddi_fw.ner.ner import CTakesNER
15
-
16
- from ddi_fw.utils import ZipHelper
17
- # from event_extractor import EventExtractor
18
-
19
-
20
- def multiline_to_singleline(multiline):
21
- if multiline is None:
22
- return ""
23
- return " ".join(line.strip() for line in multiline.splitlines())
24
-
25
- # targets -> target -> polypeptide
26
- # enzymes -> enzyme -> polypeptide
27
- # pathways from KEGG, KEGG ID is obtained from DrugBank
28
- # https://www.genome.jp/dbget-bin/www_bget?drug:D03136
29
- # https://www.kegg.jp/entry/D03136
30
-
31
-
32
- class DrugBankProcessor():
33
-
34
- def mask_interaction(self, drug_1, drug_2, interaction):
35
- return interaction.replace(
36
- drug_1, "DRUG").replace(drug_2, "DRUG")
37
-
38
- def extract_zip_files(self, input_path='zips', output_path='drugs', override=False):
39
- if override:
40
- zip_helper = ZipHelper()
41
- zip_helper.extract(input_path=input_path, output_path=output_path)
42
-
43
- def get_external_identifiers(self, input_path='drugs'):
44
- external_identifier_list = []
45
- all_json_files = input_path+'/*.json*'
46
-
47
- for filepath in tqdm(glob.glob(all_json_files)):
48
- with open(filepath, 'r', encoding="utf8") as f:
49
-
50
- data = json.load(f)
51
- drug_1 = data['name']
52
- drug_1_id = [d['value']
53
- for d in data['drugbank-id'] if d['primary'] == True][0]
54
- external_identifiers = data['external-identifiers'] if "external-identifiers" in data else None
55
- external_identifiers_dict = {}
56
- external_identifiers_dict['name'] = drug_1
57
- external_identifiers_dict['drugbank_id'] = drug_1_id
58
- if external_identifiers is not None:
59
- for p in external_identifiers['external-identifier']:
60
- external_identifiers_dict[p['resource'].lower().replace(
61
- " ", "_")] = p['identifier']
62
- # external_identifiers_dict = dict(
63
- # [(p['resource'].lower().replace(" ","_"), p['identifier']) for p in external_identifiers['external-identifier']])
64
- # external_identifiers_dict['name'] = drug_1
65
- # external_identifiers_dict['drugbank_id'] = drug_1_id
66
- external_identifier_list.append(external_identifiers_dict)
67
- return external_identifier_list
68
-
69
- def process(self,
70
- ner_data_path,
71
- input_path='drugs',
72
- output_path='output',
73
- save_as_sql=True,
74
- db_path=r"./drugbank.db",
75
- zip_outputs=True):
76
- if not os.path.exists(output_path):
77
- os.makedirs(output_path)
78
- ner_df = CTakesNER().load(ner_data_path)
79
- drugs_pickle_path = output_path+'/drugs.pkl'
80
- drugs_csv_path = output_path+'/drugs.gzip'
81
- ddi_pickle_path = output_path + '/ddi.pkl'
82
- ddi_csv_path = output_path + '/ddi.gzip'
83
-
84
- if not os.path.exists(drugs_pickle_path) or not os.path.exists(ddi_pickle_path):
85
- drug_rows = []
86
- all_ddis = []
87
- external_identifier_list = []
88
- all_json_files = input_path+'/*.json*'
89
-
90
- for filepath in tqdm(glob.glob(all_json_files)):
91
- with open(filepath, 'r', encoding="utf8") as f:
92
-
93
- data = json.load(f)
94
-
95
- # if data['drug-interactions'] is None:
96
- if False:
97
- continue
98
- else:
99
- drug_1 = data['name']
100
- drug_1_id = [d['value']
101
- for d in data['drugbank-id'] if d['primary'] == True][0]
102
- description = multiline_to_singleline(
103
- data['description'])
104
- if data['drug-interactions'] is not None:
105
- drug_interactions = [
106
- interaction for interaction in data['drug-interactions']['drug-interaction']]
107
- ddis = [(drug_1, interaction['name'], interaction['description'])
108
- for interaction in data['drug-interactions']['drug-interaction']]
109
-
110
- ddi_dict = [{
111
- 'drug_1_id': drug_1_id,
112
- 'drug_1': drug_1,
113
- 'drug_2_id': interaction['drugbank-id']['value'],
114
- 'drug_2': interaction['name'],
115
- 'interaction': interaction['description'],
116
- 'masked_interaction': self.mask_interaction(drug_1, interaction['name'], interaction['description'])}
117
- for interaction in data['drug-interactions']['drug-interaction']]
118
- all_ddis.extend(ddi_dict)
119
-
120
- synthesis_reference = data['synthesis-reference']
121
- indication = multiline_to_singleline(
122
- data['indication'])
123
- pharmacodynamics = multiline_to_singleline(
124
- data['pharmacodynamics'])
125
- mechanism_of_action = multiline_to_singleline(
126
- data['mechanism-of-action'])
127
- toxicity = multiline_to_singleline(data['toxicity'])
128
- metabolism = multiline_to_singleline(
129
- data['metabolism'])
130
- absorption = multiline_to_singleline(
131
- data['absorption'])
132
- half_life = multiline_to_singleline(data['half-life'])
133
- protein_binding = multiline_to_singleline(
134
- data['protein-binding'])
135
- route_of_elimination = multiline_to_singleline(
136
- data['route-of-elimination'])
137
- volume_of_distribution = multiline_to_singleline(
138
- data['volume-of-distribution'])
139
- clearance = multiline_to_singleline(data['clearance'])
140
-
141
- food_interactions = data['food-interactions']
142
- sequences = data['sequences'] if "sequences" in data else None
143
-
144
- external_identifiers = data['external-identifiers'] if "external-identifiers" in data else None
145
- experimental_properties = data['experimental-properties'] if "experimental-properties" in data else None
146
- calculated_properties = data['calculated-properties'] if "calculated-properties" in data else None
147
-
148
- enzymes_polypeptides = None
149
- targets_polypeptides = None
150
- pathways = None
151
-
152
- # targets = data['targets'] if "targets" in data else None
153
- if data['targets'] is not None:
154
- # targets_polypeptides = [p['id'] for d in data['targets']['target'] for p in d['polypeptide'] if 'polypeptide' in d ]
155
- targets_polypeptides = [
156
- p['id'] for d in data['targets']['target'] if 'polypeptide' in d for p in d['polypeptide']]
157
-
158
- if data['enzymes'] is not None:
159
- # enzymes_polypeptides = [p['id'] for d in data['enzymes']['enzyme'] for p in d['polypeptide'] if 'polypeptide' in d]
160
- enzymes_polypeptides = [
161
- p['id'] for d in data['enzymes']['enzyme'] if 'polypeptide' in d for p in d['polypeptide']]
162
-
163
- if data['pathways'] is not None:
164
- pathways = [
165
- d['smpdb-id'] for d in data['pathways']['pathway']]
166
-
167
- if external_identifiers is not None:
168
- external_identifiers_dict = dict(
169
- [(p['resource'], p['identifier']) for p in external_identifiers['external-identifier']])
170
- external_identifiers_dict['drugbank_id'] = drug_1_id
171
- external_identifier_list.append(
172
- external_identifiers_dict)
173
- # add note column
174
- smiles = None
175
- morgan_hashed = None
176
- if calculated_properties is not None:
177
- calculated_properties_dict = dict(
178
- [(p['kind'], p['value']) for p in calculated_properties['property']])
179
- smiles = calculated_properties_dict['SMILES'] if 'SMILES' in calculated_properties_dict else None
180
- if smiles is not None:
181
- try:
182
- mol = Chem.MolFromSmiles(smiles)
183
- morgan_hashed = AllChem.GetMorganFingerprintAsBitVect(
184
- mol, 2, nBits=881).ToList()
185
- except:
186
- print("An exception occurred")
187
- if morgan_hashed is None:
188
- morgan_hashed = np.zeros(881).tolist()
189
-
190
- # TODO cui, tui, entities other types of texts, test it
191
- tuis_description = ner_df[ner_df['drugbank_id']
192
- == drug_1_id]['tui_description'].values
193
- if len(tuis_description) > 0:
194
- tuis_description = tuis_description[0]
195
- else:
196
- tuis_description = None
197
-
198
- cuis_description = ner_df[ner_df['drugbank_id']
199
- == drug_1_id]['cui_description'].values
200
- if len(cuis_description) > 0:
201
- cuis_description = cuis_description[0]
202
- else:
203
- cuis_description = None
204
-
205
- entities_description = ner_df[ner_df['drugbank_id']
206
- == drug_1_id]['entities_description'].values
207
- if len(entities_description) > 0:
208
- entities_description = entities_description[0]
209
- else:
210
- entities_description = None
211
-
212
- # k = [p[k] for p in calculated_properties['property'] for k in p.keys() if k =='SMILES']
213
- # external_identifiers['external-identifier']
214
- # experimental_properties['property']
215
-
216
- # list to single line reminder
217
- row = {'drugbank_id': drug_1_id,
218
- 'name': drug_1,
219
- 'description': description,
220
- 'synthesis_reference': synthesis_reference,
221
- 'indication': indication,
222
- 'pharmacodynamics': pharmacodynamics,
223
- 'mechanism_of_action': mechanism_of_action,
224
- 'toxicity': toxicity,
225
- 'metabolism': metabolism,
226
- 'absorption': absorption,
227
- 'half_life': half_life,
228
- 'protein_binding': protein_binding,
229
- 'route_of_elimination': route_of_elimination,
230
- 'volume_of_distribution': volume_of_distribution,
231
- 'clearance': clearance,
232
- 'smiles': smiles,
233
- 'smiles_morgan_fingerprint': ','.join(map(str, morgan_hashed)),
234
- 'enzymes_polypeptides': '|'.join(enzymes_polypeptides) if enzymes_polypeptides is not None else None,
235
- 'targets_polypeptides': '|'.join(targets_polypeptides) if targets_polypeptides is not None else None,
236
- 'pathways': '|'.join(pathways) if pathways is not None else None,
237
- 'tuis_description': '|'.join(tuis_description) if tuis_description is not None else None,
238
- 'cuis_description': '|'.join(cuis_description) if cuis_description is not None else None,
239
- 'entities_description': '|'.join(entities_description) if entities_description is not None else None
240
- # 'external_identifiers': external_identifiers_dict
241
- }
242
- drug_rows.append(row)
243
-
244
- # if len(drug_rows) == 10:
245
- # break
246
- # print(smiles_count)
247
- print(f"Size of drugs {len(drug_rows)}")
248
- print(f"Size of DDIs {len(all_ddis)}")
249
- np.set_printoptions(threshold=np.inf)
250
-
251
- # drug_names = [row['name'] for row in drug_rows]
252
- drug_names = ['DRUG']
253
- event_extractor = EventExtractor(drug_names)
254
-
255
- replace_dict = {'MYO-029': 'Stamulumab'}
256
- for ddi in tqdm(all_ddis):
257
- for key, value in replace_dict.items():
258
- ddi['masked_interaction'] = ddi['masked_interaction'].replace(
259
- key, value)
260
- # interaction = ddi['interaction']
261
- # mechanism, action, drugA, drugB = event_extractor.extract(interaction)
262
- # ddi['mechanism'] = mechanism
263
- # ddi['action'] = action
264
-
265
- self.drugs_df = pd.DataFrame(drug_rows)
266
- # self.drugs_df.to_pickle(drugs_pickle_path)
267
- # self.drugs_df.to_csv(
268
- # drugs_csv_path, index=False, compression='gzip')
269
-
270
- # print('mechanism_action calculation')
271
- self.ddis_df = pd.DataFrame(all_ddis)
272
-
273
- count = [0]
274
-
275
- def fnc2(interaction, count):
276
- count[0] = count[0] + 1
277
- if count[0] % 1000 == 0:
278
- print(f'{count[0]}/{len(all_ddis)}')
279
- mechanism, action, drugA, drugB = event_extractor.extract(
280
- interaction)
281
- return mechanism+'__' + action
282
-
283
- # self.ddis_df['mechanism_action'] = self.ddis_df['interaction'].apply(lambda x: fnc2(x))
284
- # tqdm.pandas()
285
- self.ddis_df['mechanism_action'] = self.ddis_df['masked_interaction'].apply(
286
- fnc2, args=(count,))
287
-
288
- # self.ddis_df.to_csv(ddi_csv_path, index=False, compression='gzip')
289
- # self.ddis_df.to_pickle(ddi_pickle_path)
290
- zip_helper = ZipHelper()
291
-
292
- if save_as_sql:
293
- conn = sqlite3.connect(db_path)
294
- self.drugs_df.to_sql(
295
- '_Drugs', conn, if_exists='replace', index=True)
296
- self.ddis_df.to_sql('_Interactions', conn,
297
- if_exists='replace', index=True)
298
- ext_id_df = pd.DataFrame.from_records(external_identifier_list)
299
- ext_id_df.to_sql('_ExternalIdentifiers', conn,
300
- if_exists='replace', index=True)
301
-
302
- zip_helper.zip_single_file(
303
- file_path=db_path, output_path=output_path+'/zips', name='db')
304
- conn.close()
305
-
306
- if zip_outputs:
307
- zip_helper.zip_single_file(
308
- file_path=drugs_pickle_path, output_path=output_path+'/zips', name='drugs-pickle')
309
- zip_helper.zip_single_file(
310
- file_path=ddi_pickle_path, output_path=output_path+'/zips', name='ddi-pickle')
311
-
312
- else:
313
- print('Output path has processed data, load function is called')
314
- self.load(output_path)
315
-
316
- def load(self, path):
317
- drugs_pickle_path = path+'/drugs.pkl'
318
- ddi_pickle_path = path+'/ddi.pkl'
319
- if os.path.exists(drugs_pickle_path) and os.path.exists(ddi_pickle_path):
320
- self.drugs_df = pd.read_pickle(drugs_pickle_path)
321
- self.ddis_df = pd.read_pickle(ddi_pickle_path)
322
- else:
323
- print('One of given paths could not found')
324
-
325
- def load_from_csv(self, path):
326
- drugs_csv_path = path+'/drugs.gzip'
327
- ddi_csv_path = path+'/ddi.gzip'
328
- if os.path.exists(drugs_csv_path) and os.path.exists(ddi_csv_path):
329
- self.drugs_df = pd.read_csv(drugs_csv_path, compression='gzip')
330
- self.ddis_df = pd.read_csv(ddi_csv_path, compression='gzip')
331
- else:
332
- print('One of given paths could not found')
333
-
334
- def load2(self, path):
335
- drugs_pickle_path = path+'/drugs.pkl'
336
- ddi_csv_path = path+'/ddi.gzip'
337
- if os.path.exists(drugs_pickle_path) and os.path.exists(ddi_csv_path):
338
- self.drugs_df = pd.read_pickle(drugs_pickle_path)
339
- self.ddis_df = pd.read_csv(ddi_csv_path, compression='gzip')
340
- else:
341
- print('One of given paths could not found')
342
-
343
- def drugs_as_dataframe(self):
344
- return self.drugs_df
345
-
346
- def filtered_drugs_as_dataframe(self, drug_ids):
347
- return self.drugs_df[self.drugs_df['drugbank_id'].isin(drug_ids)]
348
-
349
- def ddis_as_dataframe(self):
350
- return self.ddis_df
351
-
352
- def filtered_ddis(self, drugs):
353
- ddis_df = self.ddis_df.copy()
354
- return ddis_df[(ddis_df['drug_1'] in drugs) & (
355
- ddis_df['drug_2'] in drugs)]