ddi-fw 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ # !pip install -U sentence-transformers
2
+
3
+ # from transformers import BertTokenizer,BertForPreTraining,BertModel
4
+ # from sentence_transformers import SentenceTransformer, util
5
+ import pandas as pd
6
+ import numpy as np
7
+ import nltk
8
+ from nltk import sent_tokenize
9
+ from tqdm import tqdm
10
+
11
+
12
+ nltk.download('punkt')
13
+
14
+ import os
15
+ def check_file_exists(path):
16
+ return os.path.isdir(path)
17
+
18
+ def get_model_name_or_local_path(model_local_path, model_name):
19
+ if check_file_exists(model_local_path):
20
+ return model_local_path
21
+ return model_name
22
+
23
+ import re
24
+ def process_text(text):
25
+ text = re.sub("\[L\d*\]", "",text)
26
+ text = text.replace("[","")
27
+ text = text.replace("]","")
28
+ return text
29
+
30
+
31
+
32
+ from collections import defaultdict
33
+ from functools import partial
34
+
35
+ # NOT modelden input size'ı anlama,
36
+ def create_embeddings(model, data, column, drop_column=True):
37
+ # model._modules['1'].get_sentence_embedding_dimension()
38
+ # shape = (1,model._modules['0'].get_word_embedding_dimension())
39
+ shape = model._modules['0'].get_word_embedding_dimension()
40
+ column_embeddings_dict = defaultdict(lambda: np.zeros(shape))
41
+ for index, row in tqdm(data.iterrows()):
42
+ # if index == 10:
43
+ # break
44
+ text = data[column][index]
45
+ # else'de zero
46
+ if text == None or type(text) != str:
47
+ embeddings = None
48
+ else:
49
+ sentences = sent_tokenize(text)
50
+ embeddings = model.encode(sentences)
51
+
52
+ #TODO benzer olan ilacın embedding değerini vererek dene
53
+ if embeddings is None or len(embeddings) == 0: #embedding check none type
54
+ sum_of_embeddings = np.zeros(shape)
55
+ else:
56
+ sum_of_embeddings = np.sum(embeddings, axis = 0)
57
+ # column_embeddings_dict[row['id']] = sum_of_embeddings.reshape(1, -1) # 2d
58
+ column_embeddings_dict[row['id']] = sum_of_embeddings
59
+ # data.iloc[index][column+'_embedding']=sum_of_embeddings
60
+
61
+ data[column+'_embedding'] = pd.Series(column_embeddings_dict.values())
62
+ if(drop_column):
63
+ data.drop([column], axis = 1, inplace = True)
64
+ # data[column+'_embedding'] = [column_embeddings_dict[row['name']] for index, row in data.iterrows()]
65
+ return column_embeddings_dict
66
+
@@ -0,0 +1,105 @@
1
+ # !pip install -U sentence-transformers
2
+
3
+ # from transformers import BertTokenizer,BertForPreTraining,BertModel
4
+ # from sentence_transformers import SentenceTransformer, util
5
+ import pandas as pd
6
+ import numpy as np
7
+ from nltk import sent_tokenize
8
+ import torch
9
+ from tqdm import tqdm
10
+
11
+
12
+ from collections import defaultdict
13
+ from functools import partial
14
+ from abc import ABC, abstractmethod
15
+ from transformers import AutoModel, AutoTokenizer
16
+ from sentence_transformers import SentenceTransformer, util
17
+
18
+
19
+ class EmbeddingGenerator(ABC):
20
+
21
+ def __init__(self):
22
+ self.shape = None
23
+
24
+ @abstractmethod
25
+ def generate(self, text):
26
+ pass
27
+
28
+ # https://github.com/huggingface/transformers/issues/1791
29
+ class PretrainedEmbeddingGenerator(EmbeddingGenerator):
30
+ def __init__(self, model_name, split_text=True):
31
+ self.model_name = model_name
32
+ self.model = AutoModel.from_pretrained(model_name)
33
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
34
+ self.shape = self.model.get_input_embeddings().weight.shape
35
+ self.split_text = split_text
36
+
37
+ def generate(self, text):
38
+ if self.split_text:
39
+ sentences = sent_tokenize(text)
40
+ output_embeddings = None
41
+ for sentence in sentences:
42
+ input_ids = self.tokenizer.encode(sentence, return_tensors='pt', padding=True)
43
+ if output_embeddings == None:
44
+ output_embeddings = self.model(input_ids).last_hidden_state.mean(dim=1)
45
+ else:
46
+ output_embeddings += self.model(input_ids).last_hidden_state.mean(dim=1)
47
+ if output_embeddings == None:
48
+ output_embeddings = torch.empty((1,self.model.get_input_embeddings().weight.shape[1]))
49
+ else:
50
+ encoded_input = self.tokenizer(text, return_tensors='pt')
51
+ input_ids = self.tokenizer.encode(text, add_special_tokens=True, max_length=self.tokenizer.model_max_length, return_tensors='pt')
52
+ # input_ids = encoded_input.input_ids[:self.tokenizer.model_max_length]
53
+ output_embeddings = self.model(input_ids)
54
+ # output_embeddings = self.model(**encoded_input)
55
+ # sentence embedding
56
+ output_embeddings = output_embeddings.last_hidden_state.mean(dim=1)
57
+ return torch.flatten(output_embeddings).detach().numpy()
58
+
59
+
60
+ class LLMEmbeddingGenerator(EmbeddingGenerator):
61
+ pass
62
+
63
+
64
+ class SBertEmbeddingGenerator(PretrainedEmbeddingGenerator):
65
+ def __init__(self, model_name, split_text=True):
66
+ self.model = SentenceTransformer(model_name)
67
+ self.shape = self.model._modules['0'].get_word_embedding_dimension()
68
+ self.split_text = split_text
69
+
70
+ def generate(self, text):
71
+ if text == None or type(text) != str:
72
+ embeddings = None
73
+ else:
74
+ if self.split_text:
75
+ sentences = sent_tokenize(text)
76
+ embeddings = self.model.encode(sentences)
77
+ else:
78
+ embeddings = self.model.encode(text)
79
+ return embeddings
80
+
81
+
82
+ # NOT modelden input size'ı anlama,
83
+ def create_embeddings_new(generator: EmbeddingGenerator, data, column, drop_column=True):
84
+ column_embeddings_dict = defaultdict(lambda: np.zeros(generator.shape))
85
+ for index, row in tqdm(data.iterrows()):
86
+ # if index == 10:
87
+ # break
88
+ text = data[column][index]
89
+ embeddings = generator.generate(text)
90
+
91
+ # TODO benzer olan ilacın embedding değerini vererek dene
92
+ # embedding check none type
93
+ if embeddings is None or len(embeddings) == 0:
94
+ sum_of_embeddings = np.zeros(generator.shape)
95
+ else:
96
+ sum_of_embeddings = np.sum(embeddings, axis=0)
97
+ # column_embeddings_dict[row['id']] = sum_of_embeddings.reshape(1, -1) # 2d
98
+ column_embeddings_dict[row['id']] = sum_of_embeddings
99
+ # data.iloc[index][column+'_embedding']=sum_of_embeddings
100
+
101
+ data[column+'_embedding'] = pd.Series(column_embeddings_dict.values())
102
+ if (drop_column):
103
+ data.drop([column], axis=1, inplace=True)
104
+ # data[column+'_embedding'] = [column_embeddings_dict[row['name']] for index, row in data.iterrows()]
105
+ return column_embeddings_dict
@@ -0,0 +1,100 @@
1
+ import numpy as np
2
+ from scipy.spatial.distance import pdist, squareform
3
+
4
+ # todo pd.unique kullan
5
+ def find_distinct_elements(frame):
6
+ # y = set(pd.unique(frame))
7
+ y = set()
8
+ for x in frame:
9
+ if x is not None:
10
+ for k in x:
11
+ # if type(k) == list:
12
+ # for i in k:
13
+ # y.add(i)
14
+ # else:
15
+ y.add(k)
16
+ return y
17
+
18
+
19
+ def find_distinct_elements_count(frame):
20
+ y = set()
21
+ for x in frame:
22
+ if x is not None:
23
+ y.update(x)
24
+ return len(y)
25
+
26
+
27
+ class SimilarityMatrixGenerator:
28
+ def __init__(self):
29
+ pass
30
+
31
+ def create_jaccard_similarity_matrices_ex(self, array):
32
+ jaccard_sim = 1 - pdist(array, metric='jaccard')
33
+ jaccard_sim_matrix = squareform(jaccard_sim)
34
+ return jaccard_sim_matrix
35
+
36
+ # https://github.com/YifanDengWHU/DDIMDL/blob/master/DDIMDL.py , def Jaccard(matrix):
37
+ def create_jaccard_similarity_matrices(self, matrix):
38
+ matrix = np.mat(matrix)
39
+ numerator = matrix * matrix.T
40
+ denominator = np.ones(np.shape(matrix)) * matrix.T + \
41
+ matrix * np.ones(np.shape(matrix.T)) - matrix * matrix.T
42
+ matrix = numerator / denominator
43
+ np.nan_to_num(matrix, nan=0.0)
44
+ return matrix
45
+
46
+
47
+ class VectorGenerator:
48
+ def __init__(self, df):
49
+ self.df = df
50
+
51
+ def generate_feature_vector(self, column):
52
+ bit_vectors = []
53
+ map = dict()
54
+ idx = 0
55
+ count = find_distinct_elements_count(self.df[column])
56
+ print(f"find_distinct_elements_count bitti, boyut: {count}")
57
+ for ind in self.df.index:
58
+ e = self.df[column][ind]
59
+ # vector = np.zeros(len(sorted_features))
60
+ vector = np.zeros(count)
61
+ if e is not None:
62
+ for item in e:
63
+ if item in map:
64
+ vector[map[item]] = 1
65
+ else:
66
+ vector[idx]=1
67
+ map[item] = idx
68
+ idx += 1
69
+
70
+ bit_vectors.append(vector)
71
+ print("array oluşturuldu")
72
+ return np.array(bit_vectors)
73
+
74
+ # def generate_feature_vector(self, column):
75
+ # bit_vectors = []
76
+ # distinct_feature = find_distinct_elements(self.df[column])
77
+ # sorted_features = sorted(distinct_feature)
78
+ # for ind in self.df.index:
79
+ # e = self.df[column][ind]
80
+ # vector = np.zeros(len(sorted_features))
81
+ # if e is not None:
82
+ # indexes = [i for i, x in enumerate(sorted_features) if x in e]
83
+ # np.put(vector, indexes, np.ones(len(indexes)))
84
+ # bit_vectors.append(vector)
85
+ # return bit_vectors
86
+
87
+ # bit_vectors ndarray olacak
88
+ def generate_feature_vectors(self, columns):
89
+ vectors = dict()
90
+ for column in columns:
91
+ bit_vectors = self.generate_feature_vector(column)
92
+ vectors[column] = bit_vectors
93
+ return vectors
94
+
95
+
96
+ # generate feature vector
97
+ # np.hstack
98
+
99
+ # https://www.datasciencelearner.com/how-to-create-an-array-of-bits-in-python/
100
+ #
@@ -0,0 +1,71 @@
1
+ from collections import defaultdict
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ def find_distinct_elements(frame):
6
+ y = set()
7
+ for x in frame:
8
+ if x is not None:
9
+ for k in x:
10
+ y.add(k)
11
+ return y
12
+
13
+ class IDF:
14
+ def __init__(self, dataframe, columns):
15
+ self.dataframe = dataframe
16
+ self.columns = columns
17
+
18
+ def calculate(self):
19
+ idf_scores = defaultdict(dict)
20
+ total_document_number = self.dataframe.shape[0]
21
+ for column in self.columns:
22
+ score = dict()
23
+ idf_scores[column] = score
24
+ for e in self.dataframe[column]:
25
+ if e is not None:
26
+ for item in e:
27
+ if item in score:
28
+ score[item] = score[item] +1
29
+ else:
30
+ score[item] = 1.0
31
+ for key,value in score.items():
32
+ score[key]= np.log(1.0 * total_document_number / value)
33
+ self.idf_scores = idf_scores
34
+
35
+ def calculate_old(self):
36
+ self.idf_scores = defaultdict(dict)
37
+ for column in self.columns:
38
+ data = self.dataframe[column]
39
+ self.distinct_items = find_distinct_elements(data)
40
+ #sorted_distinct_items = sorted(self.distinct_items)
41
+ total_document_number = data.shape[0]
42
+ for item in self.distinct_items:
43
+ document_freq = data.map(set([item]).issubset).sum()
44
+ idf = np.log(total_document_number/document_freq)
45
+ self.idf_scores[column][item] = idf
46
+
47
+ def to_dataframe(self):
48
+ return pd.DataFrame.from_dict(self.idf_scores)
49
+
50
+
51
+ # class IDF:
52
+ # def __init__(self, data, threshold = 0):
53
+ # self.data = data
54
+ # self.threshold = threshold
55
+ # self.distinct_items = find_distinct_elements(data)
56
+
57
+ # def calculate(self):
58
+ # self.idf_scores = {}
59
+ # sorted_distinct_items = sorted(self.distinct_items)
60
+ # total_document_number = self.data.shape[0]
61
+ # for item in sorted_distinct_items:
62
+ # document_freq = self.data.map(set([item]).issubset).sum()
63
+ # idf = np.log(total_document_number/document_freq)
64
+ # self.idf_scores[item] = idf
65
+
66
+ # def find_items_over_threshold(self):
67
+ # return [k for k,v in self.idf_scores.items() if v > self.threshold]
68
+
69
+ # def filter_dict_by_threshold(self):
70
+ # return {k:v for k,v in self.idf_scores.items() if v > self.threshold}
71
+
@@ -0,0 +1,2 @@
1
+ from .drugbank_parser import DrugBankParser
2
+ from .drugbank_processor import DrugBankProcessor
@@ -0,0 +1,154 @@
1
+ # # https://caseolap.github.io/docs/drug/drugbank/
2
+ # #https://gist.github.com/rosherbal/56461421c69a8a7da775336c95fa62e0
3
+
4
+ import os
5
+ import zipfile
6
+ import xml.etree.ElementTree as ET
7
+ from xml.etree.ElementTree import XMLParser, XMLPullParser
8
+ import pandas as pd
9
+ import xmlschema
10
+ import json as json
11
+ import sys
12
+ import unicodedata
13
+ import re
14
+
15
+ from ddi_fw.utils import ZipHelper
16
+
17
+
18
+ def slugify(value, allow_unicode=False):
19
+ """
20
+ Taken from https://github.com/django/django/blob/master/django/utils/text.py
21
+ Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
22
+ dashes to single dashes. Remove characters that aren't alphanumerics,
23
+ underscores, or hyphens. Convert to lowercase. Also strip leading and
24
+ trailing whitespace, dashes, and underscores.
25
+ """
26
+ value = str(value)
27
+ if allow_unicode:
28
+ value = unicodedata.normalize('NFKC', value)
29
+ else:
30
+ value = unicodedata.normalize('NFKD', value).encode(
31
+ 'ascii', 'ignore').decode('ascii')
32
+ value = re.sub(r'[^\w\s-]', '', value.lower())
33
+ return re.sub(r'[-\s]+', '-', value).strip('-_')
34
+
35
+
36
+ def replace_key(key: str):
37
+ if key.startswith('@'):
38
+ key = key[1:]
39
+ if key == '$':
40
+ key = "value"
41
+ elif '{http://www.drugbank.ca}' in key:
42
+ key = key.replace('{http://www.drugbank.ca}', '')
43
+ return key
44
+
45
+
46
+ def modify_keys(d):
47
+ for k, v in d.copy().items():
48
+ if isinstance(v, dict):
49
+ d.pop(k)
50
+ d[replace_key(k)] = v
51
+ modify_keys(v)
52
+ elif isinstance(v, list):
53
+ d.pop(k)
54
+ d[replace_key(k)] = v
55
+ for i in v:
56
+ if isinstance(i, list) or isinstance(i, dict):
57
+ modify_keys(i)
58
+ # print(i)
59
+
60
+ else:
61
+ if k == "keyToChange":
62
+ v = int(v)
63
+ d.pop(k)
64
+ d[replace_key(k)] = v
65
+ return d
66
+
67
+
68
+ class DrugBankParser:
69
+ def __init__(self, xsd_file='drugbank.xsd', zip_file='drugbank.zip', input_path='./drugbank'):
70
+
71
+ # sys.path.insert(0,'/content/drive/My Drive/drugbank')
72
+ # HERE = '/content/drive/My Drive/drugbank'
73
+ HERE = input_path
74
+ DRUGBANK_XSD = HERE + '/' + xsd_file
75
+ DRUGBANK_ZIP = HERE + '/' + zip_file
76
+ xsd = xmlschema.XMLSchema(DRUGBANK_XSD)
77
+ self.drug_type_schema = xsd.complex_types[1]
78
+ self.zf = zipfile.ZipFile(DRUGBANK_ZIP, 'r')
79
+
80
+ def parse(self, save_path='./drugbank/drugs', override = False):
81
+ if not override:
82
+ print('No parsing process has been executed!!!')
83
+ return
84
+
85
+ elements = []
86
+ k = 0
87
+
88
+ for name in self.zf.namelist():
89
+ f = self.zf.open(name)
90
+ # tree = ET.parse(f)
91
+ # root = tree.getroot()
92
+ previous_element = None
93
+ for event, element in ET.iterparse(f, events=('end',)): # "end"
94
+ # if k == 10:
95
+ # break
96
+ if len(elements) == 0:
97
+ elements.append(element)
98
+ elif len(elements) == 1:
99
+ elements.append(element)
100
+ elif len(elements) == 2:
101
+ elements[0] = elements[1]
102
+ elements[1] = element
103
+ if len(elements) == 2:
104
+ previous_element = elements[len(elements)-2]
105
+ drug = None
106
+ # previous_element = element.find("..")
107
+ #
108
+ if previous_element is not None and previous_element.tag == '{http://www.drugbank.ca}transporters' and event == 'end' and element.tag == "{http://www.drugbank.ca}drug":
109
+ drug = element
110
+ elements = []
111
+
112
+ # for child in element:
113
+ # print(child.text)
114
+
115
+ if drug is None:
116
+ continue
117
+
118
+ name = drug.find("{http://www.drugbank.ca}name")
119
+
120
+ d_name = None
121
+ if name is not None:
122
+ d_name = name.text
123
+ line = name.text
124
+
125
+ if d_name is None:
126
+ continue
127
+
128
+ k = k + 1
129
+
130
+ # print(d_name)
131
+
132
+ # if lax is used we have to send d[0] as a parameter
133
+ d = self.drug_type_schema.decode(drug, validation='strict')
134
+ # pretty_dict = {replace_key(k): v for k, v in d[0].items()}
135
+ pretty_dict = modify_keys(d)
136
+ # for key, value in pretty_dict.items():
137
+ # print(key, '->', value)
138
+ # file_name = slugify(d_name)
139
+
140
+ from pathlib import Path
141
+
142
+ Path(save_path).mkdir(parents=True, exist_ok=True)
143
+
144
+ primary_id = [
145
+ id['value'] for id in pretty_dict["drugbank-id"] if id['primary'] == True][0]
146
+ with open(f'{save_path}/{primary_id}.json', 'w', encoding='utf-8') as f:
147
+ json.dump(pretty_dict, f, ensure_ascii=False, indent=4)
148
+
149
+ print("Done")
150
+
151
+ def zip_files(self, chunk_size=1000, input_path='./drugbank/drugs', output_path='./drugbank/zips'):
152
+ zip_helper = ZipHelper()
153
+ zip_helper.zip(zip_prefix='drugs', input_path=input_path,
154
+ output_path=output_path, chunk_size=chunk_size)