ddi-fw 0.0.149__py3-none-any.whl → 0.0.151__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/__init__.py +1 -1
- ddi_fw/datasets/core.py +147 -341
- ddi_fw/datasets/dataset_splitter.py +39 -0
- ddi_fw/datasets/ddi_mdl/base.py +194 -130
- ddi_fw/datasets/ddi_mdl/debug.log +1 -0
- ddi_fw/datasets/embedding_generator.py +2 -1
- ddi_fw/langchain/embeddings.py +1 -0
- ddi_fw/ml/evaluation_helper.py +47 -178
- ddi_fw/ml/ml_helper.py +125 -81
- ddi_fw/ml/model_wrapper.py +2 -2
- ddi_fw/ml/pytorch_wrapper.py +175 -72
- ddi_fw/ml/tensorflow_wrapper.py +131 -39
- ddi_fw/ner/ner.py +93 -39
- ddi_fw/pipeline/multi_modal_combination_strategy.py +4 -2
- ddi_fw/pipeline/multi_pipeline.py +2 -15
- ddi_fw/pipeline/ner_pipeline.py +15 -6
- ddi_fw/pipeline/pipeline.py +157 -93
- ddi_fw/{test/compress_json_test.py → utils/json_helper.py} +1 -15
- {ddi_fw-0.0.149.dist-info → ddi_fw-0.0.151.dist-info}/METADATA +6 -3
- {ddi_fw-0.0.149.dist-info → ddi_fw-0.0.151.dist-info}/RECORD +22 -31
- {ddi_fw-0.0.149.dist-info → ddi_fw-0.0.151.dist-info}/WHEEL +1 -1
- ddi_fw/test/__init__.py +0 -0
- ddi_fw/test/basic_test.py +0 -15
- ddi_fw/test/combination_test.py +0 -12
- ddi_fw/test/date_test.py +0 -15
- ddi_fw/test/idf_score.py +0 -54
- ddi_fw/test/jaccard_similarity.py +0 -85
- ddi_fw/test/mlfow_test.py +0 -165
- ddi_fw/test/sklearn-tfidf.py +0 -16
- ddi_fw/test/test.py +0 -93
- ddi_fw/test/torch_cuda_test.py +0 -9
- ddi_fw/test/type_guarding_test.py +0 -18
- {ddi_fw-0.0.149.dist-info → ddi_fw-0.0.151.dist-info}/top_level.txt +0 -0
@@ -1,85 +0,0 @@
|
|
1
|
-
import pandas as pd
|
2
|
-
|
3
|
-
# data = {'A': [1, 1, 1, 0, 0],
|
4
|
-
# 'B': [0, 1, 1, 1, 0],
|
5
|
-
# 'C': [0, 0, 1, 1, 1]}
|
6
|
-
|
7
|
-
# df = pd.DataFrame(data)
|
8
|
-
|
9
|
-
|
10
|
-
# from scipy.spatial.distance import pdist, squareform
|
11
|
-
|
12
|
-
# jaccard_dist = pdist(df.values, metric='jaccard')
|
13
|
-
# jaccard_dist_matrix = squareform(jaccard_dist)
|
14
|
-
|
15
|
-
# print(jaccard_dist_matrix)
|
16
|
-
|
17
|
-
|
18
|
-
# import pandas as pd
|
19
|
-
# from scipy.spatial.distance import euclidean, pdist, squareform
|
20
|
-
|
21
|
-
|
22
|
-
# def similarity_func(u, v):
|
23
|
-
# return 1/(1+euclidean(u,v))
|
24
|
-
|
25
|
-
# DF_var = pd.DataFrame.from_dict({"s1":[1.2,3.4,10.2],"s2":[1.4,3.1,10.7],"s3":[2.1,3.7,11.3],"s4":[1.5,3.2,10.9]})
|
26
|
-
# DF_var.index = ["g1","g2","g3"]
|
27
|
-
|
28
|
-
# dists = pdist(DF_var, similarity_func)
|
29
|
-
# DF_euclid = pd.DataFrame(squareform(dists), columns=DF_var.index, index=DF_var.index)
|
30
|
-
|
31
|
-
# print(DF_euclid)
|
32
|
-
|
33
|
-
|
34
|
-
from sklearn.metrics import jaccard_score
|
35
|
-
import seaborn as sns
|
36
|
-
import matplotlib.pyplot as plt
|
37
|
-
|
38
|
-
data = [[0, 1, 0], [0, 1, 1], [0, 1, 0], [1, 1, 1], [1, 0, 1]]
|
39
|
-
|
40
|
-
similarity_matrix = []
|
41
|
-
for i in range(len(data)):
|
42
|
-
row = []
|
43
|
-
for j in range(len(data)):
|
44
|
-
row.append(jaccard_score(data[i], data[j]))
|
45
|
-
similarity_matrix.append(row)
|
46
|
-
|
47
|
-
sns.heatmap(pd.DataFrame(similarity_matrix), annot=True, cmap="YlGnBu")
|
48
|
-
plt.show()
|
49
|
-
|
50
|
-
|
51
|
-
# https://stackoverflow.com/questions/35639571/python-pandas-distance-matrix-using-jaccard-similarity
|
52
|
-
import pandas as pd
|
53
|
-
entries = [
|
54
|
-
{'id':'1', 'category1':'100', 'category2': '0', 'category3':'100'},
|
55
|
-
{'id':'2', 'category1':'100', 'category2': '0', 'category3':'100'},
|
56
|
-
{'id':'3', 'category1':'0', 'category2': '100', 'category3':'100'},
|
57
|
-
{'id':'4', 'category1':'100', 'category2': '100', 'category3':'100'},
|
58
|
-
{'id':'5', 'category1':'100', 'category2': '0', 'category3':'100'}
|
59
|
-
]
|
60
|
-
df = pd.DataFrame(entries)
|
61
|
-
|
62
|
-
from scipy.spatial.distance import squareform
|
63
|
-
from scipy.spatial.distance import pdist, jaccard
|
64
|
-
|
65
|
-
res = 1 - pdist(df[['category1','category2','category3']], 'jaccard')
|
66
|
-
# squareform(res)
|
67
|
-
distance = pd.DataFrame(squareform(res), index=df.index, columns= df.index)
|
68
|
-
print(distance)
|
69
|
-
|
70
|
-
entries2 = [
|
71
|
-
{'id':'1', 'cat':['p1','p2','p3']},
|
72
|
-
{'id':'2', 'cat':['p3','p4','p5']},
|
73
|
-
{'id':'3', 'cat':['p5','p6','p7']},
|
74
|
-
]
|
75
|
-
df2 = pd.DataFrame(entries2)
|
76
|
-
|
77
|
-
c = df2['cat']
|
78
|
-
|
79
|
-
y = set()
|
80
|
-
|
81
|
-
for x in c:
|
82
|
-
for k in x:
|
83
|
-
y.add(k)
|
84
|
-
|
85
|
-
print(y)
|
ddi_fw/test/mlfow_test.py
DELETED
@@ -1,165 +0,0 @@
|
|
1
|
-
# import mlflow
|
2
|
-
# from mlflow import tensorflow
|
3
|
-
# mlflow.set_tracking_uri("sqlite:///mlflow.db")
|
4
|
-
# mlflow.set_experiment("multi-class-ddi-classification")
|
5
|
-
|
6
|
-
|
7
|
-
# def mlp_mlflow_run(
|
8
|
-
# name,
|
9
|
-
# model_params,
|
10
|
-
# train_params,
|
11
|
-
# train_dataset,
|
12
|
-
# val_dataset,
|
13
|
-
# test_dataset,
|
14
|
-
# y_test,
|
15
|
-
# ):
|
16
|
-
# with mlflow.start_run(run_name=name):
|
17
|
-
# mlflow.log_params(model_params)
|
18
|
-
# mlflow.log_params(train_params)
|
19
|
-
# mlflow.set_tag("model_name", "MLP")
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
# mlflow.log_metric("accuracy", accuracy)
|
25
|
-
# mlflow.tensorflow.log_model(mlp, "tf_models") #folder
|
26
|
-
|
27
|
-
import mlflow
|
28
|
-
import pandas as pd
|
29
|
-
import tensorflow as tf
|
30
|
-
import matplotlib.pyplot as plt
|
31
|
-
from sklearn.model_selection import train_test_split
|
32
|
-
from sklearn.metrics import mean_squared_error
|
33
|
-
from sklearn.datasets import fetch_california_housing
|
34
|
-
import tensorflow_addons as tfa
|
35
|
-
|
36
|
-
from tensorflow.keras.callbacks import EarlyStopping
|
37
|
-
|
38
|
-
|
39
|
-
from sklearn.preprocessing import StandardScaler
|
40
|
-
from sklearn.ensemble import RandomForestRegressor
|
41
|
-
import seaborn as sns
|
42
|
-
|
43
|
-
from tensorflow.keras.layers import Dense, Dropout
|
44
|
-
from tensorflow.keras.models import Sequential
|
45
|
-
from tensorflow.keras.losses import MeanSquaredError
|
46
|
-
|
47
|
-
|
48
|
-
mlflow.set_tracking_uri("sqlite:///mlflow.db")
|
49
|
-
mlflow.set_experiment("income")
|
50
|
-
|
51
|
-
|
52
|
-
dset = fetch_california_housing()
|
53
|
-
data = dset['data']
|
54
|
-
y = dset['target']
|
55
|
-
LABEL = dset['target_names'][0]
|
56
|
-
|
57
|
-
NUMERIC_FEATURES = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Longitude', 'Latitude']
|
58
|
-
FEATURES = NUMERIC_FEATURES
|
59
|
-
|
60
|
-
data = pd.DataFrame(data, columns=dset['feature_names'])
|
61
|
-
data[LABEL] = y
|
62
|
-
|
63
|
-
data.head()
|
64
|
-
|
65
|
-
train_data, test_data = train_test_split(data, test_size=0.2)
|
66
|
-
print(f"Train dataset shape: {train_data.shape}")
|
67
|
-
print(f"Test dataset shape: {test_data.shape}")
|
68
|
-
|
69
|
-
|
70
|
-
X_train, X_val = train_test_split(train_data, test_size=0.2)
|
71
|
-
|
72
|
-
sc = StandardScaler()
|
73
|
-
X_train.loc[:, NUMERIC_FEATURES] = sc.fit_transform(X_train[NUMERIC_FEATURES])
|
74
|
-
X_val.loc[:, NUMERIC_FEATURES] = sc.transform(X_val[NUMERIC_FEATURES])
|
75
|
-
test_data.loc[:, NUMERIC_FEATURES] = sc.transform(test_data[NUMERIC_FEATURES])
|
76
|
-
|
77
|
-
|
78
|
-
def build_mlp(params):
|
79
|
-
mlp = Sequential([
|
80
|
-
Dense(params["layer1_size"], activation=params['activation']),
|
81
|
-
Dropout(params['dropout_rate']),
|
82
|
-
Dense(params["layer2_size"], activation=params['activation']),
|
83
|
-
Dropout(params['dropout_rate']),
|
84
|
-
Dense(params["layer3_size"], activation=params['activation']),
|
85
|
-
Dense(1, activation='relu')
|
86
|
-
])
|
87
|
-
return mlp
|
88
|
-
|
89
|
-
def train_mlp(mlp, train_params, train_dataset, val_dataset):
|
90
|
-
optimizer = tfa.optimizers.AdamW(
|
91
|
-
learning_rate=train_params["learning_rate"],
|
92
|
-
weight_decay=train_params["weight_decay"],
|
93
|
-
)
|
94
|
-
mlp.compile(
|
95
|
-
optimizer=optimizer,
|
96
|
-
loss=MeanSquaredError(name="mse"),
|
97
|
-
metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse")]
|
98
|
-
)
|
99
|
-
|
100
|
-
early = EarlyStopping(
|
101
|
-
monitor="val_loss",
|
102
|
-
mode="min",
|
103
|
-
patience=train_params["early_stop_patience"],
|
104
|
-
restore_best_weights=True,
|
105
|
-
)
|
106
|
-
callback_list = [early]
|
107
|
-
|
108
|
-
hist = mlp.fit(
|
109
|
-
train_dataset,
|
110
|
-
epochs=train_params["num_epochs"],
|
111
|
-
validation_data=val_dataset,
|
112
|
-
callbacks=callback_list,
|
113
|
-
)
|
114
|
-
return mlp
|
115
|
-
|
116
|
-
|
117
|
-
def mlp_mlflow_run(
|
118
|
-
name,
|
119
|
-
mlp_params,
|
120
|
-
train_params,
|
121
|
-
train_dataset,
|
122
|
-
val_dataset,
|
123
|
-
test_dataset,
|
124
|
-
y_test,
|
125
|
-
):
|
126
|
-
with mlflow.start_run(run_name=name):
|
127
|
-
mlflow.log_params(mlp_params)
|
128
|
-
mlflow.log_params(train_params)
|
129
|
-
mlflow.set_tag("model_name", "MLP")
|
130
|
-
mlp = build_mlp(mlp_params)
|
131
|
-
mlp = train_mlp(mlp, train_params, train_dataset, val_dataset)
|
132
|
-
test_preds = mlp.predict(test_dataset)
|
133
|
-
test_rms = mean_squared_error(
|
134
|
-
y_test, test_preds.ravel(), squared=False
|
135
|
-
)
|
136
|
-
mlflow.log_metric("test_rmse", test_rms)
|
137
|
-
mlflow.tensorflow.log_model(mlp, "tf_models")
|
138
|
-
|
139
|
-
|
140
|
-
# To TF Dataset
|
141
|
-
mlp_train_ds = tf.data.Dataset.from_tensor_slices((X_train[FEATURES], X_train[LABEL])).batch(512).shuffle(512*4).prefetch(512)
|
142
|
-
mlp_val_ds = tf.data.Dataset.from_tensor_slices((X_val[FEATURES], X_val[LABEL])).batch(512).shuffle(512*4).prefetch(512)
|
143
|
-
mlp_test_ds = tf.data.Dataset.from_tensor_slices(test_data[FEATURES]).batch(512).prefetch(512)
|
144
|
-
|
145
|
-
mlp_params = {
|
146
|
-
"layer1_size": 512,
|
147
|
-
"layer2_size": 128,
|
148
|
-
"layer3_size": 64,
|
149
|
-
"dropout_rate": 0.3,
|
150
|
-
"activation": 'relu'
|
151
|
-
|
152
|
-
}
|
153
|
-
train_params = dict(
|
154
|
-
learning_rate=0.001, weight_decay=0.00001, early_stop_patience=10, num_epochs=1000
|
155
|
-
)
|
156
|
-
|
157
|
-
mlp_mlflow_run(
|
158
|
-
"mlp_base",
|
159
|
-
mlp_params,
|
160
|
-
train_params,
|
161
|
-
mlp_train_ds,
|
162
|
-
mlp_val_ds,
|
163
|
-
mlp_test_ds,
|
164
|
-
test_data[LABEL],
|
165
|
-
)
|
ddi_fw/test/sklearn-tfidf.py
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
from sklearn.feature_extraction.text import TfidfTransformer
|
2
|
-
from sklearn.feature_extraction.text import CountVectorizer
|
3
|
-
from sklearn.pipeline import Pipeline
|
4
|
-
corpus = ['this is the first document',
|
5
|
-
'this document is the second document',
|
6
|
-
'and this is the third one',
|
7
|
-
'is this the first document']
|
8
|
-
vocabulary = ['this', 'document', 'first', 'is', 'second', 'the',
|
9
|
-
'and', 'one']
|
10
|
-
pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),
|
11
|
-
('tfid', TfidfTransformer())]).fit(corpus)
|
12
|
-
pipe['count'].transform(corpus).toarray()
|
13
|
-
|
14
|
-
pipe['tfid'].idf_
|
15
|
-
|
16
|
-
pipe.transform(corpus).shape
|
ddi_fw/test/test.py
DELETED
@@ -1,93 +0,0 @@
|
|
1
|
-
from rdkit import Chem
|
2
|
-
from rdkit.Chem import AllChem
|
3
|
-
|
4
|
-
from urllib.request import urlopen
|
5
|
-
from urllib.parse import quote
|
6
|
-
|
7
|
-
from Bio.KEGG import REST
|
8
|
-
|
9
|
-
|
10
|
-
x = REST.kegg_find(database='drug', query='D03136')
|
11
|
-
y = x.read()
|
12
|
-
print(x)
|
13
|
-
def CIRconvert(ids):
|
14
|
-
try:
|
15
|
-
url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles'
|
16
|
-
ans = urlopen(url).read().decode('utf8')
|
17
|
-
return ans
|
18
|
-
except:
|
19
|
-
return 'Did not work'
|
20
|
-
|
21
|
-
# identifiers = ['3-Methylheptane', 'Aspirin', 'Diethylsulfate', 'Diethyl sulfate', '50-78-2', 'Adamant']
|
22
|
-
|
23
|
-
# smiles = []
|
24
|
-
|
25
|
-
# for ids in identifiers :
|
26
|
-
# smiles.append(CIRconvert(ids))
|
27
|
-
# # print(ids, CIRconvert(ids))
|
28
|
-
|
29
|
-
|
30
|
-
# from rdkit.Chem import SaltRemover
|
31
|
-
|
32
|
-
# remover = SaltRemover(defnData="[Na+]\\nCC(=O)O", defnFormat=SaltRemover.InputFormat.SMILES)
|
33
|
-
# len(remover)
|
34
|
-
|
35
|
-
# remover = SaltRemover(defnFormat=SaltRemover.InputFormat.SMILES, defnData="[Cl]")
|
36
|
-
# mol = Chem.MolFromSmiles(smiles[0])
|
37
|
-
|
38
|
-
# morgan_hashed = AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=881)
|
39
|
-
# print(morgan_hashed.ToBitString())
|
40
|
-
|
41
|
-
#https://go.drugbank.com/structures/small_molecule_drugs/DB01076.smiles
|
42
|
-
|
43
|
-
# targets -> target -> polypeptide
|
44
|
-
# enzymes -> enzyme -> polypeptide
|
45
|
-
smiles = {'DB001075':'[H][C@]12OC[C@@H](O[N+]([O-])=O)[C@@]1([H])OC[C@@H]2O',
|
46
|
-
'DB001076':'CC(C)C1=C(C(=O)NC2=CC=CC=C2)C(=C(N1CC[C@@H](O)C[C@@H](O)CC(O)=O)C1=CC=C(F)C=C1)C1=CC=CC=C1',
|
47
|
-
'DB001077':'CC(C)C1=C(C(=O)NC2=CC=CC=C2)C(=C(N1CC[C@@H](O)C[C@@H](O)CC(O)=O)C1=CC=C(F)C=C1)C1=CC=CC=C1',
|
48
|
-
}
|
49
|
-
morgan_hashed_dict = {}
|
50
|
-
# smile = '[H][C@]12OC[C@@H](O[N+]([O-])=O)[C@@]1([H])OC[C@@H]2O'
|
51
|
-
# smile = 'CC(C)C1=C(C(=O)NC2=CC=CC=C2)C(=C(N1CC[C@@H](O)C[C@@H](O)CC(O)=O)C1=CC=C(F)C=C1)C1=CC=CC=C1'
|
52
|
-
|
53
|
-
for drugbank_id, smile in smiles.items():
|
54
|
-
mol = Chem.MolFromSmiles(smile)
|
55
|
-
morgan_hashed = AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=881)
|
56
|
-
morgan_hashed_dict.update({drugbank_id: morgan_hashed.ToList()})
|
57
|
-
# print(morgan_hashed.ToBitString())
|
58
|
-
|
59
|
-
|
60
|
-
import pandas as pd
|
61
|
-
df = pd.DataFrame(morgan_hashed_dict.values())
|
62
|
-
|
63
|
-
from scipy.spatial.distance import pdist, squareform
|
64
|
-
|
65
|
-
jaccard_dist = 1 - pdist(df.values, metric='jaccard')
|
66
|
-
jaccard_dist_matrix = squareform(jaccard_dist)
|
67
|
-
|
68
|
-
print(jaccard_dist_matrix)
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
import numpy as np
|
73
|
-
import pandas as pd
|
74
|
-
|
75
|
-
# df = pd.DataFrame({'sample':[np.array(range(99999, 99999 + 1000))]})
|
76
|
-
df = pd.DataFrame({'sample':[np.random.random_sample((1000,))]})
|
77
|
-
|
78
|
-
df['sample'] = df['sample'].apply(lambda x: str(x).replace('\n', ''))
|
79
|
-
|
80
|
-
df.to_csv('sample.csv', index=False)
|
81
|
-
|
82
|
-
|
83
|
-
from ast import literal_eval
|
84
|
-
new_df = pd.read_csv('sample.csv')
|
85
|
-
def fnc(x):
|
86
|
-
return np.array(literal_eval(x.replace('[ ', '[').replace(' ', ',')))
|
87
|
-
# new_df['array_col'] = new_df['sample'].apply(lambda x: np.array(literal_eval(x.replace('[ ', '[').replace(' ', ','))))
|
88
|
-
new_df['array_col'] = new_df['sample'].apply(lambda x: fnc(x))
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
print(new_df.loc[0, 'array_col'][0:10])
|
93
|
-
|
ddi_fw/test/torch_cuda_test.py
DELETED
@@ -1,9 +0,0 @@
|
|
1
|
-
import torch
|
2
|
-
print(f'PyTorch version: {torch.__version__}')
|
3
|
-
print('*'*10)
|
4
|
-
print(f'_CUDA version: ')
|
5
|
-
# !nvcc --version
|
6
|
-
print('*'*10)
|
7
|
-
print(f'CUDNN version: {torch.backends.cudnn.version()}')
|
8
|
-
print(f'Available GPU devices: {torch.cuda.device_count()}')
|
9
|
-
print(f'Device Name: {torch.cuda.get_device_name()}')
|
@@ -1,18 +0,0 @@
|
|
1
|
-
# from typing import List
|
2
|
-
# from itertools import product
|
3
|
-
# from ddi_fw.utils import ZipHelper
|
4
|
-
|
5
|
-
# from ddi_fw.utils.enums import DrugBankTextDataTypes, UMLSCodeTypes
|
6
|
-
|
7
|
-
# def generate_pairs(umls_code_types: List[UMLSCodeTypes] = None, text_types: List[DrugBankTextDataTypes] = None):
|
8
|
-
# _umls_codes = [t.value[0] for t in umls_code_types]
|
9
|
-
# _text_types = [t.value[0] for t in text_types]
|
10
|
-
# items = [f'{item[0]}_{item[1]}' for item in product(_umls_codes, _text_types)]
|
11
|
-
# print(items)
|
12
|
-
|
13
|
-
|
14
|
-
# if __name__ == "__main__":
|
15
|
-
# generate_pairs(umls_code_types=[UMLSCodeTypes.TUI, UMLSCodeTypes.ENTITIES], text_types= [DrugBankTextDataTypes.DESCRIPTION])
|
16
|
-
|
17
|
-
|
18
|
-
# # reveal_type(UMLSCodeTypes.ENTITIES) # Revealed type is "Literal[Direction.up]?"
|
File without changes
|