ddi-fw 0.0.149__py3-none-any.whl → 0.0.151__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,85 +0,0 @@
1
- import pandas as pd
2
-
3
- # data = {'A': [1, 1, 1, 0, 0],
4
- # 'B': [0, 1, 1, 1, 0],
5
- # 'C': [0, 0, 1, 1, 1]}
6
-
7
- # df = pd.DataFrame(data)
8
-
9
-
10
- # from scipy.spatial.distance import pdist, squareform
11
-
12
- # jaccard_dist = pdist(df.values, metric='jaccard')
13
- # jaccard_dist_matrix = squareform(jaccard_dist)
14
-
15
- # print(jaccard_dist_matrix)
16
-
17
-
18
- # import pandas as pd
19
- # from scipy.spatial.distance import euclidean, pdist, squareform
20
-
21
-
22
- # def similarity_func(u, v):
23
- # return 1/(1+euclidean(u,v))
24
-
25
- # DF_var = pd.DataFrame.from_dict({"s1":[1.2,3.4,10.2],"s2":[1.4,3.1,10.7],"s3":[2.1,3.7,11.3],"s4":[1.5,3.2,10.9]})
26
- # DF_var.index = ["g1","g2","g3"]
27
-
28
- # dists = pdist(DF_var, similarity_func)
29
- # DF_euclid = pd.DataFrame(squareform(dists), columns=DF_var.index, index=DF_var.index)
30
-
31
- # print(DF_euclid)
32
-
33
-
34
- from sklearn.metrics import jaccard_score
35
- import seaborn as sns
36
- import matplotlib.pyplot as plt
37
-
38
- data = [[0, 1, 0], [0, 1, 1], [0, 1, 0], [1, 1, 1], [1, 0, 1]]
39
-
40
- similarity_matrix = []
41
- for i in range(len(data)):
42
- row = []
43
- for j in range(len(data)):
44
- row.append(jaccard_score(data[i], data[j]))
45
- similarity_matrix.append(row)
46
-
47
- sns.heatmap(pd.DataFrame(similarity_matrix), annot=True, cmap="YlGnBu")
48
- plt.show()
49
-
50
-
51
- # https://stackoverflow.com/questions/35639571/python-pandas-distance-matrix-using-jaccard-similarity
52
- import pandas as pd
53
- entries = [
54
- {'id':'1', 'category1':'100', 'category2': '0', 'category3':'100'},
55
- {'id':'2', 'category1':'100', 'category2': '0', 'category3':'100'},
56
- {'id':'3', 'category1':'0', 'category2': '100', 'category3':'100'},
57
- {'id':'4', 'category1':'100', 'category2': '100', 'category3':'100'},
58
- {'id':'5', 'category1':'100', 'category2': '0', 'category3':'100'}
59
- ]
60
- df = pd.DataFrame(entries)
61
-
62
- from scipy.spatial.distance import squareform
63
- from scipy.spatial.distance import pdist, jaccard
64
-
65
- res = 1 - pdist(df[['category1','category2','category3']], 'jaccard')
66
- # squareform(res)
67
- distance = pd.DataFrame(squareform(res), index=df.index, columns= df.index)
68
- print(distance)
69
-
70
- entries2 = [
71
- {'id':'1', 'cat':['p1','p2','p3']},
72
- {'id':'2', 'cat':['p3','p4','p5']},
73
- {'id':'3', 'cat':['p5','p6','p7']},
74
- ]
75
- df2 = pd.DataFrame(entries2)
76
-
77
- c = df2['cat']
78
-
79
- y = set()
80
-
81
- for x in c:
82
- for k in x:
83
- y.add(k)
84
-
85
- print(y)
ddi_fw/test/mlfow_test.py DELETED
@@ -1,165 +0,0 @@
1
- # import mlflow
2
- # from mlflow import tensorflow
3
- # mlflow.set_tracking_uri("sqlite:///mlflow.db")
4
- # mlflow.set_experiment("multi-class-ddi-classification")
5
-
6
-
7
- # def mlp_mlflow_run(
8
- # name,
9
- # model_params,
10
- # train_params,
11
- # train_dataset,
12
- # val_dataset,
13
- # test_dataset,
14
- # y_test,
15
- # ):
16
- # with mlflow.start_run(run_name=name):
17
- # mlflow.log_params(model_params)
18
- # mlflow.log_params(train_params)
19
- # mlflow.set_tag("model_name", "MLP")
20
-
21
-
22
-
23
-
24
- # mlflow.log_metric("accuracy", accuracy)
25
- # mlflow.tensorflow.log_model(mlp, "tf_models") #folder
26
-
27
- import mlflow
28
- import pandas as pd
29
- import tensorflow as tf
30
- import matplotlib.pyplot as plt
31
- from sklearn.model_selection import train_test_split
32
- from sklearn.metrics import mean_squared_error
33
- from sklearn.datasets import fetch_california_housing
34
- import tensorflow_addons as tfa
35
-
36
- from tensorflow.keras.callbacks import EarlyStopping
37
-
38
-
39
- from sklearn.preprocessing import StandardScaler
40
- from sklearn.ensemble import RandomForestRegressor
41
- import seaborn as sns
42
-
43
- from tensorflow.keras.layers import Dense, Dropout
44
- from tensorflow.keras.models import Sequential
45
- from tensorflow.keras.losses import MeanSquaredError
46
-
47
-
48
- mlflow.set_tracking_uri("sqlite:///mlflow.db")
49
- mlflow.set_experiment("income")
50
-
51
-
52
- dset = fetch_california_housing()
53
- data = dset['data']
54
- y = dset['target']
55
- LABEL = dset['target_names'][0]
56
-
57
- NUMERIC_FEATURES = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Longitude', 'Latitude']
58
- FEATURES = NUMERIC_FEATURES
59
-
60
- data = pd.DataFrame(data, columns=dset['feature_names'])
61
- data[LABEL] = y
62
-
63
- data.head()
64
-
65
- train_data, test_data = train_test_split(data, test_size=0.2)
66
- print(f"Train dataset shape: {train_data.shape}")
67
- print(f"Test dataset shape: {test_data.shape}")
68
-
69
-
70
- X_train, X_val = train_test_split(train_data, test_size=0.2)
71
-
72
- sc = StandardScaler()
73
- X_train.loc[:, NUMERIC_FEATURES] = sc.fit_transform(X_train[NUMERIC_FEATURES])
74
- X_val.loc[:, NUMERIC_FEATURES] = sc.transform(X_val[NUMERIC_FEATURES])
75
- test_data.loc[:, NUMERIC_FEATURES] = sc.transform(test_data[NUMERIC_FEATURES])
76
-
77
-
78
- def build_mlp(params):
79
- mlp = Sequential([
80
- Dense(params["layer1_size"], activation=params['activation']),
81
- Dropout(params['dropout_rate']),
82
- Dense(params["layer2_size"], activation=params['activation']),
83
- Dropout(params['dropout_rate']),
84
- Dense(params["layer3_size"], activation=params['activation']),
85
- Dense(1, activation='relu')
86
- ])
87
- return mlp
88
-
89
- def train_mlp(mlp, train_params, train_dataset, val_dataset):
90
- optimizer = tfa.optimizers.AdamW(
91
- learning_rate=train_params["learning_rate"],
92
- weight_decay=train_params["weight_decay"],
93
- )
94
- mlp.compile(
95
- optimizer=optimizer,
96
- loss=MeanSquaredError(name="mse"),
97
- metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse")]
98
- )
99
-
100
- early = EarlyStopping(
101
- monitor="val_loss",
102
- mode="min",
103
- patience=train_params["early_stop_patience"],
104
- restore_best_weights=True,
105
- )
106
- callback_list = [early]
107
-
108
- hist = mlp.fit(
109
- train_dataset,
110
- epochs=train_params["num_epochs"],
111
- validation_data=val_dataset,
112
- callbacks=callback_list,
113
- )
114
- return mlp
115
-
116
-
117
- def mlp_mlflow_run(
118
- name,
119
- mlp_params,
120
- train_params,
121
- train_dataset,
122
- val_dataset,
123
- test_dataset,
124
- y_test,
125
- ):
126
- with mlflow.start_run(run_name=name):
127
- mlflow.log_params(mlp_params)
128
- mlflow.log_params(train_params)
129
- mlflow.set_tag("model_name", "MLP")
130
- mlp = build_mlp(mlp_params)
131
- mlp = train_mlp(mlp, train_params, train_dataset, val_dataset)
132
- test_preds = mlp.predict(test_dataset)
133
- test_rms = mean_squared_error(
134
- y_test, test_preds.ravel(), squared=False
135
- )
136
- mlflow.log_metric("test_rmse", test_rms)
137
- mlflow.tensorflow.log_model(mlp, "tf_models")
138
-
139
-
140
- # To TF Dataset
141
- mlp_train_ds = tf.data.Dataset.from_tensor_slices((X_train[FEATURES], X_train[LABEL])).batch(512).shuffle(512*4).prefetch(512)
142
- mlp_val_ds = tf.data.Dataset.from_tensor_slices((X_val[FEATURES], X_val[LABEL])).batch(512).shuffle(512*4).prefetch(512)
143
- mlp_test_ds = tf.data.Dataset.from_tensor_slices(test_data[FEATURES]).batch(512).prefetch(512)
144
-
145
- mlp_params = {
146
- "layer1_size": 512,
147
- "layer2_size": 128,
148
- "layer3_size": 64,
149
- "dropout_rate": 0.3,
150
- "activation": 'relu'
151
-
152
- }
153
- train_params = dict(
154
- learning_rate=0.001, weight_decay=0.00001, early_stop_patience=10, num_epochs=1000
155
- )
156
-
157
- mlp_mlflow_run(
158
- "mlp_base",
159
- mlp_params,
160
- train_params,
161
- mlp_train_ds,
162
- mlp_val_ds,
163
- mlp_test_ds,
164
- test_data[LABEL],
165
- )
@@ -1,16 +0,0 @@
1
- from sklearn.feature_extraction.text import TfidfTransformer
2
- from sklearn.feature_extraction.text import CountVectorizer
3
- from sklearn.pipeline import Pipeline
4
- corpus = ['this is the first document',
5
- 'this document is the second document',
6
- 'and this is the third one',
7
- 'is this the first document']
8
- vocabulary = ['this', 'document', 'first', 'is', 'second', 'the',
9
- 'and', 'one']
10
- pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),
11
- ('tfid', TfidfTransformer())]).fit(corpus)
12
- pipe['count'].transform(corpus).toarray()
13
-
14
- pipe['tfid'].idf_
15
-
16
- pipe.transform(corpus).shape
ddi_fw/test/test.py DELETED
@@ -1,93 +0,0 @@
1
- from rdkit import Chem
2
- from rdkit.Chem import AllChem
3
-
4
- from urllib.request import urlopen
5
- from urllib.parse import quote
6
-
7
- from Bio.KEGG import REST
8
-
9
-
10
- x = REST.kegg_find(database='drug', query='D03136')
11
- y = x.read()
12
- print(x)
13
- def CIRconvert(ids):
14
- try:
15
- url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles'
16
- ans = urlopen(url).read().decode('utf8')
17
- return ans
18
- except:
19
- return 'Did not work'
20
-
21
- # identifiers = ['3-Methylheptane', 'Aspirin', 'Diethylsulfate', 'Diethyl sulfate', '50-78-2', 'Adamant']
22
-
23
- # smiles = []
24
-
25
- # for ids in identifiers :
26
- # smiles.append(CIRconvert(ids))
27
- # # print(ids, CIRconvert(ids))
28
-
29
-
30
- # from rdkit.Chem import SaltRemover
31
-
32
- # remover = SaltRemover(defnData="[Na+]\\nCC(=O)O", defnFormat=SaltRemover.InputFormat.SMILES)
33
- # len(remover)
34
-
35
- # remover = SaltRemover(defnFormat=SaltRemover.InputFormat.SMILES, defnData="[Cl]")
36
- # mol = Chem.MolFromSmiles(smiles[0])
37
-
38
- # morgan_hashed = AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=881)
39
- # print(morgan_hashed.ToBitString())
40
-
41
- #https://go.drugbank.com/structures/small_molecule_drugs/DB01076.smiles
42
-
43
- # targets -> target -> polypeptide
44
- # enzymes -> enzyme -> polypeptide
45
- smiles = {'DB001075':'[H][C@]12OC[C@@H](O[N+]([O-])=O)[C@@]1([H])OC[C@@H]2O',
46
- 'DB001076':'CC(C)C1=C(C(=O)NC2=CC=CC=C2)C(=C(N1CC[C@@H](O)C[C@@H](O)CC(O)=O)C1=CC=C(F)C=C1)C1=CC=CC=C1',
47
- 'DB001077':'CC(C)C1=C(C(=O)NC2=CC=CC=C2)C(=C(N1CC[C@@H](O)C[C@@H](O)CC(O)=O)C1=CC=C(F)C=C1)C1=CC=CC=C1',
48
- }
49
- morgan_hashed_dict = {}
50
- # smile = '[H][C@]12OC[C@@H](O[N+]([O-])=O)[C@@]1([H])OC[C@@H]2O'
51
- # smile = 'CC(C)C1=C(C(=O)NC2=CC=CC=C2)C(=C(N1CC[C@@H](O)C[C@@H](O)CC(O)=O)C1=CC=C(F)C=C1)C1=CC=CC=C1'
52
-
53
- for drugbank_id, smile in smiles.items():
54
- mol = Chem.MolFromSmiles(smile)
55
- morgan_hashed = AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=881)
56
- morgan_hashed_dict.update({drugbank_id: morgan_hashed.ToList()})
57
- # print(morgan_hashed.ToBitString())
58
-
59
-
60
- import pandas as pd
61
- df = pd.DataFrame(morgan_hashed_dict.values())
62
-
63
- from scipy.spatial.distance import pdist, squareform
64
-
65
- jaccard_dist = 1 - pdist(df.values, metric='jaccard')
66
- jaccard_dist_matrix = squareform(jaccard_dist)
67
-
68
- print(jaccard_dist_matrix)
69
-
70
-
71
-
72
- import numpy as np
73
- import pandas as pd
74
-
75
- # df = pd.DataFrame({'sample':[np.array(range(99999, 99999 + 1000))]})
76
- df = pd.DataFrame({'sample':[np.random.random_sample((1000,))]})
77
-
78
- df['sample'] = df['sample'].apply(lambda x: str(x).replace('\n', ''))
79
-
80
- df.to_csv('sample.csv', index=False)
81
-
82
-
83
- from ast import literal_eval
84
- new_df = pd.read_csv('sample.csv')
85
- def fnc(x):
86
- return np.array(literal_eval(x.replace('[ ', '[').replace(' ', ',')))
87
- # new_df['array_col'] = new_df['sample'].apply(lambda x: np.array(literal_eval(x.replace('[ ', '[').replace(' ', ','))))
88
- new_df['array_col'] = new_df['sample'].apply(lambda x: fnc(x))
89
-
90
-
91
-
92
- print(new_df.loc[0, 'array_col'][0:10])
93
-
@@ -1,9 +0,0 @@
1
- import torch
2
- print(f'PyTorch version: {torch.__version__}')
3
- print('*'*10)
4
- print(f'_CUDA version: ')
5
- # !nvcc --version
6
- print('*'*10)
7
- print(f'CUDNN version: {torch.backends.cudnn.version()}')
8
- print(f'Available GPU devices: {torch.cuda.device_count()}')
9
- print(f'Device Name: {torch.cuda.get_device_name()}')
@@ -1,18 +0,0 @@
1
- # from typing import List
2
- # from itertools import product
3
- # from ddi_fw.utils import ZipHelper
4
-
5
- # from ddi_fw.utils.enums import DrugBankTextDataTypes, UMLSCodeTypes
6
-
7
- # def generate_pairs(umls_code_types: List[UMLSCodeTypes] = None, text_types: List[DrugBankTextDataTypes] = None):
8
- # _umls_codes = [t.value[0] for t in umls_code_types]
9
- # _text_types = [t.value[0] for t in text_types]
10
- # items = [f'{item[0]}_{item[1]}' for item in product(_umls_codes, _text_types)]
11
- # print(items)
12
-
13
-
14
- # if __name__ == "__main__":
15
- # generate_pairs(umls_code_types=[UMLSCodeTypes.TUI, UMLSCodeTypes.ENTITIES], text_types= [DrugBankTextDataTypes.DESCRIPTION])
16
-
17
-
18
- # # reveal_type(UMLSCodeTypes.ENTITIES) # Revealed type is "Literal[Direction.up]?"