ddi-fw 0.0.93__tar.gz → 0.0.95__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/PKG-INFO +1 -1
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/pyproject.toml +1 -1
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/ml/__init__.py +2 -1
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/ml/ml_helper.py +4 -4
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/ml/model_wrapper.py +1 -1
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/ml/pytorch_wrapper.py +1 -1
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/ml/tensorflow_wrapper.py +32 -22
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/pipeline/multi_pipeline.py +5 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/pipeline/pipeline.py +68 -18
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw.egg-info/PKG-INFO +1 -1
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw.egg-info/SOURCES.txt +1 -7
- ddi_fw-0.0.93/src/ddi_fw/experiments/__init__.py +0 -4
- ddi_fw-0.0.93/src/ddi_fw/experiments/custom_torch_model.py +0 -66
- ddi_fw-0.0.93/src/ddi_fw/experiments/pipeline.py +0 -132
- ddi_fw-0.0.93/src/ddi_fw/experiments/pipeline_ner.py +0 -116
- ddi_fw-0.0.93/src/ddi_fw/experiments/tensorflow_helper.py +0 -284
- ddi_fw-0.0.93/src/ddi_fw/experiments/test.py +0 -61
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/README.md +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/setup.cfg +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/__init__.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/core.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/db_utils.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/base.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/data/event.db +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_0.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_1.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_2.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_3.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_4.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes/train_indexes.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_0.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_1.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_2.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_3.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_4.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes_old/test_indexes.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_0.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_1.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_2.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_3.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_4.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_indexes.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_0.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_1.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_2.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_3.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_4.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/ddi_mdl/readme.md +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/embedding_generator.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/feature_vector_generation.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/idf_helper.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/mdf_sa_ddi/__init__.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/mdf_sa_ddi/base.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/mdf_sa_ddi/df_extraction_cleanxiaoyu50.csv +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/mdf_sa_ddi/drug_information_del_noDDIxiaoyu50.csv +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/test_indexes.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_0.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_1.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_2.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_3.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_4.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_indexes.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_0.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_1.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_2.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_3.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_4.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/mdf_sa_ddi/mdf-sa-ddi.zip +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/datasets/setup_._py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/drugbank/__init__.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/drugbank/drugbank.xsd +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/drugbank/drugbank_parser.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/drugbank/drugbank_processor.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/drugbank/drugbank_processor_org.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/drugbank/event_extractor.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/langchain/__init__.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/langchain/embeddings.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/langchain/sentence_splitter.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/langchain/storage.py +0 -0
- {ddi_fw-0.0.93/src/ddi_fw/experiments → ddi_fw-0.0.95/src/ddi_fw/ml}/evaluation_helper.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/ner/__init__.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/ner/mmlrestclient.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/ner/ner.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/pipeline/__init__.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/pipeline/multi_modal_combination_strategy.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/pipeline/ner_pipeline.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/test/basic_test.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/test/combination_test.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/test/compress_json_test.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/test/date_test.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/test/idf_score.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/test/jaccard_similarity.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/test/mlfow_test.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/test/sklearn-tfidf.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/test/test.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/test/torch_cuda_test.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/test/type_guarding_test.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/utils/__init__.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/utils/enums.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/utils/py7zr_helper.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/utils/utils.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw/utils/zip_helper.py +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw.egg-info/dependency_links.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw.egg-info/requires.txt +0 -0
- {ddi_fw-0.0.93 → ddi_fw-0.0.95}/src/ddi_fw.egg-info/top_level.txt +0 -0
@@ -1,4 +1,5 @@
|
|
1
1
|
from .ml_helper import MultiModalRunner
|
2
2
|
from .model_wrapper import ModelWrapper,Result
|
3
3
|
from .tensorflow_wrapper import TFModelWrapper
|
4
|
-
from .pytorch_wrapper import PTModelWrapper
|
4
|
+
from .pytorch_wrapper import PTModelWrapper
|
5
|
+
from .evaluation_helper import evaluate
|
@@ -16,7 +16,7 @@ from mlflow.utils.autologging_utils import batch_metrics_logger
|
|
16
16
|
import time
|
17
17
|
|
18
18
|
from mlflow.models import infer_signature
|
19
|
-
from ddi_fw.
|
19
|
+
from ddi_fw.ml.evaluation_helper import Metrics, evaluate
|
20
20
|
|
21
21
|
# import tf2onnx
|
22
22
|
# import onnx
|
@@ -24,9 +24,9 @@ from ddi_fw.experiments.evaluation_helper import Metrics, evaluate
|
|
24
24
|
import itertools
|
25
25
|
import ddi_fw.utils as utils
|
26
26
|
|
27
|
-
tf.random.set_seed(1)
|
28
|
-
np.random.seed(2)
|
29
|
-
np.set_printoptions(precision=4)
|
27
|
+
# tf.random.set_seed(1)
|
28
|
+
# np.random.seed(2)
|
29
|
+
# np.set_printoptions(precision=4)
|
30
30
|
|
31
31
|
class MultiModalRunner:
|
32
32
|
# todo model related parameters to config
|
@@ -1,24 +1,19 @@
|
|
1
|
-
from matplotlib import pyplot as plt
|
2
1
|
from ddi_fw.ml.model_wrapper import ModelWrapper
|
3
2
|
import tensorflow as tf
|
4
3
|
from tensorflow import keras
|
5
|
-
from keras.
|
6
|
-
from keras.layers import Dense, Dropout, Input, Activation, BatchNormalization
|
7
|
-
from keras.callbacks import EarlyStopping
|
4
|
+
from keras.callbacks import EarlyStopping,ModelCheckpoint
|
8
5
|
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
|
9
6
|
import numpy as np
|
10
7
|
|
11
8
|
import mlflow
|
12
9
|
from mlflow.utils.autologging_utils import batch_metrics_logger
|
13
|
-
import time
|
14
10
|
|
15
11
|
from mlflow.models import infer_signature
|
16
|
-
from ddi_fw.
|
12
|
+
from ddi_fw.ml.evaluation_helper import Metrics, evaluate
|
17
13
|
|
18
14
|
# import tf2onnx
|
19
15
|
# import onnx
|
20
16
|
|
21
|
-
import itertools
|
22
17
|
import ddi_fw.utils as utils
|
23
18
|
|
24
19
|
|
@@ -44,6 +39,15 @@ class TFModelWrapper(ModelWrapper):
|
|
44
39
|
X_valid_cv = self.train_data[val_idx]
|
45
40
|
y_valid_cv = self.train_label[val_idx]
|
46
41
|
|
42
|
+
checkpoint = ModelCheckpoint(
|
43
|
+
filepath=f'{self.descriptor}_validation_{i}.weights.h5',
|
44
|
+
monitor='val_loss',
|
45
|
+
save_best_only=True,
|
46
|
+
save_weights_only=True,
|
47
|
+
verbose=1,
|
48
|
+
mode='min'
|
49
|
+
)
|
50
|
+
|
47
51
|
early_stopping = EarlyStopping(
|
48
52
|
monitor='val_loss', patience=10, verbose=0, mode='auto')
|
49
53
|
custom_callback = CustomCallback()
|
@@ -52,15 +56,21 @@ class TFModelWrapper(ModelWrapper):
|
|
52
56
|
epochs=self.epochs,
|
53
57
|
validation_data=(
|
54
58
|
X_valid_cv, y_valid_cv),
|
55
|
-
callbacks=[early_stopping, custom_callback])
|
59
|
+
callbacks=[early_stopping, checkpoint, custom_callback])
|
56
60
|
# histories[f'validation_{i}'] = history
|
57
|
-
models_val_acc[f'validation_{i}'] = history.history['val_accuracy'][-1]
|
61
|
+
# models_val_acc[f'validation_{i}'] = history.history['val_accuracy'][-1]
|
62
|
+
models_val_acc[f'{self.descriptor}_validation_{i}'] = checkpoint.best
|
63
|
+
models[f'{self.descriptor}_validation_{i}'] = checkpoint.model
|
64
|
+
import os
|
65
|
+
if os.path.exists(f'{self.descriptor}_validation_{i}.weights.h5'):
|
66
|
+
os.remove(f'{self.descriptor}_validation_{i}.weights.h5')
|
58
67
|
# Saving each CV model
|
59
68
|
|
60
69
|
best_model_key = max(models_val_acc, key=models_val_acc.get)
|
61
70
|
best_model = models[best_model_key]
|
62
|
-
|
63
|
-
|
71
|
+
# mlflow.tensorflow.log_model(best_model, "model")
|
72
|
+
# best_model.evaluate(self.test_data, self.test_label,
|
73
|
+
# callbacks=[custom_callback])
|
64
74
|
pred = best_model.predict(self.test_data)
|
65
75
|
|
66
76
|
logs, metrics = evaluate(
|
@@ -68,17 +78,17 @@ class TFModelWrapper(ModelWrapper):
|
|
68
78
|
metrics.format_float()
|
69
79
|
mlflow.log_metrics(logs)
|
70
80
|
mlflow.log_param('best_cv', best_model_key)
|
71
|
-
signature = infer_signature(
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
)
|
76
|
-
|
77
|
-
mlflow.keras.save_model(
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
)
|
81
|
+
# signature = infer_signature(
|
82
|
+
# self.train_data,
|
83
|
+
# # generate_signature_output(model,X_valid_cv)
|
84
|
+
# # params=params,
|
85
|
+
# )
|
86
|
+
|
87
|
+
# mlflow.keras.save_model(
|
88
|
+
# best_model,
|
89
|
+
# path=run.info.artifact_uri + '/model',
|
90
|
+
# signature=signature,
|
91
|
+
# )
|
82
92
|
print(run.info.artifact_uri)
|
83
93
|
# todo tf2onnx not compatible with keras > 2.15
|
84
94
|
# onnx_model, _ = tf2onnx.convert.from_keras(
|
@@ -63,6 +63,7 @@ class MultiPipeline():
|
|
63
63
|
columns = config.get("columns")
|
64
64
|
ner_data_file = config.get("ner_data_file")
|
65
65
|
ner_threshold = config.get("ner_threshold")
|
66
|
+
column_embedding_configs = config.get("column_embedding_configs")
|
66
67
|
vector_db_persist_directory = config.get("vector_db_persist_directory")
|
67
68
|
vector_db_collection_name = config.get("vector_db_collection_name")
|
68
69
|
embedding_pooling_strategy = get_import(
|
@@ -93,6 +94,7 @@ class MultiPipeline():
|
|
93
94
|
tracking_uri=tracking_uri,
|
94
95
|
dataset_type=dataset_type,
|
95
96
|
columns=columns,
|
97
|
+
column_embedding_configs=column_embedding_configs,
|
96
98
|
vector_db_persist_directory=vector_db_persist_directory,
|
97
99
|
vector_db_collection_name=vector_db_collection_name,
|
98
100
|
embedding_pooling_strategy_type=embedding_pooling_strategy,
|
@@ -126,6 +128,7 @@ class MultiPipeline():
|
|
126
128
|
for config in self.experiments_config['experiments']:
|
127
129
|
item = self.__create_pipeline(config)
|
128
130
|
self.items.append(item)
|
131
|
+
return self
|
129
132
|
|
130
133
|
def run(self):
|
131
134
|
for item in self.items:
|
@@ -134,9 +137,11 @@ class MultiPipeline():
|
|
134
137
|
model_type = item['model_type']
|
135
138
|
batch_size = item['batch_size']
|
136
139
|
epochs = item['epochs']
|
140
|
+
# It can be moved to build function
|
137
141
|
pipeline.build()
|
138
142
|
result = pipeline.run(model_type, epochs=epochs, batch_size=batch_size)
|
139
143
|
self.pipeline_resuts[item['name']] = result
|
144
|
+
return self
|
140
145
|
|
141
146
|
def results(self):
|
142
147
|
return self.pipeline_resuts
|
@@ -21,6 +21,7 @@ class Pipeline:
|
|
21
21
|
dataset_type: BaseDataset = None,
|
22
22
|
columns=None,
|
23
23
|
embedding_dict=None,
|
24
|
+
column_embedding_configs=None,
|
24
25
|
vector_db_persist_directory=None,
|
25
26
|
vector_db_collection_name=None,
|
26
27
|
embedding_pooling_strategy_type: PoolingStrategy = None,
|
@@ -37,6 +38,7 @@ class Pipeline:
|
|
37
38
|
self.dataset_type = dataset_type
|
38
39
|
self.columns = columns
|
39
40
|
self.embedding_dict = embedding_dict
|
41
|
+
self.column_embedding_configs = column_embedding_configs
|
40
42
|
self.vector_db_persist_directory = vector_db_persist_directory
|
41
43
|
self.vector_db_collection_name = vector_db_collection_name
|
42
44
|
self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
|
@@ -45,6 +47,39 @@ class Pipeline:
|
|
45
47
|
self.combinations = combinations
|
46
48
|
self.model = model
|
47
49
|
|
50
|
+
def __create_or_update_embeddings__(embedding_dict, vector_db_persist_directory, vector_db_collection_name, column):
|
51
|
+
"""
|
52
|
+
Fetch embeddings and metadata from a persistent Chroma vector database and update the provided embedding_dict.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
- vector_db_persist_directory (str): The path to the directory where the Chroma vector database is stored.
|
56
|
+
- vector_db_collection_name (str): The name of the collection to query.
|
57
|
+
- embedding_dict (dict): The existing dictionary to update with embeddings.
|
58
|
+
|
59
|
+
Returns:
|
60
|
+
- embedding_dict (dict): The updated dictionary where embeddings are grouped by 'type' and 'id'.
|
61
|
+
"""
|
62
|
+
if vector_db_persist_directory:
|
63
|
+
# Initialize the Chroma client and get the collection
|
64
|
+
vector_db = chromadb.PersistentClient(
|
65
|
+
path=vector_db_persist_directory)
|
66
|
+
collection = vector_db.get_collection(vector_db_collection_name)
|
67
|
+
|
68
|
+
# Fetch the embeddings and metadata
|
69
|
+
if column == None:
|
70
|
+
dictionary = collection.get(include=['embeddings', 'metadatas'])
|
71
|
+
else:
|
72
|
+
dictionary = collection.get(include=['embeddings', 'metadatas'], where= {"type": {"$eq": f"{column}"}})
|
73
|
+
# Populate the embedding dictionary with embeddings from the vector database
|
74
|
+
for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
|
75
|
+
embedding_dict[metadata["type"]
|
76
|
+
][metadata["id"]].append(embedding)
|
77
|
+
|
78
|
+
# return dictionary['embeddings'].shape[1]
|
79
|
+
else:
|
80
|
+
raise ValueError(
|
81
|
+
"Persistent directory for the vector DB is not specified.")
|
82
|
+
|
48
83
|
def build(self):
|
49
84
|
# 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
|
50
85
|
kwargs = {"columns": self.columns}
|
@@ -52,27 +87,42 @@ class Pipeline:
|
|
52
87
|
for k, v in self.ner_threshold.items():
|
53
88
|
kwargs[k] = v
|
54
89
|
if self.embedding_dict == None:
|
90
|
+
embedding_dict = defaultdict(lambda: defaultdict(list))
|
55
91
|
if self.vector_db_persist_directory:
|
56
|
-
self.
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
92
|
+
self.__create_or_update_embeddings__(
|
93
|
+
embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
|
94
|
+
|
95
|
+
if self.column_embedding_configs:
|
96
|
+
for item in self.column_embedding_configs:
|
97
|
+
col = item["column"]
|
98
|
+
col_db_dir = item["vector_db_persist_directory"]
|
99
|
+
col_db_collection = item["vector_db_collection_name"]
|
100
|
+
self.__create_or_update_embeddings__(embedding_dict, col_db_dir, col_db_collection, col)
|
101
|
+
print(f"Embedings of {col} is calculated from {col_db_collection}")
|
102
|
+
|
103
|
+
# if self.embedding_dict == None:
|
104
|
+
# if self.vector_db_persist_directory:
|
105
|
+
# self.vector_db = chromadb.PersistentClient(
|
106
|
+
# path=self.vector_db_persist_directory)
|
107
|
+
# self.collection = self.vector_db.get_collection(
|
108
|
+
# self.vector_db_collection_name)
|
109
|
+
# dictionary = self.collection.get(
|
110
|
+
# include=['embeddings', 'metadatas'])
|
111
|
+
|
112
|
+
# embedding_dict = defaultdict(lambda: defaultdict(list))
|
113
|
+
|
114
|
+
# for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
|
115
|
+
# embedding_dict[metadata["type"]
|
116
|
+
# ][metadata["id"]].append(embedding)
|
117
|
+
|
118
|
+
# embedding_size = dictionary['embeddings'].shape[1]
|
70
119
|
else:
|
71
120
|
embedding_dict = self.embedding_dict
|
72
|
-
#TODO make generic
|
73
|
-
embedding_size = list(embedding_dict['all_text'].values())[
|
74
|
-
|
75
|
-
|
121
|
+
# TODO make generic
|
122
|
+
# embedding_size = list(embedding_dict['all_text'].values())[
|
123
|
+
# 0][0].shape
|
124
|
+
key, value = next(iter(embedding_dict.items()))
|
125
|
+
embedding_size = value[next(iter(value))][0].shape[0]
|
76
126
|
pooling_strategy = self.embedding_pooling_strategy_type()
|
77
127
|
|
78
128
|
self.ner_df = CTakesNER().load(
|
@@ -62,18 +62,12 @@ src/ddi_fw/drugbank/drugbank_parser.py
|
|
62
62
|
src/ddi_fw/drugbank/drugbank_processor.py
|
63
63
|
src/ddi_fw/drugbank/drugbank_processor_org.py
|
64
64
|
src/ddi_fw/drugbank/event_extractor.py
|
65
|
-
src/ddi_fw/experiments/__init__.py
|
66
|
-
src/ddi_fw/experiments/custom_torch_model.py
|
67
|
-
src/ddi_fw/experiments/evaluation_helper.py
|
68
|
-
src/ddi_fw/experiments/pipeline.py
|
69
|
-
src/ddi_fw/experiments/pipeline_ner.py
|
70
|
-
src/ddi_fw/experiments/tensorflow_helper.py
|
71
|
-
src/ddi_fw/experiments/test.py
|
72
65
|
src/ddi_fw/langchain/__init__.py
|
73
66
|
src/ddi_fw/langchain/embeddings.py
|
74
67
|
src/ddi_fw/langchain/sentence_splitter.py
|
75
68
|
src/ddi_fw/langchain/storage.py
|
76
69
|
src/ddi_fw/ml/__init__.py
|
70
|
+
src/ddi_fw/ml/evaluation_helper.py
|
77
71
|
src/ddi_fw/ml/ml_helper.py
|
78
72
|
src/ddi_fw/ml/model_wrapper.py
|
79
73
|
src/ddi_fw/ml/pytorch_wrapper.py
|
@@ -1,66 +0,0 @@
|
|
1
|
-
import torch
|
2
|
-
|
3
|
-
class ExtendedTorchModule(torch.nn.Module):
|
4
|
-
def __init__(self,model):
|
5
|
-
super().__init__()
|
6
|
-
self.model = model
|
7
|
-
|
8
|
-
def train(self,dataloader_train, criterion, optimizer, epoch_count = 10):
|
9
|
-
for epoch in range(epoch_count): # loop over the dataset multiple times
|
10
|
-
|
11
|
-
running_loss = 0.0
|
12
|
-
for i, data in enumerate(dataloader_train, 0):
|
13
|
-
# get the inputs; data is a list of [inputs, labels]
|
14
|
-
inputs, labels = data
|
15
|
-
|
16
|
-
# zero the parameter gradients
|
17
|
-
optimizer.zero_grad()
|
18
|
-
|
19
|
-
# forward + backward + optimize
|
20
|
-
outputs = self(inputs)
|
21
|
-
loss = criterion(outputs, labels)
|
22
|
-
loss.backward()
|
23
|
-
optimizer.step()
|
24
|
-
|
25
|
-
# print statistics
|
26
|
-
running_loss += loss.item()
|
27
|
-
if i % 5000 == 4999: # print every 2000 mini-batches
|
28
|
-
print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 5000:.3f}')
|
29
|
-
running_loss = 0.0
|
30
|
-
print('Finished Training')
|
31
|
-
|
32
|
-
def forward(self, x):
|
33
|
-
x = x.to(torch.float32)
|
34
|
-
# for f in self.module_list:
|
35
|
-
# x = f(x)
|
36
|
-
# return x
|
37
|
-
return self.model(x)
|
38
|
-
|
39
|
-
def compute_outputs(self, dataloader_test):
|
40
|
-
output_arr = []
|
41
|
-
with torch.no_grad():
|
42
|
-
for data in dataloader_test:
|
43
|
-
inputs, labels = data
|
44
|
-
# calculate outputs by running inputs through the network
|
45
|
-
outputs = self(inputs)
|
46
|
-
output_arr.append(outputs.numpy())
|
47
|
-
|
48
|
-
# <ipython-input-44-114ac3037693>:54: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:245.)
|
49
|
-
t = torch.tensor(output_arr)
|
50
|
-
return torch.squeeze(t)
|
51
|
-
|
52
|
-
# def compute_accuracy(self, dataloader_test):
|
53
|
-
# correct = 0
|
54
|
-
# total = 0
|
55
|
-
# # since we're not training, we don't need to calculate the gradients for our outputs
|
56
|
-
# with torch.no_grad():
|
57
|
-
# for data in dataloader_test:
|
58
|
-
# inputs, labels = data
|
59
|
-
# # calculate outputs by running inputs through the network
|
60
|
-
# outputs = self(inputs)
|
61
|
-
# # the class with the highest energy is what we choose as prediction
|
62
|
-
# _, predicted = torch.max(outputs.data, 1)
|
63
|
-
# total += labels.size(0)
|
64
|
-
# correct += (predicted == labels).sum().item()
|
65
|
-
|
66
|
-
# print(f'Accuracy of the network: {100 * correct // total} %')
|
@@ -1,132 +0,0 @@
|
|
1
|
-
import sqlite3
|
2
|
-
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
|
3
|
-
from keras.models import Model, Sequential
|
4
|
-
from keras.callbacks import EarlyStopping
|
5
|
-
from keras.layers import Dense, Dropout, Input, Activation, BatchNormalization
|
6
|
-
from tensorflow import keras
|
7
|
-
from ddi_fw.experiments import TFSingleModal, TFMultiModal
|
8
|
-
from ddi_fw.experiments import evaluate
|
9
|
-
from sklearn.preprocessing import LabelBinarizer
|
10
|
-
import numpy as np
|
11
|
-
import pandas as pd
|
12
|
-
from ddi_fw.utils import ZipHelper, Py7ZipHelper
|
13
|
-
import os
|
14
|
-
import chromadb
|
15
|
-
from collections import defaultdict
|
16
|
-
from langchain_community.vectorstores import Chroma
|
17
|
-
from ddi_fw.ner.ner import CTakesNER
|
18
|
-
from ddi_fw.langchain.embeddings import PoolingStrategy
|
19
|
-
|
20
|
-
from ddi_fw.datasets import BaseDataset, DDIMDLDataset
|
21
|
-
|
22
|
-
from ddi_fw.langchain.embeddings import SumPoolingStrategy
|
23
|
-
from keras import metrics
|
24
|
-
from ddi_fw.experiments.evaluation_helper import evaluate
|
25
|
-
|
26
|
-
import mlflow
|
27
|
-
|
28
|
-
|
29
|
-
class Experiment:
|
30
|
-
def __init__(self,
|
31
|
-
experiment_name=None,
|
32
|
-
experiment_description=None,
|
33
|
-
experiment_tags=None,
|
34
|
-
tracking_uri=None,
|
35
|
-
dataset_type:BaseDataset=None,
|
36
|
-
columns=None,
|
37
|
-
embedding_dict = None,
|
38
|
-
vector_db_persist_directory=None,
|
39
|
-
vector_db_collection_name=None,
|
40
|
-
embedding_pooling_strategy_type:PoolingStrategy=None,
|
41
|
-
ner_data_file=None,
|
42
|
-
ner_threshold=None,
|
43
|
-
combinations=None,
|
44
|
-
model=None):
|
45
|
-
|
46
|
-
self.experiment_name = experiment_name
|
47
|
-
self.experiment_description = experiment_description
|
48
|
-
self.experiment_tags = experiment_tags
|
49
|
-
self.tracking_uri = tracking_uri
|
50
|
-
self.dataset_type = dataset_type
|
51
|
-
self.columns = columns
|
52
|
-
self.embedding_dict = embedding_dict
|
53
|
-
self.vector_db_persist_directory = vector_db_persist_directory
|
54
|
-
self.vector_db_collection_name = vector_db_collection_name
|
55
|
-
self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
|
56
|
-
self.ner_data_file = ner_data_file
|
57
|
-
self.ner_threshold = ner_threshold
|
58
|
-
self.combinations = combinations
|
59
|
-
self.model = model
|
60
|
-
|
61
|
-
def build(self):
|
62
|
-
# 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
|
63
|
-
kwargs = {"columns": self.columns}
|
64
|
-
for k, v in self.ner_threshold.items():
|
65
|
-
kwargs[k] = v
|
66
|
-
if self.embedding_dict == None:
|
67
|
-
if self.vector_db_persist_directory:
|
68
|
-
self.vector_db = chromadb.PersistentClient(
|
69
|
-
path=self.vector_db_persist_directory)
|
70
|
-
self.collection = self.vector_db.get_collection(
|
71
|
-
self.vector_db_collection_name)
|
72
|
-
dictionary = self.collection.get(include=['embeddings', 'metadatas'])
|
73
|
-
|
74
|
-
embedding_dict = defaultdict(lambda: defaultdict(list))
|
75
|
-
|
76
|
-
for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
|
77
|
-
embedding_dict[metadata["type"]][metadata["id"]].append(embedding)
|
78
|
-
|
79
|
-
embedding_size = dictionary['embeddings'].shape[1]
|
80
|
-
else:
|
81
|
-
embedding_dict = self.embedding_dict
|
82
|
-
embedding_size = list(embedding_dict['all_text'].values())[0][0].shape
|
83
|
-
|
84
|
-
pooling_strategy = self.embedding_pooling_strategy_type()
|
85
|
-
|
86
|
-
self.ner_df = CTakesNER().load(filename=self.ner_data_file) if self.ner_data_file else None
|
87
|
-
|
88
|
-
self.dataset = self.dataset_type(
|
89
|
-
embedding_dict=embedding_dict,
|
90
|
-
embedding_size=embedding_size,
|
91
|
-
embeddings_pooling_strategy=pooling_strategy,
|
92
|
-
ner_df=self.ner_df, **kwargs)
|
93
|
-
|
94
|
-
X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = self.dataset.load()
|
95
|
-
|
96
|
-
self.dataframe = self.dataset.dataframe
|
97
|
-
# dataframe.dropna()
|
98
|
-
self.X_train = self.dataset.X_train
|
99
|
-
self.X_test = self.dataset.X_test
|
100
|
-
self.y_train = self.dataset.y_train
|
101
|
-
self.y_test = self.dataset.y_test
|
102
|
-
self.train_idx_arr = self.dataset.train_idx_arr
|
103
|
-
self.val_idx_arr = self.dataset.val_idx_arr
|
104
|
-
# Logic to set up the experiment
|
105
|
-
self.items = self.dataset.produce_inputs()
|
106
|
-
|
107
|
-
unique_classes = pd.unique(self.dataframe['event_category'])
|
108
|
-
event_num = len(unique_classes)
|
109
|
-
# droprate = 0.3
|
110
|
-
vector_size = self.dataset.drugs_df.shape[0]
|
111
|
-
|
112
|
-
print("Building the experiment with the following settings:")
|
113
|
-
print(
|
114
|
-
f"Name: {self.experiment_name}, Dataset: {self.dataset}, Model: {self.model}")
|
115
|
-
# Implement additional build logic as needed
|
116
|
-
return self
|
117
|
-
|
118
|
-
def run(self, model_func, batch_size=128, epochs=100):
|
119
|
-
mlflow.set_tracking_uri(self.tracking_uri)
|
120
|
-
|
121
|
-
if mlflow.get_experiment_by_name(self.experiment_name) == None:
|
122
|
-
mlflow.create_experiment(self.experiment_name)
|
123
|
-
mlflow.set_experiment_tags(self.experiment_tags)
|
124
|
-
mlflow.set_experiment(self.experiment_name)
|
125
|
-
|
126
|
-
y_test_label = self.items[0][4]
|
127
|
-
multi_modal = TFMultiModal(
|
128
|
-
model_func=model_func, batch_size=batch_size, epochs=epochs) # 100
|
129
|
-
multi_modal.set_data(
|
130
|
-
self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
|
131
|
-
result = multi_modal.predict(self.combinations)
|
132
|
-
return result
|
@@ -1,116 +0,0 @@
|
|
1
|
-
from collections import defaultdict
|
2
|
-
from enum import Enum
|
3
|
-
import numpy as np
|
4
|
-
import pandas as pd
|
5
|
-
from ddi_fw.datasets.core import BaseDataset
|
6
|
-
from ddi_fw.experiments.tensorflow_helper import TFMultiModal
|
7
|
-
from ddi_fw.experiments.pipeline import Experiment
|
8
|
-
from typing import Dict, List
|
9
|
-
from itertools import product
|
10
|
-
|
11
|
-
from ddi_fw.utils.enums import DrugBankTextDataTypes, UMLSCodeTypes
|
12
|
-
import mlflow
|
13
|
-
from ddi_fw.ner.ner import CTakesNER
|
14
|
-
|
15
|
-
def stack(df_column):
|
16
|
-
return np.stack(df_column.values)
|
17
|
-
|
18
|
-
|
19
|
-
class NerParameterSearch:
|
20
|
-
def __init__(self,
|
21
|
-
experiment_name,
|
22
|
-
experiment_description,
|
23
|
-
experiment_tags,
|
24
|
-
tracking_uri,
|
25
|
-
dataset_type: BaseDataset,
|
26
|
-
ner_data_file,
|
27
|
-
columns:list,
|
28
|
-
umls_code_types: List[UMLSCodeTypes],
|
29
|
-
text_types=List[DrugBankTextDataTypes],
|
30
|
-
min_threshold_dict: Dict[str, float] = defaultdict(float),
|
31
|
-
max_threshold_dict: Dict[str, float] = defaultdict(float),
|
32
|
-
increase_step=0.5):
|
33
|
-
self.experiment_name = experiment_name
|
34
|
-
self.experiment_description = experiment_description
|
35
|
-
self.experiment_tags = experiment_tags
|
36
|
-
self.tracking_uri = tracking_uri
|
37
|
-
|
38
|
-
self.dataset_type = dataset_type
|
39
|
-
self.ner_data_file = ner_data_file
|
40
|
-
self.columns = columns
|
41
|
-
self.umls_code_types = umls_code_types
|
42
|
-
self.text_types = text_types
|
43
|
-
self.min_threshold_dict = min_threshold_dict
|
44
|
-
self.max_threshold_dict = max_threshold_dict
|
45
|
-
self.increase_step = increase_step
|
46
|
-
|
47
|
-
def build(self):
|
48
|
-
self.datasets = {}
|
49
|
-
self.items = []
|
50
|
-
# columns = ['tui', 'cui', 'entities']
|
51
|
-
if self.umls_code_types is not None and self.text_types is not None:
|
52
|
-
# add checking statements
|
53
|
-
_umls_codes = [t.value[0] for t in self.umls_code_types]
|
54
|
-
_text_types = [t.value[0] for t in self.text_types]
|
55
|
-
_columns = [f'{item[0]}_{item[1]}' for item in product(
|
56
|
-
_umls_codes, _text_types)]
|
57
|
-
self.columns.extend(_columns)
|
58
|
-
print(f'Columns: {self.columns}')
|
59
|
-
self.ner_df = CTakesNER().load(filename=self.ner_data_file) if self.ner_data_file else None
|
60
|
-
for column in self.columns:
|
61
|
-
min_threshold = self.min_threshold_dict[column]
|
62
|
-
max_threshold = self.max_threshold_dict[column]
|
63
|
-
kwargs = {}
|
64
|
-
kwargs['threshold_method'] = 'idf'
|
65
|
-
kwargs['tui_threshold'] = 0
|
66
|
-
kwargs['cui_threshold'] = 0
|
67
|
-
kwargs['entities_threshold'] = 0
|
68
|
-
|
69
|
-
for threshold in np.arange(min_threshold, max_threshold, self.increase_step):
|
70
|
-
print(threshold)
|
71
|
-
if column.startswith('tui'):
|
72
|
-
kwargs['tui_threshold'] = threshold
|
73
|
-
if column.startswith('cui'):
|
74
|
-
kwargs['cui_threshold'] = threshold
|
75
|
-
if column.startswith('entities'):
|
76
|
-
kwargs['entities_threshold'] = threshold
|
77
|
-
dataset = self.dataset_type(
|
78
|
-
# chemical_property_columns=[],
|
79
|
-
# embedding_columns=[],
|
80
|
-
# ner_columns=[column],
|
81
|
-
columns=[column],
|
82
|
-
ner_df= self.ner_df,
|
83
|
-
embedding_size = None,
|
84
|
-
embedding_dict = None,
|
85
|
-
embeddings_pooling_strategy = None,
|
86
|
-
**kwargs)
|
87
|
-
|
88
|
-
# train_idx_arr, val_idx_arr bir kez hesaplanması yeterli aslında
|
89
|
-
X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = dataset.load()
|
90
|
-
group_items = dataset.produce_inputs()
|
91
|
-
for item in group_items:
|
92
|
-
# item[0] = f'threshold_{threshold}_{item[0]}'
|
93
|
-
item[0] = f'threshold_{item[0]}_{threshold}'
|
94
|
-
self.datasets[item[0]] = dataset.ddis_df
|
95
|
-
|
96
|
-
self.items.extend(group_items)
|
97
|
-
self.y_test_label = self.items[0][4]
|
98
|
-
self.train_idx_arr = train_idx_arr
|
99
|
-
self.val_idx_arr = val_idx_arr
|
100
|
-
|
101
|
-
|
102
|
-
def run(self, model_func, batch_size=128, epochs=100):
|
103
|
-
mlflow.set_tracking_uri(self.tracking_uri)
|
104
|
-
|
105
|
-
if mlflow.get_experiment_by_name(self.experiment_name) == None:
|
106
|
-
mlflow.create_experiment(self.experiment_name)
|
107
|
-
mlflow.set_experiment_tags(self.experiment_tags)
|
108
|
-
mlflow.set_experiment(self.experiment_name)
|
109
|
-
|
110
|
-
y_test_label = self.items[0][4]
|
111
|
-
multi_modal = TFMultiModal(
|
112
|
-
model_func=model_func, batch_size=batch_size, epochs=epochs) # 100
|
113
|
-
multi_modal.set_data(
|
114
|
-
self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
|
115
|
-
result = multi_modal.predict()
|
116
|
-
return result
|