ddi-fw 0.0.94__py3-none-any.whl → 0.0.96__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/ml/__init__.py +2 -1
- ddi_fw/ml/ml_helper.py +1 -1
- ddi_fw/ml/model_wrapper.py +1 -1
- ddi_fw/ml/pytorch_wrapper.py +1 -1
- ddi_fw/ml/tensorflow_wrapper.py +32 -22
- ddi_fw/pipeline/multi_pipeline.py +5 -0
- ddi_fw/pipeline/pipeline.py +66 -18
- {ddi_fw-0.0.94.dist-info → ddi_fw-0.0.96.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.94.dist-info → ddi_fw-0.0.96.dist-info}/RECORD +12 -18
- ddi_fw/experiments/__init__.py +0 -4
- ddi_fw/experiments/custom_torch_model.py +0 -66
- ddi_fw/experiments/pipeline.py +0 -132
- ddi_fw/experiments/pipeline_ner.py +0 -116
- ddi_fw/experiments/tensorflow_helper.py +0 -284
- ddi_fw/experiments/test.py +0 -61
- /ddi_fw/{experiments → ml}/evaluation_helper.py +0 -0
- {ddi_fw-0.0.94.dist-info → ddi_fw-0.0.96.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.94.dist-info → ddi_fw-0.0.96.dist-info}/top_level.txt +0 -0
ddi_fw/ml/__init__.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
from .ml_helper import MultiModalRunner
|
2
2
|
from .model_wrapper import ModelWrapper,Result
|
3
3
|
from .tensorflow_wrapper import TFModelWrapper
|
4
|
-
from .pytorch_wrapper import PTModelWrapper
|
4
|
+
from .pytorch_wrapper import PTModelWrapper
|
5
|
+
from .evaluation_helper import evaluate
|
ddi_fw/ml/ml_helper.py
CHANGED
@@ -16,7 +16,7 @@ from mlflow.utils.autologging_utils import batch_metrics_logger
|
|
16
16
|
import time
|
17
17
|
|
18
18
|
from mlflow.models import infer_signature
|
19
|
-
from ddi_fw.
|
19
|
+
from ddi_fw.ml.evaluation_helper import Metrics, evaluate
|
20
20
|
|
21
21
|
# import tf2onnx
|
22
22
|
# import onnx
|
ddi_fw/ml/model_wrapper.py
CHANGED
ddi_fw/ml/pytorch_wrapper.py
CHANGED
ddi_fw/ml/tensorflow_wrapper.py
CHANGED
@@ -1,24 +1,19 @@
|
|
1
|
-
from matplotlib import pyplot as plt
|
2
1
|
from ddi_fw.ml.model_wrapper import ModelWrapper
|
3
2
|
import tensorflow as tf
|
4
3
|
from tensorflow import keras
|
5
|
-
from keras.
|
6
|
-
from keras.layers import Dense, Dropout, Input, Activation, BatchNormalization
|
7
|
-
from keras.callbacks import EarlyStopping
|
4
|
+
from keras.callbacks import EarlyStopping,ModelCheckpoint
|
8
5
|
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
|
9
6
|
import numpy as np
|
10
7
|
|
11
8
|
import mlflow
|
12
9
|
from mlflow.utils.autologging_utils import batch_metrics_logger
|
13
|
-
import time
|
14
10
|
|
15
11
|
from mlflow.models import infer_signature
|
16
|
-
from ddi_fw.
|
12
|
+
from ddi_fw.ml.evaluation_helper import Metrics, evaluate
|
17
13
|
|
18
14
|
# import tf2onnx
|
19
15
|
# import onnx
|
20
16
|
|
21
|
-
import itertools
|
22
17
|
import ddi_fw.utils as utils
|
23
18
|
|
24
19
|
|
@@ -44,6 +39,15 @@ class TFModelWrapper(ModelWrapper):
|
|
44
39
|
X_valid_cv = self.train_data[val_idx]
|
45
40
|
y_valid_cv = self.train_label[val_idx]
|
46
41
|
|
42
|
+
checkpoint = ModelCheckpoint(
|
43
|
+
filepath=f'{self.descriptor}_validation_{i}.weights.h5',
|
44
|
+
monitor='val_loss',
|
45
|
+
save_best_only=True,
|
46
|
+
save_weights_only=True,
|
47
|
+
verbose=1,
|
48
|
+
mode='min'
|
49
|
+
)
|
50
|
+
|
47
51
|
early_stopping = EarlyStopping(
|
48
52
|
monitor='val_loss', patience=10, verbose=0, mode='auto')
|
49
53
|
custom_callback = CustomCallback()
|
@@ -52,15 +56,21 @@ class TFModelWrapper(ModelWrapper):
|
|
52
56
|
epochs=self.epochs,
|
53
57
|
validation_data=(
|
54
58
|
X_valid_cv, y_valid_cv),
|
55
|
-
callbacks=[early_stopping, custom_callback])
|
59
|
+
callbacks=[early_stopping, checkpoint, custom_callback])
|
56
60
|
# histories[f'validation_{i}'] = history
|
57
|
-
models_val_acc[f'validation_{i}'] = history.history['val_accuracy'][-1]
|
61
|
+
# models_val_acc[f'validation_{i}'] = history.history['val_accuracy'][-1]
|
62
|
+
models_val_acc[f'{self.descriptor}_validation_{i}'] = checkpoint.best
|
63
|
+
models[f'{self.descriptor}_validation_{i}'] = checkpoint.model
|
64
|
+
import os
|
65
|
+
if os.path.exists(f'{self.descriptor}_validation_{i}.weights.h5'):
|
66
|
+
os.remove(f'{self.descriptor}_validation_{i}.weights.h5')
|
58
67
|
# Saving each CV model
|
59
68
|
|
60
69
|
best_model_key = max(models_val_acc, key=models_val_acc.get)
|
61
70
|
best_model = models[best_model_key]
|
62
|
-
|
63
|
-
|
71
|
+
# mlflow.tensorflow.log_model(best_model, "model")
|
72
|
+
# best_model.evaluate(self.test_data, self.test_label,
|
73
|
+
# callbacks=[custom_callback])
|
64
74
|
pred = best_model.predict(self.test_data)
|
65
75
|
|
66
76
|
logs, metrics = evaluate(
|
@@ -68,17 +78,17 @@ class TFModelWrapper(ModelWrapper):
|
|
68
78
|
metrics.format_float()
|
69
79
|
mlflow.log_metrics(logs)
|
70
80
|
mlflow.log_param('best_cv', best_model_key)
|
71
|
-
signature = infer_signature(
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
)
|
76
|
-
|
77
|
-
mlflow.keras.save_model(
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
)
|
81
|
+
# signature = infer_signature(
|
82
|
+
# self.train_data,
|
83
|
+
# # generate_signature_output(model,X_valid_cv)
|
84
|
+
# # params=params,
|
85
|
+
# )
|
86
|
+
|
87
|
+
# mlflow.keras.save_model(
|
88
|
+
# best_model,
|
89
|
+
# path=run.info.artifact_uri + '/model',
|
90
|
+
# signature=signature,
|
91
|
+
# )
|
82
92
|
print(run.info.artifact_uri)
|
83
93
|
# todo tf2onnx not compatible with keras > 2.15
|
84
94
|
# onnx_model, _ = tf2onnx.convert.from_keras(
|
@@ -63,6 +63,7 @@ class MultiPipeline():
|
|
63
63
|
columns = config.get("columns")
|
64
64
|
ner_data_file = config.get("ner_data_file")
|
65
65
|
ner_threshold = config.get("ner_threshold")
|
66
|
+
column_embedding_configs = config.get("column_embedding_configs")
|
66
67
|
vector_db_persist_directory = config.get("vector_db_persist_directory")
|
67
68
|
vector_db_collection_name = config.get("vector_db_collection_name")
|
68
69
|
embedding_pooling_strategy = get_import(
|
@@ -93,6 +94,7 @@ class MultiPipeline():
|
|
93
94
|
tracking_uri=tracking_uri,
|
94
95
|
dataset_type=dataset_type,
|
95
96
|
columns=columns,
|
97
|
+
column_embedding_configs=column_embedding_configs,
|
96
98
|
vector_db_persist_directory=vector_db_persist_directory,
|
97
99
|
vector_db_collection_name=vector_db_collection_name,
|
98
100
|
embedding_pooling_strategy_type=embedding_pooling_strategy,
|
@@ -126,6 +128,7 @@ class MultiPipeline():
|
|
126
128
|
for config in self.experiments_config['experiments']:
|
127
129
|
item = self.__create_pipeline(config)
|
128
130
|
self.items.append(item)
|
131
|
+
return self
|
129
132
|
|
130
133
|
def run(self):
|
131
134
|
for item in self.items:
|
@@ -134,9 +137,11 @@ class MultiPipeline():
|
|
134
137
|
model_type = item['model_type']
|
135
138
|
batch_size = item['batch_size']
|
136
139
|
epochs = item['epochs']
|
140
|
+
# It can be moved to build function
|
137
141
|
pipeline.build()
|
138
142
|
result = pipeline.run(model_type, epochs=epochs, batch_size=batch_size)
|
139
143
|
self.pipeline_resuts[item['name']] = result
|
144
|
+
return self
|
140
145
|
|
141
146
|
def results(self):
|
142
147
|
return self.pipeline_resuts
|
ddi_fw/pipeline/pipeline.py
CHANGED
@@ -21,6 +21,7 @@ class Pipeline:
|
|
21
21
|
dataset_type: BaseDataset = None,
|
22
22
|
columns=None,
|
23
23
|
embedding_dict=None,
|
24
|
+
column_embedding_configs=None,
|
24
25
|
vector_db_persist_directory=None,
|
25
26
|
vector_db_collection_name=None,
|
26
27
|
embedding_pooling_strategy_type: PoolingStrategy = None,
|
@@ -37,6 +38,7 @@ class Pipeline:
|
|
37
38
|
self.dataset_type = dataset_type
|
38
39
|
self.columns = columns
|
39
40
|
self.embedding_dict = embedding_dict
|
41
|
+
self.column_embedding_configs = column_embedding_configs
|
40
42
|
self.vector_db_persist_directory = vector_db_persist_directory
|
41
43
|
self.vector_db_collection_name = vector_db_collection_name
|
42
44
|
self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
|
@@ -45,6 +47,37 @@ class Pipeline:
|
|
45
47
|
self.combinations = combinations
|
46
48
|
self.model = model
|
47
49
|
|
50
|
+
def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column):
|
51
|
+
"""
|
52
|
+
Fetch embeddings and metadata from a persistent Chroma vector database and update the provided embedding_dict.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
- vector_db_persist_directory (str): The path to the directory where the Chroma vector database is stored.
|
56
|
+
- vector_db_collection_name (str): The name of the collection to query.
|
57
|
+
- embedding_dict (dict): The existing dictionary to update with embeddings.
|
58
|
+
|
59
|
+
"""
|
60
|
+
if vector_db_persist_directory:
|
61
|
+
# Initialize the Chroma client and get the collection
|
62
|
+
vector_db = chromadb.PersistentClient(
|
63
|
+
path=vector_db_persist_directory)
|
64
|
+
collection = vector_db.get_collection(vector_db_collection_name)
|
65
|
+
|
66
|
+
# Fetch the embeddings and metadata
|
67
|
+
if column == None:
|
68
|
+
dictionary = collection.get(include=['embeddings', 'metadatas'])
|
69
|
+
else:
|
70
|
+
dictionary = collection.get(include=['embeddings', 'metadatas'], where= {"type": {"$eq": f"{column}"}})
|
71
|
+
# Populate the embedding dictionary with embeddings from the vector database
|
72
|
+
for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
|
73
|
+
embedding_dict[metadata["type"]
|
74
|
+
][metadata["id"]].append(embedding)
|
75
|
+
|
76
|
+
# return dictionary['embeddings'].shape[1]
|
77
|
+
else:
|
78
|
+
raise ValueError(
|
79
|
+
"Persistent directory for the vector DB is not specified.")
|
80
|
+
|
48
81
|
def build(self):
|
49
82
|
# 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
|
50
83
|
kwargs = {"columns": self.columns}
|
@@ -52,27 +85,42 @@ class Pipeline:
|
|
52
85
|
for k, v in self.ner_threshold.items():
|
53
86
|
kwargs[k] = v
|
54
87
|
if self.embedding_dict == None:
|
88
|
+
embedding_dict = defaultdict(lambda: defaultdict(list))
|
55
89
|
if self.vector_db_persist_directory:
|
56
|
-
self.
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
90
|
+
self.__create_or_update_embeddings__(
|
91
|
+
embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
|
92
|
+
|
93
|
+
if self.column_embedding_configs:
|
94
|
+
for item in self.column_embedding_configs:
|
95
|
+
col = item["column"]
|
96
|
+
col_db_dir = item["vector_db_persist_directory"]
|
97
|
+
col_db_collection = item["vector_db_collection_name"]
|
98
|
+
self.__create_or_update_embeddings__(embedding_dict, col_db_dir, col_db_collection, col)
|
99
|
+
print(f"Embedings of {col} is calculated from {col_db_collection}")
|
100
|
+
|
101
|
+
# if self.embedding_dict == None:
|
102
|
+
# if self.vector_db_persist_directory:
|
103
|
+
# self.vector_db = chromadb.PersistentClient(
|
104
|
+
# path=self.vector_db_persist_directory)
|
105
|
+
# self.collection = self.vector_db.get_collection(
|
106
|
+
# self.vector_db_collection_name)
|
107
|
+
# dictionary = self.collection.get(
|
108
|
+
# include=['embeddings', 'metadatas'])
|
109
|
+
|
110
|
+
# embedding_dict = defaultdict(lambda: defaultdict(list))
|
111
|
+
|
112
|
+
# for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
|
113
|
+
# embedding_dict[metadata["type"]
|
114
|
+
# ][metadata["id"]].append(embedding)
|
115
|
+
|
116
|
+
# embedding_size = dictionary['embeddings'].shape[1]
|
70
117
|
else:
|
71
118
|
embedding_dict = self.embedding_dict
|
72
|
-
#TODO make generic
|
73
|
-
embedding_size = list(embedding_dict['all_text'].values())[
|
74
|
-
|
75
|
-
|
119
|
+
# TODO make generic
|
120
|
+
# embedding_size = list(embedding_dict['all_text'].values())[
|
121
|
+
# 0][0].shape
|
122
|
+
key, value = next(iter(embedding_dict.items()))
|
123
|
+
embedding_size = value[next(iter(value))][0].shape[0]
|
76
124
|
pooling_strategy = self.embedding_pooling_strategy_type()
|
77
125
|
|
78
126
|
self.ner_df = CTakesNER().load(
|
@@ -55,30 +55,24 @@ ddi_fw/drugbank/drugbank_parser.py,sha256=lxUuhB0s8ef_aPNDs0V8ClKF7-KIWugNIV9gVs
|
|
55
55
|
ddi_fw/drugbank/drugbank_processor.py,sha256=vmkt68n9nFLevufgGyXhOSDtTo4G1XzwT9PVncGTXtk,18127
|
56
56
|
ddi_fw/drugbank/drugbank_processor_org.py,sha256=eO5Yset50P91qkic79RUXPoEuxRxQKFkKW0l4G29Mas,13322
|
57
57
|
ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHrBXsI8,4600
|
58
|
-
ddi_fw/experiments/__init__.py,sha256=5L2xSolpFycNnflqOMdvJSiqRB16ExA5bbVGORKFX04,195
|
59
|
-
ddi_fw/experiments/custom_torch_model.py,sha256=iQ_R_EApzD2JCcASN8cie6D21oh7VCxaOQ45_dkiGwc,2576
|
60
|
-
ddi_fw/experiments/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
|
61
|
-
ddi_fw/experiments/pipeline.py,sha256=4ltPCcfLZ1fFpiOd8ahPognI6NLmRLzJvUqyFpn3z18,5693
|
62
|
-
ddi_fw/experiments/pipeline_ner.py,sha256=unxEJCYrG6wEZjLmqvGdLRTMOBwELbGKkdygSpAR3b8,5043
|
63
|
-
ddi_fw/experiments/tensorflow_helper.py,sha256=m3Mppl-tbccTMAKLpZg2YC0xpcukkyQihPw_uwAlRRY,11857
|
64
|
-
ddi_fw/experiments/test.py,sha256=z1TfBpK75zGKpp2ZU8f6APjZlgBFthaCBN61YB9ma4o,2049
|
65
58
|
ddi_fw/langchain/__init__.py,sha256=8dBPZivc01WWaCH8sZ_UV8-XPyo74e9Qy6-fYgAiNLE,248
|
66
59
|
ddi_fw/langchain/embeddings.py,sha256=8J_SfO9pyET2W-Ltzq0_r9EchFzBsYdUabiOMma42Us,7515
|
67
60
|
ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
|
68
61
|
ddi_fw/langchain/storage.py,sha256=uy5clVB07So2eFbRGdAKzHIPdfEk4se33cPktis7Aa4,2716
|
69
|
-
ddi_fw/ml/__init__.py,sha256=
|
70
|
-
ddi_fw/ml/
|
71
|
-
ddi_fw/ml/
|
72
|
-
ddi_fw/ml/
|
73
|
-
ddi_fw/ml/
|
62
|
+
ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
|
63
|
+
ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
|
64
|
+
ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
|
65
|
+
ddi_fw/ml/model_wrapper.py,sha256=W-bed6NOJxuXs7d3nG6iU8bkUX8LBPFQ0gMjZ7Qc0Sw,1135
|
66
|
+
ddi_fw/ml/pytorch_wrapper.py,sha256=AkG-2sKDXr0IBhgmkbjG0i20OuwQv3mhdvqp6UvJDCA,3716
|
67
|
+
ddi_fw/ml/tensorflow_wrapper.py,sha256=E46lC9qMkM5NvFTL-eOuMcMhEUso5UYfP66Du4BOhfQ,6423
|
74
68
|
ddi_fw/ner/__init__.py,sha256=JwhGXrepomxPSsGsg2b_xPRC72AjvxOIn2CW5Mvscn0,26
|
75
69
|
ddi_fw/ner/mmlrestclient.py,sha256=NZta7m2Qm6I_qtVguMZhqtAUjVBmmXn0-TMnsNp0jpg,6859
|
76
70
|
ddi_fw/ner/ner.py,sha256=BEs9AFljAxOQrC2BEP1raSzRoypcfELS5UTdl4bjTqw,15863
|
77
71
|
ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,212
|
78
72
|
ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=qIst7vxHaOAhRv4lgozszwa3b1QE4aIrN74t41Xnvr4,1637
|
79
|
-
ddi_fw/pipeline/multi_pipeline.py,sha256=
|
73
|
+
ddi_fw/pipeline/multi_pipeline.py,sha256=t_Z7d7xRfDnhpQTlqCf7c0isZ5hZlyXavKhC7ePsnJY,5903
|
80
74
|
ddi_fw/pipeline/ner_pipeline.py,sha256=wB7hz4YCOv7UAz6bGE6sSpPXXIdoOflOVK5UCc1fO-o,5586
|
81
|
-
ddi_fw/pipeline/pipeline.py,sha256=
|
75
|
+
ddi_fw/pipeline/pipeline.py,sha256=q7jfTt7ryYa3xBscPtxvanB-j5RzWVZUKir0KmAdTKc,8357
|
82
76
|
ddi_fw/test/basic_test.py,sha256=fEOGcZm1ObnsDvMiXNmdmz6YCeUrGc8V0DwlSwGhsq8,376
|
83
77
|
ddi_fw/test/combination_test.py,sha256=TWNE8sf-DSh1Q9-yRaRBc774Sn1kSMGXLwQhd2_Qynk,324
|
84
78
|
ddi_fw/test/compress_json_test.py,sha256=BGny56YqiG-pzhMoDzLKQBQI1E7o3jU0S7VYWtclAx4,1045
|
@@ -95,7 +89,7 @@ ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
|
|
95
89
|
ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
|
96
90
|
ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
|
97
91
|
ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
|
98
|
-
ddi_fw-0.0.
|
99
|
-
ddi_fw-0.0.
|
100
|
-
ddi_fw-0.0.
|
101
|
-
ddi_fw-0.0.
|
92
|
+
ddi_fw-0.0.96.dist-info/METADATA,sha256=b2D7e7ub3byUbTwEPdw6FvUqTEK-H_KflNXwqbk4r7s,1966
|
93
|
+
ddi_fw-0.0.96.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
94
|
+
ddi_fw-0.0.96.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
95
|
+
ddi_fw-0.0.96.dist-info/RECORD,,
|
ddi_fw/experiments/__init__.py
DELETED
@@ -1,66 +0,0 @@
|
|
1
|
-
import torch
|
2
|
-
|
3
|
-
class ExtendedTorchModule(torch.nn.Module):
|
4
|
-
def __init__(self,model):
|
5
|
-
super().__init__()
|
6
|
-
self.model = model
|
7
|
-
|
8
|
-
def train(self,dataloader_train, criterion, optimizer, epoch_count = 10):
|
9
|
-
for epoch in range(epoch_count): # loop over the dataset multiple times
|
10
|
-
|
11
|
-
running_loss = 0.0
|
12
|
-
for i, data in enumerate(dataloader_train, 0):
|
13
|
-
# get the inputs; data is a list of [inputs, labels]
|
14
|
-
inputs, labels = data
|
15
|
-
|
16
|
-
# zero the parameter gradients
|
17
|
-
optimizer.zero_grad()
|
18
|
-
|
19
|
-
# forward + backward + optimize
|
20
|
-
outputs = self(inputs)
|
21
|
-
loss = criterion(outputs, labels)
|
22
|
-
loss.backward()
|
23
|
-
optimizer.step()
|
24
|
-
|
25
|
-
# print statistics
|
26
|
-
running_loss += loss.item()
|
27
|
-
if i % 5000 == 4999: # print every 2000 mini-batches
|
28
|
-
print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 5000:.3f}')
|
29
|
-
running_loss = 0.0
|
30
|
-
print('Finished Training')
|
31
|
-
|
32
|
-
def forward(self, x):
|
33
|
-
x = x.to(torch.float32)
|
34
|
-
# for f in self.module_list:
|
35
|
-
# x = f(x)
|
36
|
-
# return x
|
37
|
-
return self.model(x)
|
38
|
-
|
39
|
-
def compute_outputs(self, dataloader_test):
|
40
|
-
output_arr = []
|
41
|
-
with torch.no_grad():
|
42
|
-
for data in dataloader_test:
|
43
|
-
inputs, labels = data
|
44
|
-
# calculate outputs by running inputs through the network
|
45
|
-
outputs = self(inputs)
|
46
|
-
output_arr.append(outputs.numpy())
|
47
|
-
|
48
|
-
# <ipython-input-44-114ac3037693>:54: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:245.)
|
49
|
-
t = torch.tensor(output_arr)
|
50
|
-
return torch.squeeze(t)
|
51
|
-
|
52
|
-
# def compute_accuracy(self, dataloader_test):
|
53
|
-
# correct = 0
|
54
|
-
# total = 0
|
55
|
-
# # since we're not training, we don't need to calculate the gradients for our outputs
|
56
|
-
# with torch.no_grad():
|
57
|
-
# for data in dataloader_test:
|
58
|
-
# inputs, labels = data
|
59
|
-
# # calculate outputs by running inputs through the network
|
60
|
-
# outputs = self(inputs)
|
61
|
-
# # the class with the highest energy is what we choose as prediction
|
62
|
-
# _, predicted = torch.max(outputs.data, 1)
|
63
|
-
# total += labels.size(0)
|
64
|
-
# correct += (predicted == labels).sum().item()
|
65
|
-
|
66
|
-
# print(f'Accuracy of the network: {100 * correct // total} %')
|
ddi_fw/experiments/pipeline.py
DELETED
@@ -1,132 +0,0 @@
|
|
1
|
-
import sqlite3
|
2
|
-
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
|
3
|
-
from keras.models import Model, Sequential
|
4
|
-
from keras.callbacks import EarlyStopping
|
5
|
-
from keras.layers import Dense, Dropout, Input, Activation, BatchNormalization
|
6
|
-
from tensorflow import keras
|
7
|
-
from ddi_fw.experiments import TFSingleModal, TFMultiModal
|
8
|
-
from ddi_fw.experiments import evaluate
|
9
|
-
from sklearn.preprocessing import LabelBinarizer
|
10
|
-
import numpy as np
|
11
|
-
import pandas as pd
|
12
|
-
from ddi_fw.utils import ZipHelper, Py7ZipHelper
|
13
|
-
import os
|
14
|
-
import chromadb
|
15
|
-
from collections import defaultdict
|
16
|
-
from langchain_community.vectorstores import Chroma
|
17
|
-
from ddi_fw.ner.ner import CTakesNER
|
18
|
-
from ddi_fw.langchain.embeddings import PoolingStrategy
|
19
|
-
|
20
|
-
from ddi_fw.datasets import BaseDataset, DDIMDLDataset
|
21
|
-
|
22
|
-
from ddi_fw.langchain.embeddings import SumPoolingStrategy
|
23
|
-
from keras import metrics
|
24
|
-
from ddi_fw.experiments.evaluation_helper import evaluate
|
25
|
-
|
26
|
-
import mlflow
|
27
|
-
|
28
|
-
|
29
|
-
class Experiment:
|
30
|
-
def __init__(self,
|
31
|
-
experiment_name=None,
|
32
|
-
experiment_description=None,
|
33
|
-
experiment_tags=None,
|
34
|
-
tracking_uri=None,
|
35
|
-
dataset_type:BaseDataset=None,
|
36
|
-
columns=None,
|
37
|
-
embedding_dict = None,
|
38
|
-
vector_db_persist_directory=None,
|
39
|
-
vector_db_collection_name=None,
|
40
|
-
embedding_pooling_strategy_type:PoolingStrategy=None,
|
41
|
-
ner_data_file=None,
|
42
|
-
ner_threshold=None,
|
43
|
-
combinations=None,
|
44
|
-
model=None):
|
45
|
-
|
46
|
-
self.experiment_name = experiment_name
|
47
|
-
self.experiment_description = experiment_description
|
48
|
-
self.experiment_tags = experiment_tags
|
49
|
-
self.tracking_uri = tracking_uri
|
50
|
-
self.dataset_type = dataset_type
|
51
|
-
self.columns = columns
|
52
|
-
self.embedding_dict = embedding_dict
|
53
|
-
self.vector_db_persist_directory = vector_db_persist_directory
|
54
|
-
self.vector_db_collection_name = vector_db_collection_name
|
55
|
-
self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
|
56
|
-
self.ner_data_file = ner_data_file
|
57
|
-
self.ner_threshold = ner_threshold
|
58
|
-
self.combinations = combinations
|
59
|
-
self.model = model
|
60
|
-
|
61
|
-
def build(self):
|
62
|
-
# 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
|
63
|
-
kwargs = {"columns": self.columns}
|
64
|
-
for k, v in self.ner_threshold.items():
|
65
|
-
kwargs[k] = v
|
66
|
-
if self.embedding_dict == None:
|
67
|
-
if self.vector_db_persist_directory:
|
68
|
-
self.vector_db = chromadb.PersistentClient(
|
69
|
-
path=self.vector_db_persist_directory)
|
70
|
-
self.collection = self.vector_db.get_collection(
|
71
|
-
self.vector_db_collection_name)
|
72
|
-
dictionary = self.collection.get(include=['embeddings', 'metadatas'])
|
73
|
-
|
74
|
-
embedding_dict = defaultdict(lambda: defaultdict(list))
|
75
|
-
|
76
|
-
for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
|
77
|
-
embedding_dict[metadata["type"]][metadata["id"]].append(embedding)
|
78
|
-
|
79
|
-
embedding_size = dictionary['embeddings'].shape[1]
|
80
|
-
else:
|
81
|
-
embedding_dict = self.embedding_dict
|
82
|
-
embedding_size = list(embedding_dict['all_text'].values())[0][0].shape
|
83
|
-
|
84
|
-
pooling_strategy = self.embedding_pooling_strategy_type()
|
85
|
-
|
86
|
-
self.ner_df = CTakesNER().load(filename=self.ner_data_file) if self.ner_data_file else None
|
87
|
-
|
88
|
-
self.dataset = self.dataset_type(
|
89
|
-
embedding_dict=embedding_dict,
|
90
|
-
embedding_size=embedding_size,
|
91
|
-
embeddings_pooling_strategy=pooling_strategy,
|
92
|
-
ner_df=self.ner_df, **kwargs)
|
93
|
-
|
94
|
-
X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = self.dataset.load()
|
95
|
-
|
96
|
-
self.dataframe = self.dataset.dataframe
|
97
|
-
# dataframe.dropna()
|
98
|
-
self.X_train = self.dataset.X_train
|
99
|
-
self.X_test = self.dataset.X_test
|
100
|
-
self.y_train = self.dataset.y_train
|
101
|
-
self.y_test = self.dataset.y_test
|
102
|
-
self.train_idx_arr = self.dataset.train_idx_arr
|
103
|
-
self.val_idx_arr = self.dataset.val_idx_arr
|
104
|
-
# Logic to set up the experiment
|
105
|
-
self.items = self.dataset.produce_inputs()
|
106
|
-
|
107
|
-
unique_classes = pd.unique(self.dataframe['event_category'])
|
108
|
-
event_num = len(unique_classes)
|
109
|
-
# droprate = 0.3
|
110
|
-
vector_size = self.dataset.drugs_df.shape[0]
|
111
|
-
|
112
|
-
print("Building the experiment with the following settings:")
|
113
|
-
print(
|
114
|
-
f"Name: {self.experiment_name}, Dataset: {self.dataset}, Model: {self.model}")
|
115
|
-
# Implement additional build logic as needed
|
116
|
-
return self
|
117
|
-
|
118
|
-
def run(self, model_func, batch_size=128, epochs=100):
|
119
|
-
mlflow.set_tracking_uri(self.tracking_uri)
|
120
|
-
|
121
|
-
if mlflow.get_experiment_by_name(self.experiment_name) == None:
|
122
|
-
mlflow.create_experiment(self.experiment_name)
|
123
|
-
mlflow.set_experiment_tags(self.experiment_tags)
|
124
|
-
mlflow.set_experiment(self.experiment_name)
|
125
|
-
|
126
|
-
y_test_label = self.items[0][4]
|
127
|
-
multi_modal = TFMultiModal(
|
128
|
-
model_func=model_func, batch_size=batch_size, epochs=epochs) # 100
|
129
|
-
multi_modal.set_data(
|
130
|
-
self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
|
131
|
-
result = multi_modal.predict(self.combinations)
|
132
|
-
return result
|
@@ -1,116 +0,0 @@
|
|
1
|
-
from collections import defaultdict
|
2
|
-
from enum import Enum
|
3
|
-
import numpy as np
|
4
|
-
import pandas as pd
|
5
|
-
from ddi_fw.datasets.core import BaseDataset
|
6
|
-
from ddi_fw.experiments.tensorflow_helper import TFMultiModal
|
7
|
-
from ddi_fw.experiments.pipeline import Experiment
|
8
|
-
from typing import Dict, List
|
9
|
-
from itertools import product
|
10
|
-
|
11
|
-
from ddi_fw.utils.enums import DrugBankTextDataTypes, UMLSCodeTypes
|
12
|
-
import mlflow
|
13
|
-
from ddi_fw.ner.ner import CTakesNER
|
14
|
-
|
15
|
-
def stack(df_column):
|
16
|
-
return np.stack(df_column.values)
|
17
|
-
|
18
|
-
|
19
|
-
class NerParameterSearch:
|
20
|
-
def __init__(self,
|
21
|
-
experiment_name,
|
22
|
-
experiment_description,
|
23
|
-
experiment_tags,
|
24
|
-
tracking_uri,
|
25
|
-
dataset_type: BaseDataset,
|
26
|
-
ner_data_file,
|
27
|
-
columns:list,
|
28
|
-
umls_code_types: List[UMLSCodeTypes],
|
29
|
-
text_types=List[DrugBankTextDataTypes],
|
30
|
-
min_threshold_dict: Dict[str, float] = defaultdict(float),
|
31
|
-
max_threshold_dict: Dict[str, float] = defaultdict(float),
|
32
|
-
increase_step=0.5):
|
33
|
-
self.experiment_name = experiment_name
|
34
|
-
self.experiment_description = experiment_description
|
35
|
-
self.experiment_tags = experiment_tags
|
36
|
-
self.tracking_uri = tracking_uri
|
37
|
-
|
38
|
-
self.dataset_type = dataset_type
|
39
|
-
self.ner_data_file = ner_data_file
|
40
|
-
self.columns = columns
|
41
|
-
self.umls_code_types = umls_code_types
|
42
|
-
self.text_types = text_types
|
43
|
-
self.min_threshold_dict = min_threshold_dict
|
44
|
-
self.max_threshold_dict = max_threshold_dict
|
45
|
-
self.increase_step = increase_step
|
46
|
-
|
47
|
-
def build(self):
|
48
|
-
self.datasets = {}
|
49
|
-
self.items = []
|
50
|
-
# columns = ['tui', 'cui', 'entities']
|
51
|
-
if self.umls_code_types is not None and self.text_types is not None:
|
52
|
-
# add checking statements
|
53
|
-
_umls_codes = [t.value[0] for t in self.umls_code_types]
|
54
|
-
_text_types = [t.value[0] for t in self.text_types]
|
55
|
-
_columns = [f'{item[0]}_{item[1]}' for item in product(
|
56
|
-
_umls_codes, _text_types)]
|
57
|
-
self.columns.extend(_columns)
|
58
|
-
print(f'Columns: {self.columns}')
|
59
|
-
self.ner_df = CTakesNER().load(filename=self.ner_data_file) if self.ner_data_file else None
|
60
|
-
for column in self.columns:
|
61
|
-
min_threshold = self.min_threshold_dict[column]
|
62
|
-
max_threshold = self.max_threshold_dict[column]
|
63
|
-
kwargs = {}
|
64
|
-
kwargs['threshold_method'] = 'idf'
|
65
|
-
kwargs['tui_threshold'] = 0
|
66
|
-
kwargs['cui_threshold'] = 0
|
67
|
-
kwargs['entities_threshold'] = 0
|
68
|
-
|
69
|
-
for threshold in np.arange(min_threshold, max_threshold, self.increase_step):
|
70
|
-
print(threshold)
|
71
|
-
if column.startswith('tui'):
|
72
|
-
kwargs['tui_threshold'] = threshold
|
73
|
-
if column.startswith('cui'):
|
74
|
-
kwargs['cui_threshold'] = threshold
|
75
|
-
if column.startswith('entities'):
|
76
|
-
kwargs['entities_threshold'] = threshold
|
77
|
-
dataset = self.dataset_type(
|
78
|
-
# chemical_property_columns=[],
|
79
|
-
# embedding_columns=[],
|
80
|
-
# ner_columns=[column],
|
81
|
-
columns=[column],
|
82
|
-
ner_df= self.ner_df,
|
83
|
-
embedding_size = None,
|
84
|
-
embedding_dict = None,
|
85
|
-
embeddings_pooling_strategy = None,
|
86
|
-
**kwargs)
|
87
|
-
|
88
|
-
# train_idx_arr, val_idx_arr bir kez hesaplanması yeterli aslında
|
89
|
-
X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = dataset.load()
|
90
|
-
group_items = dataset.produce_inputs()
|
91
|
-
for item in group_items:
|
92
|
-
# item[0] = f'threshold_{threshold}_{item[0]}'
|
93
|
-
item[0] = f'threshold_{item[0]}_{threshold}'
|
94
|
-
self.datasets[item[0]] = dataset.ddis_df
|
95
|
-
|
96
|
-
self.items.extend(group_items)
|
97
|
-
self.y_test_label = self.items[0][4]
|
98
|
-
self.train_idx_arr = train_idx_arr
|
99
|
-
self.val_idx_arr = val_idx_arr
|
100
|
-
|
101
|
-
|
102
|
-
def run(self, model_func, batch_size=128, epochs=100):
|
103
|
-
mlflow.set_tracking_uri(self.tracking_uri)
|
104
|
-
|
105
|
-
if mlflow.get_experiment_by_name(self.experiment_name) == None:
|
106
|
-
mlflow.create_experiment(self.experiment_name)
|
107
|
-
mlflow.set_experiment_tags(self.experiment_tags)
|
108
|
-
mlflow.set_experiment(self.experiment_name)
|
109
|
-
|
110
|
-
y_test_label = self.items[0][4]
|
111
|
-
multi_modal = TFMultiModal(
|
112
|
-
model_func=model_func, batch_size=batch_size, epochs=epochs) # 100
|
113
|
-
multi_modal.set_data(
|
114
|
-
self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
|
115
|
-
result = multi_modal.predict()
|
116
|
-
return result
|
@@ -1,284 +0,0 @@
|
|
1
|
-
from matplotlib import pyplot as plt
|
2
|
-
import tensorflow as tf
|
3
|
-
from tensorflow import keras
|
4
|
-
from keras.models import Model, Sequential
|
5
|
-
from keras.layers import Dense, Dropout, Input, Activation, BatchNormalization
|
6
|
-
from keras.callbacks import EarlyStopping
|
7
|
-
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
|
8
|
-
import numpy as np
|
9
|
-
|
10
|
-
import mlflow
|
11
|
-
from mlflow.utils.autologging_utils import batch_metrics_logger
|
12
|
-
import time
|
13
|
-
|
14
|
-
from mlflow.models import infer_signature
|
15
|
-
from ddi_fw.experiments.evaluation_helper import evaluate
|
16
|
-
|
17
|
-
# import tf2onnx
|
18
|
-
# import onnx
|
19
|
-
|
20
|
-
import itertools
|
21
|
-
import ddi_fw.utils as utils
|
22
|
-
|
23
|
-
# https://github.com/YifanDengWHU/DDIMDL/blob/master/newTask.py
|
24
|
-
# from numpy.random import seed
|
25
|
-
# seed(1)
|
26
|
-
# from tensorflow import set_random_seed
|
27
|
-
# set_random_seed(2)
|
28
|
-
tf.random.set_seed(1)
|
29
|
-
np.random.seed(2)
|
30
|
-
np.set_printoptions(precision=4)
|
31
|
-
|
32
|
-
|
33
|
-
class Result:
|
34
|
-
def __init__(self) -> None:
|
35
|
-
self.log_dict = {}
|
36
|
-
self.metric_dict = {}
|
37
|
-
|
38
|
-
def add_log(self, key, logs):
|
39
|
-
self.log_dict[key] = logs
|
40
|
-
|
41
|
-
def add_metric(self, key, metrics):
|
42
|
-
self.metric_dict[key] = metrics
|
43
|
-
|
44
|
-
|
45
|
-
class TFMultiModal:
|
46
|
-
# todo model related parameters to config
|
47
|
-
def __init__(self, model_func, batch_size=128, epochs=100):
|
48
|
-
self.model_func = model_func
|
49
|
-
self.batch_size = batch_size
|
50
|
-
self.epochs = epochs
|
51
|
-
self.result = Result()
|
52
|
-
|
53
|
-
def set_data(self, items, train_idx_arr, val_idx_arr, y_test_label):
|
54
|
-
self.items = items
|
55
|
-
self.train_idx_arr = train_idx_arr
|
56
|
-
self.val_idx_arr = val_idx_arr
|
57
|
-
self.y_test_label = y_test_label
|
58
|
-
|
59
|
-
def predict(self, combinations: list = [], generate_combinations=False):
|
60
|
-
self.prefix = utils.utc_time_as_string()
|
61
|
-
self.date = utils.utc_time_as_string_simple_format()
|
62
|
-
sum = np.zeros(
|
63
|
-
(self.y_test_label.shape[0], self.y_test_label.shape[1]))
|
64
|
-
single_results = dict()
|
65
|
-
|
66
|
-
if generate_combinations:
|
67
|
-
l = [item[0] for item in self.items]
|
68
|
-
combinations = []
|
69
|
-
for i in range(2, len(l) + 1):
|
70
|
-
combinations.extend(list(itertools.combinations(l, i))) # all
|
71
|
-
|
72
|
-
with mlflow.start_run(run_name=self.prefix, description="***") as run:
|
73
|
-
self.level_0_run_id = run.info.run_id
|
74
|
-
for item in self.items:
|
75
|
-
print(item[0])
|
76
|
-
single_modal = TFSingleModal(
|
77
|
-
self.date, item[0], self.model_func, self.batch_size, self.epochs)
|
78
|
-
single_modal.set_data(
|
79
|
-
self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
|
80
|
-
logs, metrics, prediction = single_modal.predict()
|
81
|
-
self.result.add_log(item[0], logs)
|
82
|
-
# self.result.add_metric(item[0], metrics)
|
83
|
-
# single_results[item[0]] = prediction
|
84
|
-
single_results[item[0]] = tf.nn.softmax(prediction).numpy()
|
85
|
-
# sum = sum + prediction
|
86
|
-
|
87
|
-
if combinations:
|
88
|
-
self.evaluate_combinations(single_results, combinations)
|
89
|
-
# TODO: sum'a gerek yok
|
90
|
-
return self.result
|
91
|
-
|
92
|
-
def evaluate_combinations(self, single_results, combinations):
|
93
|
-
for combination in combinations:
|
94
|
-
combination_descriptor = '-'.join(combination)
|
95
|
-
with mlflow.start_run(run_name=combination_descriptor, description="***", nested=True) as combination_run:
|
96
|
-
prediction = np.zeros(
|
97
|
-
(self.y_test_label.shape[0], self.y_test_label.shape[1]))
|
98
|
-
for item in combination:
|
99
|
-
prediction = prediction + single_results[item]
|
100
|
-
logs, metrics = evaluate(
|
101
|
-
actual=self.y_test_label, pred=prediction, info=combination_descriptor)
|
102
|
-
mlflow.log_metrics(logs)
|
103
|
-
metrics.format_float()
|
104
|
-
# TODO path bulunamadı hatası aldık
|
105
|
-
print(
|
106
|
-
f'combination_artifact_uri:{combination_run.info.artifact_uri}')
|
107
|
-
utils.compress_and_save_data(
|
108
|
-
metrics.__dict__, combination_run.info.artifact_uri, f'{self.date}_metrics.gzip')
|
109
|
-
# self.result.add_log(combination_descriptor,logs)
|
110
|
-
# self.result.add_metric(combination_descriptor,metrics)
|
111
|
-
|
112
|
-
|
113
|
-
class TFSingleModal:
|
114
|
-
def __init__(self, date, descriptor, model_func, batch_size=128, epochs=100):
|
115
|
-
self.date = date
|
116
|
-
self.descriptor = descriptor
|
117
|
-
self.model_func = model_func
|
118
|
-
self.batch_size = batch_size
|
119
|
-
self.epochs = epochs
|
120
|
-
|
121
|
-
def set_data(self, train_idx_arr, val_idx_arr, train_data, train_label, test_data, test_label):
|
122
|
-
self.train_idx_arr = train_idx_arr
|
123
|
-
self.val_idx_arr = val_idx_arr
|
124
|
-
self.train_data = train_data
|
125
|
-
self.train_label = train_label
|
126
|
-
self.test_data = test_data
|
127
|
-
self.test_label = test_label
|
128
|
-
|
129
|
-
# https://github.com/mlflow/mlflow/blob/master/examples/tensorflow/train.py
|
130
|
-
def predict(self):
|
131
|
-
print(self.train_data.shape)
|
132
|
-
|
133
|
-
# Failed to convert a NumPy array to a Tensor
|
134
|
-
with mlflow.start_run(run_name=self.descriptor, description="***", nested=True) as run:
|
135
|
-
models = dict()
|
136
|
-
histories = dict()
|
137
|
-
models_val_acc = dict()
|
138
|
-
# with batch_metrics_logger(run_id) as metrics_logger:
|
139
|
-
for i, (train_idx, val_idx) in enumerate(zip(self.train_idx_arr, self.val_idx_arr)):
|
140
|
-
print(f"Validation {i}")
|
141
|
-
|
142
|
-
with mlflow.start_run(run_name=f'Validation {i}', description='CV models', nested=True) as cv_fit:
|
143
|
-
model = self.model_func(self.train_data.shape[1])
|
144
|
-
models[f'validation_{i}'] = model
|
145
|
-
X_train_cv = self.train_data[train_idx]
|
146
|
-
y_train_cv = self.train_label[train_idx]
|
147
|
-
X_valid_cv = self.train_data[val_idx]
|
148
|
-
y_valid_cv = self.train_label[val_idx]
|
149
|
-
|
150
|
-
early_stopping = EarlyStopping(
|
151
|
-
monitor='val_loss', patience=10, verbose=0, mode='auto')
|
152
|
-
custom_callback = CustomCallback()
|
153
|
-
history = model.fit(X_train_cv, y_train_cv,
|
154
|
-
batch_size=self.batch_size,
|
155
|
-
epochs=self.epochs,
|
156
|
-
validation_data=(
|
157
|
-
X_valid_cv, y_valid_cv),
|
158
|
-
callbacks=[early_stopping, custom_callback])
|
159
|
-
# histories[f'validation_{i}'] = history
|
160
|
-
models_val_acc[f'validation_{i}'] = history.history['val_accuracy'][-1]
|
161
|
-
# Saving each CV model
|
162
|
-
|
163
|
-
best_model_key = max(models_val_acc, key=models_val_acc.get)
|
164
|
-
best_model = models[best_model_key]
|
165
|
-
best_model.evaluate(self.test_data, self.test_label,
|
166
|
-
callbacks=[custom_callback])
|
167
|
-
pred = best_model.predict(self.test_data)
|
168
|
-
|
169
|
-
logs, metrics = evaluate(
|
170
|
-
actual=self.test_label, pred=pred, info=self.descriptor)
|
171
|
-
metrics.format_float()
|
172
|
-
mlflow.log_metrics(logs)
|
173
|
-
mlflow.log_param('best_cv', best_model_key)
|
174
|
-
signature = infer_signature(
|
175
|
-
self.train_data,
|
176
|
-
# generate_signature_output(model,X_valid_cv)
|
177
|
-
# params=params,
|
178
|
-
)
|
179
|
-
|
180
|
-
mlflow.keras.save_model(
|
181
|
-
best_model,
|
182
|
-
path=run.info.artifact_uri + '/model',
|
183
|
-
signature=signature,
|
184
|
-
)
|
185
|
-
print(run.info.artifact_uri)
|
186
|
-
# todo tf2onnx not compatible with keras > 2.15
|
187
|
-
# onnx_model, _ = tf2onnx.convert.from_keras(
|
188
|
-
# best_model, input_signature=None, opset=13)
|
189
|
-
# onnx.save(onnx_model, run.info.artifact_uri +
|
190
|
-
# '/model/model.onnx')
|
191
|
-
utils.compress_and_save_data(
|
192
|
-
metrics.__dict__, run.info.artifact_uri, f'{self.date}_metrics.gzip')
|
193
|
-
|
194
|
-
return logs, metrics, pred
|
195
|
-
|
196
|
-
|
197
|
-
class CustomCallback(keras.callbacks.Callback):
|
198
|
-
def on_train_begin(self, logs=None):
|
199
|
-
keys = list(logs.keys())
|
200
|
-
mlflow.log_param("train_begin_keys", keys)
|
201
|
-
config = self.model.optimizer.get_config()
|
202
|
-
for attribute in config:
|
203
|
-
mlflow.log_param("opt_" + attribute, config[attribute])
|
204
|
-
|
205
|
-
sum_list = []
|
206
|
-
self.model.summary(print_fn=sum_list.append)
|
207
|
-
summary = "\n".join(sum_list)
|
208
|
-
mlflow.log_text(summary, artifact_file="model_summary.txt")
|
209
|
-
|
210
|
-
def on_train_end(self, logs=None):
|
211
|
-
print(logs)
|
212
|
-
mlflow.log_metrics(logs)
|
213
|
-
|
214
|
-
def on_epoch_begin(self, epoch, logs=None):
|
215
|
-
keys = list(logs.keys())
|
216
|
-
|
217
|
-
def on_epoch_end(self, epoch, logs=None):
|
218
|
-
keys = list(logs.keys())
|
219
|
-
|
220
|
-
def on_test_begin(self, logs=None):
|
221
|
-
keys = list(logs.keys())
|
222
|
-
|
223
|
-
def on_test_end(self, logs=None):
|
224
|
-
mlflow.log_metrics(logs)
|
225
|
-
print(logs)
|
226
|
-
|
227
|
-
def on_predict_begin(self, logs=None):
|
228
|
-
keys = list(logs.keys())
|
229
|
-
|
230
|
-
def on_predict_end(self, logs=None):
|
231
|
-
keys = list(logs.keys())
|
232
|
-
mlflow.log_metrics(logs)
|
233
|
-
|
234
|
-
def on_train_batch_begin(self, batch, logs=None):
|
235
|
-
keys = list(logs.keys())
|
236
|
-
|
237
|
-
def on_train_batch_end(self, batch, logs=None):
|
238
|
-
keys = list(logs.keys())
|
239
|
-
|
240
|
-
def on_test_batch_begin(self, batch, logs=None):
|
241
|
-
keys = list(logs.keys())
|
242
|
-
|
243
|
-
def on_test_batch_end(self, batch, logs=None):
|
244
|
-
keys = list(logs.keys())
|
245
|
-
|
246
|
-
def on_predict_batch_begin(self, batch, logs=None):
|
247
|
-
keys = list(logs.keys())
|
248
|
-
|
249
|
-
def on_predict_batch_end(self, batch, logs=None):
|
250
|
-
keys = list(logs.keys())
|
251
|
-
# def on_train_begin(self, logs=None): # pylint: disable=unused-argument
|
252
|
-
# config = self.model.optimizer.get_config()
|
253
|
-
# for attribute in config:
|
254
|
-
# mlflow.log_param("opt_" + attribute, config[attribute])
|
255
|
-
|
256
|
-
# sum_list = []
|
257
|
-
# self.model.summary(print_fn=sum_list.append)
|
258
|
-
# summary = "\n".join(sum_list)
|
259
|
-
# mlflow.log_text(summary, artifact_file="model_summary.txt")
|
260
|
-
|
261
|
-
# def on_epoch_end(self, epoch, logs=None):
|
262
|
-
# # NB: tf.Keras uses zero-indexing for epochs, while other TensorFlow Estimator
|
263
|
-
# # APIs (e.g., tf.Estimator) use one-indexing. Accordingly, the modular arithmetic
|
264
|
-
# # used here is slightly different from the arithmetic used in `_log_event`, which
|
265
|
-
# # provides metric logging hooks for TensorFlow Estimator & other TensorFlow APIs
|
266
|
-
# if epoch % self.log_every_n_steps == 0:
|
267
|
-
# self.metrics_logger.record_metrics(logs, epoch)
|
268
|
-
|
269
|
-
# def predict(self):
|
270
|
-
# model = self.model_func()
|
271
|
-
# # Failed to convert a NumPy array to a Tensor
|
272
|
-
# for i, (train_idx, val_idx) in enumerate(zip(self.train_idx_arr, self.val_idx_arr)):
|
273
|
-
# print(f"Validation {i}")
|
274
|
-
# X_train_cv = self.train_data[train_idx]
|
275
|
-
# y_train_cv = self.train_label[train_idx]
|
276
|
-
# X_valid_cv = self.train_data[val_idx]
|
277
|
-
# y_valid_cv = self.train_label[val_idx]
|
278
|
-
|
279
|
-
# early_stopping = EarlyStopping(
|
280
|
-
# monitor='val_loss', patience=10, verbose=0, mode='auto')
|
281
|
-
# model.fit(X_train_cv, y_train_cv, batch_size=128, epochs=20, validation_data=(X_valid_cv, y_valid_cv),
|
282
|
-
# callbacks=[early_stopping])
|
283
|
-
# pred = model.predict(self.test_data)
|
284
|
-
# return pred
|
ddi_fw/experiments/test.py
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
# # https://github.com/kashif/tf-keras-tutorial/blob/tf2/3-imdb.ipynb
|
2
|
-
# # TensorFlow and tf.keras
|
3
|
-
# import tensorflow as tf
|
4
|
-
|
5
|
-
# # Helper libraries
|
6
|
-
# import numpy as np
|
7
|
-
# import matplotlib.pyplot as plt
|
8
|
-
# from tensorflow_helper import CustomCallback
|
9
|
-
|
10
|
-
# print(tf.__version__)
|
11
|
-
|
12
|
-
|
13
|
-
# imdb = tf.keras.datasets.imdb
|
14
|
-
|
15
|
-
# (train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.imdb.load_data(num_words=10000)
|
16
|
-
|
17
|
-
|
18
|
-
# class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
|
19
|
-
# 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
|
20
|
-
|
21
|
-
|
22
|
-
# # Create a model
|
23
|
-
# model = tf.keras.Sequential()
|
24
|
-
# custom_callback = CustomCallback()
|
25
|
-
|
26
|
-
# # input shape here is the length of our movie review vector
|
27
|
-
# model.add(tf.keras.layers.Dense(16, activation=tf.nn.relu, input_shape=(10000,)))
|
28
|
-
# model.add(tf.keras.layers.Dense(16, activation=tf.nn.relu))
|
29
|
-
# model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid))
|
30
|
-
|
31
|
-
# optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
|
32
|
-
|
33
|
-
# model.compile(loss='binary_crossentropy',
|
34
|
-
# optimizer=optimizer,
|
35
|
-
# metrics=['binary_accuracy'])
|
36
|
-
|
37
|
-
# model.summary()
|
38
|
-
|
39
|
-
# VAL_SIZE = 10000
|
40
|
-
# x = np.array(train_data[:VAL_SIZE].tolist())
|
41
|
-
|
42
|
-
# val_data = np.asarray(train_data[:VAL_SIZE])
|
43
|
-
# partial_train_data = np.asarray(train_data[VAL_SIZE:])
|
44
|
-
|
45
|
-
|
46
|
-
# val_labels = train_labels[:VAL_SIZE]
|
47
|
-
# partial_train_labels = train_labels[VAL_SIZE:]
|
48
|
-
|
49
|
-
# BATCH_SIZE = 512
|
50
|
-
# SHUFFLE_SIZE = 1000
|
51
|
-
|
52
|
-
# # training_set = tf.data.Dataset.from_tensor_slices((partial_train_data, partial_train_labels))
|
53
|
-
# # training_set = training_set.shuffle(SHUFFLE_SIZE).batch(BATCH_SIZE)
|
54
|
-
|
55
|
-
# model.fit(partial_train_data , partial_train_labels , batch_size=128, epochs=20, validation_data=(val_data , val_labels ),
|
56
|
-
# callbacks=[custom_callback])
|
57
|
-
|
58
|
-
# loss, accuracy = model.evaluate(test_data, test_labels,callbacks=[custom_callback])
|
59
|
-
# print('Test accuracy: %.2f' % (accuracy))
|
60
|
-
|
61
|
-
from langchain.embeddings import SentenceTransformerEmbeddings
|
File without changes
|
File without changes
|
File without changes
|