ddi-fw 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,296 @@
1
+ from matplotlib import pyplot as plt
2
+ import tensorflow as tf
3
+ from tensorflow import keras
4
+ from keras.models import Model, Sequential
5
+ from keras.layers import Dense, Dropout, Input, Activation, BatchNormalization
6
+ from keras.callbacks import EarlyStopping
7
+ from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
8
+ import numpy as np
9
+
10
+ import mlflow
11
+ from mlflow.utils.autologging_utils import batch_metrics_logger
12
+ import time
13
+
14
+ from mlflow.models import infer_signature
15
+ from ddi_fw.experiments.evaluation_helper import evaluate
16
+
17
+ import tf2onnx
18
+ import onnx
19
+
20
+ import itertools
21
+ import utils
22
+
23
+ # https://github.com/YifanDengWHU/DDIMDL/blob/master/newTask.py
24
+ # from numpy.random import seed
25
+ # seed(1)
26
+ # from tensorflow import set_random_seed
27
+ # set_random_seed(2)
28
+ tf.random.set_seed(1)
29
+ np.random.seed(2)
30
+ np.set_printoptions(precision=4)
31
+
32
+
33
+ class TFMultiModal:
34
+ # todo model related parameters to config
35
+ def __init__(self, model_func, batch_size=128, epochs=100):
36
+ self.model_func = model_func
37
+ self.batch_size = batch_size
38
+ self.epochs = epochs
39
+
40
+ def set_data(self, items, train_idx_arr, val_idx_arr, y_test_label):
41
+ self.items = items
42
+ self.train_idx_arr = train_idx_arr
43
+ self.val_idx_arr = val_idx_arr
44
+ self.y_test_label = y_test_label
45
+
46
+ def predict(self, combinations: list = [], generate_combinations=False):
47
+ self.prefix = utils.utc_time_as_string()
48
+ self.date = utils.utc_time_as_string_simple_format
49
+ sum = np.zeros(
50
+ (self.y_test_label.shape[0], self.y_test_label.shape[1]))
51
+ single_results = dict()
52
+
53
+ if generate_combinations:
54
+ l = [item[0] for item in self.items]
55
+ combinations = []
56
+ for i in range(2, len(l) + 1):
57
+ combinations.extend(list(itertools.combinations(l, i))) # all
58
+
59
+ with mlflow.start_run(run_name=self.prefix, description="***") as run:
60
+ self.level_0_run_id = run.info.run_id
61
+ for item in self.items:
62
+ print(item[0])
63
+ single_modal = TFSingleModal(
64
+ self.date, item[0], self.model_func, self.batch_size, self.epochs)
65
+ single_modal.set_data(
66
+ self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
67
+ r = single_modal.predict()
68
+ single_results[item[0]] = r
69
+ sum = sum + r
70
+
71
+ if combinations:
72
+ self.evaluate_combinations(single_results, combinations)
73
+ # TODO: sum'a gerek yok
74
+ return sum, single_results
75
+
76
+ def evaluate_combinations(self, single_results, combinations):
77
+ for combination in combinations:
78
+ combination_descriptor = '-'.join(combination)
79
+ with mlflow.start_run(run_name=combination_descriptor, description="***", nested=True) as combination_run:
80
+ prediction = np.zeros(
81
+ (self.y_test_label.shape[0], self.y_test_label.shape[1]))
82
+ for item in combination:
83
+ prediction = prediction + single_results[item]
84
+ logs, metrics = evaluate(
85
+ actual=self.y_test_label, pred=prediction, info=combination_descriptor)
86
+ mlflow.log_metrics(logs)
87
+ metrics.format_float()
88
+ # TODO path bulunamadı hatası aldık
89
+ print(
90
+ f'combination_artifact_uri:{combination_run.info.artifact_uri}')
91
+ utils.compress_and_save_data(
92
+ metrics.__dict__, combination_run.info.artifact_uri, f'{self.date}_metrics.gzip')
93
+
94
+
95
+ class TFSingleModal:
96
+ def __init__(self, date, descriptor, model_func, batch_size=128, epochs=100):
97
+ self.date = date
98
+ self.descriptor = descriptor
99
+ self.model_func = model_func
100
+ self.batch_size = batch_size
101
+ self.epochs = epochs
102
+
103
+ def set_data(self, train_idx_arr, val_idx_arr, train_data, train_label, test_data, test_label):
104
+ self.train_idx_arr = train_idx_arr
105
+ self.val_idx_arr = val_idx_arr
106
+ self.train_data = train_data
107
+ self.train_label = train_label
108
+ self.test_data = test_data
109
+ self.test_label = test_label
110
+
111
+ # https://github.com/mlflow/mlflow/blob/master/examples/tensorflow/train.py
112
+ def predict(self):
113
+ print(self.train_data.shape)
114
+
115
+ # Failed to convert a NumPy array to a Tensor
116
+ with mlflow.start_run(run_name=self.descriptor, description="***", nested=True) as run:
117
+ models = dict()
118
+ histories = dict()
119
+ models_val_acc = dict()
120
+ # with batch_metrics_logger(run_id) as metrics_logger:
121
+ for i, (train_idx, val_idx) in enumerate(zip(self.train_idx_arr, self.val_idx_arr)):
122
+ print(f"Validation {i}")
123
+
124
+ with mlflow.start_run(run_name=f'Validation {i}', description='CV models', nested=True) as cv_fit:
125
+ model = self.model_func(self.train_data.shape[1])
126
+ models[f'validation_{i}'] = model
127
+ X_train_cv = self.train_data[train_idx]
128
+ y_train_cv = self.train_label[train_idx]
129
+ X_valid_cv = self.train_data[val_idx]
130
+ y_valid_cv = self.train_label[val_idx]
131
+
132
+ early_stopping = EarlyStopping(
133
+ monitor='val_loss', patience=10, verbose=0, mode='auto')
134
+ custom_callback = CustomCallback()
135
+ history = model.fit(X_train_cv, y_train_cv,
136
+ batch_size=self.batch_size,
137
+ epochs=self.epochs,
138
+ validation_data=(
139
+ X_valid_cv, y_valid_cv),
140
+ callbacks=[early_stopping, custom_callback])
141
+ # histories[f'validation_{i}'] = history
142
+ models_val_acc[f'validation_{i}'] = history.history['val_accuracy'][-1]
143
+ # Saving each CV model
144
+
145
+ best_model_key = max(models_val_acc, key=models_val_acc.get)
146
+ best_model = models[best_model_key]
147
+ best_model.evaluate(self.test_data, self.test_label,
148
+ callbacks=[custom_callback])
149
+ pred = best_model.predict(self.test_data)
150
+
151
+ logs, metrics = evaluate(
152
+ actual=self.test_label, pred=pred, info=self.descriptor)
153
+ metrics.format_float()
154
+ mlflow.log_metrics(logs)
155
+ mlflow.log_param('best_cv', best_model_key)
156
+ signature = infer_signature(
157
+ self.train_data,
158
+ # generate_signature_output(model,X_valid_cv)
159
+ # params=params,
160
+ )
161
+
162
+ mlflow.tensorflow.save_model(
163
+ best_model,
164
+ path=run.info.artifact_uri + '/model',
165
+ signature=signature,
166
+ )
167
+ print(run.info.artifact_uri)
168
+ onnx_model, _ = tf2onnx.convert.from_keras(
169
+ best_model, input_signature=None, opset=13)
170
+ onnx.save(onnx_model, run.info.artifact_uri +
171
+ '/model/model.onnx')
172
+ utils.compress_and_save_data(
173
+ metrics.__dict__, run.info.artifact_uri, f'{self.date}metrics.gzip')
174
+ # mlflow.log_dict(metrics.__dict__, "metrics.json")
175
+
176
+ # Plot Precision-Recall curves for each class and micro-average
177
+ # fig = plt.figure()
178
+ # plt.step(metrics.recall['micro_event'], metrics.precision['micro_event'],
179
+ # color='b', alpha=0.2, where='post')
180
+ # plt.fill_between(
181
+ # metrics.recall["micro_event"], metrics.precision["micro_event"], step='post', alpha=0.2, color='b')
182
+
183
+ # for i in range(pred.shape[1]):
184
+ # plt.step(metrics.recall[i], metrics.precision[i], where='post',
185
+ # label='Class {0} (AUC={1:0.2f})'.format(i, metrics.roc_aupr[i]))
186
+
187
+ # plt.xlabel('Recall')
188
+ # plt.ylabel('Precision')
189
+ # plt.ylim([0.0, 1.05])
190
+ # plt.xlim([0.0, 1.0])
191
+ # plt.title(
192
+ # 'Micro-average Precision-Recall curve: AUC={0:0.2f}'.format(metrics.roc_aupr["micro"]))
193
+ # plt.legend(loc='best')
194
+ # # plt.savefig(run.info.artifact_uri + '/auprc.png')
195
+ # mlflow.log_figure(fig, 'auprc.png')
196
+
197
+ # mlflow.log_model(
198
+ # model,
199
+ # artifact_path=run.info.artifact_uri + '/model',
200
+ # signature=signature,
201
+ # )
202
+ # mlflow.log_artifact(run.info.artifact_uri + '/model')
203
+
204
+ # mlflow.MlflowClient().log_artifact(run.info.run_id,
205
+ # run.info.artifact_uri, None)
206
+ return pred
207
+
208
+
209
+ class CustomCallback(keras.callbacks.Callback):
210
+ def on_train_begin(self, logs=None):
211
+ keys = list(logs.keys())
212
+ mlflow.log_param("train_begin_keys", keys)
213
+ config = self.model.optimizer.get_config()
214
+ for attribute in config:
215
+ mlflow.log_param("opt_" + attribute, config[attribute])
216
+
217
+ sum_list = []
218
+ self.model.summary(print_fn=sum_list.append)
219
+ summary = "\n".join(sum_list)
220
+ mlflow.log_text(summary, artifact_file="model_summary.txt")
221
+
222
+ def on_train_end(self, logs=None):
223
+ print(logs)
224
+ mlflow.log_metrics(logs)
225
+
226
+ def on_epoch_begin(self, epoch, logs=None):
227
+ keys = list(logs.keys())
228
+
229
+ def on_epoch_end(self, epoch, logs=None):
230
+ keys = list(logs.keys())
231
+
232
+ def on_test_begin(self, logs=None):
233
+ keys = list(logs.keys())
234
+
235
+ def on_test_end(self, logs=None):
236
+ mlflow.log_metrics(logs)
237
+ print(logs)
238
+
239
+ def on_predict_begin(self, logs=None):
240
+ keys = list(logs.keys())
241
+
242
+ def on_predict_end(self, logs=None):
243
+ keys = list(logs.keys())
244
+ mlflow.log_metrics(logs)
245
+
246
+ def on_train_batch_begin(self, batch, logs=None):
247
+ keys = list(logs.keys())
248
+
249
+ def on_train_batch_end(self, batch, logs=None):
250
+ keys = list(logs.keys())
251
+
252
+ def on_test_batch_begin(self, batch, logs=None):
253
+ keys = list(logs.keys())
254
+
255
+ def on_test_batch_end(self, batch, logs=None):
256
+ keys = list(logs.keys())
257
+
258
+ def on_predict_batch_begin(self, batch, logs=None):
259
+ keys = list(logs.keys())
260
+
261
+ def on_predict_batch_end(self, batch, logs=None):
262
+ keys = list(logs.keys())
263
+ # def on_train_begin(self, logs=None): # pylint: disable=unused-argument
264
+ # config = self.model.optimizer.get_config()
265
+ # for attribute in config:
266
+ # mlflow.log_param("opt_" + attribute, config[attribute])
267
+
268
+ # sum_list = []
269
+ # self.model.summary(print_fn=sum_list.append)
270
+ # summary = "\n".join(sum_list)
271
+ # mlflow.log_text(summary, artifact_file="model_summary.txt")
272
+
273
+ # def on_epoch_end(self, epoch, logs=None):
274
+ # # NB: tf.Keras uses zero-indexing for epochs, while other TensorFlow Estimator
275
+ # # APIs (e.g., tf.Estimator) use one-indexing. Accordingly, the modular arithmetic
276
+ # # used here is slightly different from the arithmetic used in `_log_event`, which
277
+ # # provides metric logging hooks for TensorFlow Estimator & other TensorFlow APIs
278
+ # if epoch % self.log_every_n_steps == 0:
279
+ # self.metrics_logger.record_metrics(logs, epoch)
280
+
281
+ # def predict(self):
282
+ # model = self.model_func()
283
+ # # Failed to convert a NumPy array to a Tensor
284
+ # for i, (train_idx, val_idx) in enumerate(zip(self.train_idx_arr, self.val_idx_arr)):
285
+ # print(f"Validation {i}")
286
+ # X_train_cv = self.train_data[train_idx]
287
+ # y_train_cv = self.train_label[train_idx]
288
+ # X_valid_cv = self.train_data[val_idx]
289
+ # y_valid_cv = self.train_label[val_idx]
290
+
291
+ # early_stopping = EarlyStopping(
292
+ # monitor='val_loss', patience=10, verbose=0, mode='auto')
293
+ # model.fit(X_train_cv, y_train_cv, batch_size=128, epochs=20, validation_data=(X_valid_cv, y_valid_cv),
294
+ # callbacks=[early_stopping])
295
+ # pred = model.predict(self.test_data)
296
+ # return pred
@@ -0,0 +1,59 @@
1
+ # # https://github.com/kashif/tf-keras-tutorial/blob/tf2/3-imdb.ipynb
2
+ # # TensorFlow and tf.keras
3
+ # import tensorflow as tf
4
+
5
+ # # Helper libraries
6
+ # import numpy as np
7
+ # import matplotlib.pyplot as plt
8
+ # from tensorflow_helper import CustomCallback
9
+
10
+ # print(tf.__version__)
11
+
12
+
13
+ # imdb = tf.keras.datasets.imdb
14
+
15
+ # (train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.imdb.load_data(num_words=10000)
16
+
17
+
18
+ # class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
19
+ # 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
20
+
21
+
22
+ # # Create a model
23
+ # model = tf.keras.Sequential()
24
+ # custom_callback = CustomCallback()
25
+
26
+ # # input shape here is the length of our movie review vector
27
+ # model.add(tf.keras.layers.Dense(16, activation=tf.nn.relu, input_shape=(10000,)))
28
+ # model.add(tf.keras.layers.Dense(16, activation=tf.nn.relu))
29
+ # model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid))
30
+
31
+ # optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
32
+
33
+ # model.compile(loss='binary_crossentropy',
34
+ # optimizer=optimizer,
35
+ # metrics=['binary_accuracy'])
36
+
37
+ # model.summary()
38
+
39
+ # VAL_SIZE = 10000
40
+ # x = np.array(train_data[:VAL_SIZE].tolist())
41
+
42
+ # val_data = np.asarray(train_data[:VAL_SIZE])
43
+ # partial_train_data = np.asarray(train_data[VAL_SIZE:])
44
+
45
+
46
+ # val_labels = train_labels[:VAL_SIZE]
47
+ # partial_train_labels = train_labels[VAL_SIZE:]
48
+
49
+ # BATCH_SIZE = 512
50
+ # SHUFFLE_SIZE = 1000
51
+
52
+ # # training_set = tf.data.Dataset.from_tensor_slices((partial_train_data, partial_train_labels))
53
+ # # training_set = training_set.shuffle(SHUFFLE_SIZE).batch(BATCH_SIZE)
54
+
55
+ # model.fit(partial_train_data , partial_train_labels , batch_size=128, epochs=20, validation_data=(val_data , val_labels ),
56
+ # callbacks=[custom_callback])
57
+
58
+ # loss, accuracy = model.evaluate(test_data, test_labels,callbacks=[custom_callback])
59
+ # print('Test accuracy: %.2f' % (accuracy))
ddi_fw/ner/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from .ner import CTakesNER
@@ -0,0 +1,155 @@
1
+ """ MetaMapLite ReST client
2
+
3
+ Currently allows setting request content-type and accept context-type
4
+ fields of http request.
5
+
6
+ content fields include:
7
+ inputext: input document to be processed
8
+ docformat: format of input document: freetext, medline, pubmedxml, etc.
9
+ resultformat: format of result: mmi, brat, etc.
10
+ sourceString: list of sources to restrict to. (comma separated)
11
+ semanticTypeString: list of semantic types to restrict to. (comma separated)
12
+
13
+ Sample use of mmlrestclient.py:
14
+
15
+ python mmlrestclient.py https://ii-public1.nlm.nih.gov/metamaplite/rest/annotate \
16
+ ~/queries/testdoc.txt --output outfile
17
+ python mmlrestclient.py https://ii-public2vm.nlm.nih.gov/metamaplite/rest/annotate \
18
+ ~/queries/testdoc.txt --output outfile
19
+
20
+
21
+ Usage:
22
+
23
+ usage: mmlrestclient.py [-h] [--req-content-type REQ_CONTENT_TYPE]
24
+ [--res-content-type RES_CONTENT_TYPE]
25
+ [--docformat DOCFORMAT] [--resultformat RESULTFORMAT]
26
+ [--sources SOURCES] [--semantic-types SEMANTIC_TYPES]
27
+ [--output OUTPUT]
28
+ url file
29
+
30
+ ReST client
31
+
32
+ positional arguments:
33
+ url url of server
34
+ file file to send in request
35
+
36
+ optional arguments:
37
+ -h, --help show this help message and exit
38
+ --req-content-type REQ_CONTENT_TYPE
39
+ content-type of request
40
+ --res-content-type RES_CONTENT_TYPE
41
+ content-type of response
42
+ --docformat DOCFORMAT
43
+ format of input document
44
+ --resultformat RESULTFORMAT
45
+ format of metamaplite result
46
+ --sources SOURCES restrict to list of UMLS sources abbreviations separated by commas
47
+ --semantic-types SEMANTIC_TYPES
48
+ restrict to list of UMLS semantic types separated by commas
49
+ --output OUTPUT file to send response content, default is standard
50
+ output
51
+
52
+
53
+ """
54
+ import sys
55
+ import argparse
56
+ import requests
57
+
58
+ def readtextfile(filename):
59
+ """ read text file specified by filename """
60
+ textfp = open(filename)
61
+ text = textfp.read()
62
+ textfp.close()
63
+ return text
64
+
65
+ def package_payload(argdict):
66
+ """ generate payload parameters from arguments """
67
+ if 'inputtext' in argdict:
68
+ inputtext = argdict('inputtext')
69
+ else:
70
+ inputtext = readtextfile(argdict['file'])
71
+ req_content_type = argdict['req_content_type']
72
+ print('req_content_type = {}'.format(req_content_type))
73
+ params = []
74
+ params.append(('inputtext', inputtext))
75
+ params.append(('docformat', argdict['docformat']))
76
+ params.append(('resultformat', argdict['resultformat']))
77
+ for source in argdict['sources'].split(','):
78
+ params.append(('sourceString', source))
79
+ for semtype in argdict['semantic_types'].split(','):
80
+ params.append(('semanticTypeString', semtype))
81
+ return params
82
+
83
+ def handle_request(url, acceptfmt, payload):
84
+ """
85
+ Send request to ReST service and return response when received.
86
+
87
+ >>> url = 'https://ii-public1.nlm.nih.gov/metamaplite/rest/annotate'
88
+ >>> acceptfmt = 'text/plain'
89
+ >>> params = [('inputtext', 'Apnea\n'), ('docformat', 'freetext'),
90
+ ('resultformat', 'json'), ('sourceString', 'all'),
91
+ ('semanticTypeString', 'all')]
92
+ >>> resp = handle_request(url, acceptfmt, params)
93
+ >>> resp.text
94
+ '[{"matchedtext":"Apnea",
95
+ "evlist":[{"score":0,"matchedtext":"Apnea","start":0,"length":5,"id":"ev0",
96
+ "conceptinfo":{"conceptstring":"Apnea",
97
+ "sources":["MTH","NCI_CTCAE_5","NCI","NCI_CTCAE_3"],
98
+ "cui":"C1963065","preferredname":"Apnea, CTCAE",
99
+ "semantictypes":["fndg"]}},
100
+ {"score":0,"matchedtext":"Apnea","start":0,"length":5,"id":"ev0",
101
+ "conceptinfo":{"conceptstring":"Apnea",
102
+ "sources":["LNC","MTH","HPO","NANDA-I","ICPC2P","CHV",
103
+ "SNMI","SNM","NCI_FDA","LCH_NW","AOD","ICD9CM",
104
+ "MDR","SNOMEDCT_US","CCPSS","WHO","NCI_NICHD",
105
+ "CSP","RCDSA","MSH","ICD10CM","CST","OMIM",
106
+ "NCI_CTCAE","ICPC2ICD10ENG","COSTAR","MEDCIN",
107
+ "LCH","RCD","RCDAE","NCI","PSY","NDFRT","RCDSY",
108
+ "DXP","ICNP"],
109
+ "cui":"C0003578","preferredname":"Apnea",
110
+ "semantictypes":["sosy"]}}],
111
+ "docid":"00000000.tx","start":0,"length":5,"id":"en0","fieldid":"text"}]'
112
+ """
113
+ headers = {'Accept' : acceptfmt}
114
+ return requests.post(url, payload, headers=headers)
115
+
116
+ def process(argdict):
117
+ """Process command line arguments and call handle_request. """
118
+ payload = package_payload(argdict)
119
+ sys.stderr.write('%s\n' % payload)
120
+ # contenttype = argdict['req_content_type'],
121
+ acceptfmt = argdict['res_content_type']
122
+ return handle_request(argdict['url'], acceptfmt, payload)
123
+
124
+ if __name__ == '__main__':
125
+ if len(sys.argv) > 1:
126
+ parser = argparse.ArgumentParser(description="ReST client")
127
+ parser.add_argument('url',
128
+ help='url of server')
129
+ parser.add_argument('file', help='file to send in request')
130
+ parser.add_argument('--req-content-type', default='application/x-www-form-urlencoded',
131
+ help='content-type of request')
132
+ parser.add_argument('--res-content-type', default='text/plain',
133
+ help='content-type of response')
134
+ parser.add_argument('--docformat', default='freetext',
135
+ help='format of input document')
136
+ parser.add_argument('--resultformat', default='mmi',
137
+ help='format of metamaplite result')
138
+ parser.add_argument('--sources', default='all',
139
+ help='list of UMLS sources to restrict to. (comma separated)')
140
+ parser.add_argument('--semantic-types', default='all',
141
+ help='list of UMLS semantic types to restrict to separated by commas')
142
+
143
+ parser.add_argument('--output', default='stdout',
144
+ help='file to send response content, default is standard output')
145
+
146
+ args = parser.parse_args()
147
+ print(args)
148
+ resp = process(vars(args))
149
+ sys.stderr.write('resp = %s\n' % resp)
150
+ if vars(args)['output'] == 'stdout':
151
+ sys.stdout.write('%s\n' % resp.text)
152
+ else:
153
+ fp = open(vars(args)['output'], 'w')
154
+ fp.write('%s\n' % resp.text)
155
+ fp.close()