PyPI - ddi-fw - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl - Mend

ddi-fw 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

ddi_fw/datasets/__init__.py +12 -0
ddi_fw/datasets/core.py +416 -0
ddi_fw/datasets/db_utils.py +204 -0
ddi_fw/datasets/embedding_generator.py +66 -0
ddi_fw/datasets/embedding_generator_new.py +105 -0
ddi_fw/datasets/feature_vector_generation.py +100 -0
ddi_fw/datasets/idf_helper.py +71 -0
ddi_fw/drugbank/__init__.py +2 -0
ddi_fw/drugbank/drugbank_parser.py +154 -0
ddi_fw/drugbank/drugbank_processor.py +343 -0
ddi_fw/drugbank/drugbank_processor_org.py +272 -0
ddi_fw/drugbank/event_extractor.py +127 -0
ddi_fw/experiments/__init__.py +2 -0
ddi_fw/experiments/custom_torch_model.py +66 -0
ddi_fw/experiments/evaluation_helper.py +232 -0
ddi_fw/experiments/tensorflow_helper.py +296 -0
ddi_fw/experiments/test.py +59 -0
ddi_fw/ner/__init__.py +1 -0
ddi_fw/ner/mmlrestclient.py +155 -0
ddi_fw/ner/ner.py +340 -0
ddi_fw/utils/__init__.py +3 -0
ddi_fw/utils/enums.py +23 -0
ddi_fw/utils/utils.py +103 -0
ddi_fw/utils/zip_helper.py +66 -0
{ddi_fw-0.0.1.dist-info → ddi_fw-0.0.3.dist-info}/METADATA +1 -1
ddi_fw-0.0.3.dist-info/RECORD +28 -0
ddi_fw-0.0.3.dist-info/top_level.txt +5 -0
ddi_fw-0.0.1.dist-info/RECORD +0 -4
ddi_fw-0.0.1.dist-info/top_level.txt +0 -1
{ddi_fw-0.0.1.dist-info → ddi_fw-0.0.3.dist-info}/WHEEL +0 -0

ddi_fw/experiments/tensorflow_helper.py ADDED Viewed

@@ -0,0 +1,296 @@
+from matplotlib import pyplot as plt
+import tensorflow as tf
+from tensorflow import keras
+from keras.models import Model, Sequential
+from keras.layers import Dense, Dropout, Input, Activation, BatchNormalization
+from keras.callbacks import EarlyStopping
+from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
+import numpy as np
+import mlflow
+from mlflow.utils.autologging_utils import batch_metrics_logger
+import time
+from mlflow.models import infer_signature
+from ddi_fw.experiments.evaluation_helper import evaluate
+import tf2onnx
+import onnx
+import itertools
+import utils
+# https://github.com/YifanDengWHU/DDIMDL/blob/master/newTask.py
+# from numpy.random import seed
+# seed(1)
+# from tensorflow import set_random_seed
+# set_random_seed(2)
+tf.random.set_seed(1)
+np.random.seed(2)
+np.set_printoptions(precision=4)
+class TFMultiModal:
+    # todo model related parameters to config
+    def __init__(self, model_func, batch_size=128, epochs=100):
+        self.model_func = model_func
+        self.batch_size = batch_size
+        self.epochs = epochs
+    def set_data(self, items, train_idx_arr, val_idx_arr, y_test_label):
+        self.items = items
+        self.train_idx_arr = train_idx_arr
+        self.val_idx_arr = val_idx_arr
+        self.y_test_label = y_test_label
+    def predict(self, combinations: list = [], generate_combinations=False):
+        self.prefix = utils.utc_time_as_string()
+        self.date = utils.utc_time_as_string_simple_format
+        sum = np.zeros(
+            (self.y_test_label.shape[0], self.y_test_label.shape[1]))
+        single_results = dict()
+        if generate_combinations:
+            l = [item[0] for item in self.items]
+            combinations = []
+            for i in range(2, len(l) + 1):
+                combinations.extend(list(itertools.combinations(l, i)))  # all
+        with mlflow.start_run(run_name=self.prefix, description="***") as run:
+            self.level_0_run_id = run.info.run_id
+            for item in self.items:
+                print(item[0])
+                single_modal = TFSingleModal(
+                    self.date, item[0], self.model_func, self.batch_size, self.epochs)
+                single_modal.set_data(
+                    self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
+                r = single_modal.predict()
+                single_results[item[0]] = r
+                sum = sum + r
+            if combinations:
+                self.evaluate_combinations(single_results, combinations)
+        # TODO: sum'a gerek yok
+        return sum, single_results
+    def evaluate_combinations(self, single_results, combinations):
+        for combination in combinations:
+            combination_descriptor = '-'.join(combination)
+            with mlflow.start_run(run_name=combination_descriptor, description="***", nested=True) as combination_run:
+                prediction = np.zeros(
+                    (self.y_test_label.shape[0], self.y_test_label.shape[1]))
+                for item in combination:
+                    prediction = prediction + single_results[item]
+                logs, metrics = evaluate(
+                    actual=self.y_test_label, pred=prediction, info=combination_descriptor)
+                mlflow.log_metrics(logs)
+                metrics.format_float()
+                # TODO path bulunamadı hatası aldık
+                print(
+                    f'combination_artifact_uri:{combination_run.info.artifact_uri}')
+                utils.compress_and_save_data(
+                    metrics.__dict__, combination_run.info.artifact_uri, f'{self.date}_metrics.gzip')
+class TFSingleModal:
+    def __init__(self, date, descriptor, model_func, batch_size=128, epochs=100):
+        self.date = date
+        self.descriptor = descriptor
+        self.model_func = model_func
+        self.batch_size = batch_size
+        self.epochs = epochs
+    def set_data(self, train_idx_arr, val_idx_arr, train_data, train_label, test_data, test_label):
+        self.train_idx_arr = train_idx_arr
+        self.val_idx_arr = val_idx_arr
+        self.train_data = train_data
+        self.train_label = train_label
+        self.test_data = test_data
+        self.test_label = test_label
+# https://github.com/mlflow/mlflow/blob/master/examples/tensorflow/train.py
+    def predict(self):
+        print(self.train_data.shape)
+        # Failed to convert a NumPy array to a Tensor
+        with mlflow.start_run(run_name=self.descriptor, description="***", nested=True) as run:
+            models = dict()
+            histories = dict()
+            models_val_acc = dict()
+            # with batch_metrics_logger(run_id) as metrics_logger:
+            for i, (train_idx, val_idx) in enumerate(zip(self.train_idx_arr, self.val_idx_arr)):
+                print(f"Validation {i}")
+                with mlflow.start_run(run_name=f'Validation {i}', description='CV models', nested=True) as cv_fit:
+                    model = self.model_func(self.train_data.shape[1])
+                    models[f'validation_{i}'] = model
+                    X_train_cv = self.train_data[train_idx]
+                    y_train_cv = self.train_label[train_idx]
+                    X_valid_cv = self.train_data[val_idx]
+                    y_valid_cv = self.train_label[val_idx]
+                    early_stopping = EarlyStopping(
+                        monitor='val_loss', patience=10, verbose=0, mode='auto')
+                    custom_callback = CustomCallback()
+                    history = model.fit(X_train_cv, y_train_cv,
+                                        batch_size=self.batch_size,
+                                        epochs=self.epochs,
+                                        validation_data=(
+                                            X_valid_cv, y_valid_cv),
+                                        callbacks=[early_stopping, custom_callback])
+                    # histories[f'validation_{i}'] = history
+                    models_val_acc[f'validation_{i}'] = history.history['val_accuracy'][-1]
+                    # Saving each CV model
+            best_model_key = max(models_val_acc, key=models_val_acc.get)
+            best_model = models[best_model_key]
+            best_model.evaluate(self.test_data, self.test_label,
+                                callbacks=[custom_callback])
+            pred = best_model.predict(self.test_data)
+            logs, metrics = evaluate(
+                actual=self.test_label, pred=pred, info=self.descriptor)
+            metrics.format_float()
+            mlflow.log_metrics(logs)
+            mlflow.log_param('best_cv', best_model_key)
+            signature = infer_signature(
+                self.train_data,
+                # generate_signature_output(model,X_valid_cv)
+                # params=params,
+            )
+            mlflow.tensorflow.save_model(
+                best_model,
+                path=run.info.artifact_uri + '/model',
+                signature=signature,
+            )
+            print(run.info.artifact_uri)
+            onnx_model, _ = tf2onnx.convert.from_keras(
+                best_model, input_signature=None, opset=13)
+            onnx.save(onnx_model, run.info.artifact_uri +
+                      '/model/model.onnx')
+            utils.compress_and_save_data(
+                metrics.__dict__, run.info.artifact_uri, f'{self.date}metrics.gzip')
+            # mlflow.log_dict(metrics.__dict__, "metrics.json")
+            # Plot Precision-Recall curves for each class and micro-average
+            # fig = plt.figure()
+            # plt.step(metrics.recall['micro_event'], metrics.precision['micro_event'],
+            #          color='b', alpha=0.2, where='post')
+            # plt.fill_between(
+            #     metrics.recall["micro_event"], metrics.precision["micro_event"], step='post', alpha=0.2, color='b')
+            # for i in range(pred.shape[1]):
+            #     plt.step(metrics.recall[i], metrics.precision[i], where='post',
+            #              label='Class {0} (AUC={1:0.2f})'.format(i, metrics.roc_aupr[i]))
+            # plt.xlabel('Recall')
+            # plt.ylabel('Precision')
+            # plt.ylim([0.0, 1.05])
+            # plt.xlim([0.0, 1.0])
+            # plt.title(
+            #     'Micro-average Precision-Recall curve: AUC={0:0.2f}'.format(metrics.roc_aupr["micro"]))
+            # plt.legend(loc='best')
+            # # plt.savefig(run.info.artifact_uri + '/auprc.png')
+            # mlflow.log_figure(fig, 'auprc.png')
+            # mlflow.log_model(
+            #         model,
+            #         artifact_path=run.info.artifact_uri + '/model',
+            #         signature=signature,
+            #     )
+            # mlflow.log_artifact(run.info.artifact_uri + '/model')
+            # mlflow.MlflowClient().log_artifact(run.info.run_id,
+            #                                    run.info.artifact_uri, None)
+        return pred
+class CustomCallback(keras.callbacks.Callback):
+    def on_train_begin(self, logs=None):
+        keys = list(logs.keys())
+        mlflow.log_param("train_begin_keys", keys)
+        config = self.model.optimizer.get_config()
+        for attribute in config:
+            mlflow.log_param("opt_" + attribute, config[attribute])
+        sum_list = []
+        self.model.summary(print_fn=sum_list.append)
+        summary = "\n".join(sum_list)
+        mlflow.log_text(summary, artifact_file="model_summary.txt")
+    def on_train_end(self, logs=None):
+        print(logs)
+        mlflow.log_metrics(logs)
+    def on_epoch_begin(self, epoch, logs=None):
+        keys = list(logs.keys())
+    def on_epoch_end(self, epoch, logs=None):
+        keys = list(logs.keys())
+    def on_test_begin(self, logs=None):
+        keys = list(logs.keys())
+    def on_test_end(self, logs=None):
+        mlflow.log_metrics(logs)
+        print(logs)
+    def on_predict_begin(self, logs=None):
+        keys = list(logs.keys())
+    def on_predict_end(self, logs=None):
+        keys = list(logs.keys())
+        mlflow.log_metrics(logs)
+    def on_train_batch_begin(self, batch, logs=None):
+        keys = list(logs.keys())
+    def on_train_batch_end(self, batch, logs=None):
+        keys = list(logs.keys())
+    def on_test_batch_begin(self, batch, logs=None):
+        keys = list(logs.keys())
+    def on_test_batch_end(self, batch, logs=None):
+        keys = list(logs.keys())
+    def on_predict_batch_begin(self, batch, logs=None):
+        keys = list(logs.keys())
+    def on_predict_batch_end(self, batch, logs=None):
+        keys = list(logs.keys())
+    # def on_train_begin(self, logs=None):  # pylint: disable=unused-argument
+    #     config = self.model.optimizer.get_config()
+    #     for attribute in config:
+    #         mlflow.log_param("opt_" + attribute, config[attribute])
+    #     sum_list = []
+    #     self.model.summary(print_fn=sum_list.append)
+    #     summary = "\n".join(sum_list)
+    #     mlflow.log_text(summary, artifact_file="model_summary.txt")
+    # def on_epoch_end(self, epoch, logs=None):
+    #     # NB: tf.Keras uses zero-indexing for epochs, while other TensorFlow Estimator
+    #     # APIs (e.g., tf.Estimator) use one-indexing. Accordingly, the modular arithmetic
+    #     # used here is slightly different from the arithmetic used in `_log_event`, which
+    #     # provides  metric logging hooks for TensorFlow Estimator & other TensorFlow APIs
+    #     if epoch % self.log_every_n_steps == 0:
+    #         self.metrics_logger.record_metrics(logs, epoch)
+    # def predict(self):
+    #     model = self.model_func()
+    #     # Failed to convert a NumPy array to a Tensor
+    #     for i, (train_idx, val_idx) in enumerate(zip(self.train_idx_arr, self.val_idx_arr)):
+    #         print(f"Validation {i}")
+    #         X_train_cv = self.train_data[train_idx]
+    #         y_train_cv = self.train_label[train_idx]
+    #         X_valid_cv = self.train_data[val_idx]
+    #         y_valid_cv = self.train_label[val_idx]
+    #         early_stopping = EarlyStopping(
+    #             monitor='val_loss', patience=10, verbose=0, mode='auto')
+    #         model.fit(X_train_cv, y_train_cv, batch_size=128, epochs=20, validation_data=(X_valid_cv, y_valid_cv),
+    #                   callbacks=[early_stopping])
+    #     pred = model.predict(self.test_data)
+    #     return pred

ddi_fw/experiments/test.py ADDED Viewed

@@ -0,0 +1,59 @@
+# # https://github.com/kashif/tf-keras-tutorial/blob/tf2/3-imdb.ipynb
+# # TensorFlow and tf.keras
+# import tensorflow as tf
+# # Helper libraries
+# import numpy as np
+# import matplotlib.pyplot as plt
+# from tensorflow_helper import CustomCallback
+# print(tf.__version__)
+# imdb = tf.keras.datasets.imdb
+# (train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.imdb.load_data(num_words=10000)
+# class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
+#                'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
+# # Create a model
+# model = tf.keras.Sequential()
+# custom_callback = CustomCallback()
+#  # input shape here is the length of our movie review vector
+# model.add(tf.keras.layers.Dense(16, activation=tf.nn.relu, input_shape=(10000,)))
+# model.add(tf.keras.layers.Dense(16, activation=tf.nn.relu))
+# model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid))
+# optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
+# model.compile(loss='binary_crossentropy',
+#               optimizer=optimizer,
+#               metrics=['binary_accuracy'])
+# model.summary()
+# VAL_SIZE = 10000
+# x  = np.array(train_data[:VAL_SIZE].tolist())
+# val_data = np.asarray(train_data[:VAL_SIZE])
+# partial_train_data = np.asarray(train_data[VAL_SIZE:])
+# val_labels = train_labels[:VAL_SIZE]
+# partial_train_labels = train_labels[VAL_SIZE:]
+# BATCH_SIZE = 512
+# SHUFFLE_SIZE = 1000
+# # training_set = tf.data.Dataset.from_tensor_slices((partial_train_data, partial_train_labels))
+# # training_set = training_set.shuffle(SHUFFLE_SIZE).batch(BATCH_SIZE)
+# model.fit(partial_train_data , partial_train_labels , batch_size=128, epochs=20, validation_data=(val_data , val_labels ),
+#                           callbacks=[custom_callback])
+# loss, accuracy = model.evaluate(test_data, test_labels,callbacks=[custom_callback])
+# print('Test accuracy: %.2f' % (accuracy))

ddi_fw/ner/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .ner import CTakesNER

ddi_fw/ner/mmlrestclient.py ADDED Viewed

@@ -0,0 +1,155 @@
+""" MetaMapLite ReST client
+Currently allows setting request content-type and accept context-type
+fields of http request.
+content fields include:
+   inputext: input document to be processed
+   docformat: format of input document: freetext, medline, pubmedxml, etc.
+   resultformat: format of result: mmi, brat, etc.
+   sourceString: list of sources to restrict to. (comma separated)
+   semanticTypeString: list of semantic types to restrict to. (comma separated)
+Sample use of mmlrestclient.py:
+   python mmlrestclient.py https://ii-public1.nlm.nih.gov/metamaplite/rest/annotate \
+          ~/queries/testdoc.txt --output outfile
+   python mmlrestclient.py https://ii-public2vm.nlm.nih.gov/metamaplite/rest/annotate \
+          ~/queries/testdoc.txt --output outfile
+Usage:
+  usage: mmlrestclient.py [-h] [--req-content-type REQ_CONTENT_TYPE]
+                          [--res-content-type RES_CONTENT_TYPE]
+                          [--docformat DOCFORMAT] [--resultformat RESULTFORMAT]
+                          [--sources SOURCES] [--semantic-types SEMANTIC_TYPES]
+                          [--output OUTPUT]
+                          url file
+ReST client
+positional arguments:
+  url                   url of server
+  file                  file to send in request
+optional arguments:
+  -h, --help            show this help message and exit
+  --req-content-type REQ_CONTENT_TYPE
+                        content-type of request
+  --res-content-type RES_CONTENT_TYPE
+                        content-type of response
+  --docformat DOCFORMAT
+                        format of input document
+  --resultformat RESULTFORMAT
+                        format of metamaplite result
+  --sources SOURCES     restrict to list of UMLS sources abbreviations separated by commas
+  --semantic-types SEMANTIC_TYPES
+                        restrict to list of UMLS semantic types separated by commas
+  --output OUTPUT       file to send response content, default is standard
+                        output
+"""
+import sys
+import argparse
+import requests
+def readtextfile(filename):
+    """ read text file specified by filename """
+    textfp = open(filename)
+    text = textfp.read()
+    textfp.close()
+    return text
+def package_payload(argdict):
+    """ generate payload parameters from arguments """
+    if 'inputtext' in argdict:
+        inputtext = argdict('inputtext')
+    else:
+        inputtext = readtextfile(argdict['file'])
+    req_content_type = argdict['req_content_type']
+    print('req_content_type = {}'.format(req_content_type))
+    params = []
+    params.append(('inputtext', inputtext))
+    params.append(('docformat', argdict['docformat']))
+    params.append(('resultformat', argdict['resultformat']))
+    for source in argdict['sources'].split(','):
+        params.append(('sourceString', source))
+    for semtype in argdict['semantic_types'].split(','):
+        params.append(('semanticTypeString', semtype))
+    return params
+def handle_request(url, acceptfmt, payload):
+    """
+    Send request to ReST service and return response when received.
+    >>> url = 'https://ii-public1.nlm.nih.gov/metamaplite/rest/annotate'
+    >>> acceptfmt = 'text/plain'
+    >>> params = [('inputtext', 'Apnea\n'), ('docformat', 'freetext'),
+                   ('resultformat', 'json'), ('sourceString', 'all'),
+                   ('semanticTypeString', 'all')]
+    >>> resp = handle_request(url, acceptfmt, params)
+    >>> resp.text
+    '[{"matchedtext":"Apnea",
+       "evlist":[{"score":0,"matchedtext":"Apnea","start":0,"length":5,"id":"ev0",
+                   "conceptinfo":{"conceptstring":"Apnea",
+                                  "sources":["MTH","NCI_CTCAE_5","NCI","NCI_CTCAE_3"],
+                                  "cui":"C1963065","preferredname":"Apnea, CTCAE",
+                                  "semantictypes":["fndg"]}},
+                 {"score":0,"matchedtext":"Apnea","start":0,"length":5,"id":"ev0",
+                  "conceptinfo":{"conceptstring":"Apnea",
+                                 "sources":["LNC","MTH","HPO","NANDA-I","ICPC2P","CHV",
+                                            "SNMI","SNM","NCI_FDA","LCH_NW","AOD","ICD9CM",
+                                            "MDR","SNOMEDCT_US","CCPSS","WHO","NCI_NICHD",
+                                            "CSP","RCDSA","MSH","ICD10CM","CST","OMIM",
+                                            "NCI_CTCAE","ICPC2ICD10ENG","COSTAR","MEDCIN",
+                                            "LCH","RCD","RCDAE","NCI","PSY","NDFRT","RCDSY",
+                                            "DXP","ICNP"],
+                                 "cui":"C0003578","preferredname":"Apnea",
+                                 "semantictypes":["sosy"]}}],
+        "docid":"00000000.tx","start":0,"length":5,"id":"en0","fieldid":"text"}]'
+    """
+    headers = {'Accept' : acceptfmt}
+    return requests.post(url, payload, headers=headers)
+def process(argdict):
+    """Process command line arguments and call handle_request. """
+    payload = package_payload(argdict)
+    sys.stderr.write('%s\n' % payload)
+    # contenttype = argdict['req_content_type'],
+    acceptfmt = argdict['res_content_type']
+    return handle_request(argdict['url'], acceptfmt, payload)
+if __name__ == '__main__':
+    if len(sys.argv) > 1:
+        parser = argparse.ArgumentParser(description="ReST client")
+        parser.add_argument('url',
+                            help='url of server')
+        parser.add_argument('file', help='file to send in request')
+        parser.add_argument('--req-content-type', default='application/x-www-form-urlencoded',
+                            help='content-type of request')
+        parser.add_argument('--res-content-type', default='text/plain',
+                            help='content-type of response')
+        parser.add_argument('--docformat', default='freetext',
+                            help='format of input document')
+        parser.add_argument('--resultformat', default='mmi',
+                            help='format of metamaplite result')
+        parser.add_argument('--sources', default='all',
+                            help='list of UMLS sources to restrict to. (comma separated)')
+        parser.add_argument('--semantic-types', default='all',
+                            help='list of UMLS semantic types to restrict to separated by commas')
+        parser.add_argument('--output', default='stdout',
+                            help='file to send response content, default is standard output')
+        args = parser.parse_args()
+        print(args)
+        resp = process(vars(args))
+        sys.stderr.write('resp = %s\n' % resp)
+        if vars(args)['output'] == 'stdout':
+            sys.stdout.write('%s\n' % resp.text)
+        else:
+            fp = open(vars(args)['output'], 'w')
+            fp.write('%s\n' % resp.text)
+            fp.close()

ddi-fw 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

ddi-fw 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl