ddi-fw 0.0.49__tar.gz → 0.0.51__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/PKG-INFO +1 -1
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/pyproject.toml +1 -1
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/base.py +36 -4
- ddi_fw-0.0.51/src/ddi_fw/experiments/__init__.py +2 -0
- ddi_fw-0.0.51/src/ddi_fw/experiments/pipeline.py +126 -0
- ddi_fw-0.0.51/src/ddi_fw/experiments/pipeline_builder_pattern.py +152 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/experiments/tensorflow_helper.py +23 -37
- ddi_fw-0.0.51/src/ddi_fw/utils/__init__.py +4 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/utils/py7zr_helper.py +7 -4
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/utils/utils.py +17 -3
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw.egg-info/PKG-INFO +1 -1
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw.egg-info/SOURCES.txt +2 -0
- ddi_fw-0.0.49/src/ddi_fw/experiments/__init__.py +0 -2
- ddi_fw-0.0.49/src/ddi_fw/utils/__init__.py +0 -4
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/README.md +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/setup.cfg +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/__init__.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/core.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/db_utils.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/data/event.db +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_0.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_1.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_2.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_3.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_4.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/train_indexes.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_0.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_1.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_2.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_3.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_4.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/test_indexes.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_0.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_1.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_2.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_3.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_4.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_indexes.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_0.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_1.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_2.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_3.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_4.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/readme.md +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/embedding_generator.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/embedding_generator_new.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/feature_vector_generation.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/idf_helper.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/__init__.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/base.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/df_extraction_cleanxiaoyu50.csv +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/drug_information_del_noDDIxiaoyu50.csv +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/test_indexes.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_0.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_1.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_2.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_3.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_4.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_indexes.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_0.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_1.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_2.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_3.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_4.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/mdf-sa-ddi.zip +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/setup_._py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/drugbank/__init__.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/drugbank/drugbank.xsd +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/drugbank/drugbank_parser.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/drugbank/drugbank_processor.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/drugbank/drugbank_processor_org.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/drugbank/event_extractor.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/experiments/custom_torch_model.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/experiments/evaluation_helper.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/experiments/test.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/ner/__init__.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/ner/mmlrestclient.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/ner/ner.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/basic_test.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/combination_test.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/compress_json_test.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/date_test.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/idf_score.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/jaccard_similarity.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/mlfow_test.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/sklearn-tfidf.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/test.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/torch_cuda_test.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/type_guarding_test.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/utils/enums.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/utils/zip_helper.py +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw.egg-info/dependency_links.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw.egg-info/requires.txt +0 -0
- {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw.egg-info/top_level.txt +0 -0
@@ -7,16 +7,48 @@ from .. import BaseDataset
|
|
7
7
|
from ..db_utils import create_connection
|
8
8
|
|
9
9
|
HERE = pathlib.Path(__file__).resolve().parent
|
10
|
+
list_of_embedding_columns = ['all_text', 'description',
|
11
|
+
'synthesis_reference', 'indication',
|
12
|
+
'pharmacodynamics', 'mechanism_of_action',
|
13
|
+
'toxicity', 'metabolism',
|
14
|
+
'absorption', 'half_life',
|
15
|
+
'protein_binding', 'route_of_elimination',
|
16
|
+
'volume_of_distribution', 'clearance']
|
17
|
+
|
18
|
+
list_of_chemical_property_columns = ['enzyme',
|
19
|
+
'target',
|
20
|
+
'pathway',
|
21
|
+
'smile']
|
22
|
+
list_of_ner_columns = ['tui', 'cui', 'entities']
|
10
23
|
|
11
24
|
|
12
25
|
class DDIMDLDataset(BaseDataset):
|
13
|
-
def __init__(self, embedding_size,
|
14
|
-
|
15
|
-
|
16
|
-
|
26
|
+
def __init__(self, embedding_size,
|
27
|
+
embedding_dict,
|
28
|
+
embeddings_pooling_strategy: PoolingStrategy,
|
29
|
+
ner_df,
|
30
|
+
chemical_property_columns=['enzyme',
|
31
|
+
'target',
|
32
|
+
'pathway',
|
33
|
+
'smile'],
|
17
34
|
embedding_columns=[],
|
18
35
|
ner_columns=[],
|
19
36
|
**kwargs):
|
37
|
+
columns = kwargs['columns']
|
38
|
+
if columns is not None:
|
39
|
+
chemical_property_columns = []
|
40
|
+
embedding_columns=[]
|
41
|
+
ner_columns=[]
|
42
|
+
for column in columns:
|
43
|
+
if column in list_of_chemical_property_columns:
|
44
|
+
chemical_property_columns.append(column)
|
45
|
+
elif column in list_of_embedding_columns:
|
46
|
+
embedding_columns.append(column)
|
47
|
+
elif column in list_of_ner_columns:
|
48
|
+
ner_columns.append(column)
|
49
|
+
else:
|
50
|
+
raise Exception(f"{column} is not related this dataset")
|
51
|
+
|
20
52
|
|
21
53
|
super().__init__(embedding_size=embedding_size,
|
22
54
|
embedding_dict=embedding_dict,
|
@@ -0,0 +1,126 @@
|
|
1
|
+
import sqlite3
|
2
|
+
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
|
3
|
+
from keras.models import Model, Sequential
|
4
|
+
from keras.callbacks import EarlyStopping
|
5
|
+
from keras.layers import Dense, Dropout, Input, Activation, BatchNormalization
|
6
|
+
from tensorflow import keras
|
7
|
+
from ddi_fw.experiments import TFSingleModal, TFMultiModal
|
8
|
+
from ddi_fw.experiments import evaluate
|
9
|
+
from sklearn.preprocessing import LabelBinarizer
|
10
|
+
import numpy as np
|
11
|
+
import pandas as pd
|
12
|
+
from ddi_fw.utils import ZipHelper, Py7ZipHelper
|
13
|
+
import os
|
14
|
+
import chromadb
|
15
|
+
from collections import defaultdict
|
16
|
+
from langchain_community.vectorstores import Chroma
|
17
|
+
from ddi_fw.ner.ner import CTakesNER
|
18
|
+
from ddi_fw.datasets.embedding_generator_new import PoolingStrategy
|
19
|
+
|
20
|
+
from ddi_fw.datasets import BaseDataset, DDIMDLDataset
|
21
|
+
|
22
|
+
from ddi_fw.datasets import SumPoolingStrategy
|
23
|
+
from keras import metrics
|
24
|
+
from ddi_fw.experiments.evaluation_helper import evaluate
|
25
|
+
|
26
|
+
import mlflow
|
27
|
+
|
28
|
+
|
29
|
+
class Experiment:
|
30
|
+
def __init__(self,
|
31
|
+
experiment_name=None,
|
32
|
+
experiment_description=None,
|
33
|
+
experiment_tags=None,
|
34
|
+
tracking_uri=None,
|
35
|
+
dataset_type:BaseDataset=None,
|
36
|
+
columns=None,
|
37
|
+
vector_db_persist_directory=None,
|
38
|
+
vector_db_collection_name=None,
|
39
|
+
embedding_pooling_strategy_type:PoolingStrategy=None,
|
40
|
+
ner_data_file=None,
|
41
|
+
ner_threshold=None,
|
42
|
+
combinations=None,
|
43
|
+
model=None):
|
44
|
+
|
45
|
+
self.experiment_name = experiment_name
|
46
|
+
self.experiment_description = experiment_description
|
47
|
+
self.experiment_tags = experiment_tags
|
48
|
+
self.tracking_uri = tracking_uri
|
49
|
+
self.dataset_type = dataset_type
|
50
|
+
self.columns = columns
|
51
|
+
self.vector_db_persist_directory = vector_db_persist_directory
|
52
|
+
self.vector_db_collection_name = vector_db_collection_name
|
53
|
+
self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
|
54
|
+
self.ner_data_file = ner_data_file
|
55
|
+
self.ner_threshold = ner_threshold
|
56
|
+
self.combinations = combinations
|
57
|
+
self.model = model
|
58
|
+
|
59
|
+
def build(self):
|
60
|
+
# 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
|
61
|
+
kwargs = {"columns": self.columns}
|
62
|
+
for k, v in self.ner_threshold.items():
|
63
|
+
kwargs[k] = v
|
64
|
+
|
65
|
+
self.vector_db = chromadb.PersistentClient(
|
66
|
+
path=self.vector_db_persist_directory)
|
67
|
+
self.collection = self.vector_db.get_collection(
|
68
|
+
self.vector_db_collection_name)
|
69
|
+
dictionary = self.collection.get(include=['embeddings', 'metadatas'])
|
70
|
+
|
71
|
+
embedding_dict = defaultdict(lambda: defaultdict(list))
|
72
|
+
|
73
|
+
for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
|
74
|
+
embedding_dict[metadata["type"]][metadata["id"]].append(embedding)
|
75
|
+
|
76
|
+
embedding_size = dictionary['embeddings'].shape[1]
|
77
|
+
|
78
|
+
pooling_strategy = self.embedding_pooling_strategy_type()
|
79
|
+
|
80
|
+
self.ner_df = CTakesNER().load(filename=self.ner_data_file) if self.ner_data_file else None
|
81
|
+
|
82
|
+
self.dataset = self.dataset_type(
|
83
|
+
embedding_dict=embedding_dict,
|
84
|
+
embedding_size=embedding_size,
|
85
|
+
embeddings_pooling_strategy=pooling_strategy,
|
86
|
+
ner_df=self.ner_df, kwargs=kwargs)
|
87
|
+
|
88
|
+
X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = self.dataset.load()
|
89
|
+
|
90
|
+
self.dataframe = self.dataset.dataframe
|
91
|
+
# dataframe.dropna()
|
92
|
+
self.X_train = self.dataset.X_train
|
93
|
+
self.X_test = self.dataset.X_test
|
94
|
+
self.y_train = self.dataset.y_train
|
95
|
+
self.y_test = self.dataset.y_test
|
96
|
+
self.train_idx_arr = self.dataset.train_idx_arr
|
97
|
+
self.val_idx_arr = self.dataset.val_idx_arr
|
98
|
+
# Logic to set up the experiment
|
99
|
+
self.items = self.dataset.produce_inputs()
|
100
|
+
|
101
|
+
unique_classes = pd.unique(self.dataframe['event_category'])
|
102
|
+
event_num = len(unique_classes)
|
103
|
+
# droprate = 0.3
|
104
|
+
vector_size = self.dataset.drugs_df.shape[0]
|
105
|
+
|
106
|
+
print("Building the experiment with the following settings:")
|
107
|
+
print(
|
108
|
+
f"Name: {self.experiment_name}, Dataset: {self.dataset}, Model: {self.model}")
|
109
|
+
# Implement additional build logic as needed
|
110
|
+
return self
|
111
|
+
|
112
|
+
def run(self, model_func, batch_size=128, epochs=100):
|
113
|
+
mlflow.set_tracking_uri(self.tracking_uri)
|
114
|
+
|
115
|
+
if mlflow.get_experiment_by_name(self.experiment_name) == None:
|
116
|
+
mlflow.create_experiment(self.experiment_name)
|
117
|
+
mlflow.set_experiment_tags(self.experiment_tags)
|
118
|
+
mlflow.set_experiment(self.experiment_name)
|
119
|
+
|
120
|
+
y_test_label = self.items[0][4]
|
121
|
+
multi_modal = TFMultiModal(
|
122
|
+
model_func=model_func, batch_size=batch_size, epochs=epochs) # 100
|
123
|
+
multi_modal.set_data(
|
124
|
+
self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
|
125
|
+
result = multi_modal.predict(self.combinations)
|
126
|
+
return result
|
@@ -0,0 +1,152 @@
|
|
1
|
+
import sqlite3
|
2
|
+
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
|
3
|
+
from keras.models import Model, Sequential
|
4
|
+
from keras.callbacks import EarlyStopping
|
5
|
+
from keras.layers import Dense, Dropout, Input, Activation, BatchNormalization
|
6
|
+
from tensorflow import keras
|
7
|
+
from ddi_fw.experiments import TFSingleModal, TFMultiModal
|
8
|
+
from ddi_fw.experiments import evaluate
|
9
|
+
from sklearn.preprocessing import LabelBinarizer
|
10
|
+
import numpy as np
|
11
|
+
import pandas as pd
|
12
|
+
from ddi_fw.utils import ZipHelper, Py7ZipHelper
|
13
|
+
import os
|
14
|
+
import chromadb
|
15
|
+
from collections import defaultdict
|
16
|
+
from langchain_community.vectorstores import Chroma
|
17
|
+
from ddi_fw.ner.ner import CTakesNER
|
18
|
+
from ddi_fw.datasets.embedding_generator_new import PoolingStrategy
|
19
|
+
|
20
|
+
from ddi_fw.datasets import BaseDataset, DDIMDLDataset
|
21
|
+
|
22
|
+
from ddi_fw.datasets import SumPoolingStrategy
|
23
|
+
from keras import metrics
|
24
|
+
from ddi_fw.experiments.evaluation_helper import evaluate
|
25
|
+
|
26
|
+
import mlflow
|
27
|
+
|
28
|
+
|
29
|
+
class Experiment:
|
30
|
+
def __init__(self):
|
31
|
+
pass
|
32
|
+
|
33
|
+
@staticmethod
|
34
|
+
def create():
|
35
|
+
return Experiment()
|
36
|
+
|
37
|
+
def name(self, name):
|
38
|
+
self.experiment_name = name
|
39
|
+
return self
|
40
|
+
|
41
|
+
def description(self, description):
|
42
|
+
self.experiment_description = description
|
43
|
+
return self
|
44
|
+
|
45
|
+
def tags(self, tags):
|
46
|
+
self.experiment_tags = tags
|
47
|
+
return self
|
48
|
+
|
49
|
+
def tracking_uri(self, uri):
|
50
|
+
self.tracking_uri = uri
|
51
|
+
return self
|
52
|
+
|
53
|
+
def dataset(self, dataset_type: BaseDataset):
|
54
|
+
self.dataset_type = dataset_type
|
55
|
+
return self
|
56
|
+
|
57
|
+
def columns(self, cols):
|
58
|
+
self.columns = cols
|
59
|
+
return self
|
60
|
+
|
61
|
+
def vectordb_collection(self, persist_directory, collection_name):
|
62
|
+
self.vector_db_persist_directory = persist_directory
|
63
|
+
self.vector_db_collection_name = collection_name
|
64
|
+
return self
|
65
|
+
|
66
|
+
def embedding_pooling_strategy(self, strategy_type: PoolingStrategy):
|
67
|
+
self.embedding_pooling_strategy_type = strategy_type
|
68
|
+
return self
|
69
|
+
|
70
|
+
def ner_data_file(self, ner_data_file):
|
71
|
+
self.ner_data_file = ner_data_file
|
72
|
+
self.ner_df = CTakesNER().load(filename=ner_data_file)
|
73
|
+
return self
|
74
|
+
|
75
|
+
def ner_threshold(self, threshold):
|
76
|
+
self.ner_threshold = threshold
|
77
|
+
return self
|
78
|
+
|
79
|
+
def combinations(self, combs):
|
80
|
+
self.combinations = combs
|
81
|
+
return self
|
82
|
+
|
83
|
+
def model(self, model):
|
84
|
+
self.model = model
|
85
|
+
return self
|
86
|
+
|
87
|
+
def build(self):
|
88
|
+
# 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
|
89
|
+
kwargs = {"columns": self.columns}
|
90
|
+
for k, v in self.ner_threshold.items():
|
91
|
+
kwargs[k] = v
|
92
|
+
|
93
|
+
self.vector_db = chromadb.PersistentClient(
|
94
|
+
path=self.vector_db_persist_directory)
|
95
|
+
self.collection = self.vector_db.get_collection(
|
96
|
+
self.vector_db_collection_name)
|
97
|
+
dictionary = self.collection.get(include=['embeddings', 'metadatas'])
|
98
|
+
|
99
|
+
embedding_dict = defaultdict(lambda: defaultdict(list))
|
100
|
+
|
101
|
+
for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
|
102
|
+
embedding_dict[metadata["type"]][metadata["id"]].append(embedding)
|
103
|
+
|
104
|
+
embedding_size = dictionary['embeddings'].shape[1]
|
105
|
+
|
106
|
+
pooling_strategy = self.embedding_pooling_strategy_type()
|
107
|
+
|
108
|
+
self.dataset = self.dataset_type(
|
109
|
+
embedding_dict=embedding_dict,
|
110
|
+
embedding_size=embedding_size,
|
111
|
+
embeddings_pooling_strategy=pooling_strategy,
|
112
|
+
ner_df=self.ner_df, kwargs=kwargs)
|
113
|
+
|
114
|
+
X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = self.dataset.load()
|
115
|
+
|
116
|
+
self.dataframe = self.dataset.dataframe
|
117
|
+
# dataframe.dropna()
|
118
|
+
self.X_train = self.dataset.X_train
|
119
|
+
self.X_test = self.dataset.X_test
|
120
|
+
self.y_train = self.dataset.y_train
|
121
|
+
self.y_test = self.dataset.y_test
|
122
|
+
self.train_idx_arr = self.dataset.train_idx_arr
|
123
|
+
self.val_idx_arr = self.dataset.val_idx_arr
|
124
|
+
# Logic to set up the experiment
|
125
|
+
self.items = self.dataset.produce_inputs()
|
126
|
+
|
127
|
+
unique_classes = pd.unique(self.dataframe['event_category'])
|
128
|
+
event_num = len(unique_classes)
|
129
|
+
# droprate = 0.3
|
130
|
+
vector_size = self.dataset.drugs_df.shape[0]
|
131
|
+
|
132
|
+
print("Building the experiment with the following settings:")
|
133
|
+
print(
|
134
|
+
f"Name: {self.experiment_name}, Dataset: {self.dataset}, Model: {self.model}")
|
135
|
+
# Implement additional build logic as needed
|
136
|
+
return self
|
137
|
+
|
138
|
+
def run(self, model_func, batch_size=128, epochs=100):
|
139
|
+
mlflow.set_tracking_uri(self.tracking_uri)
|
140
|
+
|
141
|
+
if mlflow.get_experiment_by_name(self.experiment_name) == None:
|
142
|
+
mlflow.create_experiment(self.experiment_name)
|
143
|
+
mlflow.set_experiment_tags(self.experiment_tags)
|
144
|
+
mlflow.set_experiment(self.experiment_name)
|
145
|
+
|
146
|
+
y_test_label = self.items[0][4]
|
147
|
+
multi_modal = TFMultiModal(
|
148
|
+
model_func=model_func, batch_size=batch_size, epochs=epochs) # 100
|
149
|
+
multi_modal.set_data(
|
150
|
+
self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
|
151
|
+
pred, self.single_results = multi_modal.predict(self.combinations)
|
152
|
+
return self
|
@@ -30,12 +30,25 @@ np.random.seed(2)
|
|
30
30
|
np.set_printoptions(precision=4)
|
31
31
|
|
32
32
|
|
33
|
+
class Result:
|
34
|
+
def __init__(self) -> None:
|
35
|
+
self.log_dict = {}
|
36
|
+
self.metric_dict = {}
|
37
|
+
|
38
|
+
def add_log(self, key, logs):
|
39
|
+
self.log_dict[key] = logs
|
40
|
+
|
41
|
+
def add_metric(self, key, metrics):
|
42
|
+
self.metric_dict[key] = metrics
|
43
|
+
|
44
|
+
|
33
45
|
class TFMultiModal:
|
34
46
|
# todo model related parameters to config
|
35
47
|
def __init__(self, model_func, batch_size=128, epochs=100):
|
36
48
|
self.model_func = model_func
|
37
49
|
self.batch_size = batch_size
|
38
50
|
self.epochs = epochs
|
51
|
+
self.result = Result()
|
39
52
|
|
40
53
|
def set_data(self, items, train_idx_arr, val_idx_arr, y_test_label):
|
41
54
|
self.items = items
|
@@ -64,14 +77,16 @@ class TFMultiModal:
|
|
64
77
|
self.date, item[0], self.model_func, self.batch_size, self.epochs)
|
65
78
|
single_modal.set_data(
|
66
79
|
self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
|
67
|
-
|
68
|
-
|
69
|
-
|
80
|
+
logs, metrics, prediction = single_modal.predict()
|
81
|
+
self.result.add_log(item[0], logs)
|
82
|
+
self.result.add_metric(item[0], metrics)
|
83
|
+
single_results[item[0]] = prediction
|
84
|
+
# sum = sum + prediction
|
70
85
|
|
71
86
|
if combinations:
|
72
87
|
self.evaluate_combinations(single_results, combinations)
|
73
88
|
# TODO: sum'a gerek yok
|
74
|
-
return
|
89
|
+
return self.result
|
75
90
|
|
76
91
|
def evaluate_combinations(self, single_results, combinations):
|
77
92
|
for combination in combinations:
|
@@ -90,6 +105,8 @@ class TFMultiModal:
|
|
90
105
|
f'combination_artifact_uri:{combination_run.info.artifact_uri}')
|
91
106
|
utils.compress_and_save_data(
|
92
107
|
metrics.__dict__, combination_run.info.artifact_uri, f'{self.date}_metrics.gzip')
|
108
|
+
self.result.add_log(combination_descriptor,logs)
|
109
|
+
self.result.add_metric(combination_descriptor,metrics)
|
93
110
|
|
94
111
|
|
95
112
|
class TFSingleModal:
|
@@ -172,39 +189,8 @@ class TFSingleModal:
|
|
172
189
|
# '/model/model.onnx')
|
173
190
|
utils.compress_and_save_data(
|
174
191
|
metrics.__dict__, run.info.artifact_uri, f'{self.date}_metrics.gzip')
|
175
|
-
|
176
|
-
|
177
|
-
# Plot Precision-Recall curves for each class and micro-average
|
178
|
-
# fig = plt.figure()
|
179
|
-
# plt.step(metrics.recall['micro_event'], metrics.precision['micro_event'],
|
180
|
-
# color='b', alpha=0.2, where='post')
|
181
|
-
# plt.fill_between(
|
182
|
-
# metrics.recall["micro_event"], metrics.precision["micro_event"], step='post', alpha=0.2, color='b')
|
183
|
-
|
184
|
-
# for i in range(pred.shape[1]):
|
185
|
-
# plt.step(metrics.recall[i], metrics.precision[i], where='post',
|
186
|
-
# label='Class {0} (AUC={1:0.2f})'.format(i, metrics.roc_aupr[i]))
|
187
|
-
|
188
|
-
# plt.xlabel('Recall')
|
189
|
-
# plt.ylabel('Precision')
|
190
|
-
# plt.ylim([0.0, 1.05])
|
191
|
-
# plt.xlim([0.0, 1.0])
|
192
|
-
# plt.title(
|
193
|
-
# 'Micro-average Precision-Recall curve: AUC={0:0.2f}'.format(metrics.roc_aupr["micro"]))
|
194
|
-
# plt.legend(loc='best')
|
195
|
-
# # plt.savefig(run.info.artifact_uri + '/auprc.png')
|
196
|
-
# mlflow.log_figure(fig, 'auprc.png')
|
197
|
-
|
198
|
-
# mlflow.log_model(
|
199
|
-
# model,
|
200
|
-
# artifact_path=run.info.artifact_uri + '/model',
|
201
|
-
# signature=signature,
|
202
|
-
# )
|
203
|
-
# mlflow.log_artifact(run.info.artifact_uri + '/model')
|
204
|
-
|
205
|
-
# mlflow.MlflowClient().log_artifact(run.info.run_id,
|
206
|
-
# run.info.artifact_uri, None)
|
207
|
-
return pred
|
192
|
+
|
193
|
+
return logs, metrics, pred
|
208
194
|
|
209
195
|
|
210
196
|
class CustomCallback(keras.callbacks.Callback):
|
@@ -0,0 +1,4 @@
|
|
1
|
+
from .utils import clear_directory,create_folder_if_not_exists, utc_time_as_string,utc_time_as_string_simple_format, compress_and_save_data
|
2
|
+
from .zip_helper import ZipHelper
|
3
|
+
from .py7zr_helper import Py7ZipHelper
|
4
|
+
from .enums import UMLSCodeTypes, DrugBankTextDataTypes
|
@@ -1,11 +1,13 @@
|
|
1
1
|
from collections import defaultdict
|
2
2
|
import math
|
3
|
-
from ddi_fw.utils.utils import create_folder_if_not_exists
|
3
|
+
from ddi_fw.utils.utils import clear_directory, create_folder_if_not_exists
|
4
4
|
import py7zr
|
5
5
|
import os
|
6
6
|
from os.path import basename
|
7
7
|
|
8
|
-
|
8
|
+
# https://py7zr.readthedocs.io/en/latest/user_guide.html
|
9
|
+
# import multivolumefile
|
10
|
+
#https://github.com/miurahr/py7zr/issues/497
|
9
11
|
class Py7ZipHelper:
|
10
12
|
def __init__(self):
|
11
13
|
pass
|
@@ -24,9 +26,10 @@ class Py7ZipHelper:
|
|
24
26
|
os.path.join(folder_path, '..')))
|
25
27
|
# archive.write(root+"/"+file)
|
26
28
|
|
27
|
-
def create_archive_multiparts(self, zip_name, file_path, output_path, chunk_size):
|
29
|
+
def create_archive_multiparts(self, zip_name, file_path, output_path, chunk_size, delete_existing_files=True):
|
28
30
|
parent_folder = os.path.dirname(file_path)
|
29
|
-
|
31
|
+
if delete_existing_files:
|
32
|
+
clear_directory(output_path)
|
30
33
|
# parts_path = f"{parent_folder}/parts"
|
31
34
|
create_folder_if_not_exists(output_path)
|
32
35
|
# file_name, file_extension = os.path.splitext(file_path)
|
@@ -1,11 +1,9 @@
|
|
1
1
|
import gzip
|
2
2
|
import json
|
3
3
|
import os
|
4
|
-
|
5
4
|
from datetime import datetime, timezone
|
6
|
-
|
7
5
|
from matplotlib import pyplot as plt
|
8
|
-
|
6
|
+
import shutil
|
9
7
|
|
10
8
|
def create_folder_if_not_exists(path):
|
11
9
|
if not os.path.exists(path):
|
@@ -51,6 +49,22 @@ def decompress(gzip_file):
|
|
51
49
|
return data
|
52
50
|
|
53
51
|
|
52
|
+
def clear_directory(directory_path):
|
53
|
+
# Check if the directory exists
|
54
|
+
if os.path.exists(directory_path) and os.path.isdir(directory_path):
|
55
|
+
# Iterate through all files and directories in the directory
|
56
|
+
for item in os.listdir(directory_path):
|
57
|
+
item_path = os.path.join(directory_path, item)
|
58
|
+
# Check if it's a file or a directory and remove it
|
59
|
+
if os.path.isfile(item_path):
|
60
|
+
os.remove(item_path) # Remove file
|
61
|
+
elif os.path.isdir(item_path):
|
62
|
+
shutil.rmtree(item_path) # Remove directory
|
63
|
+
print(f"Cleared contents of directory: {directory_path}")
|
64
|
+
else:
|
65
|
+
print(f"The directory does not exist: {directory_path}")
|
66
|
+
|
67
|
+
|
54
68
|
if __name__ == "__main__":
|
55
69
|
# json_file = f'C:\\Users\\kivanc\\Downloads\\metrics.json'
|
56
70
|
# file_data = open(json_file, "r", 1).read()
|
@@ -66,6 +66,8 @@ src/ddi_fw/drugbank/event_extractor.py
|
|
66
66
|
src/ddi_fw/experiments/__init__.py
|
67
67
|
src/ddi_fw/experiments/custom_torch_model.py
|
68
68
|
src/ddi_fw/experiments/evaluation_helper.py
|
69
|
+
src/ddi_fw/experiments/pipeline.py
|
70
|
+
src/ddi_fw/experiments/pipeline_builder_pattern.py
|
69
71
|
src/ddi_fw/experiments/tensorflow_helper.py
|
70
72
|
src/ddi_fw/experiments/test.py
|
71
73
|
src/ddi_fw/ner/__init__.py
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_0.txt
RENAMED
File without changes
|
{ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_1.txt
RENAMED
File without changes
|
{ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_2.txt
RENAMED
File without changes
|
{ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_3.txt
RENAMED
File without changes
|
{ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_4.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/df_extraction_cleanxiaoyu50.csv
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_0.txt
RENAMED
File without changes
|
{ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_1.txt
RENAMED
File without changes
|
{ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_2.txt
RENAMED
File without changes
|
{ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_3.txt
RENAMED
File without changes
|
{ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_4.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|