ddi-fw 0.0.49__tar.gz → 0.0.51__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/PKG-INFO +1 -1
  2. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/pyproject.toml +1 -1
  3. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/base.py +36 -4
  4. ddi_fw-0.0.51/src/ddi_fw/experiments/__init__.py +2 -0
  5. ddi_fw-0.0.51/src/ddi_fw/experiments/pipeline.py +126 -0
  6. ddi_fw-0.0.51/src/ddi_fw/experiments/pipeline_builder_pattern.py +152 -0
  7. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/experiments/tensorflow_helper.py +23 -37
  8. ddi_fw-0.0.51/src/ddi_fw/utils/__init__.py +4 -0
  9. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/utils/py7zr_helper.py +7 -4
  10. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/utils/utils.py +17 -3
  11. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw.egg-info/PKG-INFO +1 -1
  12. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw.egg-info/SOURCES.txt +2 -0
  13. ddi_fw-0.0.49/src/ddi_fw/experiments/__init__.py +0 -2
  14. ddi_fw-0.0.49/src/ddi_fw/utils/__init__.py +0 -4
  15. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/README.md +0 -0
  16. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/setup.cfg +0 -0
  17. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/__init__.py +0 -0
  18. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/core.py +0 -0
  19. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/db_utils.py +0 -0
  20. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/data/event.db +0 -0
  21. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt +0 -0
  22. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_0.txt +0 -0
  23. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_1.txt +0 -0
  24. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_2.txt +0 -0
  25. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_3.txt +0 -0
  26. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_4.txt +0 -0
  27. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/train_indexes.txt +0 -0
  28. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_0.txt +0 -0
  29. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_1.txt +0 -0
  30. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_2.txt +0 -0
  31. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_3.txt +0 -0
  32. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_4.txt +0 -0
  33. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/test_indexes.txt +0 -0
  34. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_0.txt +0 -0
  35. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_1.txt +0 -0
  36. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_2.txt +0 -0
  37. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_3.txt +0 -0
  38. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_4.txt +0 -0
  39. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_indexes.txt +0 -0
  40. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_0.txt +0 -0
  41. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_1.txt +0 -0
  42. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_2.txt +0 -0
  43. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_3.txt +0 -0
  44. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_4.txt +0 -0
  45. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/ddi_mdl/readme.md +0 -0
  46. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/embedding_generator.py +0 -0
  47. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/embedding_generator_new.py +0 -0
  48. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/feature_vector_generation.py +0 -0
  49. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/idf_helper.py +0 -0
  50. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/__init__.py +0 -0
  51. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/base.py +0 -0
  52. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/df_extraction_cleanxiaoyu50.csv +0 -0
  53. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/drug_information_del_noDDIxiaoyu50.csv +0 -0
  54. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/test_indexes.txt +0 -0
  55. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_0.txt +0 -0
  56. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_1.txt +0 -0
  57. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_2.txt +0 -0
  58. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_3.txt +0 -0
  59. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_4.txt +0 -0
  60. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_indexes.txt +0 -0
  61. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_0.txt +0 -0
  62. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_1.txt +0 -0
  63. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_2.txt +0 -0
  64. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_3.txt +0 -0
  65. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_4.txt +0 -0
  66. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/mdf_sa_ddi/mdf-sa-ddi.zip +0 -0
  67. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/datasets/setup_._py +0 -0
  68. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/drugbank/__init__.py +0 -0
  69. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/drugbank/drugbank.xsd +0 -0
  70. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/drugbank/drugbank_parser.py +0 -0
  71. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/drugbank/drugbank_processor.py +0 -0
  72. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/drugbank/drugbank_processor_org.py +0 -0
  73. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/drugbank/event_extractor.py +0 -0
  74. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/experiments/custom_torch_model.py +0 -0
  75. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/experiments/evaluation_helper.py +0 -0
  76. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/experiments/test.py +0 -0
  77. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/ner/__init__.py +0 -0
  78. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/ner/mmlrestclient.py +0 -0
  79. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/ner/ner.py +0 -0
  80. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/basic_test.py +0 -0
  81. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/combination_test.py +0 -0
  82. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/compress_json_test.py +0 -0
  83. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/date_test.py +0 -0
  84. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/idf_score.py +0 -0
  85. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/jaccard_similarity.py +0 -0
  86. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/mlfow_test.py +0 -0
  87. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/sklearn-tfidf.py +0 -0
  88. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/test.py +0 -0
  89. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/torch_cuda_test.py +0 -0
  90. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/test/type_guarding_test.py +0 -0
  91. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/utils/enums.py +0 -0
  92. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw/utils/zip_helper.py +0 -0
  93. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw.egg-info/dependency_links.txt +0 -0
  94. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw.egg-info/requires.txt +0 -0
  95. {ddi_fw-0.0.49 → ddi_fw-0.0.51}/src/ddi_fw.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ddi_fw
3
- Version: 0.0.49
3
+ Version: 0.0.51
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
5
5
 
6
6
  [project]
7
7
  name = "ddi_fw"
8
- version = "0.0.49"
8
+ version = "0.0.51"
9
9
  description = "Do not use :)"
10
10
  readme = "README.md"
11
11
  authors = [
@@ -7,16 +7,48 @@ from .. import BaseDataset
7
7
  from ..db_utils import create_connection
8
8
 
9
9
  HERE = pathlib.Path(__file__).resolve().parent
10
+ list_of_embedding_columns = ['all_text', 'description',
11
+ 'synthesis_reference', 'indication',
12
+ 'pharmacodynamics', 'mechanism_of_action',
13
+ 'toxicity', 'metabolism',
14
+ 'absorption', 'half_life',
15
+ 'protein_binding', 'route_of_elimination',
16
+ 'volume_of_distribution', 'clearance']
17
+
18
+ list_of_chemical_property_columns = ['enzyme',
19
+ 'target',
20
+ 'pathway',
21
+ 'smile']
22
+ list_of_ner_columns = ['tui', 'cui', 'entities']
10
23
 
11
24
 
12
25
  class DDIMDLDataset(BaseDataset):
13
- def __init__(self, embedding_size, embedding_dict, embeddings_pooling_strategy: PoolingStrategy, ner_df, chemical_property_columns=['enzyme',
14
- 'target',
15
- 'pathway',
16
- 'smile'],
26
+ def __init__(self, embedding_size,
27
+ embedding_dict,
28
+ embeddings_pooling_strategy: PoolingStrategy,
29
+ ner_df,
30
+ chemical_property_columns=['enzyme',
31
+ 'target',
32
+ 'pathway',
33
+ 'smile'],
17
34
  embedding_columns=[],
18
35
  ner_columns=[],
19
36
  **kwargs):
37
+ columns = kwargs['columns']
38
+ if columns is not None:
39
+ chemical_property_columns = []
40
+ embedding_columns=[]
41
+ ner_columns=[]
42
+ for column in columns:
43
+ if column in list_of_chemical_property_columns:
44
+ chemical_property_columns.append(column)
45
+ elif column in list_of_embedding_columns:
46
+ embedding_columns.append(column)
47
+ elif column in list_of_ner_columns:
48
+ ner_columns.append(column)
49
+ else:
50
+ raise Exception(f"{column} is not related this dataset")
51
+
20
52
 
21
53
  super().__init__(embedding_size=embedding_size,
22
54
  embedding_dict=embedding_dict,
@@ -0,0 +1,2 @@
1
+ from .tensorflow_helper import TFMultiModal, TFSingleModal,Result
2
+ from .evaluation_helper import evaluate, Metrics
@@ -0,0 +1,126 @@
1
+ import sqlite3
2
+ from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
3
+ from keras.models import Model, Sequential
4
+ from keras.callbacks import EarlyStopping
5
+ from keras.layers import Dense, Dropout, Input, Activation, BatchNormalization
6
+ from tensorflow import keras
7
+ from ddi_fw.experiments import TFSingleModal, TFMultiModal
8
+ from ddi_fw.experiments import evaluate
9
+ from sklearn.preprocessing import LabelBinarizer
10
+ import numpy as np
11
+ import pandas as pd
12
+ from ddi_fw.utils import ZipHelper, Py7ZipHelper
13
+ import os
14
+ import chromadb
15
+ from collections import defaultdict
16
+ from langchain_community.vectorstores import Chroma
17
+ from ddi_fw.ner.ner import CTakesNER
18
+ from ddi_fw.datasets.embedding_generator_new import PoolingStrategy
19
+
20
+ from ddi_fw.datasets import BaseDataset, DDIMDLDataset
21
+
22
+ from ddi_fw.datasets import SumPoolingStrategy
23
+ from keras import metrics
24
+ from ddi_fw.experiments.evaluation_helper import evaluate
25
+
26
+ import mlflow
27
+
28
+
29
+ class Experiment:
30
+ def __init__(self,
31
+ experiment_name=None,
32
+ experiment_description=None,
33
+ experiment_tags=None,
34
+ tracking_uri=None,
35
+ dataset_type:BaseDataset=None,
36
+ columns=None,
37
+ vector_db_persist_directory=None,
38
+ vector_db_collection_name=None,
39
+ embedding_pooling_strategy_type:PoolingStrategy=None,
40
+ ner_data_file=None,
41
+ ner_threshold=None,
42
+ combinations=None,
43
+ model=None):
44
+
45
+ self.experiment_name = experiment_name
46
+ self.experiment_description = experiment_description
47
+ self.experiment_tags = experiment_tags
48
+ self.tracking_uri = tracking_uri
49
+ self.dataset_type = dataset_type
50
+ self.columns = columns
51
+ self.vector_db_persist_directory = vector_db_persist_directory
52
+ self.vector_db_collection_name = vector_db_collection_name
53
+ self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
54
+ self.ner_data_file = ner_data_file
55
+ self.ner_threshold = ner_threshold
56
+ self.combinations = combinations
57
+ self.model = model
58
+
59
+ def build(self):
60
+ # 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
61
+ kwargs = {"columns": self.columns}
62
+ for k, v in self.ner_threshold.items():
63
+ kwargs[k] = v
64
+
65
+ self.vector_db = chromadb.PersistentClient(
66
+ path=self.vector_db_persist_directory)
67
+ self.collection = self.vector_db.get_collection(
68
+ self.vector_db_collection_name)
69
+ dictionary = self.collection.get(include=['embeddings', 'metadatas'])
70
+
71
+ embedding_dict = defaultdict(lambda: defaultdict(list))
72
+
73
+ for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
74
+ embedding_dict[metadata["type"]][metadata["id"]].append(embedding)
75
+
76
+ embedding_size = dictionary['embeddings'].shape[1]
77
+
78
+ pooling_strategy = self.embedding_pooling_strategy_type()
79
+
80
+ self.ner_df = CTakesNER().load(filename=self.ner_data_file) if self.ner_data_file else None
81
+
82
+ self.dataset = self.dataset_type(
83
+ embedding_dict=embedding_dict,
84
+ embedding_size=embedding_size,
85
+ embeddings_pooling_strategy=pooling_strategy,
86
+ ner_df=self.ner_df, kwargs=kwargs)
87
+
88
+ X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = self.dataset.load()
89
+
90
+ self.dataframe = self.dataset.dataframe
91
+ # dataframe.dropna()
92
+ self.X_train = self.dataset.X_train
93
+ self.X_test = self.dataset.X_test
94
+ self.y_train = self.dataset.y_train
95
+ self.y_test = self.dataset.y_test
96
+ self.train_idx_arr = self.dataset.train_idx_arr
97
+ self.val_idx_arr = self.dataset.val_idx_arr
98
+ # Logic to set up the experiment
99
+ self.items = self.dataset.produce_inputs()
100
+
101
+ unique_classes = pd.unique(self.dataframe['event_category'])
102
+ event_num = len(unique_classes)
103
+ # droprate = 0.3
104
+ vector_size = self.dataset.drugs_df.shape[0]
105
+
106
+ print("Building the experiment with the following settings:")
107
+ print(
108
+ f"Name: {self.experiment_name}, Dataset: {self.dataset}, Model: {self.model}")
109
+ # Implement additional build logic as needed
110
+ return self
111
+
112
+ def run(self, model_func, batch_size=128, epochs=100):
113
+ mlflow.set_tracking_uri(self.tracking_uri)
114
+
115
+ if mlflow.get_experiment_by_name(self.experiment_name) == None:
116
+ mlflow.create_experiment(self.experiment_name)
117
+ mlflow.set_experiment_tags(self.experiment_tags)
118
+ mlflow.set_experiment(self.experiment_name)
119
+
120
+ y_test_label = self.items[0][4]
121
+ multi_modal = TFMultiModal(
122
+ model_func=model_func, batch_size=batch_size, epochs=epochs) # 100
123
+ multi_modal.set_data(
124
+ self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
125
+ result = multi_modal.predict(self.combinations)
126
+ return result
@@ -0,0 +1,152 @@
1
+ import sqlite3
2
+ from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
3
+ from keras.models import Model, Sequential
4
+ from keras.callbacks import EarlyStopping
5
+ from keras.layers import Dense, Dropout, Input, Activation, BatchNormalization
6
+ from tensorflow import keras
7
+ from ddi_fw.experiments import TFSingleModal, TFMultiModal
8
+ from ddi_fw.experiments import evaluate
9
+ from sklearn.preprocessing import LabelBinarizer
10
+ import numpy as np
11
+ import pandas as pd
12
+ from ddi_fw.utils import ZipHelper, Py7ZipHelper
13
+ import os
14
+ import chromadb
15
+ from collections import defaultdict
16
+ from langchain_community.vectorstores import Chroma
17
+ from ddi_fw.ner.ner import CTakesNER
18
+ from ddi_fw.datasets.embedding_generator_new import PoolingStrategy
19
+
20
+ from ddi_fw.datasets import BaseDataset, DDIMDLDataset
21
+
22
+ from ddi_fw.datasets import SumPoolingStrategy
23
+ from keras import metrics
24
+ from ddi_fw.experiments.evaluation_helper import evaluate
25
+
26
+ import mlflow
27
+
28
+
29
+ class Experiment:
30
+ def __init__(self):
31
+ pass
32
+
33
+ @staticmethod
34
+ def create():
35
+ return Experiment()
36
+
37
+ def name(self, name):
38
+ self.experiment_name = name
39
+ return self
40
+
41
+ def description(self, description):
42
+ self.experiment_description = description
43
+ return self
44
+
45
+ def tags(self, tags):
46
+ self.experiment_tags = tags
47
+ return self
48
+
49
+ def tracking_uri(self, uri):
50
+ self.tracking_uri = uri
51
+ return self
52
+
53
+ def dataset(self, dataset_type: BaseDataset):
54
+ self.dataset_type = dataset_type
55
+ return self
56
+
57
+ def columns(self, cols):
58
+ self.columns = cols
59
+ return self
60
+
61
+ def vectordb_collection(self, persist_directory, collection_name):
62
+ self.vector_db_persist_directory = persist_directory
63
+ self.vector_db_collection_name = collection_name
64
+ return self
65
+
66
+ def embedding_pooling_strategy(self, strategy_type: PoolingStrategy):
67
+ self.embedding_pooling_strategy_type = strategy_type
68
+ return self
69
+
70
+ def ner_data_file(self, ner_data_file):
71
+ self.ner_data_file = ner_data_file
72
+ self.ner_df = CTakesNER().load(filename=ner_data_file)
73
+ return self
74
+
75
+ def ner_threshold(self, threshold):
76
+ self.ner_threshold = threshold
77
+ return self
78
+
79
+ def combinations(self, combs):
80
+ self.combinations = combs
81
+ return self
82
+
83
+ def model(self, model):
84
+ self.model = model
85
+ return self
86
+
87
+ def build(self):
88
+ # 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
89
+ kwargs = {"columns": self.columns}
90
+ for k, v in self.ner_threshold.items():
91
+ kwargs[k] = v
92
+
93
+ self.vector_db = chromadb.PersistentClient(
94
+ path=self.vector_db_persist_directory)
95
+ self.collection = self.vector_db.get_collection(
96
+ self.vector_db_collection_name)
97
+ dictionary = self.collection.get(include=['embeddings', 'metadatas'])
98
+
99
+ embedding_dict = defaultdict(lambda: defaultdict(list))
100
+
101
+ for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
102
+ embedding_dict[metadata["type"]][metadata["id"]].append(embedding)
103
+
104
+ embedding_size = dictionary['embeddings'].shape[1]
105
+
106
+ pooling_strategy = self.embedding_pooling_strategy_type()
107
+
108
+ self.dataset = self.dataset_type(
109
+ embedding_dict=embedding_dict,
110
+ embedding_size=embedding_size,
111
+ embeddings_pooling_strategy=pooling_strategy,
112
+ ner_df=self.ner_df, kwargs=kwargs)
113
+
114
+ X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = self.dataset.load()
115
+
116
+ self.dataframe = self.dataset.dataframe
117
+ # dataframe.dropna()
118
+ self.X_train = self.dataset.X_train
119
+ self.X_test = self.dataset.X_test
120
+ self.y_train = self.dataset.y_train
121
+ self.y_test = self.dataset.y_test
122
+ self.train_idx_arr = self.dataset.train_idx_arr
123
+ self.val_idx_arr = self.dataset.val_idx_arr
124
+ # Logic to set up the experiment
125
+ self.items = self.dataset.produce_inputs()
126
+
127
+ unique_classes = pd.unique(self.dataframe['event_category'])
128
+ event_num = len(unique_classes)
129
+ # droprate = 0.3
130
+ vector_size = self.dataset.drugs_df.shape[0]
131
+
132
+ print("Building the experiment with the following settings:")
133
+ print(
134
+ f"Name: {self.experiment_name}, Dataset: {self.dataset}, Model: {self.model}")
135
+ # Implement additional build logic as needed
136
+ return self
137
+
138
+ def run(self, model_func, batch_size=128, epochs=100):
139
+ mlflow.set_tracking_uri(self.tracking_uri)
140
+
141
+ if mlflow.get_experiment_by_name(self.experiment_name) == None:
142
+ mlflow.create_experiment(self.experiment_name)
143
+ mlflow.set_experiment_tags(self.experiment_tags)
144
+ mlflow.set_experiment(self.experiment_name)
145
+
146
+ y_test_label = self.items[0][4]
147
+ multi_modal = TFMultiModal(
148
+ model_func=model_func, batch_size=batch_size, epochs=epochs) # 100
149
+ multi_modal.set_data(
150
+ self.items, self.train_idx_arr, self.val_idx_arr, y_test_label)
151
+ pred, self.single_results = multi_modal.predict(self.combinations)
152
+ return self
@@ -30,12 +30,25 @@ np.random.seed(2)
30
30
  np.set_printoptions(precision=4)
31
31
 
32
32
 
33
+ class Result:
34
+ def __init__(self) -> None:
35
+ self.log_dict = {}
36
+ self.metric_dict = {}
37
+
38
+ def add_log(self, key, logs):
39
+ self.log_dict[key] = logs
40
+
41
+ def add_metric(self, key, metrics):
42
+ self.metric_dict[key] = metrics
43
+
44
+
33
45
  class TFMultiModal:
34
46
  # todo model related parameters to config
35
47
  def __init__(self, model_func, batch_size=128, epochs=100):
36
48
  self.model_func = model_func
37
49
  self.batch_size = batch_size
38
50
  self.epochs = epochs
51
+ self.result = Result()
39
52
 
40
53
  def set_data(self, items, train_idx_arr, val_idx_arr, y_test_label):
41
54
  self.items = items
@@ -64,14 +77,16 @@ class TFMultiModal:
64
77
  self.date, item[0], self.model_func, self.batch_size, self.epochs)
65
78
  single_modal.set_data(
66
79
  self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
67
- r = single_modal.predict()
68
- single_results[item[0]] = r
69
- sum = sum + r
80
+ logs, metrics, prediction = single_modal.predict()
81
+ self.result.add_log(item[0], logs)
82
+ self.result.add_metric(item[0], metrics)
83
+ single_results[item[0]] = prediction
84
+ # sum = sum + prediction
70
85
 
71
86
  if combinations:
72
87
  self.evaluate_combinations(single_results, combinations)
73
88
  # TODO: sum'a gerek yok
74
- return sum, single_results
89
+ return self.result
75
90
 
76
91
  def evaluate_combinations(self, single_results, combinations):
77
92
  for combination in combinations:
@@ -90,6 +105,8 @@ class TFMultiModal:
90
105
  f'combination_artifact_uri:{combination_run.info.artifact_uri}')
91
106
  utils.compress_and_save_data(
92
107
  metrics.__dict__, combination_run.info.artifact_uri, f'{self.date}_metrics.gzip')
108
+ self.result.add_log(combination_descriptor,logs)
109
+ self.result.add_metric(combination_descriptor,metrics)
93
110
 
94
111
 
95
112
  class TFSingleModal:
@@ -172,39 +189,8 @@ class TFSingleModal:
172
189
  # '/model/model.onnx')
173
190
  utils.compress_and_save_data(
174
191
  metrics.__dict__, run.info.artifact_uri, f'{self.date}_metrics.gzip')
175
- # mlflow.log_dict(metrics.__dict__, "metrics.json")
176
-
177
- # Plot Precision-Recall curves for each class and micro-average
178
- # fig = plt.figure()
179
- # plt.step(metrics.recall['micro_event'], metrics.precision['micro_event'],
180
- # color='b', alpha=0.2, where='post')
181
- # plt.fill_between(
182
- # metrics.recall["micro_event"], metrics.precision["micro_event"], step='post', alpha=0.2, color='b')
183
-
184
- # for i in range(pred.shape[1]):
185
- # plt.step(metrics.recall[i], metrics.precision[i], where='post',
186
- # label='Class {0} (AUC={1:0.2f})'.format(i, metrics.roc_aupr[i]))
187
-
188
- # plt.xlabel('Recall')
189
- # plt.ylabel('Precision')
190
- # plt.ylim([0.0, 1.05])
191
- # plt.xlim([0.0, 1.0])
192
- # plt.title(
193
- # 'Micro-average Precision-Recall curve: AUC={0:0.2f}'.format(metrics.roc_aupr["micro"]))
194
- # plt.legend(loc='best')
195
- # # plt.savefig(run.info.artifact_uri + '/auprc.png')
196
- # mlflow.log_figure(fig, 'auprc.png')
197
-
198
- # mlflow.log_model(
199
- # model,
200
- # artifact_path=run.info.artifact_uri + '/model',
201
- # signature=signature,
202
- # )
203
- # mlflow.log_artifact(run.info.artifact_uri + '/model')
204
-
205
- # mlflow.MlflowClient().log_artifact(run.info.run_id,
206
- # run.info.artifact_uri, None)
207
- return pred
192
+
193
+ return logs, metrics, pred
208
194
 
209
195
 
210
196
  class CustomCallback(keras.callbacks.Callback):
@@ -0,0 +1,4 @@
1
+ from .utils import clear_directory,create_folder_if_not_exists, utc_time_as_string,utc_time_as_string_simple_format, compress_and_save_data
2
+ from .zip_helper import ZipHelper
3
+ from .py7zr_helper import Py7ZipHelper
4
+ from .enums import UMLSCodeTypes, DrugBankTextDataTypes
@@ -1,11 +1,13 @@
1
1
  from collections import defaultdict
2
2
  import math
3
- from ddi_fw.utils.utils import create_folder_if_not_exists
3
+ from ddi_fw.utils.utils import clear_directory, create_folder_if_not_exists
4
4
  import py7zr
5
5
  import os
6
6
  from os.path import basename
7
7
 
8
-
8
+ # https://py7zr.readthedocs.io/en/latest/user_guide.html
9
+ # import multivolumefile
10
+ #https://github.com/miurahr/py7zr/issues/497
9
11
  class Py7ZipHelper:
10
12
  def __init__(self):
11
13
  pass
@@ -24,9 +26,10 @@ class Py7ZipHelper:
24
26
  os.path.join(folder_path, '..')))
25
27
  # archive.write(root+"/"+file)
26
28
 
27
- def create_archive_multiparts(self, zip_name, file_path, output_path, chunk_size):
29
+ def create_archive_multiparts(self, zip_name, file_path, output_path, chunk_size, delete_existing_files=True):
28
30
  parent_folder = os.path.dirname(file_path)
29
-
31
+ if delete_existing_files:
32
+ clear_directory(output_path)
30
33
  # parts_path = f"{parent_folder}/parts"
31
34
  create_folder_if_not_exists(output_path)
32
35
  # file_name, file_extension = os.path.splitext(file_path)
@@ -1,11 +1,9 @@
1
1
  import gzip
2
2
  import json
3
3
  import os
4
-
5
4
  from datetime import datetime, timezone
6
-
7
5
  from matplotlib import pyplot as plt
8
-
6
+ import shutil
9
7
 
10
8
  def create_folder_if_not_exists(path):
11
9
  if not os.path.exists(path):
@@ -51,6 +49,22 @@ def decompress(gzip_file):
51
49
  return data
52
50
 
53
51
 
52
+ def clear_directory(directory_path):
53
+ # Check if the directory exists
54
+ if os.path.exists(directory_path) and os.path.isdir(directory_path):
55
+ # Iterate through all files and directories in the directory
56
+ for item in os.listdir(directory_path):
57
+ item_path = os.path.join(directory_path, item)
58
+ # Check if it's a file or a directory and remove it
59
+ if os.path.isfile(item_path):
60
+ os.remove(item_path) # Remove file
61
+ elif os.path.isdir(item_path):
62
+ shutil.rmtree(item_path) # Remove directory
63
+ print(f"Cleared contents of directory: {directory_path}")
64
+ else:
65
+ print(f"The directory does not exist: {directory_path}")
66
+
67
+
54
68
  if __name__ == "__main__":
55
69
  # json_file = f'C:\\Users\\kivanc\\Downloads\\metrics.json'
56
70
  # file_data = open(json_file, "r", 1).read()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ddi_fw
3
- Version: 0.0.49
3
+ Version: 0.0.51
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -66,6 +66,8 @@ src/ddi_fw/drugbank/event_extractor.py
66
66
  src/ddi_fw/experiments/__init__.py
67
67
  src/ddi_fw/experiments/custom_torch_model.py
68
68
  src/ddi_fw/experiments/evaluation_helper.py
69
+ src/ddi_fw/experiments/pipeline.py
70
+ src/ddi_fw/experiments/pipeline_builder_pattern.py
69
71
  src/ddi_fw/experiments/tensorflow_helper.py
70
72
  src/ddi_fw/experiments/test.py
71
73
  src/ddi_fw/ner/__init__.py
@@ -1,2 +0,0 @@
1
- from .tensorflow_helper import TFMultiModal, TFSingleModal
2
- from .evaluation_helper import evaluate, Metrics
@@ -1,4 +0,0 @@
1
- from .utils import create_folder_if_not_exists, utc_time_as_string,utc_time_as_string_simple_format, compress_and_save_data
2
- from .zip_helper import ZipHelper
3
- from .py7zr_helper import Py7ZipHelper
4
- from .enums import UMLSCodeTypes, DrugBankTextDataTypes
File without changes
File without changes
File without changes
File without changes