ddi-fw 0.0.217__py3-none-any.whl → 0.0.218__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ddi_fw/datasets/core.py CHANGED
@@ -73,6 +73,7 @@ class BaseDataset(BaseModel, abc.ABC):
73
73
  train_idx_arr: Optional[List[np.ndarray]] = None
74
74
  val_idx_arr: Optional[List[np.ndarray]] = None
75
75
  columns: List[str] = []
76
+ additional_config: Optional[Dict[str, Any]] = None
76
77
 
77
78
  class Config:
78
79
  arbitrary_types_allowed = True
@@ -9,6 +9,8 @@ from abc import ABC, abstractmethod
9
9
  from sklearn.preprocessing import LabelBinarizer
10
10
  import logging
11
11
 
12
+ from ddi_fw.ner.ner import CTakesNER
13
+
12
14
 
13
15
  try:
14
16
  from ddi_fw.vectorization import IDF
@@ -63,6 +65,18 @@ class DDIMDLDataset(BaseDataset,TextDatasetMixin):
63
65
 
64
66
  super().__init__(**kwargs)
65
67
 
68
+ # self.additional_config = kwargs.get('dataset_additional_config', {})
69
+ if self.additional_config:
70
+ ner = self.additional_config.get('ner', {})
71
+ self.ner_data_file = ner.get('data_file', None)
72
+ self.ner_threshold = ner.get('thresholds', None)
73
+ # if self.ner_threshold:
74
+ # for k, v in self.ner_threshold.items():
75
+ # kwargs[k] = v
76
+
77
+ self.ner_df = CTakesNER(df=None).load(
78
+ filename=self.ner_data_file) if self.ner_data_file else None
79
+
66
80
  columns = kwargs['columns']
67
81
  if columns:
68
82
  chemical_property_columns = []
@@ -155,13 +169,14 @@ class DDIMDLDataset(BaseDataset,TextDatasetMixin):
155
169
 
156
170
  # for key in filtered_ner_df.keys():
157
171
  for key in self.ner_columns:
158
- threshold = 0
159
- if key.startswith('tui'):
160
- threshold = self.tui_threshold
161
- if key.startswith('cui'):
162
- threshold = self.cui_threshold
163
- if key.startswith('entities'):
164
- threshold = self.entities_threshold
172
+ threshold = self.ner_threshold.get(key, 0)
173
+ # threshold = 0
174
+ # if key.startswith('tui'):
175
+ # threshold = self.tui_threshold
176
+ # if key.startswith('cui'):
177
+ # threshold = self.cui_threshold
178
+ # if key.startswith('entities'):
179
+ # threshold = self.entities_threshold
165
180
  combined_df[key] = filtered_ner_df[key]
166
181
  valid_codes = idf_scores_df[idf_scores_df[key]
167
182
  > threshold].index
@@ -1,68 +1,157 @@
1
1
  import os
2
2
  import pathlib
3
- import sqlite3
4
- from sqlite3 import Error
3
+ from typing import List, Optional, Tuple
4
+ from ddi_fw.datasets.core import BaseDataset, TextDatasetMixin, generate_sim_matrices_new, generate_vectors
5
+ from ddi_fw.datasets.db_utils import create_connection
6
+ import numpy as np
5
7
  import pandas as pd
8
+ from pydantic import BaseModel, Field, model_validator, root_validator
9
+ from abc import ABC, abstractmethod
10
+ from sklearn.preprocessing import LabelBinarizer
11
+ import logging
6
12
 
7
- from ddi_fw.utils import ZipHelper
13
+ from ddi_fw.ner.ner import CTakesNER
14
+ from ddi_fw.utils.zip_helper import ZipHelper
8
15
 
9
- from .. import BaseDataset
10
- from ddi_fw.langchain.embeddings import PoolingStrategy
11
- from ..db_utils import create_connection
16
+
17
+ try:
18
+ from ddi_fw.vectorization import IDF
19
+ except ImportError:
20
+ raise ImportError(
21
+ "Failed to import vectorization module. Ensure that the module exists and is correctly installed. ")
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Constants for embedding, chemical properties, and NER columns
26
+ LIST_OF_EMBEDDING_COLUMNS = [
27
+ 'all_text', 'description', 'synthesis_reference', 'indication',
28
+ 'pharmacodynamics', 'mechanism_of_action', 'toxicity', 'metabolism',
29
+ 'absorption', 'half_life', 'protein_binding', 'route_of_elimination',
30
+ 'volume_of_distribution', 'clearance'
31
+ ]
32
+
33
+ LIST_OF_CHEMICAL_PROPERTY_COLUMNS = ['enzyme', 'target', 'smile']
34
+ LIST_OF_NER_COLUMNS = ['tui', 'cui', 'entities']
12
35
 
13
36
  HERE = pathlib.Path(__file__).resolve().parent
14
- list_of_embedding_columns = ['all_text', 'description',
15
- 'synthesis_reference', 'indication',
16
- 'pharmacodynamics', 'mechanism_of_action',
17
- 'toxicity', 'metabolism',
18
- 'absorption', 'half_life',
19
- 'protein_binding', 'route_of_elimination',
20
- 'volume_of_distribution', 'clearance']
21
-
22
- list_of_chemical_property_columns = ['enzyme',
23
- 'target',
24
- 'smile']
25
-
26
- list_of_ner_columns = ['tui', 'cui', 'entities']
27
- class MDFSADDIDataset(BaseDataset):
28
- def __init__(self, embedding_size,
29
- embedding_dict,
30
- embeddings_pooling_strategy: PoolingStrategy,
31
- ner_df,
32
- chemical_property_columns=['enzyme',
33
- 'target',
34
- 'smile'],
35
- embedding_columns=[],
36
- ner_columns=[],
37
- **kwargs):
38
37
 
38
+ class MDFSADDIDataset(BaseDataset,TextDatasetMixin):
39
+ # def __init__(self, embedding_size,
40
+ # embedding_dict,
41
+ # embeddings_pooling_strategy: PoolingStrategy,
42
+ # ner_df,
43
+ # chemical_property_columns=['enzyme',
44
+ # 'target',
45
+ # 'smile'],
46
+ # embedding_columns=[],
47
+ # ner_columns=[],
48
+ # **kwargs):
49
+
50
+ # columns = kwargs['columns']
51
+ # if columns:
52
+ # chemical_property_columns = []
53
+ # embedding_columns=[]
54
+ # ner_columns=[]
55
+ # for column in columns:
56
+ # if column in list_of_chemical_property_columns:
57
+ # chemical_property_columns.append(column)
58
+ # elif column in list_of_embedding_columns:
59
+ # embedding_columns.append(column)
60
+ # elif column in list_of_ner_columns:
61
+ # ner_columns.append(column)
62
+ # # elif column == 'smile_2':
63
+ # # continue
64
+ # else:
65
+ # raise Exception(f"{column} is not related this dataset")
66
+
67
+
68
+ # super().__init__(embedding_size=embedding_size,
69
+ # embedding_dict=embedding_dict,
70
+ # embeddings_pooling_strategy=embeddings_pooling_strategy,
71
+ # ner_df=ner_df,
72
+ # chemical_property_columns=chemical_property_columns,
73
+ # embedding_columns=embedding_columns,
74
+ # ner_columns=ner_columns,
75
+ # **kwargs)
76
+
77
+ # db_zip_path = HERE.joinpath('mdf-sa-ddi.zip')
78
+ # db_path = HERE.joinpath('mdf-sa-ddi.db')
79
+ # if not os.path.exists(db_zip_path):
80
+ # self.__to_db__(db_path)
81
+ # else:
82
+ # ZipHelper().extract(
83
+ # input_path=str(HERE), output_path=str(HERE))
84
+ # conn = create_connection(db_path)
85
+ # self.drugs_df = select_all_drugs_as_dataframe(conn)
86
+ # self.ddis_df = select_all_events_as_dataframe(conn)
87
+ # # kwargs = {'index_path': str(HERE.joinpath('indexes'))}
88
+ # kwargs['index_path'] = str(HERE.joinpath('indexes'))
89
+
90
+ # self.index_path = kwargs.get('index_path')
91
+
92
+ dataset_name: str = "MDFSADDIDataset"
93
+ index_path: str = Field(default_factory=lambda: str(
94
+ pathlib.Path(__file__).resolve().parent.joinpath('indexes')))
95
+ # drugs_df: pd.DataFrame = Field(default_factory=pd.DataFrame)
96
+ # ddis_df: pd.DataFrame = Field(default_factory=pd.DataFrame)
97
+ drugs_df: Optional[pd.DataFrame] = None
98
+ ddis_df: Optional[pd.DataFrame] = None
99
+
100
+ chemical_property_columns: list[str] = Field(
101
+ default_factory=lambda: LIST_OF_CHEMICAL_PROPERTY_COLUMNS)
102
+ embedding_columns: list[str] = Field(default_factory=list)
103
+ ner_columns: list[str] = Field(default_factory=list)
104
+ ner_df: pd.DataFrame | None = None
105
+ tui_threshold: float | None = None
106
+ cui_threshold: float | None = None
107
+ entities_threshold: float | None = None
108
+
109
+ # @model_validator
110
+
111
+ def validate_columns(self, values):
112
+ if not set(values['chemical_property_columns']).issubset(LIST_OF_CHEMICAL_PROPERTY_COLUMNS):
113
+ raise ValueError("Invalid chemical property columns")
114
+ if not set(values['ner_columns']).issubset(LIST_OF_NER_COLUMNS):
115
+ raise ValueError("Invalid NER columns")
116
+ return values
117
+
118
+ def __init__(self, **kwargs):
119
+
120
+ super().__init__(**kwargs)
121
+
122
+ # self.additional_config = kwargs.get('dataset_additional_config', {})
123
+ if self.additional_config:
124
+ ner = self.additional_config.get('ner', {})
125
+ self.ner_data_file = ner.get('data_file', None)
126
+ self.ner_threshold = ner.get('thresholds', None)
127
+ # if self.ner_threshold:
128
+ # for k, v in self.ner_threshold.items():
129
+ # kwargs[k] = v
130
+
131
+ self.ner_df = CTakesNER(df=None).load(
132
+ filename=self.ner_data_file) if self.ner_data_file else None
133
+
39
134
  columns = kwargs['columns']
40
135
  if columns:
41
136
  chemical_property_columns = []
42
- embedding_columns=[]
43
- ner_columns=[]
137
+ embedding_columns = []
138
+ ner_columns = []
44
139
  for column in columns:
45
- if column in list_of_chemical_property_columns:
140
+ if column in LIST_OF_CHEMICAL_PROPERTY_COLUMNS:
46
141
  chemical_property_columns.append(column)
47
- elif column in list_of_embedding_columns:
142
+ elif column in LIST_OF_EMBEDDING_COLUMNS:
48
143
  embedding_columns.append(column)
49
- elif column in list_of_ner_columns:
144
+ elif column in LIST_OF_NER_COLUMNS:
50
145
  ner_columns.append(column)
51
- # elif column == 'smile_2':
52
- # continue
53
146
  else:
54
147
  raise Exception(f"{column} is not related this dataset")
55
-
56
-
57
- super().__init__(embedding_size=embedding_size,
58
- embedding_dict=embedding_dict,
59
- embeddings_pooling_strategy=embeddings_pooling_strategy,
60
- ner_df=ner_df,
61
- chemical_property_columns=chemical_property_columns,
62
- embedding_columns=embedding_columns,
63
- ner_columns=ner_columns,
64
- **kwargs)
65
-
148
+
149
+ self.chemical_property_columns = chemical_property_columns
150
+ self.embedding_columns = embedding_columns
151
+ self.ner_columns = ner_columns
152
+ self.columns = [] # these variable is modified in prep method
153
+
154
+
66
155
  db_zip_path = HERE.joinpath('mdf-sa-ddi.zip')
67
156
  db_path = HERE.joinpath('mdf-sa-ddi.db')
68
157
  if not os.path.exists(db_zip_path):
@@ -70,13 +159,19 @@ class MDFSADDIDataset(BaseDataset):
70
159
  else:
71
160
  ZipHelper().extract(
72
161
  input_path=str(HERE), output_path=str(HERE))
73
- conn = create_connection(db_path)
162
+ conn = create_connection(db_path.absolute().as_posix())
74
163
  self.drugs_df = select_all_drugs_as_dataframe(conn)
75
164
  self.ddis_df = select_all_events_as_dataframe(conn)
76
165
  # kwargs = {'index_path': str(HERE.joinpath('indexes'))}
77
- kwargs['index_path'] = str(HERE.joinpath('indexes'))
166
+
167
+
168
+ self.class_column = 'event_category'
78
169
 
79
- self.index_path = kwargs.get('index_path')
170
+ self.__similarity_related_columns__ = []
171
+ self.__similarity_related_columns__.extend(
172
+ self.chemical_property_columns)
173
+ self.__similarity_related_columns__.extend(self.ner_columns)
174
+ logger.info(f'{self.dataset_name} is initialized')
80
175
 
81
176
  def __to_db__(self, db_path):
82
177
  conn = create_connection(db_path)
@@ -118,10 +213,11 @@ class MDFSADDIDataset(BaseDataset):
118
213
  lambda_fnc1) # , axis=1
119
214
  self.ddis_df['id2'] = self.ddis_df['name2'].apply(
120
215
  lambda_fnc1) # , axis=1
121
- self.drugs_df.to_sql('drug', conn, if_exists='replace', index=False)
122
- self.ddis_df.to_sql('event', conn, if_exists='replace', index=False)
123
- ZipHelper().zip_single_file(
124
- file_path=db_path, output_path=HERE, name='mdf-sa-ddi')
216
+ if conn:
217
+ self.drugs_df.to_sql('drug', conn, if_exists='replace', index=False)
218
+ self.ddis_df.to_sql('event', conn, if_exists='replace', index=False)
219
+ ZipHelper().zip_single_file(
220
+ file_path=db_path, output_path=HERE, zip_name='mdf-sa-ddi')
125
221
 
126
222
 
127
223
  def select_all_drugs(conn):
ddi_fw/ml/__init__.py CHANGED
@@ -2,4 +2,5 @@ from .ml_helper import MultiModalRunner
2
2
  from .model_wrapper import ModelWrapper,Result
3
3
  from .tensorflow_wrapper import TFModelWrapper
4
4
  from .pytorch_wrapper import PTModelWrapper
5
- from .evaluation_helper import evaluate
5
+ from .evaluation_helper import evaluate
6
+ from .tracking_service import TrackingService
ddi_fw/ml/ml_helper.py CHANGED
@@ -1,23 +1,9 @@
1
- from typing import Callable, Dict, List, Tuple
2
- from matplotlib import pyplot as plt
3
1
  from ddi_fw.ml.model_wrapper import Result
4
2
  from ddi_fw.ml.pytorch_wrapper import PTModelWrapper
5
3
  from ddi_fw.ml.tensorflow_wrapper import TFModelWrapper
6
4
  from ddi_fw.utils.package_helper import get_import
7
- import tensorflow as tf
8
- from tensorflow.python import keras
9
- from tensorflow.python.keras import Model, Sequential
10
- from tensorflow.python.keras.layers import Dense, Dropout, Input, Activation
11
- from tensorflow.python.keras.callbacks import EarlyStopping
12
- from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
13
5
  import numpy as np
14
-
15
- import mlflow
16
- from mlflow.utils.autologging_utils import batch_metrics_logger
17
- import time
18
-
19
- from mlflow.models import infer_signature
20
- from ddi_fw.ml.evaluation_helper import Metrics, evaluate
6
+ from ddi_fw.ml.evaluation_helper import evaluate
21
7
 
22
8
  # import tf2onnx
23
9
  # import onnx
@@ -32,16 +18,16 @@ import ddi_fw.utils as utils
32
18
 
33
19
  class MultiModalRunner:
34
20
  # todo model related parameters to config
35
- def __init__(self, library, multi_modal, default_model, use_mlflow=False):
21
+ def __init__(self, library, multi_modal, default_model, tracking_service):
36
22
  self.library = library
37
23
  self.multi_modal = multi_modal
38
24
  self.default_model = default_model
39
- self.use_mlflow = use_mlflow
25
+ self.tracking_service = tracking_service
40
26
  self.result = Result()
41
27
 
42
- def _mlflow_(self, func: Callable):
43
- if self.use_mlflow:
44
- func()
28
+ # def _mlflow_(self, func: Callable):
29
+ # if self.use_mlflow:
30
+ # func()
45
31
 
46
32
  def set_data(self, items, train_idx_arr, val_idx_arr, y_test_label):
47
33
  self.items = items
@@ -74,7 +60,7 @@ class MultiModalRunner:
74
60
  kwargs = m.get('params')
75
61
  T = self.__create_model(self.library)
76
62
  single_modal = T(self.date, name, model_type,
77
- use_mlflow=self.use_mlflow, **kwargs)
63
+ tracking_service=self.tracking_service, **kwargs)
78
64
 
79
65
  if input is not None and inputs is not None:
80
66
  raise Exception("input and inputs should not be used together")
@@ -110,7 +96,7 @@ class MultiModalRunner:
110
96
  name = item[0]
111
97
  T = self.__create_model(self.library)
112
98
  single_modal = T(self.date, name, model_type,
113
- use_mlflow=self.use_mlflow, **kwargs)
99
+ tracking_service=self.tracking_service, **kwargs)
114
100
  single_modal.set_data(
115
101
  self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
116
102
 
@@ -130,9 +116,12 @@ class MultiModalRunner:
130
116
  combinations = []
131
117
  for i in range(2, len(l) + 1):
132
118
  combinations.extend(list(itertools.combinations(l, i))) # all
133
- if self.use_mlflow:
134
- with mlflow.start_run(run_name=self.prefix, description="***") as run:
135
- self.__predict(single_results)
119
+
120
+ def _f():
121
+ self.__predict(single_results)
122
+
123
+ if self.tracking_service:
124
+ self.tracking_service.run(run_name=self.prefix, description="***", func = _f , nested_run=False)
136
125
  else:
137
126
  self.__predict(single_results)
138
127
  if combinations:
@@ -143,10 +132,17 @@ class MultiModalRunner:
143
132
  def evaluate_combinations(self, single_results, combinations):
144
133
  for combination in combinations:
145
134
  combination_descriptor = '-'.join(combination)
146
- if self.use_mlflow:
147
- with mlflow.start_run(run_name=combination_descriptor, description="***", nested=True) as combination_run:
135
+ if self.tracking_service:
136
+ def evaluate_combination(artifact_uri=None):
148
137
  self.__evaluate_combinations(
149
- single_results, combination, combination_descriptor, combination_run.info.artifact_uri)
138
+ single_results, combination, combination_descriptor, artifact_uri
139
+ )
140
+
141
+ self.tracking_service.run(run_name=combination_descriptor, description="***", nested_run=True, func=evaluate_combination)
142
+
143
+ # with mlflow.start_run(run_name=combination_descriptor, description="***", nested=True) as combination_run:
144
+ # self.__evaluate_combinations(
145
+ # single_results, combination, combination_descriptor, combination_run.info.artifact_uri)
150
146
  else:
151
147
  self.__evaluate_combinations(
152
148
  single_results, combination, combination_descriptor, None)
@@ -159,8 +155,8 @@ class MultiModalRunner:
159
155
  prediction = utils.to_one_hot_encode(prediction)
160
156
  logs, metrics = evaluate(
161
157
  actual=self.y_test_label, pred=prediction, info=combination_descriptor)
162
- if self.use_mlflow:
163
- mlflow.log_metrics(logs)
158
+ if self.tracking_service:
159
+ self.tracking_service.log_metrics(logs)
164
160
  metrics.format_float()
165
161
  # TODO path bulunamadı hatası aldık
166
162
  if artifact_uri:
@@ -29,7 +29,6 @@ class ModelWrapper:
29
29
  self.train_label = train_label
30
30
  self.test_data = test_data
31
31
  self.test_label = test_label
32
- # https://github.com/mlflow/mlflow/blob/master/examples/tensorflow/train.py
33
32
 
34
33
  def predict(self)-> Any:
35
34
  pass