ddi-fw 0.0.149__py3-none-any.whl → 0.0.151__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,149 +1,213 @@
1
+ import glob
1
2
  import pathlib
2
-
3
+ from typing import List, Optional, Tuple
4
+ from ddi_fw.datasets.core import BaseDataset, TextDatasetMixin, generate_sim_matrices_new, generate_vectors
5
+ from ddi_fw.datasets.dataset_splitter import DatasetSplitter
6
+ from ddi_fw.datasets.db_utils import create_connection
7
+ from ddi_fw.datasets.idf_helper import IDF
8
+ from ddi_fw.utils.utils import create_folder_if_not_exists
3
9
  import numpy as np
4
10
  import pandas as pd
5
-
6
- from ddi_fw.datasets.feature_vector_generation import SimilarityMatrixGenerator
11
+ from pydantic import BaseModel, Field, model_validator, root_validator
12
+ from ddi_fw.datasets.feature_vector_generation import SimilarityMatrixGenerator,VectorGenerator
7
13
  from ddi_fw.langchain.embeddings import PoolingStrategy
8
- from .. import BaseDataset
9
- from ..db_utils import create_connection
14
+ from abc import ABC, abstractmethod
15
+ from sklearn.preprocessing import LabelBinarizer
16
+
17
+ from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
18
+
19
+ # Constants for embedding, chemical properties, and NER columns
20
+ LIST_OF_EMBEDDING_COLUMNS = [
21
+ 'all_text', 'description', 'synthesis_reference', 'indication',
22
+ 'pharmacodynamics', 'mechanism_of_action', 'toxicity', 'metabolism',
23
+ 'absorption', 'half_life', 'protein_binding', 'route_of_elimination',
24
+ 'volume_of_distribution', 'clearance'
25
+ ]
26
+
27
+ LIST_OF_CHEMICAL_PROPERTY_COLUMNS = ['enzyme', 'target', 'pathway', 'smile']
28
+ LIST_OF_NER_COLUMNS = ['tui', 'cui', 'entities']
10
29
 
11
30
  HERE = pathlib.Path(__file__).resolve().parent
12
- list_of_embedding_columns = ['all_text', 'description',
13
- 'synthesis_reference', 'indication',
14
- 'pharmacodynamics', 'mechanism_of_action',
15
- 'toxicity', 'metabolism',
16
- 'absorption', 'half_life',
17
- 'protein_binding', 'route_of_elimination',
18
- 'volume_of_distribution', 'clearance']
19
-
20
- list_of_chemical_property_columns = ['enzyme',
21
- 'target',
22
- 'pathway',
23
- 'smile']
24
- list_of_ner_columns = ['tui', 'cui', 'entities']
25
-
26
-
27
- def indices_to_binary_vector(indices, vector_length=881):
28
- # vector_length = len(indices)
29
- # Initialize a zero vector of the given length
30
- binary_vector = [0] * vector_length
31
-
32
- # Set the positions specified by indices to 1
33
- for index in indices:
34
- if 0 <= index < vector_length:
35
- binary_vector[index] = 1
36
-
37
- return binary_vector
38
-
39
- class DDIMDLDataset(BaseDataset):
40
- def __init__(self, embedding_size,
41
- embedding_dict,
42
- embeddings_pooling_strategy: PoolingStrategy,
43
- ner_df,
44
- chemical_property_columns=['enzyme',
45
- 'target',
46
- 'pathway',
47
- 'smile'],
48
- embedding_columns=[],
49
- ner_columns=[],
50
- **kwargs):
51
- columns = kwargs['columns']
52
- if columns:
53
- chemical_property_columns = []
54
- embedding_columns=[]
55
- ner_columns=[]
56
- for column in columns:
57
- if column in list_of_chemical_property_columns:
58
- chemical_property_columns.append(column)
59
- elif column in list_of_embedding_columns:
60
- embedding_columns.append(column)
61
- elif column in list_of_ner_columns:
62
- ner_columns.append(column)
63
- # elif column == 'smile_2':
64
- # continue
65
- else:
66
- raise Exception(f"{column} is not related this dataset")
67
-
68
-
69
- super().__init__(embedding_size=embedding_size,
70
- embedding_dict=embedding_dict,
71
- embeddings_pooling_strategy=embeddings_pooling_strategy,
72
- ner_df=ner_df,
73
- chemical_property_columns=chemical_property_columns,
74
- embedding_columns=embedding_columns,
75
- ner_columns=ner_columns,
76
- **kwargs)
77
-
78
- # kwargs = {'index_path': str(HERE.joinpath('indexes'))}
79
- kwargs['index_path'] = str(HERE.joinpath('indexes'))
80
-
81
- db = HERE.joinpath('data/event.db')
82
- conn = create_connection(db)
83
- print("db prep")
84
- self.drugs_df = self.__select_all_drugs_as_dataframe__(conn)
85
- self.ddis_df = self.__select_all_events__(conn)
86
- print("db bitti")
87
- self.index_path = kwargs.get('index_path')
88
-
89
- # jaccard_sim_dict = {}
90
- # sim_matrix_gen = SimilarityMatrixGenerator()
91
- # jaccard_sim_dict["smile_2"] = sim_matrix_gen.create_jaccard_similarity_matrices(
92
- # self.drugs_df["smile_2"].to_list())
93
-
94
- # similarity_matrices = {}
95
- # drugbank_ids = self.drugs_df['id'].to_list()
96
- # new_columns = {}
97
- # for idx in range(len(drugbank_ids)):
98
- # new_columns[idx] = drugbank_ids[idx]
99
- # new_df = pd.DataFrame.from_dict(jaccard_sim_dict["smile_2"])
100
- # new_df = new_df.rename(index=new_columns, columns=new_columns)
101
- # similarity_matrices["smile_2"] = new_df
102
-
31
+
32
+ class DDIMDLDataset(TextDatasetMixin):
33
+ index_path: str = Field(default_factory=lambda: str(
34
+ pathlib.Path(__file__).resolve().parent.joinpath('indexes')))
35
+ # drugs_df: pd.DataFrame = Field(default_factory=pd.DataFrame)
36
+ # ddis_df: pd.DataFrame = Field(default_factory=pd.DataFrame)
37
+ drugs_df: Optional[pd.DataFrame] = None
38
+ ddis_df: Optional[pd.DataFrame] = None
39
+
40
+ chemical_property_columns: list[str] = Field(
41
+ default_factory=lambda: LIST_OF_CHEMICAL_PROPERTY_COLUMNS)
42
+ embedding_columns: list[str] = Field(default_factory=list)
43
+ ner_columns: list[str] = Field(default_factory=list)
44
+ ner_df: pd.DataFrame | None = None
45
+ tui_threshold: float | None = None
46
+ cui_threshold: float | None = None
47
+ entities_threshold: float | None = None
48
+
49
+
50
+ # @model_validator
51
+ def validate_columns(self, values):
52
+ if not set(values['chemical_property_columns']).issubset(LIST_OF_CHEMICAL_PROPERTY_COLUMNS):
53
+ raise ValueError("Invalid chemical property columns")
54
+ if not set(values['ner_columns']).issubset(LIST_OF_NER_COLUMNS):
55
+ raise ValueError("Invalid NER columns")
56
+ return values
57
+
58
+ def __init__(self, **kwargs):
59
+ super().__init__(**kwargs)
60
+ self.class_column = 'event_category'
61
+ _db_path = HERE.joinpath('data/event.db')
103
62
 
104
- # def lambda_fnc(row, value):
105
- # if row['id1'] in value and row['id2'] in value:
106
- # return np.float16(np.hstack(
107
- # (value[row['id1']], value[row['id2']])))
108
- # for key, value in similarity_matrices.items():
109
-
110
- # print(f'sim matrix: {key}')
111
- # self.ddis_df[key] = self.ddis_df.apply(
112
- # lambda_fnc, args=(value,), axis=1)
113
- # print(self.ddis_df[key].head())
114
- # print("init finished")
115
-
116
- def __select_all_drugs_as_dataframe__(self, conn):
63
+ self.__similarity_related_columns__ = []
64
+ self.__similarity_related_columns__.extend(self.chemical_property_columns)
65
+ self.__similarity_related_columns__.extend(self.ner_columns)
66
+ # TODO with resource
67
+ self._conn = create_connection(_db_path.absolute().as_posix())
68
+ self.load_drugs_and_events()
69
+
70
+ def load_drugs_and_events(self):
71
+ self.drugs_df = self.__select_all_drugs_as_dataframe__()
72
+ self.ddis_df = self.__select_all_events__()
73
+
74
+ def __select_all_drugs_as_dataframe__(self):
117
75
  headers = ['index', 'id', 'name',
118
76
  'target', 'enzyme', 'pathway', 'smile']
119
- cur = conn.cursor()
77
+ if self._conn is None:
78
+ raise Exception("There is no connection")
79
+ cur = self._conn.cursor()
120
80
  cur.execute(
121
- '''select "index", id, name, target, enzyme, pathway, smile from drug''')
81
+ '''SELECT "index", id, name, target, enzyme, pathway, smile FROM drug'''
82
+ )
122
83
  rows = cur.fetchall()
123
84
  df = pd.DataFrame(columns=headers, data=rows)
124
- df['enzyme'] = df['enzyme'].apply(lambda x: x.split('|'))
125
- df['target'] = df['target'].apply(lambda x: x.split('|'))
126
- df['pathway'] = df['pathway'].apply(lambda x: x.split('|'))
127
- # df['smile_2'] = df['smile'].apply(lambda x: indices_to_binary_vector(indices = list(map(int, x.split('|'))), vector_length = 881))
128
- df['smile'] = df['smile'].apply(lambda x: x.split('|'))
129
-
130
-
85
+
86
+ # Convert string fields to lists
87
+ for col in ['enzyme', 'target', 'pathway', 'smile']:
88
+ df[col] = df[col].apply(lambda x: x.split('|'))
89
+
131
90
  return df
132
91
 
133
- def __select_all_events__(self, conn):
134
- """
135
- Query all rows in the event table
136
- :param conn: the Connection object
137
- :return:
138
- """
139
- cur = conn.cursor()
92
+ def __select_all_events__(self):
93
+ if self._conn is None:
94
+ raise Exception("There is no connection")
95
+ cur = self._conn.cursor()
140
96
  cur.execute('''
141
- select ex."index", d1.id, d1.name, d2.id, d2.name, mechanism || ' ' ||action from extraction ex
142
- join drug d1 on d1.name = ex.drugA
143
- join drug d2 on d2.name = ex.drugB
97
+ SELECT ex."index", d1.id, d1.name, d2.id, d2.name, mechanism || ' ' || action
98
+ FROM extraction ex
99
+ JOIN drug d1 ON d1.name = ex.drugA
100
+ JOIN drug d2 ON d2.name = ex.drugB
144
101
  ''')
145
-
146
102
  rows = cur.fetchall()
147
-
148
103
  headers = ["index", "id1", "name1", "id2", "name2", "event_category"]
149
104
  return pd.DataFrame(columns=headers, data=rows)
105
+
106
+ def prep(self):
107
+ if self.drugs_df is None or self.ddis_df is None:
108
+ raise Exception("There is no data")
109
+
110
+ drug_ids = self.drugs_df['id'].to_list()
111
+
112
+ filtered_df = self.drugs_df
113
+ combined_df = filtered_df.copy()
114
+
115
+ if self.ner_df is not None and not self.ner_df.empty:
116
+ filtered_ner_df = self.ner_df[self.ner_df['drugbank_id'].isin(
117
+ drug_ids)]
118
+ filtered_ner_df = self.ner_df.copy()
119
+
120
+ # TODO: eğer kullanılan veri setinde tui, cui veya entity bilgileri yoksa o veri setine bu sütunları eklemek için aşağısı gerekli
121
+
122
+ # idf_calc = IDF(filtered_ner_df, [f for f in filtered_ner_df.keys()])
123
+ idf_calc = IDF(filtered_ner_df, self.ner_columns)
124
+ idf_calc.calculate()
125
+ idf_scores_df = idf_calc.to_dataframe()
126
+
127
+ # for key in filtered_ner_df.keys():
128
+ for key in self.ner_columns:
129
+ threshold = 0
130
+ if key.startswith('tui'):
131
+ threshold = self.tui_threshold
132
+ if key.startswith('cui'):
133
+ threshold = self.cui_threshold
134
+ if key.startswith('entities'):
135
+ threshold = self.entities_threshold
136
+ combined_df[key] = filtered_ner_df[key]
137
+ valid_codes = idf_scores_df[idf_scores_df[key] > threshold].index
138
+
139
+ # print(f'{key}: valid code size = {len(valid_codes)}')
140
+ combined_df[key] = combined_df[key].apply(lambda items:
141
+ [item for item in items if item in valid_codes])
142
+
143
+ moved_columns = ['id']
144
+ moved_columns.extend(self.__similarity_related_columns__)
145
+ chemical_properties_df = combined_df[moved_columns]
146
+
147
+ chemical_properties_df = chemical_properties_df.fillna("").apply(list)
148
+
149
+ # generate vectors dictionary içinde ndarray dönecek
150
+ generated_vectors = generate_vectors(chemical_properties_df, self.__similarity_related_columns__)
151
+
152
+ similarity_matrices = generate_sim_matrices_new(
153
+ chemical_properties_df,generated_vectors, self.__similarity_related_columns__, key_column= "id")
154
+
155
+ event_categories = self.ddis_df['event_category']
156
+ labels = event_categories.tolist()
157
+ lb = LabelBinarizer()
158
+ lb.fit(labels)
159
+ classes = lb.transform(labels)
160
+
161
+ def similarity_lambda_fnc(row, value):
162
+ if row['id1'] in value:
163
+ return value[row['id1']]
164
+
165
+ def lambda_fnc(row: pd.Series, value)-> Optional[np.float16]:
166
+ if row['id1'] in value and row['id2'] in value:
167
+ return np.float16(np.hstack(
168
+ (value[row['id1']], value[row['id2']])))
169
+ return None
170
+ # return np.hstack(
171
+ # (value[row['id1']], value[row['id2']]), dtype=np.float16)
172
+
173
+ def x_fnc(row, embeddings_after_pooling):
174
+ if row['id1'] in embeddings_after_pooling:
175
+ v1 = embeddings_after_pooling[row['id1']]
176
+ else:
177
+ v1 = np.zeros(self.embedding_size)
178
+ if row['id2'] in embeddings_after_pooling:
179
+ v2 = embeddings_after_pooling[row['id2']]
180
+ else:
181
+ v2 = np.zeros(self.embedding_size)
182
+ return np.float16(np.hstack(
183
+ (v1, v2)))
184
+
185
+ for key, value in similarity_matrices.items():
186
+
187
+ print(f'sim matrix: {key}')
188
+ self.ddis_df[key] = self.ddis_df.apply(
189
+ lambda_fnc, args=(value,), axis=1)
190
+ self.columns.append(key)
191
+ print(self.ddis_df[key].head())
192
+
193
+ for embedding_column in self.embedding_columns:
194
+ print(f"concat {embedding_column} embeddings")
195
+ embeddings_after_pooling = {k: self.embeddings_pooling_strategy.apply(
196
+ v) for k, v in self.embedding_dict[embedding_column].items()}
197
+ # column_embeddings_dict = embedding_values[embedding_column]
198
+ self.ddis_df[embedding_column+'_embedding'] = self.ddis_df.apply(
199
+ x_fnc, args=(embeddings_after_pooling,), axis=1)
200
+ self.columns.append(embedding_column+'_embedding')
201
+
202
+ dataframe = self.ddis_df.copy()
203
+ if not isinstance(classes, (list, pd.Series, np.ndarray)):
204
+ raise TypeError("classes must be an iterable (list, Series, or ndarray)")
205
+
206
+ if len(classes) != len(dataframe):
207
+ raise ValueError("Length of classes must match the number of rows in the DataFrame")
208
+
209
+ dataframe['class'] = list(classes)
210
+ self.set_dataframe(dataframe)
211
+
212
+
213
+
@@ -0,0 +1 @@
1
+ [0217/121135.683:ERROR:registration_protocol_win.cc(108)] CreateFile: Sistem belirtilen dosyayı bulamıyor. (0x2)
@@ -58,7 +58,8 @@ def create_embeddings(model, data, column, drop_column=True):
58
58
  column_embeddings_dict[row['id']] = sum_of_embeddings
59
59
  # data.iloc[index][column+'_embedding']=sum_of_embeddings
60
60
 
61
- data[column+'_embedding'] = pd.Series(column_embeddings_dict.values())
61
+ # data[column+'_embedding'] = pd.Series(column_embeddings_dict.values())
62
+ data[column+'_embedding'] = pd.Series(list(column_embeddings_dict.values()))
62
63
  if(drop_column):
63
64
  data.drop([column], axis = 1, inplace = True)
64
65
  # data[column+'_embedding'] = [column_embeddings_dict[row['name']] for index, row in data.iterrows()]
@@ -82,6 +82,7 @@ class PretrainedEmbeddings(Embeddings):
82
82
  text, return_tensors='pt', padding=True)
83
83
  output_embeddings.append(self.model(
84
84
  input_ids).last_hidden_state.mean(dim=1))
85
+ return output_embeddings
85
86
 
86
87
  def embed_query(self, text: str) -> List[float]:
87
88
  return self.embed_documents([text])[0]