ddi-fw 0.0.96__py3-none-any.whl → 0.0.98__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ddi_fw/datasets/core.py CHANGED
@@ -268,81 +268,39 @@ class BaseDataset(ABC):
268
268
 
269
269
  # her bir metin tipi için embedding oluşturursan burayı düzenle
270
270
  def prep(self):
271
- # if self.embedding_columns:
272
- # zip_helper = ZipHelper()
273
- # zip_helper.extract(str(HERE.joinpath('zips/embeddings')),
274
- # str(HERE.joinpath('zips/embeddings')))
275
-
276
- # embedding_dict = dict()
277
- # for embedding_column in self.embedding_columns:
278
- # embedding_file = HERE.joinpath(
279
- # f'zips/embeddings/{embedding_column}_embeddings.pkl')
280
- # embedding_values = pd.read_pickle(embedding_file)
281
- # d = embedding_values.apply(
282
- # lambda x: {x.id: x[f'{embedding_column}_embedding']}, axis=1)
283
- # x = {k: v for l in d.values.tolist() for k, v in l.items()}
284
- # embedding_dict[embedding_column] = x
285
-
286
- # self.ner_df = CTakesNER().load()
287
271
  drug_names = self.drugs_df['name'].to_list()
288
272
  drug_ids = self.drugs_df['id'].to_list()
289
273
 
290
- # self.ddis_df = self.ddis_df[(self.ddis_df['name1'].isin(drug_names)) & (
291
- # self.ddis_df['name2'].isin(drug_names))]
292
-
293
274
  filtered_df = self.drugs_df
294
- filtered_ner_df = self.ner_df[self.ner_df['drugbank_id'].isin(
295
- drug_ids)]
296
- filtered_ner_df = self.ner_df.copy()
297
-
298
275
  combined_df = filtered_df.copy()
299
- # TODO: eğer kullanılan veri setinde tui, cui veya entity bilgileri yoksa o veri setine bu sütunları eklemek için aşağısı gerekli
300
-
301
- # idf_calc = IDF(filtered_ner_df, [f for f in filtered_ner_df.keys()])
302
- idf_calc = IDF(filtered_ner_df, self.ner_columns)
303
- idf_calc.calculate()
304
- idf_scores_df = idf_calc.to_dataframe()
305
-
306
- # for key in filtered_ner_df.keys():
307
- for key in self.ner_columns:
308
- threshold = 0
309
- if key.startswith('tui'):
310
- threshold = self.tui_threshold
311
- if key.startswith('cui'):
312
- threshold = self.cui_threshold
313
- if key.startswith('entities'):
314
- threshold = self.entities_threshold
315
- combined_df[key] = filtered_ner_df[key]
316
- valid_codes = idf_scores_df[idf_scores_df[key] > threshold].index
317
-
318
- # print(f'{key}: valid code size = {len(valid_codes)}')
319
- combined_df[key] = combined_df[key].apply(lambda items:
320
- [item for item in items if item in valid_codes])
321
-
322
- # Yukarıdaki koda evrildi
323
- # combined_df['tui_description'] = filtered_ner_df['description_tuis']
324
- # combined_df['cui_description'] = filtered_ner_df['description_cuis']
325
- # combined_df['entities_description'] = filtered_ner_df['description_entities']
326
-
327
- # tui_idf = IDF(combined_df['tui_description'], self.tui_threshold)
328
- # cui_idf = IDF(combined_df['cui_description'], self.cui_threshold)
329
- # entities_idf = IDF(
330
- # combined_df['entities_description'], self.entities_threshold)
331
-
332
- # tui_idf.calculate()
333
- # cui_idf.calculate()
334
- # entities_idf.calculate()
335
-
336
- # valid_tui_codes = tui_idf.find_items_over_threshold()
337
- # valid_cui_codes = cui_idf.find_items_over_threshold()
338
- # valid_entities_codes = entities_idf.find_items_over_threshold()
339
-
340
- # combined_df['tui_description'] = combined_df['tui_description'].apply(lambda items:
341
- # [item for item in items if item in valid_tui_codes])
342
- # combined_df['cui_description'] = combined_df['cui_description'].apply(lambda items:
343
- # [item for item in items if item in valid_cui_codes])
344
- # combined_df['entities_description'] = combined_df['entities_description'].apply(lambda items:
345
- # [item for item in items if item in valid_entities_codes])
276
+
277
+ if self.ner_df:
278
+ filtered_ner_df = self.ner_df[self.ner_df['drugbank_id'].isin(
279
+ drug_ids)]
280
+ filtered_ner_df = self.ner_df.copy()
281
+
282
+ # TODO: eğer kullanılan veri setinde tui, cui veya entity bilgileri yoksa o veri setine bu sütunları eklemek için aşağısı gerekli
283
+
284
+ # idf_calc = IDF(filtered_ner_df, [f for f in filtered_ner_df.keys()])
285
+ idf_calc = IDF(filtered_ner_df, self.ner_columns)
286
+ idf_calc.calculate()
287
+ idf_scores_df = idf_calc.to_dataframe()
288
+
289
+ # for key in filtered_ner_df.keys():
290
+ for key in self.ner_columns:
291
+ threshold = 0
292
+ if key.startswith('tui'):
293
+ threshold = self.tui_threshold
294
+ if key.startswith('cui'):
295
+ threshold = self.cui_threshold
296
+ if key.startswith('entities'):
297
+ threshold = self.entities_threshold
298
+ combined_df[key] = filtered_ner_df[key]
299
+ valid_codes = idf_scores_df[idf_scores_df[key] > threshold].index
300
+
301
+ # print(f'{key}: valid code size = {len(valid_codes)}')
302
+ combined_df[key] = combined_df[key].apply(lambda items:
303
+ [item for item in items if item in valid_codes])
346
304
 
347
305
  moved_columns = ['id']
348
306
  moved_columns.extend(self.__similarity_related_columns__)
@@ -409,7 +367,6 @@ class BaseDataset(ABC):
409
367
  x_fnc, args=(embeddings_after_pooling,), axis=1)
410
368
 
411
369
  self.dataframe = self.ddis_df.copy()
412
- self.dataframe['class_as_txt'] = labels
413
370
  self.dataframe['class'] = list(classes)
414
371
  print(self.dataframe.shape)
415
372
 
@@ -47,7 +47,7 @@ class Pipeline:
47
47
  self.combinations = combinations
48
48
  self.model = model
49
49
 
50
- def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column):
50
+ def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
51
51
  """
52
52
  Fetch embeddings and metadata from a persistent Chroma vector database and update the provided embedding_dict.
53
53
 
@@ -65,9 +65,11 @@ class Pipeline:
65
65
 
66
66
  # Fetch the embeddings and metadata
67
67
  if column == None:
68
- dictionary = collection.get(include=['embeddings', 'metadatas'])
68
+ dictionary = collection.get(
69
+ include=['embeddings', 'metadatas'])
69
70
  else:
70
- dictionary = collection.get(include=['embeddings', 'metadatas'], where= {"type": {"$eq": f"{column}"}})
71
+ dictionary = collection.get(include=['embeddings', 'metadatas'], where={
72
+ "type": {"$eq": f"{column}"}})
71
73
  # Populate the embedding dictionary with embeddings from the vector database
72
74
  for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
73
75
  embedding_dict[metadata["type"]
@@ -77,7 +79,7 @@ class Pipeline:
77
79
  else:
78
80
  raise ValueError(
79
81
  "Persistent directory for the vector DB is not specified.")
80
-
82
+
81
83
  def build(self):
82
84
  # 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
83
85
  kwargs = {"columns": self.columns}
@@ -89,14 +91,16 @@ class Pipeline:
89
91
  if self.vector_db_persist_directory:
90
92
  self.__create_or_update_embeddings__(
91
93
  embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
92
-
94
+
93
95
  if self.column_embedding_configs:
94
96
  for item in self.column_embedding_configs:
95
97
  col = item["column"]
96
98
  col_db_dir = item["vector_db_persist_directory"]
97
99
  col_db_collection = item["vector_db_collection_name"]
98
- self.__create_or_update_embeddings__(embedding_dict, col_db_dir, col_db_collection, col)
99
- print(f"Embedings of {col} is calculated from {col_db_collection}")
100
+ self.__create_or_update_embeddings__(
101
+ embedding_dict, col_db_dir, col_db_collection, col)
102
+ print(
103
+ f"Embedings of {col} is calculated from {col_db_collection}")
100
104
 
101
105
  # if self.embedding_dict == None:
102
106
  # if self.vector_db_persist_directory:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ddi_fw
3
- Version: 0.0.96
3
+ Version: 0.0.98
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -27,25 +27,25 @@ Requires-Dist: python-stopwatch==1.1.11
27
27
  Requires-Dist: lxml==5.3.0
28
28
  Requires-Dist: matplotlib==3.8.0
29
29
  Requires-Dist: mlflow==2.16.1
30
- Requires-Dist: nltk==3.8.1
31
- Requires-Dist: numpy==1.26.4
32
- Requires-Dist: pandas==2.2.2
30
+ Requires-Dist: nltk>=3.8.1
31
+ Requires-Dist: numpy>=1.26.4
32
+ Requires-Dist: pandas>=2.2.0
33
33
  Requires-Dist: plotly==5.24.1
34
34
  Requires-Dist: rdkit==2023.3.3
35
35
  Requires-Dist: scikit-learn==1.5.2
36
36
  Requires-Dist: scipy==1.13.1
37
- Requires-Dist: accelerate==0.33.0
38
- Requires-Dist: sentence-transformers==3.0.1
39
- Requires-Dist: transformers==4.42.4
37
+ Requires-Dist: accelerate>=0.33.0
38
+ Requires-Dist: sentence-transformers>=3.0.1
39
+ Requires-Dist: transformers>=4.42.4
40
40
  Requires-Dist: stanza==1.9.2
41
- Requires-Dist: tokenizers==0.19.1
42
- Requires-Dist: tqdm==4.66.6
41
+ Requires-Dist: tokenizers>=0.19.1
42
+ Requires-Dist: tqdm>=4.66.6
43
43
  Requires-Dist: xmlschema==3.4.2
44
- Requires-Dist: zipp==3.20.2
44
+ Requires-Dist: zipp>=3.20.2
45
45
  Requires-Dist: py7zr==0.22.0
46
- Requires-Dist: openai==1.52.2
47
- Requires-Dist: langchain==0.3.4
48
- Requires-Dist: chromadb==0.5.15
46
+ Requires-Dist: openai>=1.52.2
47
+ Requires-Dist: langchain>=0.3.4
48
+ Requires-Dist: chromadb>=0.5.15
49
49
  Requires-Dist: langchain-community==0.3.3
50
50
  Requires-Dist: datasets==3.0.2
51
51
  Requires-Dist: unstructured==0.16.3
@@ -1,5 +1,5 @@
1
1
  ddi_fw/datasets/__init__.py,sha256=HSwQrqnzrEjIG4gif41pwJ_cST3t2XHGDxqFyuEBRwo,351
2
- ddi_fw/datasets/core.py,sha256=cL_H7-osGTNG5W8X8LLpIcSJ-GUXoI3LjNwvffmEGzA,19452
2
+ ddi_fw/datasets/core.py,sha256=9RaUPhAYCn4RDeTZpHATtJaqNWsO17bduYyVqxAZWs0,17001
3
3
  ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
4
4
  ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
5
5
  ddi_fw/datasets/feature_vector_generation.py,sha256=EImavcALxkIB0YG_smOzagMNzuWMbK9SaWSKwARx_qU,3254
@@ -72,7 +72,7 @@ ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,2
72
72
  ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=qIst7vxHaOAhRv4lgozszwa3b1QE4aIrN74t41Xnvr4,1637
73
73
  ddi_fw/pipeline/multi_pipeline.py,sha256=t_Z7d7xRfDnhpQTlqCf7c0isZ5hZlyXavKhC7ePsnJY,5903
74
74
  ddi_fw/pipeline/ner_pipeline.py,sha256=wB7hz4YCOv7UAz6bGE6sSpPXXIdoOflOVK5UCc1fO-o,5586
75
- ddi_fw/pipeline/pipeline.py,sha256=q7jfTt7ryYa3xBscPtxvanB-j5RzWVZUKir0KmAdTKc,8357
75
+ ddi_fw/pipeline/pipeline.py,sha256=VX3GcoTjY7_ehX_rmpa-uh0NyBFoxF24AxbwruEWw4A,8457
76
76
  ddi_fw/test/basic_test.py,sha256=fEOGcZm1ObnsDvMiXNmdmz6YCeUrGc8V0DwlSwGhsq8,376
77
77
  ddi_fw/test/combination_test.py,sha256=TWNE8sf-DSh1Q9-yRaRBc774Sn1kSMGXLwQhd2_Qynk,324
78
78
  ddi_fw/test/compress_json_test.py,sha256=BGny56YqiG-pzhMoDzLKQBQI1E7o3jU0S7VYWtclAx4,1045
@@ -89,7 +89,7 @@ ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
89
89
  ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
90
90
  ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
91
91
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
92
- ddi_fw-0.0.96.dist-info/METADATA,sha256=b2D7e7ub3byUbTwEPdw6FvUqTEK-H_KflNXwqbk4r7s,1966
93
- ddi_fw-0.0.96.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
94
- ddi_fw-0.0.96.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
95
- ddi_fw-0.0.96.dist-info/RECORD,,
92
+ ddi_fw-0.0.98.dist-info/METADATA,sha256=W4ZdrQs8YgQp6aHxr4Py5_lO4zrzKnk1XjDfFhrlsq8,1966
93
+ ddi_fw-0.0.98.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
94
+ ddi_fw-0.0.98.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
95
+ ddi_fw-0.0.98.dist-info/RECORD,,