ddi-fw 0.0.96__py3-none-any.whl → 0.0.98__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/core.py +28 -71
- ddi_fw/pipeline/pipeline.py +11 -7
- {ddi_fw-0.0.96.dist-info → ddi_fw-0.0.98.dist-info}/METADATA +13 -13
- {ddi_fw-0.0.96.dist-info → ddi_fw-0.0.98.dist-info}/RECORD +6 -6
- {ddi_fw-0.0.96.dist-info → ddi_fw-0.0.98.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.96.dist-info → ddi_fw-0.0.98.dist-info}/top_level.txt +0 -0
ddi_fw/datasets/core.py
CHANGED
@@ -268,81 +268,39 @@ class BaseDataset(ABC):
|
|
268
268
|
|
269
269
|
# her bir metin tipi için embedding oluşturursan burayı düzenle
|
270
270
|
def prep(self):
|
271
|
-
# if self.embedding_columns:
|
272
|
-
# zip_helper = ZipHelper()
|
273
|
-
# zip_helper.extract(str(HERE.joinpath('zips/embeddings')),
|
274
|
-
# str(HERE.joinpath('zips/embeddings')))
|
275
|
-
|
276
|
-
# embedding_dict = dict()
|
277
|
-
# for embedding_column in self.embedding_columns:
|
278
|
-
# embedding_file = HERE.joinpath(
|
279
|
-
# f'zips/embeddings/{embedding_column}_embeddings.pkl')
|
280
|
-
# embedding_values = pd.read_pickle(embedding_file)
|
281
|
-
# d = embedding_values.apply(
|
282
|
-
# lambda x: {x.id: x[f'{embedding_column}_embedding']}, axis=1)
|
283
|
-
# x = {k: v for l in d.values.tolist() for k, v in l.items()}
|
284
|
-
# embedding_dict[embedding_column] = x
|
285
|
-
|
286
|
-
# self.ner_df = CTakesNER().load()
|
287
271
|
drug_names = self.drugs_df['name'].to_list()
|
288
272
|
drug_ids = self.drugs_df['id'].to_list()
|
289
273
|
|
290
|
-
# self.ddis_df = self.ddis_df[(self.ddis_df['name1'].isin(drug_names)) & (
|
291
|
-
# self.ddis_df['name2'].isin(drug_names))]
|
292
|
-
|
293
274
|
filtered_df = self.drugs_df
|
294
|
-
filtered_ner_df = self.ner_df[self.ner_df['drugbank_id'].isin(
|
295
|
-
drug_ids)]
|
296
|
-
filtered_ner_df = self.ner_df.copy()
|
297
|
-
|
298
275
|
combined_df = filtered_df.copy()
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
threshold =
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
# tui_idf = IDF(combined_df['tui_description'], self.tui_threshold)
|
328
|
-
# cui_idf = IDF(combined_df['cui_description'], self.cui_threshold)
|
329
|
-
# entities_idf = IDF(
|
330
|
-
# combined_df['entities_description'], self.entities_threshold)
|
331
|
-
|
332
|
-
# tui_idf.calculate()
|
333
|
-
# cui_idf.calculate()
|
334
|
-
# entities_idf.calculate()
|
335
|
-
|
336
|
-
# valid_tui_codes = tui_idf.find_items_over_threshold()
|
337
|
-
# valid_cui_codes = cui_idf.find_items_over_threshold()
|
338
|
-
# valid_entities_codes = entities_idf.find_items_over_threshold()
|
339
|
-
|
340
|
-
# combined_df['tui_description'] = combined_df['tui_description'].apply(lambda items:
|
341
|
-
# [item for item in items if item in valid_tui_codes])
|
342
|
-
# combined_df['cui_description'] = combined_df['cui_description'].apply(lambda items:
|
343
|
-
# [item for item in items if item in valid_cui_codes])
|
344
|
-
# combined_df['entities_description'] = combined_df['entities_description'].apply(lambda items:
|
345
|
-
# [item for item in items if item in valid_entities_codes])
|
276
|
+
|
277
|
+
if self.ner_df:
|
278
|
+
filtered_ner_df = self.ner_df[self.ner_df['drugbank_id'].isin(
|
279
|
+
drug_ids)]
|
280
|
+
filtered_ner_df = self.ner_df.copy()
|
281
|
+
|
282
|
+
# TODO: eğer kullanılan veri setinde tui, cui veya entity bilgileri yoksa o veri setine bu sütunları eklemek için aşağısı gerekli
|
283
|
+
|
284
|
+
# idf_calc = IDF(filtered_ner_df, [f for f in filtered_ner_df.keys()])
|
285
|
+
idf_calc = IDF(filtered_ner_df, self.ner_columns)
|
286
|
+
idf_calc.calculate()
|
287
|
+
idf_scores_df = idf_calc.to_dataframe()
|
288
|
+
|
289
|
+
# for key in filtered_ner_df.keys():
|
290
|
+
for key in self.ner_columns:
|
291
|
+
threshold = 0
|
292
|
+
if key.startswith('tui'):
|
293
|
+
threshold = self.tui_threshold
|
294
|
+
if key.startswith('cui'):
|
295
|
+
threshold = self.cui_threshold
|
296
|
+
if key.startswith('entities'):
|
297
|
+
threshold = self.entities_threshold
|
298
|
+
combined_df[key] = filtered_ner_df[key]
|
299
|
+
valid_codes = idf_scores_df[idf_scores_df[key] > threshold].index
|
300
|
+
|
301
|
+
# print(f'{key}: valid code size = {len(valid_codes)}')
|
302
|
+
combined_df[key] = combined_df[key].apply(lambda items:
|
303
|
+
[item for item in items if item in valid_codes])
|
346
304
|
|
347
305
|
moved_columns = ['id']
|
348
306
|
moved_columns.extend(self.__similarity_related_columns__)
|
@@ -409,7 +367,6 @@ class BaseDataset(ABC):
|
|
409
367
|
x_fnc, args=(embeddings_after_pooling,), axis=1)
|
410
368
|
|
411
369
|
self.dataframe = self.ddis_df.copy()
|
412
|
-
self.dataframe['class_as_txt'] = labels
|
413
370
|
self.dataframe['class'] = list(classes)
|
414
371
|
print(self.dataframe.shape)
|
415
372
|
|
ddi_fw/pipeline/pipeline.py
CHANGED
@@ -47,7 +47,7 @@ class Pipeline:
|
|
47
47
|
self.combinations = combinations
|
48
48
|
self.model = model
|
49
49
|
|
50
|
-
def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column):
|
50
|
+
def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
|
51
51
|
"""
|
52
52
|
Fetch embeddings and metadata from a persistent Chroma vector database and update the provided embedding_dict.
|
53
53
|
|
@@ -65,9 +65,11 @@ class Pipeline:
|
|
65
65
|
|
66
66
|
# Fetch the embeddings and metadata
|
67
67
|
if column == None:
|
68
|
-
dictionary = collection.get(
|
68
|
+
dictionary = collection.get(
|
69
|
+
include=['embeddings', 'metadatas'])
|
69
70
|
else:
|
70
|
-
dictionary = collection.get(include=['embeddings', 'metadatas'], where=
|
71
|
+
dictionary = collection.get(include=['embeddings', 'metadatas'], where={
|
72
|
+
"type": {"$eq": f"{column}"}})
|
71
73
|
# Populate the embedding dictionary with embeddings from the vector database
|
72
74
|
for metadata, embedding in zip(dictionary['metadatas'], dictionary['embeddings']):
|
73
75
|
embedding_dict[metadata["type"]
|
@@ -77,7 +79,7 @@ class Pipeline:
|
|
77
79
|
else:
|
78
80
|
raise ValueError(
|
79
81
|
"Persistent directory for the vector DB is not specified.")
|
80
|
-
|
82
|
+
|
81
83
|
def build(self):
|
82
84
|
# 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
|
83
85
|
kwargs = {"columns": self.columns}
|
@@ -89,14 +91,16 @@ class Pipeline:
|
|
89
91
|
if self.vector_db_persist_directory:
|
90
92
|
self.__create_or_update_embeddings__(
|
91
93
|
embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
|
92
|
-
|
94
|
+
|
93
95
|
if self.column_embedding_configs:
|
94
96
|
for item in self.column_embedding_configs:
|
95
97
|
col = item["column"]
|
96
98
|
col_db_dir = item["vector_db_persist_directory"]
|
97
99
|
col_db_collection = item["vector_db_collection_name"]
|
98
|
-
self.__create_or_update_embeddings__(
|
99
|
-
|
100
|
+
self.__create_or_update_embeddings__(
|
101
|
+
embedding_dict, col_db_dir, col_db_collection, col)
|
102
|
+
print(
|
103
|
+
f"Embedings of {col} is calculated from {col_db_collection}")
|
100
104
|
|
101
105
|
# if self.embedding_dict == None:
|
102
106
|
# if self.vector_db_persist_directory:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: ddi_fw
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.98
|
4
4
|
Summary: Do not use :)
|
5
5
|
Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
|
6
6
|
Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
|
@@ -27,25 +27,25 @@ Requires-Dist: python-stopwatch==1.1.11
|
|
27
27
|
Requires-Dist: lxml==5.3.0
|
28
28
|
Requires-Dist: matplotlib==3.8.0
|
29
29
|
Requires-Dist: mlflow==2.16.1
|
30
|
-
Requires-Dist: nltk
|
31
|
-
Requires-Dist: numpy
|
32
|
-
Requires-Dist: pandas
|
30
|
+
Requires-Dist: nltk>=3.8.1
|
31
|
+
Requires-Dist: numpy>=1.26.4
|
32
|
+
Requires-Dist: pandas>=2.2.0
|
33
33
|
Requires-Dist: plotly==5.24.1
|
34
34
|
Requires-Dist: rdkit==2023.3.3
|
35
35
|
Requires-Dist: scikit-learn==1.5.2
|
36
36
|
Requires-Dist: scipy==1.13.1
|
37
|
-
Requires-Dist: accelerate
|
38
|
-
Requires-Dist: sentence-transformers
|
39
|
-
Requires-Dist: transformers
|
37
|
+
Requires-Dist: accelerate>=0.33.0
|
38
|
+
Requires-Dist: sentence-transformers>=3.0.1
|
39
|
+
Requires-Dist: transformers>=4.42.4
|
40
40
|
Requires-Dist: stanza==1.9.2
|
41
|
-
Requires-Dist: tokenizers
|
42
|
-
Requires-Dist: tqdm
|
41
|
+
Requires-Dist: tokenizers>=0.19.1
|
42
|
+
Requires-Dist: tqdm>=4.66.6
|
43
43
|
Requires-Dist: xmlschema==3.4.2
|
44
|
-
Requires-Dist: zipp
|
44
|
+
Requires-Dist: zipp>=3.20.2
|
45
45
|
Requires-Dist: py7zr==0.22.0
|
46
|
-
Requires-Dist: openai
|
47
|
-
Requires-Dist: langchain
|
48
|
-
Requires-Dist: chromadb
|
46
|
+
Requires-Dist: openai>=1.52.2
|
47
|
+
Requires-Dist: langchain>=0.3.4
|
48
|
+
Requires-Dist: chromadb>=0.5.15
|
49
49
|
Requires-Dist: langchain-community==0.3.3
|
50
50
|
Requires-Dist: datasets==3.0.2
|
51
51
|
Requires-Dist: unstructured==0.16.3
|
@@ -1,5 +1,5 @@
|
|
1
1
|
ddi_fw/datasets/__init__.py,sha256=HSwQrqnzrEjIG4gif41pwJ_cST3t2XHGDxqFyuEBRwo,351
|
2
|
-
ddi_fw/datasets/core.py,sha256=
|
2
|
+
ddi_fw/datasets/core.py,sha256=9RaUPhAYCn4RDeTZpHATtJaqNWsO17bduYyVqxAZWs0,17001
|
3
3
|
ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
|
4
4
|
ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
|
5
5
|
ddi_fw/datasets/feature_vector_generation.py,sha256=EImavcALxkIB0YG_smOzagMNzuWMbK9SaWSKwARx_qU,3254
|
@@ -72,7 +72,7 @@ ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,2
|
|
72
72
|
ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=qIst7vxHaOAhRv4lgozszwa3b1QE4aIrN74t41Xnvr4,1637
|
73
73
|
ddi_fw/pipeline/multi_pipeline.py,sha256=t_Z7d7xRfDnhpQTlqCf7c0isZ5hZlyXavKhC7ePsnJY,5903
|
74
74
|
ddi_fw/pipeline/ner_pipeline.py,sha256=wB7hz4YCOv7UAz6bGE6sSpPXXIdoOflOVK5UCc1fO-o,5586
|
75
|
-
ddi_fw/pipeline/pipeline.py,sha256=
|
75
|
+
ddi_fw/pipeline/pipeline.py,sha256=VX3GcoTjY7_ehX_rmpa-uh0NyBFoxF24AxbwruEWw4A,8457
|
76
76
|
ddi_fw/test/basic_test.py,sha256=fEOGcZm1ObnsDvMiXNmdmz6YCeUrGc8V0DwlSwGhsq8,376
|
77
77
|
ddi_fw/test/combination_test.py,sha256=TWNE8sf-DSh1Q9-yRaRBc774Sn1kSMGXLwQhd2_Qynk,324
|
78
78
|
ddi_fw/test/compress_json_test.py,sha256=BGny56YqiG-pzhMoDzLKQBQI1E7o3jU0S7VYWtclAx4,1045
|
@@ -89,7 +89,7 @@ ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
|
|
89
89
|
ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
|
90
90
|
ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
|
91
91
|
ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
|
92
|
-
ddi_fw-0.0.
|
93
|
-
ddi_fw-0.0.
|
94
|
-
ddi_fw-0.0.
|
95
|
-
ddi_fw-0.0.
|
92
|
+
ddi_fw-0.0.98.dist-info/METADATA,sha256=W4ZdrQs8YgQp6aHxr4Py5_lO4zrzKnk1XjDfFhrlsq8,1966
|
93
|
+
ddi_fw-0.0.98.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
94
|
+
ddi_fw-0.0.98.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
95
|
+
ddi_fw-0.0.98.dist-info/RECORD,,
|
File without changes
|
File without changes
|