ddi-fw 0.0.193__py3-none-any.whl → 0.0.194__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/core.py +95 -3
- ddi_fw/datasets/ddi_mdl/base.py +1 -0
- ddi_fw/pipeline/pipeline.py +59 -81
- {ddi_fw-0.0.193.dist-info → ddi_fw-0.0.194.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.193.dist-info → ddi_fw-0.0.194.dist-info}/RECORD +7 -7
- {ddi_fw-0.0.193.dist-info → ddi_fw-0.0.194.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.193.dist-info → ddi_fw-0.0.194.dist-info}/top_level.txt +0 -0
ddi_fw/datasets/core.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1
|
+
from collections import defaultdict
|
1
2
|
import glob
|
2
3
|
import logging
|
3
4
|
from typing import Any, Dict, List, Optional, Type
|
5
|
+
import chromadb
|
6
|
+
from chromadb.api.types import IncludeEnum
|
4
7
|
import numpy as np
|
5
8
|
import pandas as pd
|
6
9
|
from pydantic import BaseModel, Field, computed_field
|
@@ -132,6 +135,9 @@ class BaseDataset(BaseModel):
|
|
132
135
|
skip deriving them. Otherwise, derive them from the dataframe and indices.
|
133
136
|
"""
|
134
137
|
self.prep()
|
138
|
+
|
139
|
+
if isinstance(self, TextDatasetMixin):
|
140
|
+
self.process_text()
|
135
141
|
|
136
142
|
if self.X_train is not None or self.y_train is not None or self.X_test is not None or self.y_test is not None:
|
137
143
|
# Data is already provided, no need to calculate
|
@@ -254,12 +260,98 @@ class BaseDataset(BaseModel):
|
|
254
260
|
|
255
261
|
class TextDatasetMixin(BaseDataset):
|
256
262
|
embedding_size: Optional[int] = None
|
257
|
-
embedding_dict: Dict[str, Any] = Field(
|
263
|
+
embedding_dict: Dict[str, Any] | None = Field(
|
258
264
|
default_factory=dict, description="Dictionary for embeddings")
|
259
|
-
|
265
|
+
pooling_strategy: PoolingStrategy | None = None
|
266
|
+
column_embedding_configs: Optional[Dict] = None
|
267
|
+
vector_db_persist_directory: Optional[str] = None
|
268
|
+
vector_db_collection_name: Optional[str] = None
|
260
269
|
|
270
|
+
def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
|
271
|
+
"""
|
272
|
+
Fetch embeddings and metadata from a persistent Chroma vector database and update the provided embedding_dict.
|
273
|
+
|
274
|
+
Args:
|
275
|
+
- vector_db_persist_directory (str): The path to the directory where the Chroma vector database is stored.
|
276
|
+
- vector_db_collection_name (str): The name of the collection to query.
|
277
|
+
- embedding_dict (dict): The existing dictionary to update with embeddings.
|
278
|
+
|
279
|
+
"""
|
280
|
+
if vector_db_persist_directory:
|
281
|
+
# Initialize the Chroma client and get the collection
|
282
|
+
vector_db = chromadb.PersistentClient(
|
283
|
+
path=vector_db_persist_directory)
|
284
|
+
collection = vector_db.get_collection(vector_db_collection_name)
|
285
|
+
include = [IncludeEnum.embeddings, IncludeEnum.metadatas]
|
286
|
+
dictionary: chromadb.GetResult
|
287
|
+
# Fetch the embeddings and metadata
|
288
|
+
if column == None:
|
289
|
+
dictionary = collection.get(
|
290
|
+
include=include
|
291
|
+
# include=['embeddings', 'metadatas']
|
292
|
+
)
|
293
|
+
print(
|
294
|
+
f"Embeddings are calculated from {vector_db_collection_name}")
|
295
|
+
else:
|
296
|
+
dictionary = collection.get(
|
297
|
+
include=include,
|
298
|
+
# include=['embeddings', 'metadatas'],
|
299
|
+
where={
|
300
|
+
"type": {"$eq": f"{column}"}})
|
301
|
+
print(
|
302
|
+
f"Embeddings of {column} are calculated from {vector_db_collection_name}")
|
303
|
+
|
304
|
+
# Populate the embedding dictionary with embeddings from the vector database
|
305
|
+
metadatas = dictionary["metadatas"]
|
306
|
+
embeddings = dictionary["embeddings"]
|
307
|
+
if metadatas is None or embeddings is None:
|
308
|
+
raise ValueError(
|
309
|
+
"The collection does not contain embeddings or metadatas.")
|
310
|
+
for metadata, embedding in zip(metadatas, embeddings):
|
311
|
+
embedding_dict[metadata["type"]
|
312
|
+
][metadata["id"]].append(embedding)
|
313
|
+
|
314
|
+
else:
|
315
|
+
raise ValueError(
|
316
|
+
"Persistent directory for the vector DB is not specified.")
|
317
|
+
|
261
318
|
def process_text(self):
|
262
|
-
|
319
|
+
# key, value = next(iter(embedding_dict.items()))
|
320
|
+
# embedding_size = value[next(iter(value))][0].shape[0]
|
321
|
+
# pooling_strategy = self.embedding_pooling_strategy_type(
|
322
|
+
# ) if self.embedding_pooling_strategy_type else None
|
323
|
+
|
324
|
+
|
325
|
+
# 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
|
326
|
+
# kwargs = {"columns": self.columns}
|
327
|
+
# if self.ner_threshold:
|
328
|
+
# for k, v in self.ner_threshold.items():
|
329
|
+
# kwargs[k] = v
|
330
|
+
if self.embedding_dict == None:
|
331
|
+
embedding_dict = defaultdict(lambda: defaultdict(list))
|
332
|
+
# TODO find more effective solution
|
333
|
+
|
334
|
+
if self.column_embedding_configs:
|
335
|
+
for item in self.column_embedding_configs:
|
336
|
+
col = item["column"]
|
337
|
+
col_db_dir = item["vector_db_persist_directory"]
|
338
|
+
col_db_collection = item["vector_db_collection_name"]
|
339
|
+
self.__create_or_update_embeddings__(
|
340
|
+
embedding_dict, col_db_dir, col_db_collection, col)
|
341
|
+
|
342
|
+
elif self.vector_db_persist_directory:
|
343
|
+
self.__create_or_update_embeddings__(
|
344
|
+
embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
|
345
|
+
|
346
|
+
else:
|
347
|
+
print(
|
348
|
+
f"There is no configuration of Embeddings")
|
349
|
+
|
350
|
+
else:
|
351
|
+
embedding_dict = self.embedding_dict
|
352
|
+
# TODO make generic
|
353
|
+
# embedding_size = list(embedding_dict['all_text'].values())[
|
354
|
+
# 0][0].shape
|
263
355
|
|
264
356
|
|
265
357
|
# class ImageDatasetMixin(BaseModel):
|
ddi_fw/datasets/ddi_mdl/base.py
CHANGED
@@ -158,6 +158,7 @@ class DDIMDLDataset(TextDatasetMixin):
|
|
158
158
|
generated_vectors = generate_vectors(
|
159
159
|
chemical_properties_df, self.__similarity_related_columns__)
|
160
160
|
|
161
|
+
# TODO if necessary
|
161
162
|
similarity_matrices = generate_sim_matrices_new(
|
162
163
|
chemical_properties_df, generated_vectors, self.__similarity_related_columns__, key_column="id")
|
163
164
|
|
ddi_fw/pipeline/pipeline.py
CHANGED
@@ -61,54 +61,55 @@ class Pipeline(BaseModel):
|
|
61
61
|
class Config:
|
62
62
|
arbitrary_types_allowed = True
|
63
63
|
|
64
|
-
def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
64
|
+
# def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
|
65
|
+
# """
|
66
|
+
# Fetch embeddings and metadata from a persistent Chroma vector database and update the provided embedding_dict.
|
67
|
+
|
68
|
+
# Args:
|
69
|
+
# - vector_db_persist_directory (str): The path to the directory where the Chroma vector database is stored.
|
70
|
+
# - vector_db_collection_name (str): The name of the collection to query.
|
71
|
+
# - embedding_dict (dict): The existing dictionary to update with embeddings.
|
72
|
+
|
73
|
+
# """
|
74
|
+
# if vector_db_persist_directory:
|
75
|
+
# # Initialize the Chroma client and get the collection
|
76
|
+
# vector_db = chromadb.PersistentClient(
|
77
|
+
# path=vector_db_persist_directory)
|
78
|
+
# collection = vector_db.get_collection(vector_db_collection_name)
|
79
|
+
# include = [IncludeEnum.embeddings, IncludeEnum.metadatas]
|
80
|
+
# dictionary: chromadb.GetResult
|
81
|
+
# # Fetch the embeddings and metadata
|
82
|
+
# if column == None:
|
83
|
+
# dictionary = collection.get(
|
84
|
+
# include=include
|
85
|
+
# # include=['embeddings', 'metadatas']
|
86
|
+
# )
|
87
|
+
# print(
|
88
|
+
# f"Embeddings are calculated from {vector_db_collection_name}")
|
89
|
+
# else:
|
90
|
+
# dictionary = collection.get(
|
91
|
+
# include=include,
|
92
|
+
# # include=['embeddings', 'metadatas'],
|
93
|
+
# where={
|
94
|
+
# "type": {"$eq": f"{column}"}})
|
95
|
+
# print(
|
96
|
+
# f"Embeddings of {column} are calculated from {vector_db_collection_name}")
|
97
|
+
|
98
|
+
# # Populate the embedding dictionary with embeddings from the vector database
|
99
|
+
# metadatas = dictionary["metadatas"]
|
100
|
+
# embeddings = dictionary["embeddings"]
|
101
|
+
# if metadatas is None or embeddings is None:
|
102
|
+
# raise ValueError(
|
103
|
+
# "The collection does not contain embeddings or metadatas.")
|
104
|
+
# for metadata, embedding in zip(metadatas, embeddings):
|
105
|
+
# embedding_dict[metadata["type"]
|
106
|
+
# ][metadata["id"]].append(embedding)
|
107
|
+
|
108
|
+
# else:
|
109
|
+
# raise ValueError(
|
110
|
+
# "Persistent directory for the vector DB is not specified.")
|
111
|
+
|
112
|
+
#TODO embedding'leri set etme kimin görevi
|
112
113
|
def build(self):
|
113
114
|
if self.embedding_pooling_strategy_type is not None and not isinstance(self.embedding_pooling_strategy_type, type):
|
114
115
|
raise TypeError(
|
@@ -122,49 +123,25 @@ class Pipeline(BaseModel):
|
|
122
123
|
if self.ner_threshold:
|
123
124
|
for k, v in self.ner_threshold.items():
|
124
125
|
kwargs[k] = v
|
125
|
-
|
126
|
-
embedding_dict = defaultdict(lambda: defaultdict(list))
|
127
|
-
# TODO find more effective solution
|
128
|
-
|
129
|
-
if self.column_embedding_configs:
|
130
|
-
for item in self.column_embedding_configs:
|
131
|
-
col = item["column"]
|
132
|
-
col_db_dir = item["vector_db_persist_directory"]
|
133
|
-
col_db_collection = item["vector_db_collection_name"]
|
134
|
-
self.__create_or_update_embeddings__(
|
135
|
-
embedding_dict, col_db_dir, col_db_collection, col)
|
136
|
-
|
137
|
-
elif self.vector_db_persist_directory:
|
138
|
-
self.__create_or_update_embeddings__(
|
139
|
-
embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
|
140
|
-
|
141
|
-
else:
|
142
|
-
print(
|
143
|
-
f"There is no configuration of Embeddings")
|
144
|
-
|
145
|
-
else:
|
146
|
-
embedding_dict = self.embedding_dict
|
147
|
-
# TODO make generic
|
148
|
-
# embedding_size = list(embedding_dict['all_text'].values())[
|
149
|
-
# 0][0].shape
|
126
|
+
|
150
127
|
|
151
128
|
# self.ner_df = CTakesNER(df=None).load(
|
152
129
|
# filename=self.ner_data_file) if self.ner_data_file else None
|
153
130
|
|
154
131
|
dataset_splitter = self.dataset_splitter_type()
|
155
|
-
|
132
|
+
pooling_strategy = self.embedding_pooling_strategy_type(
|
133
|
+
) if self.embedding_pooling_strategy_type else None
|
156
134
|
if issubclass(self.dataset_type, TextDatasetMixin):
|
157
|
-
key, value = next(iter(embedding_dict.items()))
|
158
|
-
embedding_size = value[next(iter(value))][0].shape[0]
|
159
|
-
pooling_strategy = self.embedding_pooling_strategy_type(
|
160
|
-
) if self.embedding_pooling_strategy_type else None
|
161
135
|
|
162
136
|
dataset = self.dataset_type(
|
163
|
-
embedding_dict=embedding_dict,
|
164
|
-
|
165
|
-
|
137
|
+
embedding_dict=self.embedding_dict,
|
138
|
+
pooling_strategy=pooling_strategy,
|
139
|
+
column_embedding_configs=self.column_embedding_configs,
|
140
|
+
vector_db_persist_directory=self.vector_db_persist_directory,
|
141
|
+
vector_db_collection_name=self.vector_db_collection_name,
|
166
142
|
dataset_splitter_type=self.dataset_splitter_type,
|
167
143
|
**kwargs)
|
144
|
+
|
168
145
|
elif self.dataset_type == BaseDataset:
|
169
146
|
dataset = self.dataset_type(
|
170
147
|
dataset_splitter_type=self.dataset_splitter_type,
|
@@ -175,6 +152,7 @@ class Pipeline(BaseModel):
|
|
175
152
|
# X_train, X_test, y_train, y_test, train_indexes, test_indexes, train_idx_arr, val_idx_arr = dataset.load()
|
176
153
|
|
177
154
|
dataset.load()
|
155
|
+
|
178
156
|
self._dataset = dataset
|
179
157
|
|
180
158
|
dataframe = dataset.dataframe
|
@@ -1,9 +1,9 @@
|
|
1
1
|
ddi_fw/datasets/__init__.py,sha256=_I3iDHARwzmg7_EL5XKtB_TgG1yAkLSOVTujLL9Wz9Q,280
|
2
|
-
ddi_fw/datasets/core.py,sha256=
|
2
|
+
ddi_fw/datasets/core.py,sha256=HXU09CTbe3zpdBiUcE2w2Yxx_3yHfY_rqa31oS959jw,15531
|
3
3
|
ddi_fw/datasets/dataset_splitter.py,sha256=8H8uZTAf8N9LUZeSeHOMawtJFJhnDgUUqFcnl7dquBQ,1672
|
4
4
|
ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
|
5
5
|
ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
|
6
|
-
ddi_fw/datasets/ddi_mdl/base.py,sha256=
|
6
|
+
ddi_fw/datasets/ddi_mdl/base.py,sha256=UevqzUUQozjRW9rnVaW2dogV_wahcEujH8c6MMvSQEo,9343
|
7
7
|
ddi_fw/datasets/ddi_mdl/debug.log,sha256=eWz05j8RFqZuHFDTCF7Rck5w4rvtTanFN21iZsgxO7Y,115
|
8
8
|
ddi_fw/datasets/ddi_mdl/readme.md,sha256=WC6lpmsEKvIISnZqENY7TWtzCQr98HPpE3oRsBl8pIw,625
|
9
9
|
ddi_fw/datasets/ddi_mdl/data/event.db,sha256=cmlSsf9MYjRzqR-mw3cUDnTnfT6FkpOG2yCl2mMwwew,30580736
|
@@ -85,7 +85,7 @@ ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,2
|
|
85
85
|
ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=JSyuP71b1I1yuk0s2ecCJZTtCED85jBtkpwTUxibJvI,1706
|
86
86
|
ddi_fw/pipeline/multi_pipeline.py,sha256=fYyvwIOscUahjXd3QO5RSFrp1LliGR7RzOZyAXrXXz4,5637
|
87
87
|
ddi_fw/pipeline/ner_pipeline.py,sha256=Bp6BA6nozfWFaMHH6jKlzesnCGO6qiMkzdGy_ed6nh0,5947
|
88
|
-
ddi_fw/pipeline/pipeline.py,sha256=
|
88
|
+
ddi_fw/pipeline/pipeline.py,sha256=CUHuy1nNgGD-eUcLnWFXcmSoTGssmg4ZFRAY1Cufey0,9047
|
89
89
|
ddi_fw/utils/__init__.py,sha256=HC32XkYQTYH_9vt0eX6tqQngEFG-R70hGrYkT-BcHCk,519
|
90
90
|
ddi_fw/utils/categorical_data_encoding_checker.py,sha256=gzb_vUDBrCMUhBxY1fBYTe8hmK72p0_uw3DTga8cqP8,1580
|
91
91
|
ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
|
@@ -99,7 +99,7 @@ ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,55
|
|
99
99
|
ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
|
100
100
|
ddi_fw/vectorization/feature_vector_generation.py,sha256=EBf-XAiwQwr68az91erEYNegfeqssBR29kVgrliIyac,4765
|
101
101
|
ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
102
|
-
ddi_fw-0.0.
|
103
|
-
ddi_fw-0.0.
|
104
|
-
ddi_fw-0.0.
|
105
|
-
ddi_fw-0.0.
|
102
|
+
ddi_fw-0.0.194.dist-info/METADATA,sha256=Nv82MVq4n0p6vpkGa_mBn0kw0rRAKaYIauDLJhZVUkI,2542
|
103
|
+
ddi_fw-0.0.194.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
104
|
+
ddi_fw-0.0.194.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
105
|
+
ddi_fw-0.0.194.dist-info/RECORD,,
|
File without changes
|
File without changes
|