ddi-fw 0.0.193__py3-none-any.whl → 0.0.194__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ddi_fw/datasets/core.py CHANGED
@@ -1,6 +1,9 @@
1
+ from collections import defaultdict
1
2
  import glob
2
3
  import logging
3
4
  from typing import Any, Dict, List, Optional, Type
5
+ import chromadb
6
+ from chromadb.api.types import IncludeEnum
4
7
  import numpy as np
5
8
  import pandas as pd
6
9
  from pydantic import BaseModel, Field, computed_field
@@ -132,6 +135,9 @@ class BaseDataset(BaseModel):
132
135
  skip deriving them. Otherwise, derive them from the dataframe and indices.
133
136
  """
134
137
  self.prep()
138
+
139
+ if isinstance(self, TextDatasetMixin):
140
+ self.process_text()
135
141
 
136
142
  if self.X_train is not None or self.y_train is not None or self.X_test is not None or self.y_test is not None:
137
143
  # Data is already provided, no need to calculate
@@ -254,12 +260,98 @@ class BaseDataset(BaseModel):
254
260
 
255
261
  class TextDatasetMixin(BaseDataset):
256
262
  embedding_size: Optional[int] = None
257
- embedding_dict: Dict[str, Any] = Field(
263
+ embedding_dict: Dict[str, Any] | None = Field(
258
264
  default_factory=dict, description="Dictionary for embeddings")
259
- embeddings_pooling_strategy: PoolingStrategy | None = None
265
+ pooling_strategy: PoolingStrategy | None = None
266
+ column_embedding_configs: Optional[Dict] = None
267
+ vector_db_persist_directory: Optional[str] = None
268
+ vector_db_collection_name: Optional[str] = None
260
269
 
270
+ def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
271
+ """
272
+ Fetch embeddings and metadata from a persistent Chroma vector database and update the provided embedding_dict.
273
+
274
+ Args:
275
+ - vector_db_persist_directory (str): The path to the directory where the Chroma vector database is stored.
276
+ - vector_db_collection_name (str): The name of the collection to query.
277
+ - embedding_dict (dict): The existing dictionary to update with embeddings.
278
+
279
+ """
280
+ if vector_db_persist_directory:
281
+ # Initialize the Chroma client and get the collection
282
+ vector_db = chromadb.PersistentClient(
283
+ path=vector_db_persist_directory)
284
+ collection = vector_db.get_collection(vector_db_collection_name)
285
+ include = [IncludeEnum.embeddings, IncludeEnum.metadatas]
286
+ dictionary: chromadb.GetResult
287
+ # Fetch the embeddings and metadata
288
+ if column == None:
289
+ dictionary = collection.get(
290
+ include=include
291
+ # include=['embeddings', 'metadatas']
292
+ )
293
+ print(
294
+ f"Embeddings are calculated from {vector_db_collection_name}")
295
+ else:
296
+ dictionary = collection.get(
297
+ include=include,
298
+ # include=['embeddings', 'metadatas'],
299
+ where={
300
+ "type": {"$eq": f"{column}"}})
301
+ print(
302
+ f"Embeddings of {column} are calculated from {vector_db_collection_name}")
303
+
304
+ # Populate the embedding dictionary with embeddings from the vector database
305
+ metadatas = dictionary["metadatas"]
306
+ embeddings = dictionary["embeddings"]
307
+ if metadatas is None or embeddings is None:
308
+ raise ValueError(
309
+ "The collection does not contain embeddings or metadatas.")
310
+ for metadata, embedding in zip(metadatas, embeddings):
311
+ embedding_dict[metadata["type"]
312
+ ][metadata["id"]].append(embedding)
313
+
314
+ else:
315
+ raise ValueError(
316
+ "Persistent directory for the vector DB is not specified.")
317
+
261
318
  def process_text(self):
262
- pass
319
+ # key, value = next(iter(embedding_dict.items()))
320
+ # embedding_size = value[next(iter(value))][0].shape[0]
321
+ # pooling_strategy = self.embedding_pooling_strategy_type(
322
+ # ) if self.embedding_pooling_strategy_type else None
323
+
324
+
325
+ # 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
326
+ # kwargs = {"columns": self.columns}
327
+ # if self.ner_threshold:
328
+ # for k, v in self.ner_threshold.items():
329
+ # kwargs[k] = v
330
+ if self.embedding_dict == None:
331
+ embedding_dict = defaultdict(lambda: defaultdict(list))
332
+ # TODO find more effective solution
333
+
334
+ if self.column_embedding_configs:
335
+ for item in self.column_embedding_configs:
336
+ col = item["column"]
337
+ col_db_dir = item["vector_db_persist_directory"]
338
+ col_db_collection = item["vector_db_collection_name"]
339
+ self.__create_or_update_embeddings__(
340
+ embedding_dict, col_db_dir, col_db_collection, col)
341
+
342
+ elif self.vector_db_persist_directory:
343
+ self.__create_or_update_embeddings__(
344
+ embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
345
+
346
+ else:
347
+ print(
348
+ f"There is no configuration of Embeddings")
349
+
350
+ else:
351
+ embedding_dict = self.embedding_dict
352
+ # TODO make generic
353
+ # embedding_size = list(embedding_dict['all_text'].values())[
354
+ # 0][0].shape
263
355
 
264
356
 
265
357
  # class ImageDatasetMixin(BaseModel):
@@ -158,6 +158,7 @@ class DDIMDLDataset(TextDatasetMixin):
158
158
  generated_vectors = generate_vectors(
159
159
  chemical_properties_df, self.__similarity_related_columns__)
160
160
 
161
+ # TODO if necessary
161
162
  similarity_matrices = generate_sim_matrices_new(
162
163
  chemical_properties_df, generated_vectors, self.__similarity_related_columns__, key_column="id")
163
164
 
@@ -61,54 +61,55 @@ class Pipeline(BaseModel):
61
61
  class Config:
62
62
  arbitrary_types_allowed = True
63
63
 
64
- def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
65
- """
66
- Fetch embeddings and metadata from a persistent Chroma vector database and update the provided embedding_dict.
67
-
68
- Args:
69
- - vector_db_persist_directory (str): The path to the directory where the Chroma vector database is stored.
70
- - vector_db_collection_name (str): The name of the collection to query.
71
- - embedding_dict (dict): The existing dictionary to update with embeddings.
72
-
73
- """
74
- if vector_db_persist_directory:
75
- # Initialize the Chroma client and get the collection
76
- vector_db = chromadb.PersistentClient(
77
- path=vector_db_persist_directory)
78
- collection = vector_db.get_collection(vector_db_collection_name)
79
- include = [IncludeEnum.embeddings, IncludeEnum.metadatas]
80
- dictionary: chromadb.GetResult
81
- # Fetch the embeddings and metadata
82
- if column == None:
83
- dictionary = collection.get(
84
- include=include
85
- # include=['embeddings', 'metadatas']
86
- )
87
- print(
88
- f"Embeddings are calculated from {vector_db_collection_name}")
89
- else:
90
- dictionary = collection.get(
91
- include=include,
92
- # include=['embeddings', 'metadatas'],
93
- where={
94
- "type": {"$eq": f"{column}"}})
95
- print(
96
- f"Embeddings of {column} are calculated from {vector_db_collection_name}")
97
-
98
- # Populate the embedding dictionary with embeddings from the vector database
99
- metadatas = dictionary["metadatas"]
100
- embeddings = dictionary["embeddings"]
101
- if metadatas is None or embeddings is None:
102
- raise ValueError(
103
- "The collection does not contain embeddings or metadatas.")
104
- for metadata, embedding in zip(metadatas, embeddings):
105
- embedding_dict[metadata["type"]
106
- ][metadata["id"]].append(embedding)
107
-
108
- else:
109
- raise ValueError(
110
- "Persistent directory for the vector DB is not specified.")
111
-
64
+ # def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
65
+ # """
66
+ # Fetch embeddings and metadata from a persistent Chroma vector database and update the provided embedding_dict.
67
+
68
+ # Args:
69
+ # - vector_db_persist_directory (str): The path to the directory where the Chroma vector database is stored.
70
+ # - vector_db_collection_name (str): The name of the collection to query.
71
+ # - embedding_dict (dict): The existing dictionary to update with embeddings.
72
+
73
+ # """
74
+ # if vector_db_persist_directory:
75
+ # # Initialize the Chroma client and get the collection
76
+ # vector_db = chromadb.PersistentClient(
77
+ # path=vector_db_persist_directory)
78
+ # collection = vector_db.get_collection(vector_db_collection_name)
79
+ # include = [IncludeEnum.embeddings, IncludeEnum.metadatas]
80
+ # dictionary: chromadb.GetResult
81
+ # # Fetch the embeddings and metadata
82
+ # if column == None:
83
+ # dictionary = collection.get(
84
+ # include=include
85
+ # # include=['embeddings', 'metadatas']
86
+ # )
87
+ # print(
88
+ # f"Embeddings are calculated from {vector_db_collection_name}")
89
+ # else:
90
+ # dictionary = collection.get(
91
+ # include=include,
92
+ # # include=['embeddings', 'metadatas'],
93
+ # where={
94
+ # "type": {"$eq": f"{column}"}})
95
+ # print(
96
+ # f"Embeddings of {column} are calculated from {vector_db_collection_name}")
97
+
98
+ # # Populate the embedding dictionary with embeddings from the vector database
99
+ # metadatas = dictionary["metadatas"]
100
+ # embeddings = dictionary["embeddings"]
101
+ # if metadatas is None or embeddings is None:
102
+ # raise ValueError(
103
+ # "The collection does not contain embeddings or metadatas.")
104
+ # for metadata, embedding in zip(metadatas, embeddings):
105
+ # embedding_dict[metadata["type"]
106
+ # ][metadata["id"]].append(embedding)
107
+
108
+ # else:
109
+ # raise ValueError(
110
+ # "Persistent directory for the vector DB is not specified.")
111
+
112
+ #TODO embedding'leri set etme kimin görevi
112
113
  def build(self):
113
114
  if self.embedding_pooling_strategy_type is not None and not isinstance(self.embedding_pooling_strategy_type, type):
114
115
  raise TypeError(
@@ -122,49 +123,25 @@ class Pipeline(BaseModel):
122
123
  if self.ner_threshold:
123
124
  for k, v in self.ner_threshold.items():
124
125
  kwargs[k] = v
125
- if self.embedding_dict == None:
126
- embedding_dict = defaultdict(lambda: defaultdict(list))
127
- # TODO find more effective solution
128
-
129
- if self.column_embedding_configs:
130
- for item in self.column_embedding_configs:
131
- col = item["column"]
132
- col_db_dir = item["vector_db_persist_directory"]
133
- col_db_collection = item["vector_db_collection_name"]
134
- self.__create_or_update_embeddings__(
135
- embedding_dict, col_db_dir, col_db_collection, col)
136
-
137
- elif self.vector_db_persist_directory:
138
- self.__create_or_update_embeddings__(
139
- embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
140
-
141
- else:
142
- print(
143
- f"There is no configuration of Embeddings")
144
-
145
- else:
146
- embedding_dict = self.embedding_dict
147
- # TODO make generic
148
- # embedding_size = list(embedding_dict['all_text'].values())[
149
- # 0][0].shape
126
+
150
127
 
151
128
  # self.ner_df = CTakesNER(df=None).load(
152
129
  # filename=self.ner_data_file) if self.ner_data_file else None
153
130
 
154
131
  dataset_splitter = self.dataset_splitter_type()
155
-
132
+ pooling_strategy = self.embedding_pooling_strategy_type(
133
+ ) if self.embedding_pooling_strategy_type else None
156
134
  if issubclass(self.dataset_type, TextDatasetMixin):
157
- key, value = next(iter(embedding_dict.items()))
158
- embedding_size = value[next(iter(value))][0].shape[0]
159
- pooling_strategy = self.embedding_pooling_strategy_type(
160
- ) if self.embedding_pooling_strategy_type else None
161
135
 
162
136
  dataset = self.dataset_type(
163
- embedding_dict=embedding_dict,
164
- embedding_size=embedding_size,
165
- embeddings_pooling_strategy=pooling_strategy,
137
+ embedding_dict=self.embedding_dict,
138
+ pooling_strategy=pooling_strategy,
139
+ column_embedding_configs=self.column_embedding_configs,
140
+ vector_db_persist_directory=self.vector_db_persist_directory,
141
+ vector_db_collection_name=self.vector_db_collection_name,
166
142
  dataset_splitter_type=self.dataset_splitter_type,
167
143
  **kwargs)
144
+
168
145
  elif self.dataset_type == BaseDataset:
169
146
  dataset = self.dataset_type(
170
147
  dataset_splitter_type=self.dataset_splitter_type,
@@ -175,6 +152,7 @@ class Pipeline(BaseModel):
175
152
  # X_train, X_test, y_train, y_test, train_indexes, test_indexes, train_idx_arr, val_idx_arr = dataset.load()
176
153
 
177
154
  dataset.load()
155
+
178
156
  self._dataset = dataset
179
157
 
180
158
  dataframe = dataset.dataframe
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.193
3
+ Version: 0.0.194
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -1,9 +1,9 @@
1
1
  ddi_fw/datasets/__init__.py,sha256=_I3iDHARwzmg7_EL5XKtB_TgG1yAkLSOVTujLL9Wz9Q,280
2
- ddi_fw/datasets/core.py,sha256=4705a94kKBueyWFXRJ3cnivAGKjrR89uBBKpxtMozOM,11080
2
+ ddi_fw/datasets/core.py,sha256=HXU09CTbe3zpdBiUcE2w2Yxx_3yHfY_rqa31oS959jw,15531
3
3
  ddi_fw/datasets/dataset_splitter.py,sha256=8H8uZTAf8N9LUZeSeHOMawtJFJhnDgUUqFcnl7dquBQ,1672
4
4
  ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
5
5
  ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
6
- ddi_fw/datasets/ddi_mdl/base.py,sha256=bdcGmEbY_2Fe8fg0pKxfMuDopgaPUTUfQasCy8Bhcvc,9313
6
+ ddi_fw/datasets/ddi_mdl/base.py,sha256=UevqzUUQozjRW9rnVaW2dogV_wahcEujH8c6MMvSQEo,9343
7
7
  ddi_fw/datasets/ddi_mdl/debug.log,sha256=eWz05j8RFqZuHFDTCF7Rck5w4rvtTanFN21iZsgxO7Y,115
8
8
  ddi_fw/datasets/ddi_mdl/readme.md,sha256=WC6lpmsEKvIISnZqENY7TWtzCQr98HPpE3oRsBl8pIw,625
9
9
  ddi_fw/datasets/ddi_mdl/data/event.db,sha256=cmlSsf9MYjRzqR-mw3cUDnTnfT6FkpOG2yCl2mMwwew,30580736
@@ -85,7 +85,7 @@ ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,2
85
85
  ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=JSyuP71b1I1yuk0s2ecCJZTtCED85jBtkpwTUxibJvI,1706
86
86
  ddi_fw/pipeline/multi_pipeline.py,sha256=fYyvwIOscUahjXd3QO5RSFrp1LliGR7RzOZyAXrXXz4,5637
87
87
  ddi_fw/pipeline/ner_pipeline.py,sha256=Bp6BA6nozfWFaMHH6jKlzesnCGO6qiMkzdGy_ed6nh0,5947
88
- ddi_fw/pipeline/pipeline.py,sha256=fRNUmKMrIiN_pX0aU57FGxaZ-1gdaI9IPBwAv3qgO7o,9961
88
+ ddi_fw/pipeline/pipeline.py,sha256=CUHuy1nNgGD-eUcLnWFXcmSoTGssmg4ZFRAY1Cufey0,9047
89
89
  ddi_fw/utils/__init__.py,sha256=HC32XkYQTYH_9vt0eX6tqQngEFG-R70hGrYkT-BcHCk,519
90
90
  ddi_fw/utils/categorical_data_encoding_checker.py,sha256=gzb_vUDBrCMUhBxY1fBYTe8hmK72p0_uw3DTga8cqP8,1580
91
91
  ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
@@ -99,7 +99,7 @@ ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,55
99
99
  ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
100
100
  ddi_fw/vectorization/feature_vector_generation.py,sha256=EBf-XAiwQwr68az91erEYNegfeqssBR29kVgrliIyac,4765
101
101
  ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
102
- ddi_fw-0.0.193.dist-info/METADATA,sha256=onl9mPw_lsOyg_rdTxnYXNVGFGULp-SMmDGz82vPZE0,2542
103
- ddi_fw-0.0.193.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
104
- ddi_fw-0.0.193.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
105
- ddi_fw-0.0.193.dist-info/RECORD,,
102
+ ddi_fw-0.0.194.dist-info/METADATA,sha256=Nv82MVq4n0p6vpkGa_mBn0kw0rRAKaYIauDLJhZVUkI,2542
103
+ ddi_fw-0.0.194.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
104
+ ddi_fw-0.0.194.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
105
+ ddi_fw-0.0.194.dist-info/RECORD,,