ddi-fw 0.0.197__tar.gz → 0.0.199__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/PKG-INFO +1 -1
  2. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/pyproject.toml +1 -1
  3. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/core.py +56 -41
  4. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/base.py +12 -9
  5. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ml/ml_helper.py +14 -5
  6. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/pipeline/multi_pipeline.py +2 -0
  7. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/pipeline/pipeline.py +2 -1
  8. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw.egg-info/PKG-INFO +1 -1
  9. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/README.md +0 -0
  10. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/setup.cfg +0 -0
  11. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/__init__.py +0 -0
  12. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/dataset_splitter.py +0 -0
  13. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/db_utils.py +0 -0
  14. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/data/event.db +0 -0
  15. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/debug.log +0 -0
  16. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt +0 -0
  17. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_0.txt +0 -0
  18. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_1.txt +0 -0
  19. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_2.txt +0 -0
  20. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_3.txt +0 -0
  21. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_4.txt +0 -0
  22. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/train_indexes.txt +0 -0
  23. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_0.txt +0 -0
  24. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_1.txt +0 -0
  25. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_2.txt +0 -0
  26. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_3.txt +0 -0
  27. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_4.txt +0 -0
  28. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/test_indexes.txt +0 -0
  29. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_0.txt +0 -0
  30. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_1.txt +0 -0
  31. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_2.txt +0 -0
  32. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_3.txt +0 -0
  33. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_4.txt +0 -0
  34. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_indexes.txt +0 -0
  35. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_0.txt +0 -0
  36. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_1.txt +0 -0
  37. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_2.txt +0 -0
  38. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_3.txt +0 -0
  39. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_4.txt +0 -0
  40. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/readme.md +0 -0
  41. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/base.py +0 -0
  42. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/data/event.db +0 -0
  43. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/test_indexes.txt +0 -0
  44. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_0.txt +0 -0
  45. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_1.txt +0 -0
  46. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_2.txt +0 -0
  47. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_3.txt +0 -0
  48. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_4.txt +0 -0
  49. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_indexes.txt +0 -0
  50. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_0.txt +0 -0
  51. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_1.txt +0 -0
  52. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_2.txt +0 -0
  53. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_3.txt +0 -0
  54. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_4.txt +0 -0
  55. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/__init__.py +0 -0
  56. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/base.py +0 -0
  57. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/df_extraction_cleanxiaoyu50.csv +0 -0
  58. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/drug_information_del_noDDIxiaoyu50.csv +0 -0
  59. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/test_indexes.txt +0 -0
  60. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_0.txt +0 -0
  61. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_1.txt +0 -0
  62. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_2.txt +0 -0
  63. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_3.txt +0 -0
  64. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_4.txt +0 -0
  65. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_indexes.txt +0 -0
  66. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_0.txt +0 -0
  67. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_1.txt +0 -0
  68. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_2.txt +0 -0
  69. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_3.txt +0 -0
  70. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_4.txt +0 -0
  71. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/mdf-sa-ddi.zip +0 -0
  72. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/setup_._py +0 -0
  73. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/drugbank/__init__.py +0 -0
  74. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/drugbank/drugbank.xsd +0 -0
  75. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/drugbank/drugbank_parser.py +0 -0
  76. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/drugbank/drugbank_processor.py +0 -0
  77. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/drugbank/drugbank_processor_org.py +0 -0
  78. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/drugbank/event_extractor.py +0 -0
  79. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/langchain/__init__.py +0 -0
  80. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/langchain/embeddings.py +0 -0
  81. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/langchain/sentence_splitter.py +0 -0
  82. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/langchain/storage.py +0 -0
  83. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ml/__init__.py +0 -0
  84. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ml/evaluation_helper.py +0 -0
  85. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ml/model_wrapper.py +0 -0
  86. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ml/pytorch_wrapper.py +0 -0
  87. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ml/tensorflow_wrapper.py +0 -0
  88. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ner/__init__.py +0 -0
  89. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ner/mmlrestclient.py +0 -0
  90. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ner/ner.py +0 -0
  91. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/pipeline/__init__.py +0 -0
  92. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/pipeline/multi_modal_combination_strategy.py +0 -0
  93. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/pipeline/ner_pipeline.py +0 -0
  94. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/__init__.py +0 -0
  95. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/categorical_data_encoding_checker.py +0 -0
  96. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/enums.py +0 -0
  97. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/json_helper.py +0 -0
  98. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/kaggle.py +0 -0
  99. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/numpy_utils.py +0 -0
  100. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/package_helper.py +0 -0
  101. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/py7zr_helper.py +0 -0
  102. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/utils.py +0 -0
  103. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/zip_helper.py +0 -0
  104. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/vectorization/__init__.py +0 -0
  105. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/vectorization/feature_vector_generation.py +0 -0
  106. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/vectorization/idf_helper.py +0 -0
  107. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw.egg-info/SOURCES.txt +0 -0
  108. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw.egg-info/dependency_links.txt +0 -0
  109. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw.egg-info/requires.txt +0 -0
  110. {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.197
3
+ Version: 0.0.199
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
6
6
 
7
7
  [project]
8
8
  name = "ddi_fw"
9
- version = "0.0.197"
9
+ version = "0.0.199"
10
10
  description = "Do not use :)"
11
11
  readme = "README.md"
12
12
  authors = [
@@ -1,3 +1,4 @@
1
+ import abc
1
2
  from collections import defaultdict
2
3
  import glob
3
4
  import logging
@@ -57,7 +58,7 @@ def generate_sim_matrices_new(df, generated_vectors, columns, key_column="id"):
57
58
  return similarity_matrices
58
59
 
59
60
 
60
- class BaseDataset(BaseModel):
61
+ class BaseDataset(BaseModel, abc.ABC):
61
62
  dataset_name: str
62
63
  index_path: Optional[str] = None
63
64
  dataset_splitter_type: Type[DatasetSplitter]
@@ -125,19 +126,26 @@ class BaseDataset(BaseModel):
125
126
  def set_dataframe(self, dataframe: pd.DataFrame):
126
127
  self.dataframe = dataframe
127
128
 
128
- # @abstractmethod
129
+ @abc.abstractmethod
129
130
  def prep(self):
130
- pass
131
+ """Prepare the dataset. This method should be overridden in subclasses."""
132
+
131
133
 
134
+ def handle_mixins(self):
135
+ """Handle mixin-specific logic."""
136
+ if isinstance(self, TextDatasetMixin):
137
+ self.process_text()
138
+ # if isinstance(self, ImageDatasetMixin):
139
+ # self.process_image_data()
140
+ # Add other mixin-specific logic here
141
+
132
142
  def load(self):
133
143
  """
134
144
  Load the dataset. If X_train, y_train, X_test, and y_test are already provided,
135
145
  skip deriving them. Otherwise, derive them from the dataframe and indices.
136
146
  """
137
- self.prep()
138
-
139
- if isinstance(self, TextDatasetMixin):
140
- self.process_text()
147
+ self.handle_mixins() # Centralized mixin handling
148
+ self.prep() # Prepare the dataset
141
149
 
142
150
  if self.X_train is not None or self.y_train is not None or self.X_test is not None or self.y_test is not None:
143
151
  # Data is already provided, no need to calculate
@@ -158,9 +166,12 @@ class BaseDataset(BaseModel):
158
166
  self.index_path)
159
167
  except FileNotFoundError as e:
160
168
  raise FileNotFoundError(f"Index files not found: {e.filename}")
161
-
162
- train = self.dataframe[self.dataframe.index.isin(train_idx_all)]
163
- test = self.dataframe[self.dataframe.index.isin(test_idx_all)]
169
+
170
+ # train = self.dataframe[self.dataframe.index.isin(train_idx_all)]
171
+ # test = self.dataframe[self.dataframe.index.isin(test_idx_all)]
172
+ columns = self.columns + [self.class_column]
173
+ train = self.dataframe.loc[self.dataframe.index.isin(train_idx_all), columns]
174
+ test = self.dataframe.loc[self.dataframe.index.isin(test_idx_all), columns]
164
175
  X_train = train.drop(self.class_column, axis=1)
165
176
  X_train = train.drop(self.class_column, axis=1)
166
177
  y_train = train[self.class_column]
@@ -259,13 +270,18 @@ class BaseDataset(BaseModel):
259
270
 
260
271
 
261
272
  class TextDatasetMixin(BaseModel):
262
- embedding_size: Optional[int] = None
263
273
  embedding_dict: Dict[str, Any] | None = Field(
264
274
  default_factory=dict, description="Dictionary for embeddings")
265
275
  pooling_strategy: PoolingStrategy | None = None
266
276
  column_embedding_configs: Optional[Dict] = None
267
277
  vector_db_persist_directory: Optional[str] = None
268
278
  vector_db_collection_name: Optional[str] = None
279
+ _embedding_size: int
280
+
281
+ @computed_field
282
+ @property
283
+ def embedding_size(self) -> int:
284
+ return self._embedding_size
269
285
 
270
286
  class Config:
271
287
  arbitrary_types_allowed = True
@@ -317,44 +333,43 @@ class TextDatasetMixin(BaseModel):
317
333
  else:
318
334
  raise ValueError(
319
335
  "Persistent directory for the vector DB is not specified.")
336
+
337
+ def __initialize_embedding_dict(self):
338
+ embedding_dict = defaultdict(lambda: defaultdict(list))
339
+ if self.column_embedding_configs:
340
+ for item in self.column_embedding_configs:
341
+ col = item["column"]
342
+ col_db_dir = item["vector_db_persist_directory"]
343
+ col_db_collection = item["vector_db_collection_name"]
344
+ self.__create_or_update_embeddings__(embedding_dict, col_db_dir, col_db_collection, col)
345
+ elif self.vector_db_persist_directory:
346
+ self.__create_or_update_embeddings__(embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
347
+ else:
348
+ logging.warning("There is no configuration of Embeddings")
349
+ raise ValueError(
350
+ "There is no configuration of Embeddings. Please provide a vector database directory and collection name.")
351
+ return embedding_dict
320
352
 
321
- def process_text(self):
322
- # key, value = next(iter(embedding_dict.items()))
323
- # embedding_size = value[next(iter(value))][0].shape[0]
324
- # pooling_strategy = self.embedding_pooling_strategy_type(
325
- # ) if self.embedding_pooling_strategy_type else None
353
+ def __calculate_embedding_size(self):
354
+ if self.embedding_dict is None:
355
+ raise ValueError("Embedding dictionary is not initialized, embedding size cannot be calculated.")
356
+
357
+ key, value = next(iter(self.embedding_dict.items()))
358
+ self._embedding_size = value[next(iter(value))][0].shape[0]
326
359
 
360
+ def process_text(self):
361
+ logging.info("Processing text data...")
362
+
327
363
  # 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
328
364
  # kwargs = {"columns": self.columns}
329
365
  # if self.ner_threshold:
330
366
  # for k, v in self.ner_threshold.items():
331
367
  # kwargs[k] = v
332
- if self.embedding_dict == None:
333
- embedding_dict = defaultdict(lambda: defaultdict(list))
334
- # TODO find more effective solution
335
-
336
- if self.column_embedding_configs:
337
- for item in self.column_embedding_configs:
338
- col = item["column"]
339
- col_db_dir = item["vector_db_persist_directory"]
340
- col_db_collection = item["vector_db_collection_name"]
341
- self.__create_or_update_embeddings__(
342
- embedding_dict, col_db_dir, col_db_collection, col)
343
-
344
- elif self.vector_db_persist_directory:
345
- self.__create_or_update_embeddings__(
346
- embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
368
+ if self.embedding_dict is None:
369
+ self.embedding_dict = self.__initialize_embedding_dict()
347
370
 
348
- else:
349
- print(
350
- f"There is no configuration of Embeddings")
351
- self.embedding_dict = embedding_dict
352
-
353
- # else:
354
- # embedding_dict = self.embedding_dict
355
- # TODO make generic
356
- # embedding_size = list(embedding_dict['all_text'].values())[
357
- # 0][0].shape
371
+ self.__calculate_embedding_size()
372
+
358
373
 
359
374
 
360
375
  # class ImageDatasetMixin(BaseModel):
@@ -81,6 +81,7 @@ class DDIMDLDataset(BaseDataset,TextDatasetMixin):
81
81
  self.chemical_property_columns = chemical_property_columns
82
82
  self.embedding_columns = embedding_columns
83
83
  self.ner_columns = ner_columns
84
+ self.columns = [] # these variable is modified in prep method
84
85
 
85
86
  self.class_column = 'event_category'
86
87
  _db_path = HERE.joinpath('data/event.db')
@@ -91,7 +92,7 @@ class DDIMDLDataset(BaseDataset,TextDatasetMixin):
91
92
  self.__similarity_related_columns__.extend(self.ner_columns)
92
93
  # TODO with resource
93
94
  self._conn = create_connection(_db_path.absolute().as_posix())
94
- self.load_drugs_and_events()
95
+ # self.load_drugs_and_events()
95
96
  logger.info(f'{self.dataset_name} is initialized')
96
97
 
97
98
  def load_drugs_and_events(self):
@@ -131,6 +132,7 @@ class DDIMDLDataset(BaseDataset,TextDatasetMixin):
131
132
  return pd.DataFrame(columns=headers, data=rows)
132
133
 
133
134
  def prep(self):
135
+ self.load_drugs_and_events()
134
136
  if self.drugs_df is None or self.ddis_df is None:
135
137
  raise Exception("There is no data")
136
138
 
@@ -220,14 +222,15 @@ class DDIMDLDataset(BaseDataset,TextDatasetMixin):
220
222
  self.columns.append(key)
221
223
  print(self.ddis_df[key].head())
222
224
 
223
- for embedding_column in self.embedding_columns:
224
- print(f"concat {embedding_column} embeddings")
225
- embeddings_after_pooling = {k: self.embeddings_pooling_strategy.apply(
226
- v) for k, v in self.embedding_dict[embedding_column].items()}
227
- # column_embeddings_dict = embedding_values[embedding_column]
228
- self.ddis_df[embedding_column+'_embedding'] = self.ddis_df.apply(
229
- x_fnc, args=(embeddings_after_pooling,), axis=1)
230
- self.columns.append(embedding_column+'_embedding')
225
+ if self.embedding_dict is not None:
226
+ for embedding_column in self.embedding_columns:
227
+ print(f"concat {embedding_column} embeddings")
228
+ embeddings_after_pooling = {k: self.pooling_strategy.apply(
229
+ v) for k, v in self.embedding_dict[embedding_column].items()}
230
+ # column_embeddings_dict = embedding_values[embedding_column]
231
+ self.ddis_df[embedding_column+'_embedding'] = self.ddis_df.apply(
232
+ x_fnc, args=(embeddings_after_pooling,), axis=1)
233
+ self.columns.append(embedding_column+'_embedding')
231
234
 
232
235
  dataframe = self.ddis_df.copy()
233
236
  if not isinstance(classes, (list, pd.Series, np.ndarray)):
@@ -32,9 +32,10 @@ import ddi_fw.utils as utils
32
32
 
33
33
  class MultiModalRunner:
34
34
  # todo model related parameters to config
35
- def __init__(self, library, multi_modal, use_mlflow=False):
35
+ def __init__(self, library, multi_modal, default_model, use_mlflow=False):
36
36
  self.library = library
37
37
  self.multi_modal = multi_modal
38
+ self.default_model = default_model
38
39
  self.use_mlflow = use_mlflow
39
40
  self.result = Result()
40
41
 
@@ -60,14 +61,13 @@ class MultiModalRunner:
60
61
  # TODO check single_results, 1d,2d ...
61
62
  def __predict(self, single_results):
62
63
  item_dict = {t[0]: t for t in self.items}
63
- print("multi_modal")
64
- print(self.multi_modal)
65
- print(item_dict.keys())
64
+ if self.default_model is None and not self.multi_modal:
65
+ raise Exception("Default model and multi modal cannot be None at the same time")
66
66
 
67
67
  if self.multi_modal:
68
68
  for m in self.multi_modal:
69
69
  name = m.get('name')
70
- input_type = m.get('input_type')
70
+ # input_type = m.get('input_type')
71
71
  input = m.get('input')
72
72
  inputs = m.get('inputs')
73
73
  model_type = get_import(m.get("model_type"))
@@ -100,6 +100,15 @@ class MultiModalRunner:
100
100
  else:
101
101
  raise Exception("check configurations")
102
102
  else: # TODO default model maybe?
103
+ print("Default model will be used")
104
+ name = self.default_model.get('name')
105
+ # input_type = m.get('input_type')
106
+ input = self.default_model.get('input')
107
+ inputs = self.default_model.get('inputs')
108
+ model_type = get_import(self.default_model.get("model_type"))
109
+ kwargs = self.default_model.get('params')
110
+ single_modal = T(self.date, name, model_type,
111
+ use_mlflow=self.use_mlflow, **kwargs)
103
112
  item = self.items[0]
104
113
  single_modal.set_data(
105
114
  self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
@@ -57,6 +57,7 @@ class MultiPipeline():
57
57
  tracking_uri = config.get("tracking_uri")
58
58
  artifact_location = config.get("artifact_location")
59
59
  #new
60
+ default_model = config.get("default_model"),
60
61
  multi_modal = config.get("multi_modal")
61
62
  columns = config.get("columns")
62
63
  ner_data_file = config.get("ner_data_file")
@@ -101,6 +102,7 @@ class MultiPipeline():
101
102
  ner_data_file=ner_data_file,
102
103
  ner_threshold=ner_threshold,
103
104
  combinations=combinations,
105
+ default_model=default_model,
104
106
  multi_modal= multi_modal)
105
107
  elif type== "ner_search":
106
108
  pipeline = NerParameterSearch(
@@ -35,6 +35,7 @@ class Pipeline(BaseModel):
35
35
  ner_threshold: Optional[dict] = None
36
36
  combinations: Optional[List[str]] = None
37
37
  model: Optional[Any] = None
38
+ default_model: Optional[Any] = None
38
39
  multi_modal: Optional[Any] = None
39
40
  use_mlflow: bool = False
40
41
  _dataset: BaseDataset = []
@@ -193,7 +194,7 @@ class Pipeline(BaseModel):
193
194
 
194
195
  y_test_label = self.items[0][4]
195
196
  multi_modal_runner = MultiModalRunner(
196
- library=self.library, multi_modal=self.multi_modal, use_mlflow=self.use_mlflow)
197
+ library=self.library, multi_modal=self.multi_modal, default_model= self.default_model , use_mlflow=self.use_mlflow)
197
198
  # multi_modal_runner = MultiModalRunner(
198
199
  # library=self.library, model_func=model_func, batch_size=batch_size, epochs=epochs)
199
200
  # multi_modal = TFMultiModal(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.197
3
+ Version: 0.0.199
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
File without changes
File without changes
File without changes