ddi-fw 0.0.197__tar.gz → 0.0.199__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/PKG-INFO +1 -1
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/pyproject.toml +1 -1
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/core.py +56 -41
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/base.py +12 -9
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ml/ml_helper.py +14 -5
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/pipeline/multi_pipeline.py +2 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/pipeline/pipeline.py +2 -1
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw.egg-info/PKG-INFO +1 -1
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/README.md +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/setup.cfg +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/__init__.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/dataset_splitter.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/db_utils.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/data/event.db +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/debug.log +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_0.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_1.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_2.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_3.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_4.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/train_indexes.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_0.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_1.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_2.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_3.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_4.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/test_indexes.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_0.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_1.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_2.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_3.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_4.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_indexes.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_0.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_1.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_2.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_3.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_4.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/readme.md +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/base.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/data/event.db +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/test_indexes.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_0.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_1.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_2.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_3.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_4.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_indexes.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_0.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_1.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_2.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_3.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_4.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/__init__.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/base.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/df_extraction_cleanxiaoyu50.csv +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/drug_information_del_noDDIxiaoyu50.csv +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/test_indexes.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_0.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_1.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_2.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_3.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_4.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_indexes.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_0.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_1.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_2.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_3.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_4.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/mdf-sa-ddi.zip +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/setup_._py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/drugbank/__init__.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/drugbank/drugbank.xsd +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/drugbank/drugbank_parser.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/drugbank/drugbank_processor.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/drugbank/drugbank_processor_org.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/drugbank/event_extractor.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/langchain/__init__.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/langchain/embeddings.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/langchain/sentence_splitter.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/langchain/storage.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ml/__init__.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ml/evaluation_helper.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ml/model_wrapper.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ml/pytorch_wrapper.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ml/tensorflow_wrapper.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ner/__init__.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ner/mmlrestclient.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/ner/ner.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/pipeline/__init__.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/pipeline/multi_modal_combination_strategy.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/pipeline/ner_pipeline.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/__init__.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/categorical_data_encoding_checker.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/enums.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/json_helper.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/kaggle.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/numpy_utils.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/package_helper.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/py7zr_helper.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/utils.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/utils/zip_helper.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/vectorization/__init__.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/vectorization/feature_vector_generation.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/vectorization/idf_helper.py +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw.egg-info/SOURCES.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw.egg-info/dependency_links.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw.egg-info/requires.txt +0 -0
- {ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw.egg-info/top_level.txt +0 -0
@@ -1,3 +1,4 @@
|
|
1
|
+
import abc
|
1
2
|
from collections import defaultdict
|
2
3
|
import glob
|
3
4
|
import logging
|
@@ -57,7 +58,7 @@ def generate_sim_matrices_new(df, generated_vectors, columns, key_column="id"):
|
|
57
58
|
return similarity_matrices
|
58
59
|
|
59
60
|
|
60
|
-
class BaseDataset(BaseModel):
|
61
|
+
class BaseDataset(BaseModel, abc.ABC):
|
61
62
|
dataset_name: str
|
62
63
|
index_path: Optional[str] = None
|
63
64
|
dataset_splitter_type: Type[DatasetSplitter]
|
@@ -125,19 +126,26 @@ class BaseDataset(BaseModel):
|
|
125
126
|
def set_dataframe(self, dataframe: pd.DataFrame):
|
126
127
|
self.dataframe = dataframe
|
127
128
|
|
128
|
-
|
129
|
+
@abc.abstractmethod
|
129
130
|
def prep(self):
|
130
|
-
|
131
|
+
"""Prepare the dataset. This method should be overridden in subclasses."""
|
132
|
+
|
131
133
|
|
134
|
+
def handle_mixins(self):
|
135
|
+
"""Handle mixin-specific logic."""
|
136
|
+
if isinstance(self, TextDatasetMixin):
|
137
|
+
self.process_text()
|
138
|
+
# if isinstance(self, ImageDatasetMixin):
|
139
|
+
# self.process_image_data()
|
140
|
+
# Add other mixin-specific logic here
|
141
|
+
|
132
142
|
def load(self):
|
133
143
|
"""
|
134
144
|
Load the dataset. If X_train, y_train, X_test, and y_test are already provided,
|
135
145
|
skip deriving them. Otherwise, derive them from the dataframe and indices.
|
136
146
|
"""
|
137
|
-
self.
|
138
|
-
|
139
|
-
if isinstance(self, TextDatasetMixin):
|
140
|
-
self.process_text()
|
147
|
+
self.handle_mixins() # Centralized mixin handling
|
148
|
+
self.prep() # Prepare the dataset
|
141
149
|
|
142
150
|
if self.X_train is not None or self.y_train is not None or self.X_test is not None or self.y_test is not None:
|
143
151
|
# Data is already provided, no need to calculate
|
@@ -158,9 +166,12 @@ class BaseDataset(BaseModel):
|
|
158
166
|
self.index_path)
|
159
167
|
except FileNotFoundError as e:
|
160
168
|
raise FileNotFoundError(f"Index files not found: {e.filename}")
|
161
|
-
|
162
|
-
train = self.dataframe[self.dataframe.index.isin(train_idx_all)]
|
163
|
-
test = self.dataframe[self.dataframe.index.isin(test_idx_all)]
|
169
|
+
|
170
|
+
# train = self.dataframe[self.dataframe.index.isin(train_idx_all)]
|
171
|
+
# test = self.dataframe[self.dataframe.index.isin(test_idx_all)]
|
172
|
+
columns = self.columns + [self.class_column]
|
173
|
+
train = self.dataframe.loc[self.dataframe.index.isin(train_idx_all), columns]
|
174
|
+
test = self.dataframe.loc[self.dataframe.index.isin(test_idx_all), columns]
|
164
175
|
X_train = train.drop(self.class_column, axis=1)
|
165
176
|
X_train = train.drop(self.class_column, axis=1)
|
166
177
|
y_train = train[self.class_column]
|
@@ -259,13 +270,18 @@ class BaseDataset(BaseModel):
|
|
259
270
|
|
260
271
|
|
261
272
|
class TextDatasetMixin(BaseModel):
|
262
|
-
embedding_size: Optional[int] = None
|
263
273
|
embedding_dict: Dict[str, Any] | None = Field(
|
264
274
|
default_factory=dict, description="Dictionary for embeddings")
|
265
275
|
pooling_strategy: PoolingStrategy | None = None
|
266
276
|
column_embedding_configs: Optional[Dict] = None
|
267
277
|
vector_db_persist_directory: Optional[str] = None
|
268
278
|
vector_db_collection_name: Optional[str] = None
|
279
|
+
_embedding_size: int
|
280
|
+
|
281
|
+
@computed_field
|
282
|
+
@property
|
283
|
+
def embedding_size(self) -> int:
|
284
|
+
return self._embedding_size
|
269
285
|
|
270
286
|
class Config:
|
271
287
|
arbitrary_types_allowed = True
|
@@ -317,44 +333,43 @@ class TextDatasetMixin(BaseModel):
|
|
317
333
|
else:
|
318
334
|
raise ValueError(
|
319
335
|
"Persistent directory for the vector DB is not specified.")
|
336
|
+
|
337
|
+
def __initialize_embedding_dict(self):
|
338
|
+
embedding_dict = defaultdict(lambda: defaultdict(list))
|
339
|
+
if self.column_embedding_configs:
|
340
|
+
for item in self.column_embedding_configs:
|
341
|
+
col = item["column"]
|
342
|
+
col_db_dir = item["vector_db_persist_directory"]
|
343
|
+
col_db_collection = item["vector_db_collection_name"]
|
344
|
+
self.__create_or_update_embeddings__(embedding_dict, col_db_dir, col_db_collection, col)
|
345
|
+
elif self.vector_db_persist_directory:
|
346
|
+
self.__create_or_update_embeddings__(embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
|
347
|
+
else:
|
348
|
+
logging.warning("There is no configuration of Embeddings")
|
349
|
+
raise ValueError(
|
350
|
+
"There is no configuration of Embeddings. Please provide a vector database directory and collection name.")
|
351
|
+
return embedding_dict
|
320
352
|
|
321
|
-
def
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
353
|
+
def __calculate_embedding_size(self):
|
354
|
+
if self.embedding_dict is None:
|
355
|
+
raise ValueError("Embedding dictionary is not initialized, embedding size cannot be calculated.")
|
356
|
+
|
357
|
+
key, value = next(iter(self.embedding_dict.items()))
|
358
|
+
self._embedding_size = value[next(iter(value))][0].shape[0]
|
326
359
|
|
360
|
+
def process_text(self):
|
361
|
+
logging.info("Processing text data...")
|
362
|
+
|
327
363
|
# 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
|
328
364
|
# kwargs = {"columns": self.columns}
|
329
365
|
# if self.ner_threshold:
|
330
366
|
# for k, v in self.ner_threshold.items():
|
331
367
|
# kwargs[k] = v
|
332
|
-
if self.embedding_dict
|
333
|
-
embedding_dict =
|
334
|
-
# TODO find more effective solution
|
335
|
-
|
336
|
-
if self.column_embedding_configs:
|
337
|
-
for item in self.column_embedding_configs:
|
338
|
-
col = item["column"]
|
339
|
-
col_db_dir = item["vector_db_persist_directory"]
|
340
|
-
col_db_collection = item["vector_db_collection_name"]
|
341
|
-
self.__create_or_update_embeddings__(
|
342
|
-
embedding_dict, col_db_dir, col_db_collection, col)
|
343
|
-
|
344
|
-
elif self.vector_db_persist_directory:
|
345
|
-
self.__create_or_update_embeddings__(
|
346
|
-
embedding_dict, self.vector_db_persist_directory, self.vector_db_collection_name)
|
368
|
+
if self.embedding_dict is None:
|
369
|
+
self.embedding_dict = self.__initialize_embedding_dict()
|
347
370
|
|
348
|
-
|
349
|
-
|
350
|
-
f"There is no configuration of Embeddings")
|
351
|
-
self.embedding_dict = embedding_dict
|
352
|
-
|
353
|
-
# else:
|
354
|
-
# embedding_dict = self.embedding_dict
|
355
|
-
# TODO make generic
|
356
|
-
# embedding_size = list(embedding_dict['all_text'].values())[
|
357
|
-
# 0][0].shape
|
371
|
+
self.__calculate_embedding_size()
|
372
|
+
|
358
373
|
|
359
374
|
|
360
375
|
# class ImageDatasetMixin(BaseModel):
|
@@ -81,6 +81,7 @@ class DDIMDLDataset(BaseDataset,TextDatasetMixin):
|
|
81
81
|
self.chemical_property_columns = chemical_property_columns
|
82
82
|
self.embedding_columns = embedding_columns
|
83
83
|
self.ner_columns = ner_columns
|
84
|
+
self.columns = [] # these variable is modified in prep method
|
84
85
|
|
85
86
|
self.class_column = 'event_category'
|
86
87
|
_db_path = HERE.joinpath('data/event.db')
|
@@ -91,7 +92,7 @@ class DDIMDLDataset(BaseDataset,TextDatasetMixin):
|
|
91
92
|
self.__similarity_related_columns__.extend(self.ner_columns)
|
92
93
|
# TODO with resource
|
93
94
|
self._conn = create_connection(_db_path.absolute().as_posix())
|
94
|
-
self.load_drugs_and_events()
|
95
|
+
# self.load_drugs_and_events()
|
95
96
|
logger.info(f'{self.dataset_name} is initialized')
|
96
97
|
|
97
98
|
def load_drugs_and_events(self):
|
@@ -131,6 +132,7 @@ class DDIMDLDataset(BaseDataset,TextDatasetMixin):
|
|
131
132
|
return pd.DataFrame(columns=headers, data=rows)
|
132
133
|
|
133
134
|
def prep(self):
|
135
|
+
self.load_drugs_and_events()
|
134
136
|
if self.drugs_df is None or self.ddis_df is None:
|
135
137
|
raise Exception("There is no data")
|
136
138
|
|
@@ -220,14 +222,15 @@ class DDIMDLDataset(BaseDataset,TextDatasetMixin):
|
|
220
222
|
self.columns.append(key)
|
221
223
|
print(self.ddis_df[key].head())
|
222
224
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
225
|
+
if self.embedding_dict is not None:
|
226
|
+
for embedding_column in self.embedding_columns:
|
227
|
+
print(f"concat {embedding_column} embeddings")
|
228
|
+
embeddings_after_pooling = {k: self.pooling_strategy.apply(
|
229
|
+
v) for k, v in self.embedding_dict[embedding_column].items()}
|
230
|
+
# column_embeddings_dict = embedding_values[embedding_column]
|
231
|
+
self.ddis_df[embedding_column+'_embedding'] = self.ddis_df.apply(
|
232
|
+
x_fnc, args=(embeddings_after_pooling,), axis=1)
|
233
|
+
self.columns.append(embedding_column+'_embedding')
|
231
234
|
|
232
235
|
dataframe = self.ddis_df.copy()
|
233
236
|
if not isinstance(classes, (list, pd.Series, np.ndarray)):
|
@@ -32,9 +32,10 @@ import ddi_fw.utils as utils
|
|
32
32
|
|
33
33
|
class MultiModalRunner:
|
34
34
|
# todo model related parameters to config
|
35
|
-
def __init__(self, library, multi_modal, use_mlflow=False):
|
35
|
+
def __init__(self, library, multi_modal, default_model, use_mlflow=False):
|
36
36
|
self.library = library
|
37
37
|
self.multi_modal = multi_modal
|
38
|
+
self.default_model = default_model
|
38
39
|
self.use_mlflow = use_mlflow
|
39
40
|
self.result = Result()
|
40
41
|
|
@@ -60,14 +61,13 @@ class MultiModalRunner:
|
|
60
61
|
# TODO check single_results, 1d,2d ...
|
61
62
|
def __predict(self, single_results):
|
62
63
|
item_dict = {t[0]: t for t in self.items}
|
63
|
-
|
64
|
-
|
65
|
-
print(item_dict.keys())
|
64
|
+
if self.default_model is None and not self.multi_modal:
|
65
|
+
raise Exception("Default model and multi modal cannot be None at the same time")
|
66
66
|
|
67
67
|
if self.multi_modal:
|
68
68
|
for m in self.multi_modal:
|
69
69
|
name = m.get('name')
|
70
|
-
input_type = m.get('input_type')
|
70
|
+
# input_type = m.get('input_type')
|
71
71
|
input = m.get('input')
|
72
72
|
inputs = m.get('inputs')
|
73
73
|
model_type = get_import(m.get("model_type"))
|
@@ -100,6 +100,15 @@ class MultiModalRunner:
|
|
100
100
|
else:
|
101
101
|
raise Exception("check configurations")
|
102
102
|
else: # TODO default model maybe?
|
103
|
+
print("Default model will be used")
|
104
|
+
name = self.default_model.get('name')
|
105
|
+
# input_type = m.get('input_type')
|
106
|
+
input = self.default_model.get('input')
|
107
|
+
inputs = self.default_model.get('inputs')
|
108
|
+
model_type = get_import(self.default_model.get("model_type"))
|
109
|
+
kwargs = self.default_model.get('params')
|
110
|
+
single_modal = T(self.date, name, model_type,
|
111
|
+
use_mlflow=self.use_mlflow, **kwargs)
|
103
112
|
item = self.items[0]
|
104
113
|
single_modal.set_data(
|
105
114
|
self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
|
@@ -57,6 +57,7 @@ class MultiPipeline():
|
|
57
57
|
tracking_uri = config.get("tracking_uri")
|
58
58
|
artifact_location = config.get("artifact_location")
|
59
59
|
#new
|
60
|
+
default_model = config.get("default_model"),
|
60
61
|
multi_modal = config.get("multi_modal")
|
61
62
|
columns = config.get("columns")
|
62
63
|
ner_data_file = config.get("ner_data_file")
|
@@ -101,6 +102,7 @@ class MultiPipeline():
|
|
101
102
|
ner_data_file=ner_data_file,
|
102
103
|
ner_threshold=ner_threshold,
|
103
104
|
combinations=combinations,
|
105
|
+
default_model=default_model,
|
104
106
|
multi_modal= multi_modal)
|
105
107
|
elif type== "ner_search":
|
106
108
|
pipeline = NerParameterSearch(
|
@@ -35,6 +35,7 @@ class Pipeline(BaseModel):
|
|
35
35
|
ner_threshold: Optional[dict] = None
|
36
36
|
combinations: Optional[List[str]] = None
|
37
37
|
model: Optional[Any] = None
|
38
|
+
default_model: Optional[Any] = None
|
38
39
|
multi_modal: Optional[Any] = None
|
39
40
|
use_mlflow: bool = False
|
40
41
|
_dataset: BaseDataset = []
|
@@ -193,7 +194,7 @@ class Pipeline(BaseModel):
|
|
193
194
|
|
194
195
|
y_test_label = self.items[0][4]
|
195
196
|
multi_modal_runner = MultiModalRunner(
|
196
|
-
library=self.library, multi_modal=self.multi_modal, use_mlflow=self.use_mlflow)
|
197
|
+
library=self.library, multi_modal=self.multi_modal, default_model= self.default_model , use_mlflow=self.use_mlflow)
|
197
198
|
# multi_modal_runner = MultiModalRunner(
|
198
199
|
# library=self.library, model_func=model_func, batch_size=batch_size, epochs=epochs)
|
199
200
|
# multi_modal = TFMultiModal(
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_0.txt
RENAMED
File without changes
|
{ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_1.txt
RENAMED
File without changes
|
{ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_2.txt
RENAMED
File without changes
|
{ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_3.txt
RENAMED
File without changes
|
{ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_4.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_indexes.txt
RENAMED
File without changes
|
{ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_0.txt
RENAMED
File without changes
|
{ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_1.txt
RENAMED
File without changes
|
{ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_2.txt
RENAMED
File without changes
|
{ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_3.txt
RENAMED
File without changes
|
{ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_4.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
{ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/df_extraction_cleanxiaoyu50.csv
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_0.txt
RENAMED
File without changes
|
{ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_1.txt
RENAMED
File without changes
|
{ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_2.txt
RENAMED
File without changes
|
{ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_3.txt
RENAMED
File without changes
|
{ddi_fw-0.0.197 → ddi_fw-0.0.199}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_4.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|