ddi-fw 0.0.195__py3-none-any.whl → 0.0.197__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/core.py +11 -8
- ddi_fw/datasets/ddi_mdl/base.py +2 -2
- {ddi_fw-0.0.195.dist-info → ddi_fw-0.0.197.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.195.dist-info → ddi_fw-0.0.197.dist-info}/RECORD +6 -6
- {ddi_fw-0.0.195.dist-info → ddi_fw-0.0.197.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.195.dist-info → ddi_fw-0.0.197.dist-info}/top_level.txt +0 -0
ddi_fw/datasets/core.py
CHANGED
@@ -135,7 +135,7 @@ class BaseDataset(BaseModel):
|
|
135
135
|
skip deriving them. Otherwise, derive them from the dataframe and indices.
|
136
136
|
"""
|
137
137
|
self.prep()
|
138
|
-
|
138
|
+
|
139
139
|
if isinstance(self, TextDatasetMixin):
|
140
140
|
self.process_text()
|
141
141
|
|
@@ -258,7 +258,7 @@ class BaseDataset(BaseModel):
|
|
258
258
|
# return X_train, X_test, y_train, y_test, folds
|
259
259
|
|
260
260
|
|
261
|
-
class TextDatasetMixin(
|
261
|
+
class TextDatasetMixin(BaseModel):
|
262
262
|
embedding_size: Optional[int] = None
|
263
263
|
embedding_dict: Dict[str, Any] | None = Field(
|
264
264
|
default_factory=dict, description="Dictionary for embeddings")
|
@@ -267,6 +267,9 @@ class TextDatasetMixin(BaseDataset):
|
|
267
267
|
vector_db_persist_directory: Optional[str] = None
|
268
268
|
vector_db_collection_name: Optional[str] = None
|
269
269
|
|
270
|
+
class Config:
|
271
|
+
arbitrary_types_allowed = True
|
272
|
+
|
270
273
|
def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
|
271
274
|
"""
|
272
275
|
Fetch embeddings and metadata from a persistent Chroma vector database and update the provided embedding_dict.
|
@@ -314,14 +317,13 @@ class TextDatasetMixin(BaseDataset):
|
|
314
317
|
else:
|
315
318
|
raise ValueError(
|
316
319
|
"Persistent directory for the vector DB is not specified.")
|
317
|
-
|
320
|
+
|
318
321
|
def process_text(self):
|
319
322
|
# key, value = next(iter(embedding_dict.items()))
|
320
323
|
# embedding_size = value[next(iter(value))][0].shape[0]
|
321
324
|
# pooling_strategy = self.embedding_pooling_strategy_type(
|
322
|
-
# ) if self.embedding_pooling_strategy_type else None
|
323
|
-
|
324
|
-
|
325
|
+
# ) if self.embedding_pooling_strategy_type else None
|
326
|
+
|
325
327
|
# 'enzyme','target','pathway','smile','all_text','indication', 'description','mechanism_of_action','pharmacodynamics', 'tui', 'cui', 'entities'
|
326
328
|
# kwargs = {"columns": self.columns}
|
327
329
|
# if self.ner_threshold:
|
@@ -346,9 +348,10 @@ class TextDatasetMixin(BaseDataset):
|
|
346
348
|
else:
|
347
349
|
print(
|
348
350
|
f"There is no configuration of Embeddings")
|
351
|
+
self.embedding_dict = embedding_dict
|
349
352
|
|
350
|
-
else:
|
351
|
-
|
353
|
+
# else:
|
354
|
+
# embedding_dict = self.embedding_dict
|
352
355
|
# TODO make generic
|
353
356
|
# embedding_size = list(embedding_dict['all_text'].values())[
|
354
357
|
# 0][0].shape
|
ddi_fw/datasets/ddi_mdl/base.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
import pathlib
|
2
2
|
from typing import List, Optional, Tuple
|
3
|
-
from ddi_fw.datasets.core import TextDatasetMixin, generate_sim_matrices_new, generate_vectors
|
3
|
+
from ddi_fw.datasets.core import BaseDataset, TextDatasetMixin, generate_sim_matrices_new, generate_vectors
|
4
4
|
from ddi_fw.datasets.db_utils import create_connection
|
5
5
|
import numpy as np
|
6
6
|
import pandas as pd
|
@@ -32,7 +32,7 @@ LIST_OF_NER_COLUMNS = ['tui', 'cui', 'entities']
|
|
32
32
|
HERE = pathlib.Path(__file__).resolve().parent
|
33
33
|
|
34
34
|
|
35
|
-
class DDIMDLDataset(TextDatasetMixin):
|
35
|
+
class DDIMDLDataset(BaseDataset,TextDatasetMixin):
|
36
36
|
dataset_name: str = "DDIMDLDataset"
|
37
37
|
index_path: str = Field(default_factory=lambda: str(
|
38
38
|
pathlib.Path(__file__).resolve().parent.joinpath('indexes')))
|
@@ -1,9 +1,9 @@
|
|
1
1
|
ddi_fw/datasets/__init__.py,sha256=_I3iDHARwzmg7_EL5XKtB_TgG1yAkLSOVTujLL9Wz9Q,280
|
2
|
-
ddi_fw/datasets/core.py,sha256=
|
2
|
+
ddi_fw/datasets/core.py,sha256=eLS4TtQN1_1kI0huMt7eTOCz5hY3da9PHhEeiLjWtQg,15605
|
3
3
|
ddi_fw/datasets/dataset_splitter.py,sha256=8H8uZTAf8N9LUZeSeHOMawtJFJhnDgUUqFcnl7dquBQ,1672
|
4
4
|
ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
|
5
5
|
ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
|
6
|
-
ddi_fw/datasets/ddi_mdl/base.py,sha256=
|
6
|
+
ddi_fw/datasets/ddi_mdl/base.py,sha256=rS8lSGE-SLeoE3GuElJ-TNaRHIGhaZBeOM2UH3JUS4M,10218
|
7
7
|
ddi_fw/datasets/ddi_mdl/debug.log,sha256=eWz05j8RFqZuHFDTCF7Rck5w4rvtTanFN21iZsgxO7Y,115
|
8
8
|
ddi_fw/datasets/ddi_mdl/readme.md,sha256=WC6lpmsEKvIISnZqENY7TWtzCQr98HPpE3oRsBl8pIw,625
|
9
9
|
ddi_fw/datasets/ddi_mdl/data/event.db,sha256=cmlSsf9MYjRzqR-mw3cUDnTnfT6FkpOG2yCl2mMwwew,30580736
|
@@ -99,7 +99,7 @@ ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,55
|
|
99
99
|
ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
|
100
100
|
ddi_fw/vectorization/feature_vector_generation.py,sha256=EBf-XAiwQwr68az91erEYNegfeqssBR29kVgrliIyac,4765
|
101
101
|
ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
102
|
-
ddi_fw-0.0.
|
103
|
-
ddi_fw-0.0.
|
104
|
-
ddi_fw-0.0.
|
105
|
-
ddi_fw-0.0.
|
102
|
+
ddi_fw-0.0.197.dist-info/METADATA,sha256=pVnij5JFvkPUgjVqvHmFLdI2OKSFRYxt7-vLVXhpldU,2542
|
103
|
+
ddi_fw-0.0.197.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
104
|
+
ddi_fw-0.0.197.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
105
|
+
ddi_fw-0.0.197.dist-info/RECORD,,
|
File without changes
|
File without changes
|