ddi-fw 0.0.148__tar.gz → 0.0.150__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/PKG-INFO +6 -3
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/pyproject.toml +19 -4
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/__init__.py +1 -1
- ddi_fw-0.0.150/src/ddi_fw/datasets/core.py +211 -0
- ddi_fw-0.0.150/src/ddi_fw/datasets/dataset_splitter.py +39 -0
- ddi_fw-0.0.150/src/ddi_fw/datasets/ddi_mdl/base.py +213 -0
- ddi_fw-0.0.150/src/ddi_fw/datasets/ddi_mdl/debug.log +1 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/embedding_generator.py +2 -1
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/langchain/embeddings.py +1 -0
- ddi_fw-0.0.150/src/ddi_fw/ml/evaluation_helper.py +195 -0
- ddi_fw-0.0.150/src/ddi_fw/ml/ml_helper.py +187 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/ml/model_wrapper.py +2 -2
- ddi_fw-0.0.150/src/ddi_fw/ml/pytorch_wrapper.py +186 -0
- ddi_fw-0.0.150/src/ddi_fw/ml/tensorflow_wrapper.py +260 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/ner/ner.py +93 -39
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/pipeline/multi_modal_combination_strategy.py +4 -2
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/pipeline/multi_pipeline.py +2 -15
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/pipeline/ner_pipeline.py +15 -6
- ddi_fw-0.0.150/src/ddi_fw/pipeline/pipeline.py +250 -0
- ddi_fw-0.0.148/src/ddi_fw/test/compress_json_test.py → ddi_fw-0.0.150/src/ddi_fw/utils/json_helper.py +1 -15
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw.egg-info/PKG-INFO +6 -3
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw.egg-info/SOURCES.txt +3 -12
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw.egg-info/requires.txt +4 -1
- ddi_fw-0.0.148/src/ddi_fw/datasets/core.py +0 -405
- ddi_fw-0.0.148/src/ddi_fw/datasets/ddi_mdl/base.py +0 -149
- ddi_fw-0.0.148/src/ddi_fw/ml/evaluation_helper.py +0 -326
- ddi_fw-0.0.148/src/ddi_fw/ml/ml_helper.py +0 -143
- ddi_fw-0.0.148/src/ddi_fw/ml/pytorch_wrapper.py +0 -83
- ddi_fw-0.0.148/src/ddi_fw/ml/tensorflow_wrapper.py +0 -168
- ddi_fw-0.0.148/src/ddi_fw/pipeline/pipeline.py +0 -192
- ddi_fw-0.0.148/src/ddi_fw/test/__init__.py +0 -0
- ddi_fw-0.0.148/src/ddi_fw/test/basic_test.py +0 -15
- ddi_fw-0.0.148/src/ddi_fw/test/combination_test.py +0 -12
- ddi_fw-0.0.148/src/ddi_fw/test/date_test.py +0 -15
- ddi_fw-0.0.148/src/ddi_fw/test/idf_score.py +0 -54
- ddi_fw-0.0.148/src/ddi_fw/test/jaccard_similarity.py +0 -85
- ddi_fw-0.0.148/src/ddi_fw/test/mlfow_test.py +0 -165
- ddi_fw-0.0.148/src/ddi_fw/test/sklearn-tfidf.py +0 -16
- ddi_fw-0.0.148/src/ddi_fw/test/test.py +0 -93
- ddi_fw-0.0.148/src/ddi_fw/test/torch_cuda_test.py +0 -9
- ddi_fw-0.0.148/src/ddi_fw/test/type_guarding_test.py +0 -18
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/README.md +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/setup.cfg +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/db_utils.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/data/event.db +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_0.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_1.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_2.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_3.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_4.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/train_indexes.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_0.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_1.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_2.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_3.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_4.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/test_indexes.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_0.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_1.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_2.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_3.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_4.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_indexes.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_0.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_1.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_2.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_3.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_4.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/readme.md +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/base.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/data/event.db +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/test_indexes.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_0.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_1.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_2.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_3.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_4.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_indexes.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_0.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_1.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_2.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_3.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_4.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/feature_vector_generation.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/idf_helper.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/__init__.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/base.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/df_extraction_cleanxiaoyu50.csv +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/drug_information_del_noDDIxiaoyu50.csv +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/test_indexes.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_0.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_1.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_2.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_3.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_4.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_indexes.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_0.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_1.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_2.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_3.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_4.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/mdf-sa-ddi.zip +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/setup_._py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/drugbank/__init__.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/drugbank/drugbank.xsd +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/drugbank/drugbank_parser.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/drugbank/drugbank_processor.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/drugbank/drugbank_processor_org.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/drugbank/event_extractor.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/langchain/__init__.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/langchain/sentence_splitter.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/langchain/storage.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/ml/__init__.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/ner/__init__.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/ner/mmlrestclient.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/pipeline/__init__.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/utils/__init__.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/utils/enums.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/utils/kaggle.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/utils/package_helper.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/utils/py7zr_helper.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/utils/utils.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/utils/zip_helper.py +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw.egg-info/dependency_links.txt +0 -0
- {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: ddi_fw
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.150
|
4
4
|
Summary: Do not use :)
|
5
5
|
Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
|
6
6
|
Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
|
@@ -22,6 +22,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
|
23
23
|
Requires-Python: >=3.10
|
24
24
|
Description-Content-Type: text/markdown
|
25
|
+
Requires-Dist: pydantic==2.10.6
|
25
26
|
Requires-Dist: importlib-resources==6.4.5
|
26
27
|
Requires-Dist: python-stopwatch==1.1.11
|
27
28
|
Requires-Dist: lxml==5.3.0
|
@@ -35,7 +36,7 @@ Requires-Dist: rdkit==2023.3.3
|
|
35
36
|
Requires-Dist: scikit-learn==1.5.2
|
36
37
|
Requires-Dist: scipy==1.13.1
|
37
38
|
Requires-Dist: accelerate>=0.33.0
|
38
|
-
Requires-Dist: sentence-transformers
|
39
|
+
Requires-Dist: sentence-transformers<=3.3.1,>=3.0.1
|
39
40
|
Requires-Dist: transformers>=4.42.4
|
40
41
|
Requires-Dist: stanza==1.9.2
|
41
42
|
Requires-Dist: tokenizers>=0.19.1
|
@@ -49,3 +50,5 @@ Requires-Dist: chromadb>=0.5.15
|
|
49
50
|
Requires-Dist: langchain_community==0.3.3
|
50
51
|
Requires-Dist: datasets==3.0.2
|
51
52
|
Requires-Dist: unstructured==0.16.3
|
53
|
+
Requires-Dist: tensorflow<2.18.0,>=2.17.0
|
54
|
+
Requires-Dist: tf-keras==2.17.0
|
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
|
|
5
5
|
|
6
6
|
[project]
|
7
7
|
name = "ddi_fw"
|
8
|
-
version = "0.0.
|
8
|
+
version = "0.0.150"
|
9
9
|
description = "Do not use :)"
|
10
10
|
readme = "README.md"
|
11
11
|
authors = [
|
@@ -45,7 +45,8 @@ license = { file = "LICENSE" }
|
|
45
45
|
|
46
46
|
requires-python = ">=3.10"
|
47
47
|
dependencies = [
|
48
|
-
"
|
48
|
+
"pydantic==2.10.6"
|
49
|
+
,"importlib-resources==6.4.5"
|
49
50
|
,"python-stopwatch==1.1.11"
|
50
51
|
,"lxml==5.3.0"
|
51
52
|
,"matplotlib==3.8.0"
|
@@ -58,7 +59,7 @@ dependencies = [
|
|
58
59
|
,"scikit-learn==1.5.2"
|
59
60
|
,"scipy==1.13.1"
|
60
61
|
,"accelerate>=0.33.0"
|
61
|
-
,"sentence-transformers>=3.0.1"
|
62
|
+
,"sentence-transformers>=3.0.1,<=3.3.1"
|
62
63
|
,"transformers>=4.42.4"
|
63
64
|
,"stanza==1.9.2"
|
64
65
|
,"tokenizers>=0.19.1"
|
@@ -71,7 +72,9 @@ dependencies = [
|
|
71
72
|
,"chromadb>=0.5.15"
|
72
73
|
,"langchain_community==0.3.3"
|
73
74
|
,"datasets==3.0.2"
|
74
|
-
,"unstructured==0.16.3"
|
75
|
+
,"unstructured==0.16.3",
|
76
|
+
"tensorflow>=2.17.0,<2.18.0"
|
77
|
+
,"tf-keras==2.17.0"
|
75
78
|
]
|
76
79
|
|
77
80
|
|
@@ -85,3 +88,15 @@ where = ["src"] # list of folders that contain the packages (["."] by default)
|
|
85
88
|
# See https://setuptools.pypa.io/en/latest/userguide/datafiles.html
|
86
89
|
[tool.setuptools.package-data]
|
87
90
|
"*" = ["*.*"]
|
91
|
+
|
92
|
+
[tool.coverage.run]
|
93
|
+
source = ["src"]
|
94
|
+
|
95
|
+
# pyproject.toml
|
96
|
+
[tool.pytest.ini_options]
|
97
|
+
minversion = "6.0"
|
98
|
+
addopts = "-ra -q"
|
99
|
+
|
100
|
+
testpaths = [
|
101
|
+
"tests"
|
102
|
+
]
|
@@ -5,7 +5,7 @@ from .mdf_sa_ddi.base import MDFSADDIDataset
|
|
5
5
|
from .embedding_generator import create_embeddings
|
6
6
|
from .idf_helper import IDF
|
7
7
|
from .feature_vector_generation import SimilarityMatrixGenerator, VectorGenerator
|
8
|
-
|
8
|
+
from .dataset_splitter import DatasetSplitter
|
9
9
|
__all__ = ['BaseDataset','DDIMDLDataset','MDFSADDIDataset']
|
10
10
|
|
11
11
|
|
@@ -0,0 +1,211 @@
|
|
1
|
+
import glob
|
2
|
+
from typing import List, Optional, Type
|
3
|
+
import numpy as np
|
4
|
+
import pandas as pd
|
5
|
+
from pydantic import BaseModel, Field, computed_field
|
6
|
+
from ddi_fw.datasets.dataset_splitter import DatasetSplitter
|
7
|
+
from ddi_fw.datasets.feature_vector_generation import SimilarityMatrixGenerator, VectorGenerator
|
8
|
+
from ddi_fw.langchain.embeddings import PoolingStrategy
|
9
|
+
from ddi_fw.utils.utils import create_folder_if_not_exists
|
10
|
+
|
11
|
+
|
12
|
+
def stack(df_column):
|
13
|
+
return np.stack(df_column.values)
|
14
|
+
|
15
|
+
|
16
|
+
def generate_vectors(df, columns):
|
17
|
+
vectorGenerator = VectorGenerator(df)
|
18
|
+
generated_vectors = vectorGenerator.generate_feature_vectors(
|
19
|
+
columns)
|
20
|
+
return generated_vectors
|
21
|
+
|
22
|
+
|
23
|
+
def generate_sim_matrices_new(df, generated_vectors, columns, key_column="id"):
|
24
|
+
jaccard_sim_dict = {}
|
25
|
+
sim_matrix_gen = SimilarityMatrixGenerator()
|
26
|
+
|
27
|
+
for column in columns:
|
28
|
+
# key = '2D_'+column
|
29
|
+
key = column
|
30
|
+
jaccard_sim_dict[column] = sim_matrix_gen.create_jaccard_similarity_matrices(
|
31
|
+
generated_vectors[key])
|
32
|
+
|
33
|
+
similarity_matrices = {}
|
34
|
+
keys = df[key_column].to_list()
|
35
|
+
new_columns = {}
|
36
|
+
for idx in range(len(keys)):
|
37
|
+
new_columns[idx] = keys[idx]
|
38
|
+
for column in columns:
|
39
|
+
new_df = pd.DataFrame.from_dict(jaccard_sim_dict[column])
|
40
|
+
new_df = new_df.rename(index=new_columns, columns=new_columns)
|
41
|
+
similarity_matrices[column] = new_df
|
42
|
+
return similarity_matrices
|
43
|
+
|
44
|
+
|
45
|
+
class BaseDataset(BaseModel):
|
46
|
+
dataset_name: str
|
47
|
+
index_path: str
|
48
|
+
dataset_splitter_type: Type[DatasetSplitter]
|
49
|
+
class_column: str = 'class'
|
50
|
+
dataframe: Optional[pd.DataFrame] = None
|
51
|
+
X_train: Optional[pd.DataFrame] = None
|
52
|
+
X_test: Optional[pd.DataFrame] = None
|
53
|
+
y_train: Optional[pd.Series] = None
|
54
|
+
y_test: Optional[pd.Series] = None
|
55
|
+
train_indexes: Optional[pd.Index] = None
|
56
|
+
test_indexes: Optional[pd.Index] = None
|
57
|
+
train_idx_arr: List|None = None
|
58
|
+
val_idx_arr: List|None = None
|
59
|
+
# train_idx_arr: Optional[List[np.ndarray]] = None
|
60
|
+
# val_idx_arr: Optional[List[np.ndarray]] = None
|
61
|
+
columns: List[str] = []
|
62
|
+
|
63
|
+
# feature_process: FeatureProcessor
|
64
|
+
# similarity_matrix_service: SimilarityMatrixService
|
65
|
+
|
66
|
+
class Config:
|
67
|
+
arbitrary_types_allowed = True
|
68
|
+
|
69
|
+
def produce_inputs(self):
|
70
|
+
items = []
|
71
|
+
if self.X_train is None or self.X_test is None:
|
72
|
+
raise Exception("There is no data to produce inputs")
|
73
|
+
y_train_label, y_test_label = stack(self.y_train), stack(self.y_test)
|
74
|
+
|
75
|
+
for column in self.columns:
|
76
|
+
train_data, test_data = stack(
|
77
|
+
self.X_train[column]), stack(self.X_test[column])
|
78
|
+
items.append([f'{column}', np.nan_to_num(train_data),
|
79
|
+
y_train_label, np.nan_to_num(test_data), y_test_label])
|
80
|
+
|
81
|
+
# items.append([f'{column}_embedding', train_data,
|
82
|
+
# y_train_label, test_data, y_test_label])
|
83
|
+
return items
|
84
|
+
|
85
|
+
@computed_field
|
86
|
+
@property
|
87
|
+
def dataset_splitter(self) -> DatasetSplitter:
|
88
|
+
return self.dataset_splitter_type()
|
89
|
+
|
90
|
+
def set_dataframe(self, dataframe: pd.DataFrame):
|
91
|
+
self.dataframe = dataframe
|
92
|
+
|
93
|
+
# @abstractmethod
|
94
|
+
def prep(self):
|
95
|
+
pass
|
96
|
+
|
97
|
+
def load(self):
|
98
|
+
if self.index_path is None:
|
99
|
+
raise Exception(
|
100
|
+
"There is no index path, please call split function")
|
101
|
+
|
102
|
+
try:
|
103
|
+
train_idx_all, test_idx_all, train_idx_arr, val_idx_arr = self.__get_indexes__(
|
104
|
+
self.index_path)
|
105
|
+
except FileNotFoundError as e:
|
106
|
+
raise FileNotFoundError(f"Index files not found: {e.filename}")
|
107
|
+
|
108
|
+
self.prep()
|
109
|
+
|
110
|
+
if self.dataframe is None:
|
111
|
+
raise Exception("There is no dataframe")
|
112
|
+
|
113
|
+
train = self.dataframe[self.dataframe.index.isin(train_idx_all)]
|
114
|
+
test = self.dataframe[self.dataframe.index.isin(test_idx_all)]
|
115
|
+
|
116
|
+
self.X_train = train.drop(self.class_column, axis=1)
|
117
|
+
self.y_train = train[self.class_column]
|
118
|
+
self.X_test = test.drop(self.class_column, axis=1)
|
119
|
+
self.y_test = test[self.class_column]
|
120
|
+
|
121
|
+
self.train_indexes = self.X_train.index
|
122
|
+
self.test_indexes = self.X_test.index
|
123
|
+
self.train_idx_arr = train_idx_arr
|
124
|
+
self.val_idx_arr = val_idx_arr
|
125
|
+
|
126
|
+
return self.X_train, self.X_test, self.y_train, self.y_test, self.X_train.index, self.X_test.index, train_idx_arr, val_idx_arr
|
127
|
+
|
128
|
+
def __get_indexes__(self, path):
|
129
|
+
train_index_path = path+'/train_indexes.txt'
|
130
|
+
test_index_path = path+'/test_indexes.txt'
|
131
|
+
train_fold_files = f'{path}/train_fold_*.txt'
|
132
|
+
val_fold_files = f'{path}/validation_fold_*.txt'
|
133
|
+
train_idx_arr = []
|
134
|
+
val_idx_arr = []
|
135
|
+
with open(train_index_path, 'r', encoding="utf8") as f:
|
136
|
+
train_idx_all = [int(r) for r in f.readlines()]
|
137
|
+
with open(test_index_path, 'r', encoding="utf8") as f:
|
138
|
+
test_idx_all = [int(r) for r in f.readlines()]
|
139
|
+
|
140
|
+
for filepath in glob.glob(train_fold_files):
|
141
|
+
with open(filepath, 'r', encoding="utf8") as f:
|
142
|
+
train_idx = [int(r) for r in f.readlines()]
|
143
|
+
train_idx_arr.append(train_idx)
|
144
|
+
for filepath in glob.glob(val_fold_files):
|
145
|
+
with open(filepath, 'r', encoding="utf8") as f:
|
146
|
+
val_idx = [int(r) for r in f.readlines()]
|
147
|
+
val_idx_arr.append(val_idx)
|
148
|
+
return train_idx_all, test_idx_all, train_idx_arr, val_idx_arr
|
149
|
+
|
150
|
+
def __save_indexes__(self, path, filename, indexes):
|
151
|
+
create_folder_if_not_exists(path)
|
152
|
+
file_path = path + '/'+filename
|
153
|
+
str_indexes = [str(index) for index in indexes]
|
154
|
+
with open(file_path, 'w') as f:
|
155
|
+
f.write('\n'.join(str_indexes))
|
156
|
+
|
157
|
+
def split_dataset(self, save_indexes: bool = False):
|
158
|
+
# TODO class type should be parametric
|
159
|
+
|
160
|
+
save_path = self.index_path
|
161
|
+
self.prep()
|
162
|
+
|
163
|
+
if self.dataframe is None:
|
164
|
+
raise Exception("There is no data")
|
165
|
+
|
166
|
+
X = self.dataframe.drop(self.class_column, axis=1)
|
167
|
+
y = self.dataframe[self.class_column]
|
168
|
+
|
169
|
+
X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = self.dataset_splitter.split(
|
170
|
+
X=X, y=y)
|
171
|
+
self.X_train = X_train
|
172
|
+
self.X_test = X_test
|
173
|
+
self.y_train = y_train
|
174
|
+
self.y_test = y_test
|
175
|
+
self.train_indexes = X_train.index
|
176
|
+
self.test_indexes = X_test.index
|
177
|
+
self.train_idx_arr = train_idx_arr
|
178
|
+
self.val_idx_arr = val_idx_arr
|
179
|
+
|
180
|
+
if save_indexes:
|
181
|
+
# train_pairs = [row['id1'].join(',').row['id2'] for index, row in X_train.iterrows()]
|
182
|
+
self.__save_indexes__(
|
183
|
+
save_path, 'train_indexes.txt', self.train_indexes.values)
|
184
|
+
self.__save_indexes__(
|
185
|
+
save_path, 'test_indexes.txt', self.test_indexes.values)
|
186
|
+
|
187
|
+
for i, (train_idx, val_idx) in enumerate(zip(train_idx_arr, val_idx_arr)):
|
188
|
+
self.__save_indexes__(
|
189
|
+
save_path, f'train_fold_{i}.txt', train_idx)
|
190
|
+
self.__save_indexes__(
|
191
|
+
save_path, f'validation_fold_{i}.txt', val_idx)
|
192
|
+
|
193
|
+
# return X_train, X_test, y_train, y_test, folds
|
194
|
+
|
195
|
+
|
196
|
+
class TextDatasetMixin(BaseDataset):
|
197
|
+
embedding_size: int
|
198
|
+
embedding_dict: dict
|
199
|
+
embeddings_pooling_strategy: PoolingStrategy | None = None
|
200
|
+
|
201
|
+
def process_text(self):
|
202
|
+
pass
|
203
|
+
|
204
|
+
|
205
|
+
# class ImageDatasetMixin(BaseModel):
|
206
|
+
# image_size: tuple[int, int] = Field(default=(224, 224))
|
207
|
+
# augmentations: list[str] = Field(default_factory=list)
|
208
|
+
|
209
|
+
# def process_image_data(self):
|
210
|
+
# print(
|
211
|
+
# f"Processing image data with size {self.image_size} and augmentations {self.augmentations}...")
|
@@ -0,0 +1,39 @@
|
|
1
|
+
from typing import List, Tuple
|
2
|
+
import numpy as np
|
3
|
+
import pandas as pd
|
4
|
+
from pydantic import BaseModel, Field
|
5
|
+
from sklearn.model_selection import StratifiedKFold, train_test_split
|
6
|
+
|
7
|
+
|
8
|
+
class DatasetSplitter(BaseModel):
|
9
|
+
fold_size: int = Field(default=5, ge=2)
|
10
|
+
test_size: float = Field(default=0.2, ge=0.0, le=1.0)
|
11
|
+
shuffle: bool = True
|
12
|
+
random_state: int = Field(default=42)
|
13
|
+
|
14
|
+
class Config:
|
15
|
+
arbitrary_types_allowed = True
|
16
|
+
|
17
|
+
def split(self, X: pd.DataFrame, y: pd.Series)-> Tuple[
|
18
|
+
pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.Index, pd.Index, List[np.ndarray], List[np.ndarray]]:
|
19
|
+
print(
|
20
|
+
f"Splitting dataset into {self.fold_size} folds with shuffle={self.shuffle}...")
|
21
|
+
#TODO check it
|
22
|
+
if len(y.shape) == 1:
|
23
|
+
y = pd.Series(np.expand_dims(y.to_numpy(), axis=1).flatten())
|
24
|
+
stacked = np.vstack(tuple(y.to_numpy()))
|
25
|
+
stratify = np.argmax(stacked, axis=1)
|
26
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
27
|
+
X, y, shuffle=self.shuffle, test_size=self.test_size, stratify=stratify)
|
28
|
+
|
29
|
+
k_fold = StratifiedKFold(
|
30
|
+
n_splits=self.fold_size, shuffle=self.shuffle, random_state=self.random_state)
|
31
|
+
folds = k_fold.split(X_train, np.argmax(
|
32
|
+
np.vstack(y_train.to_numpy()), axis=1))
|
33
|
+
train_idx_arr = []
|
34
|
+
val_idx_arr = []
|
35
|
+
for i, (train_index, val_index) in enumerate(folds):
|
36
|
+
train_idx_arr.append(train_index)
|
37
|
+
val_idx_arr.append(val_index)
|
38
|
+
|
39
|
+
return X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr
|
@@ -0,0 +1,213 @@
|
|
1
|
+
import glob
|
2
|
+
import pathlib
|
3
|
+
from typing import List, Optional, Tuple
|
4
|
+
from ddi_fw.datasets.core import BaseDataset, TextDatasetMixin, generate_sim_matrices_new, generate_vectors
|
5
|
+
from ddi_fw.datasets.dataset_splitter import DatasetSplitter
|
6
|
+
from ddi_fw.datasets.db_utils import create_connection
|
7
|
+
from ddi_fw.datasets.idf_helper import IDF
|
8
|
+
from ddi_fw.utils.utils import create_folder_if_not_exists
|
9
|
+
import numpy as np
|
10
|
+
import pandas as pd
|
11
|
+
from pydantic import BaseModel, Field, model_validator, root_validator
|
12
|
+
from ddi_fw.datasets.feature_vector_generation import SimilarityMatrixGenerator,VectorGenerator
|
13
|
+
from ddi_fw.langchain.embeddings import PoolingStrategy
|
14
|
+
from abc import ABC, abstractmethod
|
15
|
+
from sklearn.preprocessing import LabelBinarizer
|
16
|
+
|
17
|
+
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
|
18
|
+
|
19
|
+
# Constants for embedding, chemical properties, and NER columns
|
20
|
+
LIST_OF_EMBEDDING_COLUMNS = [
|
21
|
+
'all_text', 'description', 'synthesis_reference', 'indication',
|
22
|
+
'pharmacodynamics', 'mechanism_of_action', 'toxicity', 'metabolism',
|
23
|
+
'absorption', 'half_life', 'protein_binding', 'route_of_elimination',
|
24
|
+
'volume_of_distribution', 'clearance'
|
25
|
+
]
|
26
|
+
|
27
|
+
LIST_OF_CHEMICAL_PROPERTY_COLUMNS = ['enzyme', 'target', 'pathway', 'smile']
|
28
|
+
LIST_OF_NER_COLUMNS = ['tui', 'cui', 'entities']
|
29
|
+
|
30
|
+
HERE = pathlib.Path(__file__).resolve().parent
|
31
|
+
|
32
|
+
class DDIMDLDataset(TextDatasetMixin):
|
33
|
+
index_path: str = Field(default_factory=lambda: str(
|
34
|
+
pathlib.Path(__file__).resolve().parent.joinpath('indexes')))
|
35
|
+
# drugs_df: pd.DataFrame = Field(default_factory=pd.DataFrame)
|
36
|
+
# ddis_df: pd.DataFrame = Field(default_factory=pd.DataFrame)
|
37
|
+
drugs_df: Optional[pd.DataFrame] = None
|
38
|
+
ddis_df: Optional[pd.DataFrame] = None
|
39
|
+
|
40
|
+
chemical_property_columns: list[str] = Field(
|
41
|
+
default_factory=lambda: LIST_OF_CHEMICAL_PROPERTY_COLUMNS)
|
42
|
+
embedding_columns: list[str] = Field(default_factory=list)
|
43
|
+
ner_columns: list[str] = Field(default_factory=list)
|
44
|
+
ner_df: pd.DataFrame | None = None
|
45
|
+
tui_threshold: float | None = None
|
46
|
+
cui_threshold: float | None = None
|
47
|
+
entities_threshold: float | None = None
|
48
|
+
|
49
|
+
|
50
|
+
# @model_validator
|
51
|
+
def validate_columns(self, values):
|
52
|
+
if not set(values['chemical_property_columns']).issubset(LIST_OF_CHEMICAL_PROPERTY_COLUMNS):
|
53
|
+
raise ValueError("Invalid chemical property columns")
|
54
|
+
if not set(values['ner_columns']).issubset(LIST_OF_NER_COLUMNS):
|
55
|
+
raise ValueError("Invalid NER columns")
|
56
|
+
return values
|
57
|
+
|
58
|
+
def __init__(self, **kwargs):
|
59
|
+
super().__init__(**kwargs)
|
60
|
+
self.class_column = 'event_category'
|
61
|
+
_db_path = HERE.joinpath('data/event.db')
|
62
|
+
|
63
|
+
self.__similarity_related_columns__ = []
|
64
|
+
self.__similarity_related_columns__.extend(self.chemical_property_columns)
|
65
|
+
self.__similarity_related_columns__.extend(self.ner_columns)
|
66
|
+
# TODO with resource
|
67
|
+
self._conn = create_connection(_db_path.absolute().as_posix())
|
68
|
+
self.load_drugs_and_events()
|
69
|
+
|
70
|
+
def load_drugs_and_events(self):
|
71
|
+
self.drugs_df = self.__select_all_drugs_as_dataframe__()
|
72
|
+
self.ddis_df = self.__select_all_events__()
|
73
|
+
|
74
|
+
def __select_all_drugs_as_dataframe__(self):
|
75
|
+
headers = ['index', 'id', 'name',
|
76
|
+
'target', 'enzyme', 'pathway', 'smile']
|
77
|
+
if self._conn is None:
|
78
|
+
raise Exception("There is no connection")
|
79
|
+
cur = self._conn.cursor()
|
80
|
+
cur.execute(
|
81
|
+
'''SELECT "index", id, name, target, enzyme, pathway, smile FROM drug'''
|
82
|
+
)
|
83
|
+
rows = cur.fetchall()
|
84
|
+
df = pd.DataFrame(columns=headers, data=rows)
|
85
|
+
|
86
|
+
# Convert string fields to lists
|
87
|
+
for col in ['enzyme', 'target', 'pathway', 'smile']:
|
88
|
+
df[col] = df[col].apply(lambda x: x.split('|'))
|
89
|
+
|
90
|
+
return df
|
91
|
+
|
92
|
+
def __select_all_events__(self):
|
93
|
+
if self._conn is None:
|
94
|
+
raise Exception("There is no connection")
|
95
|
+
cur = self._conn.cursor()
|
96
|
+
cur.execute('''
|
97
|
+
SELECT ex."index", d1.id, d1.name, d2.id, d2.name, mechanism || ' ' || action
|
98
|
+
FROM extraction ex
|
99
|
+
JOIN drug d1 ON d1.name = ex.drugA
|
100
|
+
JOIN drug d2 ON d2.name = ex.drugB
|
101
|
+
''')
|
102
|
+
rows = cur.fetchall()
|
103
|
+
headers = ["index", "id1", "name1", "id2", "name2", "event_category"]
|
104
|
+
return pd.DataFrame(columns=headers, data=rows)
|
105
|
+
|
106
|
+
def prep(self):
|
107
|
+
if self.drugs_df is None or self.ddis_df is None:
|
108
|
+
raise Exception("There is no data")
|
109
|
+
|
110
|
+
drug_ids = self.drugs_df['id'].to_list()
|
111
|
+
|
112
|
+
filtered_df = self.drugs_df
|
113
|
+
combined_df = filtered_df.copy()
|
114
|
+
|
115
|
+
if self.ner_df is not None and not self.ner_df.empty:
|
116
|
+
filtered_ner_df = self.ner_df[self.ner_df['drugbank_id'].isin(
|
117
|
+
drug_ids)]
|
118
|
+
filtered_ner_df = self.ner_df.copy()
|
119
|
+
|
120
|
+
# TODO: eğer kullanılan veri setinde tui, cui veya entity bilgileri yoksa o veri setine bu sütunları eklemek için aşağısı gerekli
|
121
|
+
|
122
|
+
# idf_calc = IDF(filtered_ner_df, [f for f in filtered_ner_df.keys()])
|
123
|
+
idf_calc = IDF(filtered_ner_df, self.ner_columns)
|
124
|
+
idf_calc.calculate()
|
125
|
+
idf_scores_df = idf_calc.to_dataframe()
|
126
|
+
|
127
|
+
# for key in filtered_ner_df.keys():
|
128
|
+
for key in self.ner_columns:
|
129
|
+
threshold = 0
|
130
|
+
if key.startswith('tui'):
|
131
|
+
threshold = self.tui_threshold
|
132
|
+
if key.startswith('cui'):
|
133
|
+
threshold = self.cui_threshold
|
134
|
+
if key.startswith('entities'):
|
135
|
+
threshold = self.entities_threshold
|
136
|
+
combined_df[key] = filtered_ner_df[key]
|
137
|
+
valid_codes = idf_scores_df[idf_scores_df[key] > threshold].index
|
138
|
+
|
139
|
+
# print(f'{key}: valid code size = {len(valid_codes)}')
|
140
|
+
combined_df[key] = combined_df[key].apply(lambda items:
|
141
|
+
[item for item in items if item in valid_codes])
|
142
|
+
|
143
|
+
moved_columns = ['id']
|
144
|
+
moved_columns.extend(self.__similarity_related_columns__)
|
145
|
+
chemical_properties_df = combined_df[moved_columns]
|
146
|
+
|
147
|
+
chemical_properties_df = chemical_properties_df.fillna("").apply(list)
|
148
|
+
|
149
|
+
# generate vectors dictionary içinde ndarray dönecek
|
150
|
+
generated_vectors = generate_vectors(chemical_properties_df, self.__similarity_related_columns__)
|
151
|
+
|
152
|
+
similarity_matrices = generate_sim_matrices_new(
|
153
|
+
chemical_properties_df,generated_vectors, self.__similarity_related_columns__, key_column= "id")
|
154
|
+
|
155
|
+
event_categories = self.ddis_df['event_category']
|
156
|
+
labels = event_categories.tolist()
|
157
|
+
lb = LabelBinarizer()
|
158
|
+
lb.fit(labels)
|
159
|
+
classes = lb.transform(labels)
|
160
|
+
|
161
|
+
def similarity_lambda_fnc(row, value):
|
162
|
+
if row['id1'] in value:
|
163
|
+
return value[row['id1']]
|
164
|
+
|
165
|
+
def lambda_fnc(row: pd.Series, value)-> Optional[np.float16]:
|
166
|
+
if row['id1'] in value and row['id2'] in value:
|
167
|
+
return np.float16(np.hstack(
|
168
|
+
(value[row['id1']], value[row['id2']])))
|
169
|
+
return None
|
170
|
+
# return np.hstack(
|
171
|
+
# (value[row['id1']], value[row['id2']]), dtype=np.float16)
|
172
|
+
|
173
|
+
def x_fnc(row, embeddings_after_pooling):
|
174
|
+
if row['id1'] in embeddings_after_pooling:
|
175
|
+
v1 = embeddings_after_pooling[row['id1']]
|
176
|
+
else:
|
177
|
+
v1 = np.zeros(self.embedding_size)
|
178
|
+
if row['id2'] in embeddings_after_pooling:
|
179
|
+
v2 = embeddings_after_pooling[row['id2']]
|
180
|
+
else:
|
181
|
+
v2 = np.zeros(self.embedding_size)
|
182
|
+
return np.float16(np.hstack(
|
183
|
+
(v1, v2)))
|
184
|
+
|
185
|
+
for key, value in similarity_matrices.items():
|
186
|
+
|
187
|
+
print(f'sim matrix: {key}')
|
188
|
+
self.ddis_df[key] = self.ddis_df.apply(
|
189
|
+
lambda_fnc, args=(value,), axis=1)
|
190
|
+
self.columns.append(key)
|
191
|
+
print(self.ddis_df[key].head())
|
192
|
+
|
193
|
+
for embedding_column in self.embedding_columns:
|
194
|
+
print(f"concat {embedding_column} embeddings")
|
195
|
+
embeddings_after_pooling = {k: self.embeddings_pooling_strategy.apply(
|
196
|
+
v) for k, v in self.embedding_dict[embedding_column].items()}
|
197
|
+
# column_embeddings_dict = embedding_values[embedding_column]
|
198
|
+
self.ddis_df[embedding_column+'_embedding'] = self.ddis_df.apply(
|
199
|
+
x_fnc, args=(embeddings_after_pooling,), axis=1)
|
200
|
+
self.columns.append(embedding_column+'_embedding')
|
201
|
+
|
202
|
+
dataframe = self.ddis_df.copy()
|
203
|
+
if not isinstance(classes, (list, pd.Series, np.ndarray)):
|
204
|
+
raise TypeError("classes must be an iterable (list, Series, or ndarray)")
|
205
|
+
|
206
|
+
if len(classes) != len(dataframe):
|
207
|
+
raise ValueError("Length of classes must match the number of rows in the DataFrame")
|
208
|
+
|
209
|
+
dataframe['class'] = list(classes)
|
210
|
+
self.set_dataframe(dataframe)
|
211
|
+
|
212
|
+
|
213
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
[0217/121135.683:ERROR:registration_protocol_win.cc(108)] CreateFile: Sistem belirtilen dosyayı bulamıyor. (0x2)
|
@@ -58,7 +58,8 @@ def create_embeddings(model, data, column, drop_column=True):
|
|
58
58
|
column_embeddings_dict[row['id']] = sum_of_embeddings
|
59
59
|
# data.iloc[index][column+'_embedding']=sum_of_embeddings
|
60
60
|
|
61
|
-
data[column+'_embedding'] = pd.Series(column_embeddings_dict.values())
|
61
|
+
# data[column+'_embedding'] = pd.Series(column_embeddings_dict.values())
|
62
|
+
data[column+'_embedding'] = pd.Series(list(column_embeddings_dict.values()))
|
62
63
|
if(drop_column):
|
63
64
|
data.drop([column], axis = 1, inplace = True)
|
64
65
|
# data[column+'_embedding'] = [column_embeddings_dict[row['name']] for index, row in data.iterrows()]
|
@@ -82,6 +82,7 @@ class PretrainedEmbeddings(Embeddings):
|
|
82
82
|
text, return_tensors='pt', padding=True)
|
83
83
|
output_embeddings.append(self.model(
|
84
84
|
input_ids).last_hidden_state.mean(dim=1))
|
85
|
+
return output_embeddings
|
85
86
|
|
86
87
|
def embed_query(self, text: str) -> List[float]:
|
87
88
|
return self.embed_documents([text])[0]
|