ddi-fw 0.0.43__tar.gz → 0.0.45__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/PKG-INFO +2 -2
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/pyproject.toml +2 -2
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/__init__.py +1 -1
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/core.py +8 -5
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/base.py +4 -2
- ddi_fw-0.0.45/src/ddi_fw/datasets/embedding_generator_new.py +186 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/experiments/tensorflow_helper.py +1 -1
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/experiments/test.py +3 -1
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw.egg-info/PKG-INFO +2 -2
- ddi_fw-0.0.43/src/ddi_fw/datasets/embedding_generator_new.py +0 -105
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/README.md +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/setup.cfg +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/db_utils.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/data/event.db +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_0.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_1.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_2.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_3.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_4.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes/train_indexes.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_0.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_1.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_2.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_3.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_4.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes_old/test_indexes.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_0.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_1.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_2.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_3.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_4.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_indexes.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_0.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_1.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_2.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_3.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_4.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/readme.md +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/embedding_generator.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/feature_vector_generation.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/idf_helper.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/__init__.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/base.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/df_extraction_cleanxiaoyu50.csv +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/drug_information_del_noDDIxiaoyu50.csv +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/test_indexes.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_0.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_1.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_2.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_3.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_4.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_indexes.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_0.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_1.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_2.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_3.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_4.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/mdf-sa-ddi.zip +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/setup_._py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/drugbank/__init__.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/drugbank/drugbank.xsd +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/drugbank/drugbank_parser.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/drugbank/drugbank_processor.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/drugbank/drugbank_processor_org.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/drugbank/event_extractor.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/experiments/__init__.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/experiments/custom_torch_model.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/experiments/evaluation_helper.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/ner/__init__.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/ner/mmlrestclient.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/ner/ner.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/test/basic_test.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/test/combination_test.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/test/compress_json_test.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/test/date_test.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/test/idf_score.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/test/jaccard_similarity.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/test/mlfow_test.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/test/sklearn-tfidf.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/test/test.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/test/torch_cuda_test.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/test/type_guarding_test.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/utils/__init__.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/utils/enums.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/utils/py7zr_helper.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/utils/utils.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/utils/zip_helper.py +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw.egg-info/SOURCES.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw.egg-info/dependency_links.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw.egg-info/requires.txt +0 -0
- {ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: ddi_fw
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.45
|
4
4
|
Summary: Do not use :)
|
5
5
|
Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
|
6
6
|
Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
|
@@ -20,7 +20,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
20
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
21
21
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
|
23
|
-
Requires-Python: >=3.
|
23
|
+
Requires-Python: >=3.10
|
24
24
|
Description-Content-Type: text/markdown
|
25
25
|
Requires-Dist: python-stopwatch
|
26
26
|
Requires-Dist: importlib-resources
|
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
|
|
5
5
|
|
6
6
|
[project]
|
7
7
|
name = "ddi_fw"
|
8
|
-
version = "0.0.
|
8
|
+
version = "0.0.45"
|
9
9
|
description = "Do not use :)"
|
10
10
|
readme = "README.md"
|
11
11
|
authors = [
|
@@ -43,7 +43,7 @@ keywords = [
|
|
43
43
|
# See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license
|
44
44
|
license = { file = "LICENSE" }
|
45
45
|
|
46
|
-
requires-python = ">=3.
|
46
|
+
requires-python = ">=3.10"
|
47
47
|
dependencies = [
|
48
48
|
"python-stopwatch"
|
49
49
|
,"importlib-resources"
|
@@ -2,7 +2,7 @@ from .core import BaseDataset
|
|
2
2
|
from .ddi_mdl.base import DDIMDLDataset
|
3
3
|
from .mdf_sa_ddi.base import MDFSADDIDataset
|
4
4
|
from .embedding_generator import create_embeddings
|
5
|
-
from .embedding_generator_new import
|
5
|
+
from .embedding_generator_new import PoolingStrategy,SumPoolingStrategy,MeanPoolingStrategy,SentenceTransformerDecorator,PretrainedEmbeddings,SBertEmbeddings
|
6
6
|
from .idf_helper import IDF
|
7
7
|
from .feature_vector_generation import SimilarityMatrixGenerator, VectorGenerator
|
8
8
|
|
@@ -5,6 +5,7 @@ from abc import ABC, abstractmethod
|
|
5
5
|
import numpy as np
|
6
6
|
import pandas as pd
|
7
7
|
import pathlib
|
8
|
+
from ddi_fw.datasets.embedding_generator_new import PoolingStrategy
|
8
9
|
from ddi_fw.datasets.idf_helper import IDF
|
9
10
|
|
10
11
|
from ddi_fw.utils.zip_helper import ZipHelper
|
@@ -21,10 +22,11 @@ def stack(df_column):
|
|
21
22
|
|
22
23
|
|
23
24
|
class BaseDataset(ABC):
|
24
|
-
def __init__(self,embedding_size,embedding_dict, ner_df, chemical_property_columns, embedding_columns, ner_columns,
|
25
|
+
def __init__(self,embedding_size,embedding_dict, embeddings_pooling_strategy:PoolingStrategy, ner_df, chemical_property_columns, embedding_columns, ner_columns,
|
25
26
|
**kwargs):
|
26
27
|
self.embedding_size = embedding_size
|
27
28
|
self.embedding_dict = embedding_dict
|
29
|
+
self.embeddings_pooling_strategy = embeddings_pooling_strategy
|
28
30
|
self.ner_df = ner_df
|
29
31
|
self.__similarity_related_columns__ = []
|
30
32
|
self.__similarity_related_columns__.extend(chemical_property_columns)
|
@@ -364,13 +366,13 @@ class BaseDataset(ABC):
|
|
364
366
|
# return np.hstack(
|
365
367
|
# (value[row['id1']], value[row['id2']]), dtype=np.float16)
|
366
368
|
|
367
|
-
def x_fnc(row, embedding_column):
|
369
|
+
def x_fnc(row, embedding_column,embeddings_after_pooling):
|
368
370
|
if row['id1'] in self.embedding_dict[embedding_column]:
|
369
|
-
v1 =
|
371
|
+
v1 = embeddings_after_pooling[embedding_column][row['id1']]
|
370
372
|
else:
|
371
373
|
v1 = np.zeros(self.embedding_size)
|
372
374
|
if row['id2'] in self.embedding_dict[embedding_column]:
|
373
|
-
v2 =
|
375
|
+
v2 = embeddings_after_pooling[embedding_column][row['id2']]
|
374
376
|
else:
|
375
377
|
v2 = np.zeros(self.embedding_size)
|
376
378
|
return np.float16(np.hstack(
|
@@ -385,9 +387,10 @@ class BaseDataset(ABC):
|
|
385
387
|
|
386
388
|
for embedding_column in self.embedding_columns:
|
387
389
|
print(f"concat {embedding_column} embeddings")
|
390
|
+
embeddings_after_pooling = {k: self.embeddings_pooling_strategy.apply(v) for k,v in self.embedding_dict[embedding_column].items()}
|
388
391
|
# column_embeddings_dict = embedding_values[embedding_column]
|
389
392
|
self.ddis_df[embedding_column+'_embedding'] = self.ddis_df.apply(
|
390
|
-
x_fnc, args=(embedding_column,), axis=1)
|
393
|
+
x_fnc, args=(embedding_column,embeddings_after_pooling), axis=1)
|
391
394
|
|
392
395
|
self.dataframe = self.ddis_df.copy()
|
393
396
|
self.dataframe['class'] = list(classes)
|
@@ -1,6 +1,8 @@
|
|
1
1
|
import pathlib
|
2
2
|
|
3
3
|
import pandas as pd
|
4
|
+
|
5
|
+
from ddi_fw.datasets.embedding_generator_new import PoolingStrategy
|
4
6
|
from .. import BaseDataset
|
5
7
|
from ..db_utils import create_connection
|
6
8
|
|
@@ -8,7 +10,7 @@ HERE = pathlib.Path(__file__).resolve().parent
|
|
8
10
|
|
9
11
|
|
10
12
|
class DDIMDLDataset(BaseDataset):
|
11
|
-
def __init__(self, embedding_size, embedding_dict, ner_df, chemical_property_columns=['enzyme',
|
13
|
+
def __init__(self, embedding_size, embedding_dict,embeddings_pooling_strategy:PoolingStrategy, ner_df, chemical_property_columns=['enzyme',
|
12
14
|
'target',
|
13
15
|
'pathway',
|
14
16
|
'smile'],
|
@@ -16,7 +18,7 @@ class DDIMDLDataset(BaseDataset):
|
|
16
18
|
ner_columns=[],
|
17
19
|
**kwargs):
|
18
20
|
|
19
|
-
super().__init__(embedding_size, embedding_dict,ner_df, chemical_property_columns, embedding_columns,
|
21
|
+
super().__init__(embedding_size, embedding_dict,ner_df,embeddings_pooling_strategy, chemical_property_columns, embedding_columns,
|
20
22
|
ner_columns, **kwargs)
|
21
23
|
|
22
24
|
# kwargs = {'index_path': str(HERE.joinpath('indexes'))}
|
@@ -0,0 +1,186 @@
|
|
1
|
+
# !pip install -U sentence-transformers
|
2
|
+
|
3
|
+
# from transformers import BertTokenizer,BertForPreTraining,BertModel
|
4
|
+
# from sentence_transformers import SentenceTransformer, util
|
5
|
+
import pandas as pd
|
6
|
+
import numpy as np
|
7
|
+
from nltk import sent_tokenize
|
8
|
+
import torch
|
9
|
+
from tqdm import tqdm
|
10
|
+
|
11
|
+
|
12
|
+
from collections import defaultdict
|
13
|
+
from functools import partial
|
14
|
+
from abc import ABC, abstractmethod
|
15
|
+
from transformers import AutoModel, AutoTokenizer
|
16
|
+
from sentence_transformers import SentenceTransformer, util
|
17
|
+
|
18
|
+
from typing import Any, Dict, List, Optional
|
19
|
+
from langchain_core.embeddings import Embeddings
|
20
|
+
from pydantic import BaseModel, ConfigDict, Field, SecretStr
|
21
|
+
from langchain.embeddings import SentenceTransformerEmbeddings
|
22
|
+
|
23
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
24
|
+
from langchain_community.vectorstores import Chroma
|
25
|
+
import chromadb
|
26
|
+
|
27
|
+
|
28
|
+
|
29
|
+
# def split_docs(documents, chunk_size=1000, chunk_overlap=20):
|
30
|
+
# text_splitter = RecursiveCharacterTextSplitter(
|
31
|
+
# chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
32
|
+
# docs = text_splitter.split_documents(documents)
|
33
|
+
# return docs
|
34
|
+
|
35
|
+
class PoolingStrategy():
|
36
|
+
def __init__(self):
|
37
|
+
pass
|
38
|
+
|
39
|
+
def apply(self, embeddings: List[List[float]]):
|
40
|
+
pass
|
41
|
+
|
42
|
+
|
43
|
+
class MeanPoolingStrategy(PoolingStrategy):
|
44
|
+
def __init__(self):
|
45
|
+
pass
|
46
|
+
|
47
|
+
def apply(self, embeddings: List[List[float]]):
|
48
|
+
return np.mean(embeddings, axis=0)
|
49
|
+
|
50
|
+
|
51
|
+
class SumPoolingStrategy(PoolingStrategy):
|
52
|
+
def __init__(self):
|
53
|
+
pass
|
54
|
+
|
55
|
+
def apply(self, embeddings: List[List[float]]):
|
56
|
+
return np.sum(embeddings, axis=0)
|
57
|
+
|
58
|
+
|
59
|
+
class SentenceTransformerDecorator(BaseModel, Embeddings):
|
60
|
+
def __init__(self, model_name="all-MiniLM-L6-v2", **kwargs: Any):
|
61
|
+
self.embeddings = SentenceTransformerEmbeddings(model_name=model_name)
|
62
|
+
|
63
|
+
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
64
|
+
return self.embeddings.embed_documents(texts)
|
65
|
+
|
66
|
+
def embed_query(self, text: str) -> List[float]:
|
67
|
+
return self.embeddings.embed_query(text)
|
68
|
+
|
69
|
+
|
70
|
+
class PretrainedEmbeddings(BaseModel, Embeddings):
|
71
|
+
def __init__(self, model_name):
|
72
|
+
self.mmodel_name = model_name
|
73
|
+
self.model = AutoModel.from_pretrained(model_name)
|
74
|
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
75
|
+
self.shape = self.model.get_input_embeddings().weight.shape
|
76
|
+
|
77
|
+
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
78
|
+
output_embeddings = []
|
79
|
+
texts = list(map(lambda x: x.replace("\n", " "), texts))
|
80
|
+
for text in texts:
|
81
|
+
input_ids = self.tokenizer.encode(
|
82
|
+
text, return_tensors='pt', padding=True)
|
83
|
+
output_embeddings.append(self.model(
|
84
|
+
input_ids).last_hidden_state.mean(dim=1))
|
85
|
+
|
86
|
+
def embed_query(self, text: str) -> List[float]:
|
87
|
+
return self.embed_documents([text])[0]
|
88
|
+
|
89
|
+
|
90
|
+
class SBertEmbeddings(BaseModel, Embeddings):
|
91
|
+
def __init__(self, model_name):
|
92
|
+
self.model = SentenceTransformer(model_name)
|
93
|
+
|
94
|
+
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
95
|
+
return self.model.encode(texts)
|
96
|
+
|
97
|
+
def embed_query(self, text: str) -> List[float]:
|
98
|
+
return self.embed_documents([text])[0]
|
99
|
+
|
100
|
+
# class EmbeddingGenerator(ABC):
|
101
|
+
|
102
|
+
# def __init__(self):
|
103
|
+
# self.shape = None
|
104
|
+
|
105
|
+
# @abstractmethod
|
106
|
+
# def generate(self, text):
|
107
|
+
# pass
|
108
|
+
|
109
|
+
# # https://github.com/huggingface/transformers/issues/1791
|
110
|
+
# class PretrainedEmbeddingGenerator(EmbeddingGenerator):
|
111
|
+
# def __init__(self, model_name, split_text=True):
|
112
|
+
# self.model_name = model_name
|
113
|
+
# self.model = AutoModel.from_pretrained(model_name)
|
114
|
+
# self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
115
|
+
# self.shape = self.model.get_input_embeddings().weight.shape
|
116
|
+
# self.split_text = split_text
|
117
|
+
|
118
|
+
# def generate(self, text):
|
119
|
+
# if self.split_text:
|
120
|
+
# sentences = sent_tokenize(text)
|
121
|
+
# output_embeddings = None
|
122
|
+
# for sentence in sentences:
|
123
|
+
# input_ids = self.tokenizer.encode(sentence, return_tensors='pt', padding=True)
|
124
|
+
# if output_embeddings == None:
|
125
|
+
# output_embeddings = self.model(input_ids).last_hidden_state.mean(dim=1)
|
126
|
+
# else:
|
127
|
+
# output_embeddings += self.model(input_ids).last_hidden_state.mean(dim=1)
|
128
|
+
# if output_embeddings == None:
|
129
|
+
# output_embeddings = torch.empty((1,self.model.get_input_embeddings().weight.shape[1]))
|
130
|
+
# else:
|
131
|
+
# encoded_input = self.tokenizer(text, return_tensors='pt')
|
132
|
+
# input_ids = self.tokenizer.encode(text, add_special_tokens=True, max_length=self.tokenizer.model_max_length, return_tensors='pt')
|
133
|
+
# # input_ids = encoded_input.input_ids[:self.tokenizer.model_max_length]
|
134
|
+
# output_embeddings = self.model(input_ids)
|
135
|
+
# # output_embeddings = self.model(**encoded_input)
|
136
|
+
# # sentence embedding
|
137
|
+
# output_embeddings = output_embeddings.last_hidden_state.mean(dim=1)
|
138
|
+
# return torch.flatten(output_embeddings).detach().numpy()
|
139
|
+
|
140
|
+
|
141
|
+
# class LLMEmbeddingGenerator(EmbeddingGenerator):
|
142
|
+
# pass
|
143
|
+
|
144
|
+
|
145
|
+
# class SBertEmbeddingGenerator(PretrainedEmbeddingGenerator):
|
146
|
+
# def __init__(self, model_name, split_text=True):
|
147
|
+
# self.model = SentenceTransformer(model_name)
|
148
|
+
# self.shape = self.model._modules['0'].get_word_embedding_dimension()
|
149
|
+
# self.split_text = split_text
|
150
|
+
|
151
|
+
# def generate(self, text):
|
152
|
+
# if text == None or type(text) != str:
|
153
|
+
# embeddings = None
|
154
|
+
# else:
|
155
|
+
# if self.split_text:
|
156
|
+
# sentences = sent_tokenize(text)
|
157
|
+
# embeddings = self.model.encode(sentences)
|
158
|
+
# else:
|
159
|
+
# embeddings = self.model.encode(text)
|
160
|
+
# return embeddings
|
161
|
+
|
162
|
+
|
163
|
+
# # NOT modelden input size'ı anlama,
|
164
|
+
# def create_embeddings_new(generator: EmbeddingGenerator, data, column, drop_column=True):
|
165
|
+
# column_embeddings_dict = defaultdict(lambda: np.zeros(generator.shape))
|
166
|
+
# for index, row in tqdm(data.iterrows()):
|
167
|
+
# # if index == 10:
|
168
|
+
# # break
|
169
|
+
# text = data[column][index]
|
170
|
+
# embeddings = generator.generate(text)
|
171
|
+
|
172
|
+
# # TODO benzer olan ilacın embedding değerini vererek dene
|
173
|
+
# # embedding check none type
|
174
|
+
# if embeddings is None or len(embeddings) == 0:
|
175
|
+
# sum_of_embeddings = np.zeros(generator.shape)
|
176
|
+
# else:
|
177
|
+
# sum_of_embeddings = np.sum(embeddings, axis=0)
|
178
|
+
# # column_embeddings_dict[row['id']] = sum_of_embeddings.reshape(1, -1) # 2d
|
179
|
+
# column_embeddings_dict[row['id']] = sum_of_embeddings
|
180
|
+
# # data.iloc[index][column+'_embedding']=sum_of_embeddings
|
181
|
+
|
182
|
+
# data[column+'_embedding'] = pd.Series(column_embeddings_dict.values())
|
183
|
+
# if (drop_column):
|
184
|
+
# data.drop([column], axis=1, inplace=True)
|
185
|
+
# # data[column+'_embedding'] = [column_embeddings_dict[row['name']] for index, row in data.iterrows()]
|
186
|
+
# return column_embeddings_dict
|
@@ -171,7 +171,7 @@ class TFSingleModal:
|
|
171
171
|
# onnx.save(onnx_model, run.info.artifact_uri +
|
172
172
|
# '/model/model.onnx')
|
173
173
|
utils.compress_and_save_data(
|
174
|
-
metrics.__dict__, run.info.artifact_uri, f'{self.date}
|
174
|
+
metrics.__dict__, run.info.artifact_uri, f'{self.date}_metrics.gzip')
|
175
175
|
# mlflow.log_dict(metrics.__dict__, "metrics.json")
|
176
176
|
|
177
177
|
# Plot Precision-Recall curves for each class and micro-average
|
@@ -56,4 +56,6 @@
|
|
56
56
|
# callbacks=[custom_callback])
|
57
57
|
|
58
58
|
# loss, accuracy = model.evaluate(test_data, test_labels,callbacks=[custom_callback])
|
59
|
-
# print('Test accuracy: %.2f' % (accuracy))
|
59
|
+
# print('Test accuracy: %.2f' % (accuracy))
|
60
|
+
|
61
|
+
from langchain.embeddings import SentenceTransformerEmbeddings
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: ddi_fw
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.45
|
4
4
|
Summary: Do not use :)
|
5
5
|
Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
|
6
6
|
Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
|
@@ -20,7 +20,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
20
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
21
21
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
|
23
|
-
Requires-Python: >=3.
|
23
|
+
Requires-Python: >=3.10
|
24
24
|
Description-Content-Type: text/markdown
|
25
25
|
Requires-Dist: python-stopwatch
|
26
26
|
Requires-Dist: importlib-resources
|
@@ -1,105 +0,0 @@
|
|
1
|
-
# !pip install -U sentence-transformers
|
2
|
-
|
3
|
-
# from transformers import BertTokenizer,BertForPreTraining,BertModel
|
4
|
-
# from sentence_transformers import SentenceTransformer, util
|
5
|
-
import pandas as pd
|
6
|
-
import numpy as np
|
7
|
-
from nltk import sent_tokenize
|
8
|
-
import torch
|
9
|
-
from tqdm import tqdm
|
10
|
-
|
11
|
-
|
12
|
-
from collections import defaultdict
|
13
|
-
from functools import partial
|
14
|
-
from abc import ABC, abstractmethod
|
15
|
-
from transformers import AutoModel, AutoTokenizer
|
16
|
-
from sentence_transformers import SentenceTransformer, util
|
17
|
-
|
18
|
-
|
19
|
-
class EmbeddingGenerator(ABC):
|
20
|
-
|
21
|
-
def __init__(self):
|
22
|
-
self.shape = None
|
23
|
-
|
24
|
-
@abstractmethod
|
25
|
-
def generate(self, text):
|
26
|
-
pass
|
27
|
-
|
28
|
-
# https://github.com/huggingface/transformers/issues/1791
|
29
|
-
class PretrainedEmbeddingGenerator(EmbeddingGenerator):
|
30
|
-
def __init__(self, model_name, split_text=True):
|
31
|
-
self.model_name = model_name
|
32
|
-
self.model = AutoModel.from_pretrained(model_name)
|
33
|
-
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
34
|
-
self.shape = self.model.get_input_embeddings().weight.shape
|
35
|
-
self.split_text = split_text
|
36
|
-
|
37
|
-
def generate(self, text):
|
38
|
-
if self.split_text:
|
39
|
-
sentences = sent_tokenize(text)
|
40
|
-
output_embeddings = None
|
41
|
-
for sentence in sentences:
|
42
|
-
input_ids = self.tokenizer.encode(sentence, return_tensors='pt', padding=True)
|
43
|
-
if output_embeddings == None:
|
44
|
-
output_embeddings = self.model(input_ids).last_hidden_state.mean(dim=1)
|
45
|
-
else:
|
46
|
-
output_embeddings += self.model(input_ids).last_hidden_state.mean(dim=1)
|
47
|
-
if output_embeddings == None:
|
48
|
-
output_embeddings = torch.empty((1,self.model.get_input_embeddings().weight.shape[1]))
|
49
|
-
else:
|
50
|
-
encoded_input = self.tokenizer(text, return_tensors='pt')
|
51
|
-
input_ids = self.tokenizer.encode(text, add_special_tokens=True, max_length=self.tokenizer.model_max_length, return_tensors='pt')
|
52
|
-
# input_ids = encoded_input.input_ids[:self.tokenizer.model_max_length]
|
53
|
-
output_embeddings = self.model(input_ids)
|
54
|
-
# output_embeddings = self.model(**encoded_input)
|
55
|
-
# sentence embedding
|
56
|
-
output_embeddings = output_embeddings.last_hidden_state.mean(dim=1)
|
57
|
-
return torch.flatten(output_embeddings).detach().numpy()
|
58
|
-
|
59
|
-
|
60
|
-
class LLMEmbeddingGenerator(EmbeddingGenerator):
|
61
|
-
pass
|
62
|
-
|
63
|
-
|
64
|
-
class SBertEmbeddingGenerator(PretrainedEmbeddingGenerator):
|
65
|
-
def __init__(self, model_name, split_text=True):
|
66
|
-
self.model = SentenceTransformer(model_name)
|
67
|
-
self.shape = self.model._modules['0'].get_word_embedding_dimension()
|
68
|
-
self.split_text = split_text
|
69
|
-
|
70
|
-
def generate(self, text):
|
71
|
-
if text == None or type(text) != str:
|
72
|
-
embeddings = None
|
73
|
-
else:
|
74
|
-
if self.split_text:
|
75
|
-
sentences = sent_tokenize(text)
|
76
|
-
embeddings = self.model.encode(sentences)
|
77
|
-
else:
|
78
|
-
embeddings = self.model.encode(text)
|
79
|
-
return embeddings
|
80
|
-
|
81
|
-
|
82
|
-
# NOT modelden input size'ı anlama,
|
83
|
-
def create_embeddings_new(generator: EmbeddingGenerator, data, column, drop_column=True):
|
84
|
-
column_embeddings_dict = defaultdict(lambda: np.zeros(generator.shape))
|
85
|
-
for index, row in tqdm(data.iterrows()):
|
86
|
-
# if index == 10:
|
87
|
-
# break
|
88
|
-
text = data[column][index]
|
89
|
-
embeddings = generator.generate(text)
|
90
|
-
|
91
|
-
# TODO benzer olan ilacın embedding değerini vererek dene
|
92
|
-
# embedding check none type
|
93
|
-
if embeddings is None or len(embeddings) == 0:
|
94
|
-
sum_of_embeddings = np.zeros(generator.shape)
|
95
|
-
else:
|
96
|
-
sum_of_embeddings = np.sum(embeddings, axis=0)
|
97
|
-
# column_embeddings_dict[row['id']] = sum_of_embeddings.reshape(1, -1) # 2d
|
98
|
-
column_embeddings_dict[row['id']] = sum_of_embeddings
|
99
|
-
# data.iloc[index][column+'_embedding']=sum_of_embeddings
|
100
|
-
|
101
|
-
data[column+'_embedding'] = pd.Series(column_embeddings_dict.values())
|
102
|
-
if (drop_column):
|
103
|
-
data.drop([column], axis=1, inplace=True)
|
104
|
-
# data[column+'_embedding'] = [column_embeddings_dict[row['name']] for index, row in data.iterrows()]
|
105
|
-
return column_embeddings_dict
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_0.txt
RENAMED
File without changes
|
{ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_1.txt
RENAMED
File without changes
|
{ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_2.txt
RENAMED
File without changes
|
{ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_3.txt
RENAMED
File without changes
|
{ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_4.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/df_extraction_cleanxiaoyu50.csv
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_0.txt
RENAMED
File without changes
|
{ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_1.txt
RENAMED
File without changes
|
{ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_2.txt
RENAMED
File without changes
|
{ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_3.txt
RENAMED
File without changes
|
{ddi_fw-0.0.43 → ddi_fw-0.0.45}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_4.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|