ddi-fw 0.0.42__tar.gz → 0.0.44__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/PKG-INFO +2 -4
  2. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/pyproject.toml +2 -4
  3. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/__init__.py +1 -1
  4. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/core.py +8 -5
  5. ddi_fw-0.0.44/src/ddi_fw/datasets/embedding_generator_new.py +186 -0
  6. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/experiments/tensorflow_helper.py +9 -8
  7. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/experiments/test.py +3 -1
  8. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw.egg-info/PKG-INFO +2 -4
  9. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw.egg-info/requires.txt +0 -2
  10. ddi_fw-0.0.42/src/ddi_fw/datasets/embedding_generator_new.py +0 -105
  11. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/README.md +0 -0
  12. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/setup.cfg +0 -0
  13. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/db_utils.py +0 -0
  14. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/base.py +0 -0
  15. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/data/event.db +0 -0
  16. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt +0 -0
  17. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_0.txt +0 -0
  18. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_1.txt +0 -0
  19. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_2.txt +0 -0
  20. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_3.txt +0 -0
  21. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_4.txt +0 -0
  22. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes/train_indexes.txt +0 -0
  23. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_0.txt +0 -0
  24. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_1.txt +0 -0
  25. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_2.txt +0 -0
  26. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_3.txt +0 -0
  27. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_4.txt +0 -0
  28. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes_old/test_indexes.txt +0 -0
  29. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_0.txt +0 -0
  30. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_1.txt +0 -0
  31. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_2.txt +0 -0
  32. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_3.txt +0 -0
  33. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_4.txt +0 -0
  34. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_indexes.txt +0 -0
  35. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_0.txt +0 -0
  36. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_1.txt +0 -0
  37. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_2.txt +0 -0
  38. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_3.txt +0 -0
  39. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_4.txt +0 -0
  40. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/ddi_mdl/readme.md +0 -0
  41. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/embedding_generator.py +0 -0
  42. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/feature_vector_generation.py +0 -0
  43. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/idf_helper.py +0 -0
  44. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/mdf_sa_ddi/__init__.py +0 -0
  45. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/mdf_sa_ddi/base.py +0 -0
  46. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/mdf_sa_ddi/df_extraction_cleanxiaoyu50.csv +0 -0
  47. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/mdf_sa_ddi/drug_information_del_noDDIxiaoyu50.csv +0 -0
  48. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/test_indexes.txt +0 -0
  49. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_0.txt +0 -0
  50. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_1.txt +0 -0
  51. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_2.txt +0 -0
  52. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_3.txt +0 -0
  53. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_4.txt +0 -0
  54. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_indexes.txt +0 -0
  55. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_0.txt +0 -0
  56. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_1.txt +0 -0
  57. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_2.txt +0 -0
  58. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_3.txt +0 -0
  59. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_4.txt +0 -0
  60. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/mdf_sa_ddi/mdf-sa-ddi.zip +0 -0
  61. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/datasets/setup_._py +0 -0
  62. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/drugbank/__init__.py +0 -0
  63. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/drugbank/drugbank.xsd +0 -0
  64. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/drugbank/drugbank_parser.py +0 -0
  65. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/drugbank/drugbank_processor.py +0 -0
  66. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/drugbank/drugbank_processor_org.py +0 -0
  67. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/drugbank/event_extractor.py +0 -0
  68. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/experiments/__init__.py +0 -0
  69. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/experiments/custom_torch_model.py +0 -0
  70. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/experiments/evaluation_helper.py +0 -0
  71. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/ner/__init__.py +0 -0
  72. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/ner/mmlrestclient.py +0 -0
  73. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/ner/ner.py +0 -0
  74. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/test/basic_test.py +0 -0
  75. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/test/combination_test.py +0 -0
  76. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/test/compress_json_test.py +0 -0
  77. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/test/date_test.py +0 -0
  78. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/test/idf_score.py +0 -0
  79. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/test/jaccard_similarity.py +0 -0
  80. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/test/mlfow_test.py +0 -0
  81. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/test/sklearn-tfidf.py +0 -0
  82. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/test/test.py +0 -0
  83. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/test/torch_cuda_test.py +0 -0
  84. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/test/type_guarding_test.py +0 -0
  85. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/utils/__init__.py +0 -0
  86. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/utils/enums.py +0 -0
  87. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/utils/py7zr_helper.py +0 -0
  88. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/utils/utils.py +0 -0
  89. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw/utils/zip_helper.py +0 -0
  90. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw.egg-info/SOURCES.txt +0 -0
  91. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw.egg-info/dependency_links.txt +0 -0
  92. {ddi_fw-0.0.42 → ddi_fw-0.0.44}/src/ddi_fw.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ddi_fw
3
- Version: 0.0.42
3
+ Version: 0.0.44
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -20,7 +20,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
20
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
21
21
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
22
  Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
23
- Requires-Python: >=3.8
23
+ Requires-Python: >=3.10
24
24
  Description-Content-Type: text/markdown
25
25
  Requires-Dist: python-stopwatch
26
26
  Requires-Dist: importlib-resources
@@ -41,5 +41,3 @@ Requires-Dist: tqdm
41
41
  Requires-Dist: xmlschema
42
42
  Requires-Dist: zipp
43
43
  Requires-Dist: py7zr
44
- Requires-Dist: tf2onnx
45
- Requires-Dist: tensorflow==2.15.0
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
5
5
 
6
6
  [project]
7
7
  name = "ddi_fw"
8
- version = "0.0.42"
8
+ version = "0.0.44"
9
9
  description = "Do not use :)"
10
10
  readme = "README.md"
11
11
  authors = [
@@ -43,7 +43,7 @@ keywords = [
43
43
  # See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license
44
44
  license = { file = "LICENSE" }
45
45
 
46
- requires-python = ">=3.8"
46
+ requires-python = ">=3.10"
47
47
  dependencies = [
48
48
  "python-stopwatch"
49
49
  ,"importlib-resources"
@@ -64,8 +64,6 @@ dependencies = [
64
64
  ,"xmlschema"
65
65
  ,"zipp"
66
66
  ,"py7zr"
67
- ,"tf2onnx"
68
- ,"tensorflow==2.15.0"
69
67
  ]
70
68
 
71
69
 
@@ -2,7 +2,7 @@ from .core import BaseDataset
2
2
  from .ddi_mdl.base import DDIMDLDataset
3
3
  from .mdf_sa_ddi.base import MDFSADDIDataset
4
4
  from .embedding_generator import create_embeddings
5
- from .embedding_generator_new import EmbeddingGenerator,PretrainedEmbeddingGenerator,SBertEmbeddingGenerator,LLMEmbeddingGenerator,create_embeddings_new
5
+ from .embedding_generator_new import PoolingStrategy,SumPoolingStrategy,MeanPoolingStrategy,SentenceTransformerDecorator,PretrainedEmbeddings,SBertEmbeddings
6
6
  from .idf_helper import IDF
7
7
  from .feature_vector_generation import SimilarityMatrixGenerator, VectorGenerator
8
8
 
@@ -5,6 +5,7 @@ from abc import ABC, abstractmethod
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
  import pathlib
8
+ from ddi_fw.datasets.embedding_generator_new import PoolingStrategy
8
9
  from ddi_fw.datasets.idf_helper import IDF
9
10
 
10
11
  from ddi_fw.utils.zip_helper import ZipHelper
@@ -21,10 +22,11 @@ def stack(df_column):
21
22
 
22
23
 
23
24
  class BaseDataset(ABC):
24
- def __init__(self,embedding_size,embedding_dict, ner_df, chemical_property_columns, embedding_columns, ner_columns,
25
+ def __init__(self,embedding_size,embedding_dict, embeddings_pooling_strategy:PoolingStrategy, ner_df, chemical_property_columns, embedding_columns, ner_columns,
25
26
  **kwargs):
26
27
  self.embedding_size = embedding_size
27
28
  self.embedding_dict = embedding_dict
29
+ self.embeddings_pooling_strategy = embeddings_pooling_strategy
28
30
  self.ner_df = ner_df
29
31
  self.__similarity_related_columns__ = []
30
32
  self.__similarity_related_columns__.extend(chemical_property_columns)
@@ -364,13 +366,13 @@ class BaseDataset(ABC):
364
366
  # return np.hstack(
365
367
  # (value[row['id1']], value[row['id2']]), dtype=np.float16)
366
368
 
367
- def x_fnc(row, embedding_column):
369
+ def x_fnc(row, embedding_column,embeddings_after_pooling):
368
370
  if row['id1'] in self.embedding_dict[embedding_column]:
369
- v1 = self.embedding_dict[embedding_column][row['id1']]
371
+ v1 = embeddings_after_pooling[embedding_column][row['id1']]
370
372
  else:
371
373
  v1 = np.zeros(self.embedding_size)
372
374
  if row['id2'] in self.embedding_dict[embedding_column]:
373
- v2 = self.embedding_dict[embedding_column][row['id2']]
375
+ v2 = embeddings_after_pooling[embedding_column][row['id2']]
374
376
  else:
375
377
  v2 = np.zeros(self.embedding_size)
376
378
  return np.float16(np.hstack(
@@ -385,9 +387,10 @@ class BaseDataset(ABC):
385
387
 
386
388
  for embedding_column in self.embedding_columns:
387
389
  print(f"concat {embedding_column} embeddings")
390
+ embeddings_after_pooling = {k: self.embeddings_pooling_strategy.apply(v) for k,v in self.embedding_dict[embedding_column].items()}
388
391
  # column_embeddings_dict = embedding_values[embedding_column]
389
392
  self.ddis_df[embedding_column+'_embedding'] = self.ddis_df.apply(
390
- x_fnc, args=(embedding_column,), axis=1)
393
+ x_fnc, args=(embedding_column,embeddings_after_pooling), axis=1)
391
394
 
392
395
  self.dataframe = self.ddis_df.copy()
393
396
  self.dataframe['class'] = list(classes)
@@ -0,0 +1,186 @@
1
+ # !pip install -U sentence-transformers
2
+
3
+ # from transformers import BertTokenizer,BertForPreTraining,BertModel
4
+ # from sentence_transformers import SentenceTransformer, util
5
+ import pandas as pd
6
+ import numpy as np
7
+ from nltk import sent_tokenize
8
+ import torch
9
+ from tqdm import tqdm
10
+
11
+
12
+ from collections import defaultdict
13
+ from functools import partial
14
+ from abc import ABC, abstractmethod
15
+ from transformers import AutoModel, AutoTokenizer
16
+ from sentence_transformers import SentenceTransformer, util
17
+
18
+ from typing import Any, Dict, List, Optional
19
+ from langchain_core.embeddings import Embeddings
20
+ from pydantic import BaseModel, ConfigDict, Field, SecretStr
21
+ from langchain.embeddings import SentenceTransformerEmbeddings
22
+
23
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
24
+ from langchain_community.vectorstores import Chroma
25
+ import chromadb
26
+
27
+
28
+
29
+ # def split_docs(documents, chunk_size=1000, chunk_overlap=20):
30
+ # text_splitter = RecursiveCharacterTextSplitter(
31
+ # chunk_size=chunk_size, chunk_overlap=chunk_overlap)
32
+ # docs = text_splitter.split_documents(documents)
33
+ # return docs
34
+
35
+ class PoolingStrategy():
36
+ def __init__(self):
37
+ pass
38
+
39
+ def apply(self, embeddings: List[List[float]]):
40
+ pass
41
+
42
+
43
+ class MeanPoolingStrategy(PoolingStrategy):
44
+ def __init__(self):
45
+ pass
46
+
47
+ def apply(self, embeddings: List[List[float]]):
48
+ return np.mean(embeddings, axis=0)
49
+
50
+
51
+ class SumPoolingStrategy(PoolingStrategy):
52
+ def __init__(self):
53
+ pass
54
+
55
+ def apply(self, embeddings: List[List[float]]):
56
+ return np.sum(embeddings, axis=0)
57
+
58
+
59
+ class SentenceTransformerDecorator(BaseModel, Embeddings):
60
+ def __init__(self, model_name="all-MiniLM-L6-v2", **kwargs: Any):
61
+ self.embeddings = SentenceTransformerEmbeddings(model_name=model_name)
62
+
63
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
64
+ return self.embeddings.embed_documents(texts)
65
+
66
+ def embed_query(self, text: str) -> List[float]:
67
+ return self.embeddings.embed_query(text)
68
+
69
+
70
+ class PretrainedEmbeddings(BaseModel, Embeddings):
71
+ def __init__(self, model_name):
72
+ self.mmodel_name = model_name
73
+ self.model = AutoModel.from_pretrained(model_name)
74
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
75
+ self.shape = self.model.get_input_embeddings().weight.shape
76
+
77
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
78
+ output_embeddings = []
79
+ texts = list(map(lambda x: x.replace("\n", " "), texts))
80
+ for text in texts:
81
+ input_ids = self.tokenizer.encode(
82
+ text, return_tensors='pt', padding=True)
83
+ output_embeddings.append(self.model(
84
+ input_ids).last_hidden_state.mean(dim=1))
85
+
86
+ def embed_query(self, text: str) -> List[float]:
87
+ return self.embed_documents([text])[0]
88
+
89
+
90
+ class SBertEmbeddings(BaseModel, Embeddings):
91
+ def __init__(self, model_name):
92
+ self.model = SentenceTransformer(model_name)
93
+
94
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
95
+ return self.model.encode(texts)
96
+
97
+ def embed_query(self, text: str) -> List[float]:
98
+ return self.embed_documents([text])[0]
99
+
100
+ # class EmbeddingGenerator(ABC):
101
+
102
+ # def __init__(self):
103
+ # self.shape = None
104
+
105
+ # @abstractmethod
106
+ # def generate(self, text):
107
+ # pass
108
+
109
+ # # https://github.com/huggingface/transformers/issues/1791
110
+ # class PretrainedEmbeddingGenerator(EmbeddingGenerator):
111
+ # def __init__(self, model_name, split_text=True):
112
+ # self.model_name = model_name
113
+ # self.model = AutoModel.from_pretrained(model_name)
114
+ # self.tokenizer = AutoTokenizer.from_pretrained(model_name)
115
+ # self.shape = self.model.get_input_embeddings().weight.shape
116
+ # self.split_text = split_text
117
+
118
+ # def generate(self, text):
119
+ # if self.split_text:
120
+ # sentences = sent_tokenize(text)
121
+ # output_embeddings = None
122
+ # for sentence in sentences:
123
+ # input_ids = self.tokenizer.encode(sentence, return_tensors='pt', padding=True)
124
+ # if output_embeddings == None:
125
+ # output_embeddings = self.model(input_ids).last_hidden_state.mean(dim=1)
126
+ # else:
127
+ # output_embeddings += self.model(input_ids).last_hidden_state.mean(dim=1)
128
+ # if output_embeddings == None:
129
+ # output_embeddings = torch.empty((1,self.model.get_input_embeddings().weight.shape[1]))
130
+ # else:
131
+ # encoded_input = self.tokenizer(text, return_tensors='pt')
132
+ # input_ids = self.tokenizer.encode(text, add_special_tokens=True, max_length=self.tokenizer.model_max_length, return_tensors='pt')
133
+ # # input_ids = encoded_input.input_ids[:self.tokenizer.model_max_length]
134
+ # output_embeddings = self.model(input_ids)
135
+ # # output_embeddings = self.model(**encoded_input)
136
+ # # sentence embedding
137
+ # output_embeddings = output_embeddings.last_hidden_state.mean(dim=1)
138
+ # return torch.flatten(output_embeddings).detach().numpy()
139
+
140
+
141
+ # class LLMEmbeddingGenerator(EmbeddingGenerator):
142
+ # pass
143
+
144
+
145
+ # class SBertEmbeddingGenerator(PretrainedEmbeddingGenerator):
146
+ # def __init__(self, model_name, split_text=True):
147
+ # self.model = SentenceTransformer(model_name)
148
+ # self.shape = self.model._modules['0'].get_word_embedding_dimension()
149
+ # self.split_text = split_text
150
+
151
+ # def generate(self, text):
152
+ # if text == None or type(text) != str:
153
+ # embeddings = None
154
+ # else:
155
+ # if self.split_text:
156
+ # sentences = sent_tokenize(text)
157
+ # embeddings = self.model.encode(sentences)
158
+ # else:
159
+ # embeddings = self.model.encode(text)
160
+ # return embeddings
161
+
162
+
163
+ # # NOT modelden input size'ı anlama,
164
+ # def create_embeddings_new(generator: EmbeddingGenerator, data, column, drop_column=True):
165
+ # column_embeddings_dict = defaultdict(lambda: np.zeros(generator.shape))
166
+ # for index, row in tqdm(data.iterrows()):
167
+ # # if index == 10:
168
+ # # break
169
+ # text = data[column][index]
170
+ # embeddings = generator.generate(text)
171
+
172
+ # # TODO benzer olan ilacın embedding değerini vererek dene
173
+ # # embedding check none type
174
+ # if embeddings is None or len(embeddings) == 0:
175
+ # sum_of_embeddings = np.zeros(generator.shape)
176
+ # else:
177
+ # sum_of_embeddings = np.sum(embeddings, axis=0)
178
+ # # column_embeddings_dict[row['id']] = sum_of_embeddings.reshape(1, -1) # 2d
179
+ # column_embeddings_dict[row['id']] = sum_of_embeddings
180
+ # # data.iloc[index][column+'_embedding']=sum_of_embeddings
181
+
182
+ # data[column+'_embedding'] = pd.Series(column_embeddings_dict.values())
183
+ # if (drop_column):
184
+ # data.drop([column], axis=1, inplace=True)
185
+ # # data[column+'_embedding'] = [column_embeddings_dict[row['name']] for index, row in data.iterrows()]
186
+ # return column_embeddings_dict
@@ -14,8 +14,8 @@ import time
14
14
  from mlflow.models import infer_signature
15
15
  from ddi_fw.experiments.evaluation_helper import evaluate
16
16
 
17
- import tf2onnx
18
- import onnx
17
+ # import tf2onnx
18
+ # import onnx
19
19
 
20
20
  import itertools
21
21
  import ddi_fw.utils as utils
@@ -45,7 +45,7 @@ class TFMultiModal:
45
45
 
46
46
  def predict(self, combinations: list = [], generate_combinations=False):
47
47
  self.prefix = utils.utc_time_as_string()
48
- self.date = utils.utc_time_as_string_simple_format
48
+ self.date = utils.utc_time_as_string_simple_format()
49
49
  sum = np.zeros(
50
50
  (self.y_test_label.shape[0], self.y_test_label.shape[1]))
51
51
  single_results = dict()
@@ -165,12 +165,13 @@ class TFSingleModal:
165
165
  signature=signature,
166
166
  )
167
167
  print(run.info.artifact_uri)
168
- onnx_model, _ = tf2onnx.convert.from_keras(
169
- best_model, input_signature=None, opset=13)
170
- onnx.save(onnx_model, run.info.artifact_uri +
171
- '/model/model.onnx')
168
+ # todo tf2onnx not compatible with keras > 2.15
169
+ # onnx_model, _ = tf2onnx.convert.from_keras(
170
+ # best_model, input_signature=None, opset=13)
171
+ # onnx.save(onnx_model, run.info.artifact_uri +
172
+ # '/model/model.onnx')
172
173
  utils.compress_and_save_data(
173
- metrics.__dict__, run.info.artifact_uri, f'{self.date}metrics.gzip')
174
+ metrics.__dict__, run.info.artifact_uri, f'{self.date}_metrics.gzip')
174
175
  # mlflow.log_dict(metrics.__dict__, "metrics.json")
175
176
 
176
177
  # Plot Precision-Recall curves for each class and micro-average
@@ -56,4 +56,6 @@
56
56
  # callbacks=[custom_callback])
57
57
 
58
58
  # loss, accuracy = model.evaluate(test_data, test_labels,callbacks=[custom_callback])
59
- # print('Test accuracy: %.2f' % (accuracy))
59
+ # print('Test accuracy: %.2f' % (accuracy))
60
+
61
+ from langchain.embeddings import SentenceTransformerEmbeddings
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ddi_fw
3
- Version: 0.0.42
3
+ Version: 0.0.44
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -20,7 +20,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
20
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
21
21
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
22
  Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
23
- Requires-Python: >=3.8
23
+ Requires-Python: >=3.10
24
24
  Description-Content-Type: text/markdown
25
25
  Requires-Dist: python-stopwatch
26
26
  Requires-Dist: importlib-resources
@@ -41,5 +41,3 @@ Requires-Dist: tqdm
41
41
  Requires-Dist: xmlschema
42
42
  Requires-Dist: zipp
43
43
  Requires-Dist: py7zr
44
- Requires-Dist: tf2onnx
45
- Requires-Dist: tensorflow==2.15.0
@@ -17,5 +17,3 @@ tqdm
17
17
  xmlschema
18
18
  zipp
19
19
  py7zr
20
- tf2onnx
21
- tensorflow==2.15.0
@@ -1,105 +0,0 @@
1
- # !pip install -U sentence-transformers
2
-
3
- # from transformers import BertTokenizer,BertForPreTraining,BertModel
4
- # from sentence_transformers import SentenceTransformer, util
5
- import pandas as pd
6
- import numpy as np
7
- from nltk import sent_tokenize
8
- import torch
9
- from tqdm import tqdm
10
-
11
-
12
- from collections import defaultdict
13
- from functools import partial
14
- from abc import ABC, abstractmethod
15
- from transformers import AutoModel, AutoTokenizer
16
- from sentence_transformers import SentenceTransformer, util
17
-
18
-
19
- class EmbeddingGenerator(ABC):
20
-
21
- def __init__(self):
22
- self.shape = None
23
-
24
- @abstractmethod
25
- def generate(self, text):
26
- pass
27
-
28
- # https://github.com/huggingface/transformers/issues/1791
29
- class PretrainedEmbeddingGenerator(EmbeddingGenerator):
30
- def __init__(self, model_name, split_text=True):
31
- self.model_name = model_name
32
- self.model = AutoModel.from_pretrained(model_name)
33
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
34
- self.shape = self.model.get_input_embeddings().weight.shape
35
- self.split_text = split_text
36
-
37
- def generate(self, text):
38
- if self.split_text:
39
- sentences = sent_tokenize(text)
40
- output_embeddings = None
41
- for sentence in sentences:
42
- input_ids = self.tokenizer.encode(sentence, return_tensors='pt', padding=True)
43
- if output_embeddings == None:
44
- output_embeddings = self.model(input_ids).last_hidden_state.mean(dim=1)
45
- else:
46
- output_embeddings += self.model(input_ids).last_hidden_state.mean(dim=1)
47
- if output_embeddings == None:
48
- output_embeddings = torch.empty((1,self.model.get_input_embeddings().weight.shape[1]))
49
- else:
50
- encoded_input = self.tokenizer(text, return_tensors='pt')
51
- input_ids = self.tokenizer.encode(text, add_special_tokens=True, max_length=self.tokenizer.model_max_length, return_tensors='pt')
52
- # input_ids = encoded_input.input_ids[:self.tokenizer.model_max_length]
53
- output_embeddings = self.model(input_ids)
54
- # output_embeddings = self.model(**encoded_input)
55
- # sentence embedding
56
- output_embeddings = output_embeddings.last_hidden_state.mean(dim=1)
57
- return torch.flatten(output_embeddings).detach().numpy()
58
-
59
-
60
- class LLMEmbeddingGenerator(EmbeddingGenerator):
61
- pass
62
-
63
-
64
- class SBertEmbeddingGenerator(PretrainedEmbeddingGenerator):
65
- def __init__(self, model_name, split_text=True):
66
- self.model = SentenceTransformer(model_name)
67
- self.shape = self.model._modules['0'].get_word_embedding_dimension()
68
- self.split_text = split_text
69
-
70
- def generate(self, text):
71
- if text == None or type(text) != str:
72
- embeddings = None
73
- else:
74
- if self.split_text:
75
- sentences = sent_tokenize(text)
76
- embeddings = self.model.encode(sentences)
77
- else:
78
- embeddings = self.model.encode(text)
79
- return embeddings
80
-
81
-
82
- # NOT modelden input size'ı anlama,
83
- def create_embeddings_new(generator: EmbeddingGenerator, data, column, drop_column=True):
84
- column_embeddings_dict = defaultdict(lambda: np.zeros(generator.shape))
85
- for index, row in tqdm(data.iterrows()):
86
- # if index == 10:
87
- # break
88
- text = data[column][index]
89
- embeddings = generator.generate(text)
90
-
91
- # TODO benzer olan ilacın embedding değerini vererek dene
92
- # embedding check none type
93
- if embeddings is None or len(embeddings) == 0:
94
- sum_of_embeddings = np.zeros(generator.shape)
95
- else:
96
- sum_of_embeddings = np.sum(embeddings, axis=0)
97
- # column_embeddings_dict[row['id']] = sum_of_embeddings.reshape(1, -1) # 2d
98
- column_embeddings_dict[row['id']] = sum_of_embeddings
99
- # data.iloc[index][column+'_embedding']=sum_of_embeddings
100
-
101
- data[column+'_embedding'] = pd.Series(column_embeddings_dict.values())
102
- if (drop_column):
103
- data.drop([column], axis=1, inplace=True)
104
- # data[column+'_embedding'] = [column_embeddings_dict[row['name']] for index, row in data.iterrows()]
105
- return column_embeddings_dict
File without changes
File without changes
File without changes
File without changes