ddi-fw 0.0.148__tar.gz → 0.0.150__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/PKG-INFO +6 -3
  2. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/pyproject.toml +19 -4
  3. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/__init__.py +1 -1
  4. ddi_fw-0.0.150/src/ddi_fw/datasets/core.py +211 -0
  5. ddi_fw-0.0.150/src/ddi_fw/datasets/dataset_splitter.py +39 -0
  6. ddi_fw-0.0.150/src/ddi_fw/datasets/ddi_mdl/base.py +213 -0
  7. ddi_fw-0.0.150/src/ddi_fw/datasets/ddi_mdl/debug.log +1 -0
  8. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/embedding_generator.py +2 -1
  9. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/langchain/embeddings.py +1 -0
  10. ddi_fw-0.0.150/src/ddi_fw/ml/evaluation_helper.py +195 -0
  11. ddi_fw-0.0.150/src/ddi_fw/ml/ml_helper.py +187 -0
  12. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/ml/model_wrapper.py +2 -2
  13. ddi_fw-0.0.150/src/ddi_fw/ml/pytorch_wrapper.py +186 -0
  14. ddi_fw-0.0.150/src/ddi_fw/ml/tensorflow_wrapper.py +260 -0
  15. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/ner/ner.py +93 -39
  16. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/pipeline/multi_modal_combination_strategy.py +4 -2
  17. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/pipeline/multi_pipeline.py +2 -15
  18. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/pipeline/ner_pipeline.py +15 -6
  19. ddi_fw-0.0.150/src/ddi_fw/pipeline/pipeline.py +250 -0
  20. ddi_fw-0.0.148/src/ddi_fw/test/compress_json_test.py → ddi_fw-0.0.150/src/ddi_fw/utils/json_helper.py +1 -15
  21. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw.egg-info/PKG-INFO +6 -3
  22. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw.egg-info/SOURCES.txt +3 -12
  23. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw.egg-info/requires.txt +4 -1
  24. ddi_fw-0.0.148/src/ddi_fw/datasets/core.py +0 -405
  25. ddi_fw-0.0.148/src/ddi_fw/datasets/ddi_mdl/base.py +0 -149
  26. ddi_fw-0.0.148/src/ddi_fw/ml/evaluation_helper.py +0 -326
  27. ddi_fw-0.0.148/src/ddi_fw/ml/ml_helper.py +0 -143
  28. ddi_fw-0.0.148/src/ddi_fw/ml/pytorch_wrapper.py +0 -83
  29. ddi_fw-0.0.148/src/ddi_fw/ml/tensorflow_wrapper.py +0 -168
  30. ddi_fw-0.0.148/src/ddi_fw/pipeline/pipeline.py +0 -192
  31. ddi_fw-0.0.148/src/ddi_fw/test/__init__.py +0 -0
  32. ddi_fw-0.0.148/src/ddi_fw/test/basic_test.py +0 -15
  33. ddi_fw-0.0.148/src/ddi_fw/test/combination_test.py +0 -12
  34. ddi_fw-0.0.148/src/ddi_fw/test/date_test.py +0 -15
  35. ddi_fw-0.0.148/src/ddi_fw/test/idf_score.py +0 -54
  36. ddi_fw-0.0.148/src/ddi_fw/test/jaccard_similarity.py +0 -85
  37. ddi_fw-0.0.148/src/ddi_fw/test/mlfow_test.py +0 -165
  38. ddi_fw-0.0.148/src/ddi_fw/test/sklearn-tfidf.py +0 -16
  39. ddi_fw-0.0.148/src/ddi_fw/test/test.py +0 -93
  40. ddi_fw-0.0.148/src/ddi_fw/test/torch_cuda_test.py +0 -9
  41. ddi_fw-0.0.148/src/ddi_fw/test/type_guarding_test.py +0 -18
  42. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/README.md +0 -0
  43. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/setup.cfg +0 -0
  44. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/db_utils.py +0 -0
  45. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/data/event.db +0 -0
  46. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt +0 -0
  47. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_0.txt +0 -0
  48. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_1.txt +0 -0
  49. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_2.txt +0 -0
  50. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_3.txt +0 -0
  51. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/train_fold_4.txt +0 -0
  52. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/train_indexes.txt +0 -0
  53. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_0.txt +0 -0
  54. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_1.txt +0 -0
  55. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_2.txt +0 -0
  56. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_3.txt +0 -0
  57. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes/validation_fold_4.txt +0 -0
  58. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/test_indexes.txt +0 -0
  59. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_0.txt +0 -0
  60. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_1.txt +0 -0
  61. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_2.txt +0 -0
  62. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_3.txt +0 -0
  63. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_fold_4.txt +0 -0
  64. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/train_indexes.txt +0 -0
  65. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_0.txt +0 -0
  66. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_1.txt +0 -0
  67. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_2.txt +0 -0
  68. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_3.txt +0 -0
  69. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/indexes_old/validation_fold_4.txt +0 -0
  70. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl/readme.md +0 -0
  71. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/base.py +0 -0
  72. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/data/event.db +0 -0
  73. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/test_indexes.txt +0 -0
  74. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_0.txt +0 -0
  75. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_1.txt +0 -0
  76. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_2.txt +0 -0
  77. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_3.txt +0 -0
  78. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_4.txt +0 -0
  79. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/train_indexes.txt +0 -0
  80. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_0.txt +0 -0
  81. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_1.txt +0 -0
  82. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_2.txt +0 -0
  83. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_3.txt +0 -0
  84. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_4.txt +0 -0
  85. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/feature_vector_generation.py +0 -0
  86. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/idf_helper.py +0 -0
  87. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/__init__.py +0 -0
  88. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/base.py +0 -0
  89. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/df_extraction_cleanxiaoyu50.csv +0 -0
  90. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/drug_information_del_noDDIxiaoyu50.csv +0 -0
  91. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/test_indexes.txt +0 -0
  92. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_0.txt +0 -0
  93. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_1.txt +0 -0
  94. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_2.txt +0 -0
  95. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_3.txt +0 -0
  96. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_fold_4.txt +0 -0
  97. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/train_indexes.txt +0 -0
  98. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_0.txt +0 -0
  99. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_1.txt +0 -0
  100. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_2.txt +0 -0
  101. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_3.txt +0 -0
  102. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/indexes/validation_fold_4.txt +0 -0
  103. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/mdf_sa_ddi/mdf-sa-ddi.zip +0 -0
  104. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/datasets/setup_._py +0 -0
  105. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/drugbank/__init__.py +0 -0
  106. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/drugbank/drugbank.xsd +0 -0
  107. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/drugbank/drugbank_parser.py +0 -0
  108. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/drugbank/drugbank_processor.py +0 -0
  109. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/drugbank/drugbank_processor_org.py +0 -0
  110. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/drugbank/event_extractor.py +0 -0
  111. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/langchain/__init__.py +0 -0
  112. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/langchain/sentence_splitter.py +0 -0
  113. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/langchain/storage.py +0 -0
  114. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/ml/__init__.py +0 -0
  115. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/ner/__init__.py +0 -0
  116. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/ner/mmlrestclient.py +0 -0
  117. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/pipeline/__init__.py +0 -0
  118. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/utils/__init__.py +0 -0
  119. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/utils/enums.py +0 -0
  120. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/utils/kaggle.py +0 -0
  121. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/utils/package_helper.py +0 -0
  122. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/utils/py7zr_helper.py +0 -0
  123. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/utils/utils.py +0 -0
  124. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw/utils/zip_helper.py +0 -0
  125. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw.egg-info/dependency_links.txt +0 -0
  126. {ddi_fw-0.0.148 → ddi_fw-0.0.150}/src/ddi_fw.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.148
3
+ Version: 0.0.150
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -22,6 +22,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
22
  Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
23
23
  Requires-Python: >=3.10
24
24
  Description-Content-Type: text/markdown
25
+ Requires-Dist: pydantic==2.10.6
25
26
  Requires-Dist: importlib-resources==6.4.5
26
27
  Requires-Dist: python-stopwatch==1.1.11
27
28
  Requires-Dist: lxml==5.3.0
@@ -35,7 +36,7 @@ Requires-Dist: rdkit==2023.3.3
35
36
  Requires-Dist: scikit-learn==1.5.2
36
37
  Requires-Dist: scipy==1.13.1
37
38
  Requires-Dist: accelerate>=0.33.0
38
- Requires-Dist: sentence-transformers>=3.0.1
39
+ Requires-Dist: sentence-transformers<=3.3.1,>=3.0.1
39
40
  Requires-Dist: transformers>=4.42.4
40
41
  Requires-Dist: stanza==1.9.2
41
42
  Requires-Dist: tokenizers>=0.19.1
@@ -49,3 +50,5 @@ Requires-Dist: chromadb>=0.5.15
49
50
  Requires-Dist: langchain_community==0.3.3
50
51
  Requires-Dist: datasets==3.0.2
51
52
  Requires-Dist: unstructured==0.16.3
53
+ Requires-Dist: tensorflow<2.18.0,>=2.17.0
54
+ Requires-Dist: tf-keras==2.17.0
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
5
5
 
6
6
  [project]
7
7
  name = "ddi_fw"
8
- version = "0.0.148"
8
+ version = "0.0.150"
9
9
  description = "Do not use :)"
10
10
  readme = "README.md"
11
11
  authors = [
@@ -45,7 +45,8 @@ license = { file = "LICENSE" }
45
45
 
46
46
  requires-python = ">=3.10"
47
47
  dependencies = [
48
- "importlib-resources==6.4.5"
48
+ "pydantic==2.10.6"
49
+ ,"importlib-resources==6.4.5"
49
50
  ,"python-stopwatch==1.1.11"
50
51
  ,"lxml==5.3.0"
51
52
  ,"matplotlib==3.8.0"
@@ -58,7 +59,7 @@ dependencies = [
58
59
  ,"scikit-learn==1.5.2"
59
60
  ,"scipy==1.13.1"
60
61
  ,"accelerate>=0.33.0"
61
- ,"sentence-transformers>=3.0.1"
62
+ ,"sentence-transformers>=3.0.1,<=3.3.1"
62
63
  ,"transformers>=4.42.4"
63
64
  ,"stanza==1.9.2"
64
65
  ,"tokenizers>=0.19.1"
@@ -71,7 +72,9 @@ dependencies = [
71
72
  ,"chromadb>=0.5.15"
72
73
  ,"langchain_community==0.3.3"
73
74
  ,"datasets==3.0.2"
74
- ,"unstructured==0.16.3"
75
+ ,"unstructured==0.16.3",
76
+ "tensorflow>=2.17.0,<2.18.0"
77
+ ,"tf-keras==2.17.0"
75
78
  ]
76
79
 
77
80
 
@@ -85,3 +88,15 @@ where = ["src"] # list of folders that contain the packages (["."] by default)
85
88
  # See https://setuptools.pypa.io/en/latest/userguide/datafiles.html
86
89
  [tool.setuptools.package-data]
87
90
  "*" = ["*.*"]
91
+
92
+ [tool.coverage.run]
93
+ source = ["src"]
94
+
95
+ # pyproject.toml
96
+ [tool.pytest.ini_options]
97
+ minversion = "6.0"
98
+ addopts = "-ra -q"
99
+
100
+ testpaths = [
101
+ "tests"
102
+ ]
@@ -5,7 +5,7 @@ from .mdf_sa_ddi.base import MDFSADDIDataset
5
5
  from .embedding_generator import create_embeddings
6
6
  from .idf_helper import IDF
7
7
  from .feature_vector_generation import SimilarityMatrixGenerator, VectorGenerator
8
-
8
+ from .dataset_splitter import DatasetSplitter
9
9
  __all__ = ['BaseDataset','DDIMDLDataset','MDFSADDIDataset']
10
10
 
11
11
 
@@ -0,0 +1,211 @@
1
+ import glob
2
+ from typing import List, Optional, Type
3
+ import numpy as np
4
+ import pandas as pd
5
+ from pydantic import BaseModel, Field, computed_field
6
+ from ddi_fw.datasets.dataset_splitter import DatasetSplitter
7
+ from ddi_fw.datasets.feature_vector_generation import SimilarityMatrixGenerator, VectorGenerator
8
+ from ddi_fw.langchain.embeddings import PoolingStrategy
9
+ from ddi_fw.utils.utils import create_folder_if_not_exists
10
+
11
+
12
+ def stack(df_column):
13
+ return np.stack(df_column.values)
14
+
15
+
16
+ def generate_vectors(df, columns):
17
+ vectorGenerator = VectorGenerator(df)
18
+ generated_vectors = vectorGenerator.generate_feature_vectors(
19
+ columns)
20
+ return generated_vectors
21
+
22
+
23
+ def generate_sim_matrices_new(df, generated_vectors, columns, key_column="id"):
24
+ jaccard_sim_dict = {}
25
+ sim_matrix_gen = SimilarityMatrixGenerator()
26
+
27
+ for column in columns:
28
+ # key = '2D_'+column
29
+ key = column
30
+ jaccard_sim_dict[column] = sim_matrix_gen.create_jaccard_similarity_matrices(
31
+ generated_vectors[key])
32
+
33
+ similarity_matrices = {}
34
+ keys = df[key_column].to_list()
35
+ new_columns = {}
36
+ for idx in range(len(keys)):
37
+ new_columns[idx] = keys[idx]
38
+ for column in columns:
39
+ new_df = pd.DataFrame.from_dict(jaccard_sim_dict[column])
40
+ new_df = new_df.rename(index=new_columns, columns=new_columns)
41
+ similarity_matrices[column] = new_df
42
+ return similarity_matrices
43
+
44
+
45
+ class BaseDataset(BaseModel):
46
+ dataset_name: str
47
+ index_path: str
48
+ dataset_splitter_type: Type[DatasetSplitter]
49
+ class_column: str = 'class'
50
+ dataframe: Optional[pd.DataFrame] = None
51
+ X_train: Optional[pd.DataFrame] = None
52
+ X_test: Optional[pd.DataFrame] = None
53
+ y_train: Optional[pd.Series] = None
54
+ y_test: Optional[pd.Series] = None
55
+ train_indexes: Optional[pd.Index] = None
56
+ test_indexes: Optional[pd.Index] = None
57
+ train_idx_arr: List|None = None
58
+ val_idx_arr: List|None = None
59
+ # train_idx_arr: Optional[List[np.ndarray]] = None
60
+ # val_idx_arr: Optional[List[np.ndarray]] = None
61
+ columns: List[str] = []
62
+
63
+ # feature_process: FeatureProcessor
64
+ # similarity_matrix_service: SimilarityMatrixService
65
+
66
+ class Config:
67
+ arbitrary_types_allowed = True
68
+
69
+ def produce_inputs(self):
70
+ items = []
71
+ if self.X_train is None or self.X_test is None:
72
+ raise Exception("There is no data to produce inputs")
73
+ y_train_label, y_test_label = stack(self.y_train), stack(self.y_test)
74
+
75
+ for column in self.columns:
76
+ train_data, test_data = stack(
77
+ self.X_train[column]), stack(self.X_test[column])
78
+ items.append([f'{column}', np.nan_to_num(train_data),
79
+ y_train_label, np.nan_to_num(test_data), y_test_label])
80
+
81
+ # items.append([f'{column}_embedding', train_data,
82
+ # y_train_label, test_data, y_test_label])
83
+ return items
84
+
85
+ @computed_field
86
+ @property
87
+ def dataset_splitter(self) -> DatasetSplitter:
88
+ return self.dataset_splitter_type()
89
+
90
+ def set_dataframe(self, dataframe: pd.DataFrame):
91
+ self.dataframe = dataframe
92
+
93
+ # @abstractmethod
94
+ def prep(self):
95
+ pass
96
+
97
+ def load(self):
98
+ if self.index_path is None:
99
+ raise Exception(
100
+ "There is no index path, please call split function")
101
+
102
+ try:
103
+ train_idx_all, test_idx_all, train_idx_arr, val_idx_arr = self.__get_indexes__(
104
+ self.index_path)
105
+ except FileNotFoundError as e:
106
+ raise FileNotFoundError(f"Index files not found: {e.filename}")
107
+
108
+ self.prep()
109
+
110
+ if self.dataframe is None:
111
+ raise Exception("There is no dataframe")
112
+
113
+ train = self.dataframe[self.dataframe.index.isin(train_idx_all)]
114
+ test = self.dataframe[self.dataframe.index.isin(test_idx_all)]
115
+
116
+ self.X_train = train.drop(self.class_column, axis=1)
117
+ self.y_train = train[self.class_column]
118
+ self.X_test = test.drop(self.class_column, axis=1)
119
+ self.y_test = test[self.class_column]
120
+
121
+ self.train_indexes = self.X_train.index
122
+ self.test_indexes = self.X_test.index
123
+ self.train_idx_arr = train_idx_arr
124
+ self.val_idx_arr = val_idx_arr
125
+
126
+ return self.X_train, self.X_test, self.y_train, self.y_test, self.X_train.index, self.X_test.index, train_idx_arr, val_idx_arr
127
+
128
+ def __get_indexes__(self, path):
129
+ train_index_path = path+'/train_indexes.txt'
130
+ test_index_path = path+'/test_indexes.txt'
131
+ train_fold_files = f'{path}/train_fold_*.txt'
132
+ val_fold_files = f'{path}/validation_fold_*.txt'
133
+ train_idx_arr = []
134
+ val_idx_arr = []
135
+ with open(train_index_path, 'r', encoding="utf8") as f:
136
+ train_idx_all = [int(r) for r in f.readlines()]
137
+ with open(test_index_path, 'r', encoding="utf8") as f:
138
+ test_idx_all = [int(r) for r in f.readlines()]
139
+
140
+ for filepath in glob.glob(train_fold_files):
141
+ with open(filepath, 'r', encoding="utf8") as f:
142
+ train_idx = [int(r) for r in f.readlines()]
143
+ train_idx_arr.append(train_idx)
144
+ for filepath in glob.glob(val_fold_files):
145
+ with open(filepath, 'r', encoding="utf8") as f:
146
+ val_idx = [int(r) for r in f.readlines()]
147
+ val_idx_arr.append(val_idx)
148
+ return train_idx_all, test_idx_all, train_idx_arr, val_idx_arr
149
+
150
+ def __save_indexes__(self, path, filename, indexes):
151
+ create_folder_if_not_exists(path)
152
+ file_path = path + '/'+filename
153
+ str_indexes = [str(index) for index in indexes]
154
+ with open(file_path, 'w') as f:
155
+ f.write('\n'.join(str_indexes))
156
+
157
+ def split_dataset(self, save_indexes: bool = False):
158
+ # TODO class type should be parametric
159
+
160
+ save_path = self.index_path
161
+ self.prep()
162
+
163
+ if self.dataframe is None:
164
+ raise Exception("There is no data")
165
+
166
+ X = self.dataframe.drop(self.class_column, axis=1)
167
+ y = self.dataframe[self.class_column]
168
+
169
+ X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = self.dataset_splitter.split(
170
+ X=X, y=y)
171
+ self.X_train = X_train
172
+ self.X_test = X_test
173
+ self.y_train = y_train
174
+ self.y_test = y_test
175
+ self.train_indexes = X_train.index
176
+ self.test_indexes = X_test.index
177
+ self.train_idx_arr = train_idx_arr
178
+ self.val_idx_arr = val_idx_arr
179
+
180
+ if save_indexes:
181
+ # train_pairs = [row['id1'].join(',').row['id2'] for index, row in X_train.iterrows()]
182
+ self.__save_indexes__(
183
+ save_path, 'train_indexes.txt', self.train_indexes.values)
184
+ self.__save_indexes__(
185
+ save_path, 'test_indexes.txt', self.test_indexes.values)
186
+
187
+ for i, (train_idx, val_idx) in enumerate(zip(train_idx_arr, val_idx_arr)):
188
+ self.__save_indexes__(
189
+ save_path, f'train_fold_{i}.txt', train_idx)
190
+ self.__save_indexes__(
191
+ save_path, f'validation_fold_{i}.txt', val_idx)
192
+
193
+ # return X_train, X_test, y_train, y_test, folds
194
+
195
+
196
+ class TextDatasetMixin(BaseDataset):
197
+ embedding_size: int
198
+ embedding_dict: dict
199
+ embeddings_pooling_strategy: PoolingStrategy | None = None
200
+
201
+ def process_text(self):
202
+ pass
203
+
204
+
205
+ # class ImageDatasetMixin(BaseModel):
206
+ # image_size: tuple[int, int] = Field(default=(224, 224))
207
+ # augmentations: list[str] = Field(default_factory=list)
208
+
209
+ # def process_image_data(self):
210
+ # print(
211
+ # f"Processing image data with size {self.image_size} and augmentations {self.augmentations}...")
@@ -0,0 +1,39 @@
1
+ from typing import List, Tuple
2
+ import numpy as np
3
+ import pandas as pd
4
+ from pydantic import BaseModel, Field
5
+ from sklearn.model_selection import StratifiedKFold, train_test_split
6
+
7
+
8
+ class DatasetSplitter(BaseModel):
9
+ fold_size: int = Field(default=5, ge=2)
10
+ test_size: float = Field(default=0.2, ge=0.0, le=1.0)
11
+ shuffle: bool = True
12
+ random_state: int = Field(default=42)
13
+
14
+ class Config:
15
+ arbitrary_types_allowed = True
16
+
17
+ def split(self, X: pd.DataFrame, y: pd.Series)-> Tuple[
18
+ pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.Index, pd.Index, List[np.ndarray], List[np.ndarray]]:
19
+ print(
20
+ f"Splitting dataset into {self.fold_size} folds with shuffle={self.shuffle}...")
21
+ #TODO check it
22
+ if len(y.shape) == 1:
23
+ y = pd.Series(np.expand_dims(y.to_numpy(), axis=1).flatten())
24
+ stacked = np.vstack(tuple(y.to_numpy()))
25
+ stratify = np.argmax(stacked, axis=1)
26
+ X_train, X_test, y_train, y_test = train_test_split(
27
+ X, y, shuffle=self.shuffle, test_size=self.test_size, stratify=stratify)
28
+
29
+ k_fold = StratifiedKFold(
30
+ n_splits=self.fold_size, shuffle=self.shuffle, random_state=self.random_state)
31
+ folds = k_fold.split(X_train, np.argmax(
32
+ np.vstack(y_train.to_numpy()), axis=1))
33
+ train_idx_arr = []
34
+ val_idx_arr = []
35
+ for i, (train_index, val_index) in enumerate(folds):
36
+ train_idx_arr.append(train_index)
37
+ val_idx_arr.append(val_index)
38
+
39
+ return X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr
@@ -0,0 +1,213 @@
1
+ import glob
2
+ import pathlib
3
+ from typing import List, Optional, Tuple
4
+ from ddi_fw.datasets.core import BaseDataset, TextDatasetMixin, generate_sim_matrices_new, generate_vectors
5
+ from ddi_fw.datasets.dataset_splitter import DatasetSplitter
6
+ from ddi_fw.datasets.db_utils import create_connection
7
+ from ddi_fw.datasets.idf_helper import IDF
8
+ from ddi_fw.utils.utils import create_folder_if_not_exists
9
+ import numpy as np
10
+ import pandas as pd
11
+ from pydantic import BaseModel, Field, model_validator, root_validator
12
+ from ddi_fw.datasets.feature_vector_generation import SimilarityMatrixGenerator,VectorGenerator
13
+ from ddi_fw.langchain.embeddings import PoolingStrategy
14
+ from abc import ABC, abstractmethod
15
+ from sklearn.preprocessing import LabelBinarizer
16
+
17
+ from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
18
+
19
+ # Constants for embedding, chemical properties, and NER columns
20
+ LIST_OF_EMBEDDING_COLUMNS = [
21
+ 'all_text', 'description', 'synthesis_reference', 'indication',
22
+ 'pharmacodynamics', 'mechanism_of_action', 'toxicity', 'metabolism',
23
+ 'absorption', 'half_life', 'protein_binding', 'route_of_elimination',
24
+ 'volume_of_distribution', 'clearance'
25
+ ]
26
+
27
+ LIST_OF_CHEMICAL_PROPERTY_COLUMNS = ['enzyme', 'target', 'pathway', 'smile']
28
+ LIST_OF_NER_COLUMNS = ['tui', 'cui', 'entities']
29
+
30
+ HERE = pathlib.Path(__file__).resolve().parent
31
+
32
+ class DDIMDLDataset(TextDatasetMixin):
33
+ index_path: str = Field(default_factory=lambda: str(
34
+ pathlib.Path(__file__).resolve().parent.joinpath('indexes')))
35
+ # drugs_df: pd.DataFrame = Field(default_factory=pd.DataFrame)
36
+ # ddis_df: pd.DataFrame = Field(default_factory=pd.DataFrame)
37
+ drugs_df: Optional[pd.DataFrame] = None
38
+ ddis_df: Optional[pd.DataFrame] = None
39
+
40
+ chemical_property_columns: list[str] = Field(
41
+ default_factory=lambda: LIST_OF_CHEMICAL_PROPERTY_COLUMNS)
42
+ embedding_columns: list[str] = Field(default_factory=list)
43
+ ner_columns: list[str] = Field(default_factory=list)
44
+ ner_df: pd.DataFrame | None = None
45
+ tui_threshold: float | None = None
46
+ cui_threshold: float | None = None
47
+ entities_threshold: float | None = None
48
+
49
+
50
+ # @model_validator
51
+ def validate_columns(self, values):
52
+ if not set(values['chemical_property_columns']).issubset(LIST_OF_CHEMICAL_PROPERTY_COLUMNS):
53
+ raise ValueError("Invalid chemical property columns")
54
+ if not set(values['ner_columns']).issubset(LIST_OF_NER_COLUMNS):
55
+ raise ValueError("Invalid NER columns")
56
+ return values
57
+
58
+ def __init__(self, **kwargs):
59
+ super().__init__(**kwargs)
60
+ self.class_column = 'event_category'
61
+ _db_path = HERE.joinpath('data/event.db')
62
+
63
+ self.__similarity_related_columns__ = []
64
+ self.__similarity_related_columns__.extend(self.chemical_property_columns)
65
+ self.__similarity_related_columns__.extend(self.ner_columns)
66
+ # TODO with resource
67
+ self._conn = create_connection(_db_path.absolute().as_posix())
68
+ self.load_drugs_and_events()
69
+
70
+ def load_drugs_and_events(self):
71
+ self.drugs_df = self.__select_all_drugs_as_dataframe__()
72
+ self.ddis_df = self.__select_all_events__()
73
+
74
+ def __select_all_drugs_as_dataframe__(self):
75
+ headers = ['index', 'id', 'name',
76
+ 'target', 'enzyme', 'pathway', 'smile']
77
+ if self._conn is None:
78
+ raise Exception("There is no connection")
79
+ cur = self._conn.cursor()
80
+ cur.execute(
81
+ '''SELECT "index", id, name, target, enzyme, pathway, smile FROM drug'''
82
+ )
83
+ rows = cur.fetchall()
84
+ df = pd.DataFrame(columns=headers, data=rows)
85
+
86
+ # Convert string fields to lists
87
+ for col in ['enzyme', 'target', 'pathway', 'smile']:
88
+ df[col] = df[col].apply(lambda x: x.split('|'))
89
+
90
+ return df
91
+
92
+ def __select_all_events__(self):
93
+ if self._conn is None:
94
+ raise Exception("There is no connection")
95
+ cur = self._conn.cursor()
96
+ cur.execute('''
97
+ SELECT ex."index", d1.id, d1.name, d2.id, d2.name, mechanism || ' ' || action
98
+ FROM extraction ex
99
+ JOIN drug d1 ON d1.name = ex.drugA
100
+ JOIN drug d2 ON d2.name = ex.drugB
101
+ ''')
102
+ rows = cur.fetchall()
103
+ headers = ["index", "id1", "name1", "id2", "name2", "event_category"]
104
+ return pd.DataFrame(columns=headers, data=rows)
105
+
106
+ def prep(self):
107
+ if self.drugs_df is None or self.ddis_df is None:
108
+ raise Exception("There is no data")
109
+
110
+ drug_ids = self.drugs_df['id'].to_list()
111
+
112
+ filtered_df = self.drugs_df
113
+ combined_df = filtered_df.copy()
114
+
115
+ if self.ner_df is not None and not self.ner_df.empty:
116
+ filtered_ner_df = self.ner_df[self.ner_df['drugbank_id'].isin(
117
+ drug_ids)]
118
+ filtered_ner_df = self.ner_df.copy()
119
+
120
+ # TODO: eğer kullanılan veri setinde tui, cui veya entity bilgileri yoksa o veri setine bu sütunları eklemek için aşağısı gerekli
121
+
122
+ # idf_calc = IDF(filtered_ner_df, [f for f in filtered_ner_df.keys()])
123
+ idf_calc = IDF(filtered_ner_df, self.ner_columns)
124
+ idf_calc.calculate()
125
+ idf_scores_df = idf_calc.to_dataframe()
126
+
127
+ # for key in filtered_ner_df.keys():
128
+ for key in self.ner_columns:
129
+ threshold = 0
130
+ if key.startswith('tui'):
131
+ threshold = self.tui_threshold
132
+ if key.startswith('cui'):
133
+ threshold = self.cui_threshold
134
+ if key.startswith('entities'):
135
+ threshold = self.entities_threshold
136
+ combined_df[key] = filtered_ner_df[key]
137
+ valid_codes = idf_scores_df[idf_scores_df[key] > threshold].index
138
+
139
+ # print(f'{key}: valid code size = {len(valid_codes)}')
140
+ combined_df[key] = combined_df[key].apply(lambda items:
141
+ [item for item in items if item in valid_codes])
142
+
143
+ moved_columns = ['id']
144
+ moved_columns.extend(self.__similarity_related_columns__)
145
+ chemical_properties_df = combined_df[moved_columns]
146
+
147
+ chemical_properties_df = chemical_properties_df.fillna("").apply(list)
148
+
149
+ # generate vectors dictionary içinde ndarray dönecek
150
+ generated_vectors = generate_vectors(chemical_properties_df, self.__similarity_related_columns__)
151
+
152
+ similarity_matrices = generate_sim_matrices_new(
153
+ chemical_properties_df,generated_vectors, self.__similarity_related_columns__, key_column= "id")
154
+
155
+ event_categories = self.ddis_df['event_category']
156
+ labels = event_categories.tolist()
157
+ lb = LabelBinarizer()
158
+ lb.fit(labels)
159
+ classes = lb.transform(labels)
160
+
161
+ def similarity_lambda_fnc(row, value):
162
+ if row['id1'] in value:
163
+ return value[row['id1']]
164
+
165
+ def lambda_fnc(row: pd.Series, value)-> Optional[np.float16]:
166
+ if row['id1'] in value and row['id2'] in value:
167
+ return np.float16(np.hstack(
168
+ (value[row['id1']], value[row['id2']])))
169
+ return None
170
+ # return np.hstack(
171
+ # (value[row['id1']], value[row['id2']]), dtype=np.float16)
172
+
173
+ def x_fnc(row, embeddings_after_pooling):
174
+ if row['id1'] in embeddings_after_pooling:
175
+ v1 = embeddings_after_pooling[row['id1']]
176
+ else:
177
+ v1 = np.zeros(self.embedding_size)
178
+ if row['id2'] in embeddings_after_pooling:
179
+ v2 = embeddings_after_pooling[row['id2']]
180
+ else:
181
+ v2 = np.zeros(self.embedding_size)
182
+ return np.float16(np.hstack(
183
+ (v1, v2)))
184
+
185
+ for key, value in similarity_matrices.items():
186
+
187
+ print(f'sim matrix: {key}')
188
+ self.ddis_df[key] = self.ddis_df.apply(
189
+ lambda_fnc, args=(value,), axis=1)
190
+ self.columns.append(key)
191
+ print(self.ddis_df[key].head())
192
+
193
+ for embedding_column in self.embedding_columns:
194
+ print(f"concat {embedding_column} embeddings")
195
+ embeddings_after_pooling = {k: self.embeddings_pooling_strategy.apply(
196
+ v) for k, v in self.embedding_dict[embedding_column].items()}
197
+ # column_embeddings_dict = embedding_values[embedding_column]
198
+ self.ddis_df[embedding_column+'_embedding'] = self.ddis_df.apply(
199
+ x_fnc, args=(embeddings_after_pooling,), axis=1)
200
+ self.columns.append(embedding_column+'_embedding')
201
+
202
+ dataframe = self.ddis_df.copy()
203
+ if not isinstance(classes, (list, pd.Series, np.ndarray)):
204
+ raise TypeError("classes must be an iterable (list, Series, or ndarray)")
205
+
206
+ if len(classes) != len(dataframe):
207
+ raise ValueError("Length of classes must match the number of rows in the DataFrame")
208
+
209
+ dataframe['class'] = list(classes)
210
+ self.set_dataframe(dataframe)
211
+
212
+
213
+
@@ -0,0 +1 @@
1
+ [0217/121135.683:ERROR:registration_protocol_win.cc(108)] CreateFile: Sistem belirtilen dosyayı bulamıyor. (0x2)
@@ -58,7 +58,8 @@ def create_embeddings(model, data, column, drop_column=True):
58
58
  column_embeddings_dict[row['id']] = sum_of_embeddings
59
59
  # data.iloc[index][column+'_embedding']=sum_of_embeddings
60
60
 
61
- data[column+'_embedding'] = pd.Series(column_embeddings_dict.values())
61
+ # data[column+'_embedding'] = pd.Series(column_embeddings_dict.values())
62
+ data[column+'_embedding'] = pd.Series(list(column_embeddings_dict.values()))
62
63
  if(drop_column):
63
64
  data.drop([column], axis = 1, inplace = True)
64
65
  # data[column+'_embedding'] = [column_embeddings_dict[row['name']] for index, row in data.iterrows()]
@@ -82,6 +82,7 @@ class PretrainedEmbeddings(Embeddings):
82
82
  text, return_tensors='pt', padding=True)
83
83
  output_embeddings.append(self.model(
84
84
  input_ids).last_hidden_state.mean(dim=1))
85
+ return output_embeddings
85
86
 
86
87
  def embed_query(self, text: str) -> List[float]:
87
88
  return self.embed_documents([text])[0]