biased-split 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. biased_split-0.1.0/.gitignore +228 -0
  2. biased_split-0.1.0/.python-version +1 -0
  3. biased_split-0.1.0/PKG-INFO +26 -0
  4. biased_split-0.1.0/README.md +1 -0
  5. biased_split-0.1.0/biased_split/__init__.py +29 -0
  6. biased_split-0.1.0/biased_split/activity_cliff.py +294 -0
  7. biased_split-0.1.0/biased_split/knn_failure.py +231 -0
  8. biased_split-0.1.0/biased_split/molecularnetwork.py +271 -0
  9. biased_split-0.1.0/biased_split/proxy_sorted.py +251 -0
  10. biased_split-0.1.0/biased_split/substructure_distance.py +185 -0
  11. biased_split-0.1.0/data/raw/target_CHEMBL1163125-1.IC50.csv +712 -0
  12. biased_split-0.1.0/data/raw/target_CHEMBL1163125-2.IC50.csv +232 -0
  13. biased_split-0.1.0/data/raw/target_CHEMBL1163125-3.IC50.csv +401 -0
  14. biased_split-0.1.0/data/raw/target_CHEMBL1163125-4.IC50.csv +287 -0
  15. biased_split-0.1.0/data/raw/target_CHEMBL1741186-1.IC50.csv +442 -0
  16. biased_split-0.1.0/data/raw/target_CHEMBL1741186-2.IC50.csv +296 -0
  17. biased_split-0.1.0/data/raw/target_CHEMBL1865-1.IC50.csv +720 -0
  18. biased_split-0.1.0/data/raw/target_CHEMBL1865-2.IC50.csv +352 -0
  19. biased_split-0.1.0/data/raw/target_CHEMBL1974-1.IC50.csv +517 -0
  20. biased_split-0.1.0/data/raw/target_CHEMBL1974-2.IC50.csv +204 -0
  21. biased_split-0.1.0/data/raw/target_CHEMBL203-1.IC50.csv +230 -0
  22. biased_split-0.1.0/data/raw/target_CHEMBL203-2.IC50.csv +831 -0
  23. biased_split-0.1.0/data/raw/target_CHEMBL2039-1.IC50.csv +1006 -0
  24. biased_split-0.1.0/data/raw/target_CHEMBL2039-2.IC50.csv +584 -0
  25. biased_split-0.1.0/data/raw/target_CHEMBL2039-3.IC50.csv +485 -0
  26. biased_split-0.1.0/data/raw/target_CHEMBL206-1.IC50.csv +490 -0
  27. biased_split-0.1.0/data/raw/target_CHEMBL2148-1.IC50.csv +738 -0
  28. biased_split-0.1.0/data/raw/target_CHEMBL2148-2.IC50.csv +208 -0
  29. biased_split-0.1.0/data/raw/target_CHEMBL220-1.IC50.csv +1583 -0
  30. biased_split-0.1.0/data/raw/target_CHEMBL222-1.IC50.csv +669 -0
  31. biased_split-0.1.0/data/raw/target_CHEMBL222-2.IC50.csv +215 -0
  32. biased_split-0.1.0/data/raw/target_CHEMBL228-1.IC50.csv +209 -0
  33. biased_split-0.1.0/data/raw/target_CHEMBL228-2.IC50.csv +906 -0
  34. biased_split-0.1.0/data/raw/target_CHEMBL240-1.IC50.csv +1468 -0
  35. biased_split-0.1.0/data/raw/target_CHEMBL240-2.IC50.csv +250 -0
  36. biased_split-0.1.0/data/raw/target_CHEMBL240-3.IC50.csv +671 -0
  37. biased_split-0.1.0/data/raw/target_CHEMBL2409-1.IC50.csv +1266 -0
  38. biased_split-0.1.0/data/raw/target_CHEMBL243-1.IC50.csv +1050 -0
  39. biased_split-0.1.0/data/raw/target_CHEMBL247-1.IC50.csv +2055 -0
  40. biased_split-0.1.0/data/raw/target_CHEMBL247-2.IC50.csv +221 -0
  41. biased_split-0.1.0/data/raw/target_CHEMBL260-1.IC50.csv +1514 -0
  42. biased_split-0.1.0/data/raw/target_CHEMBL260-2.IC50.csv +289 -0
  43. biased_split-0.1.0/data/raw/target_CHEMBL262-1.IC50.csv +870 -0
  44. biased_split-0.1.0/data/raw/target_CHEMBL279-1.IC50.csv +1752 -0
  45. biased_split-0.1.0/data/raw/target_CHEMBL284-1.IC50.csv +1076 -0
  46. biased_split-0.1.0/data/raw/target_CHEMBL284-2.IC50.csv +230 -0
  47. biased_split-0.1.0/data/raw/target_CHEMBL2971-1.IC50.csv +803 -0
  48. biased_split-0.1.0/data/raw/target_CHEMBL2971-2.IC50.csv +209 -0
  49. biased_split-0.1.0/data/raw/target_CHEMBL3105-1.IC50.csv +795 -0
  50. biased_split-0.1.0/data/raw/target_CHEMBL3130-1.IC50.csv +988 -0
  51. biased_split-0.1.0/data/raw/target_CHEMBL3229-1.IC50.csv +310 -0
  52. biased_split-0.1.0/data/raw/target_CHEMBL3229-2.IC50.csv +240 -0
  53. biased_split-0.1.0/data/raw/target_CHEMBL325-1.IC50.csv +1444 -0
  54. biased_split-0.1.0/data/raw/target_CHEMBL3267-1.IC50.csv +709 -0
  55. biased_split-0.1.0/data/raw/target_CHEMBL344-1.IC50.csv +391 -0
  56. biased_split-0.1.0/data/raw/target_CHEMBL3471-1.IC50.csv +1762 -0
  57. biased_split-0.1.0/data/raw/target_CHEMBL3717-1.IC50.csv +1371 -0
  58. biased_split-0.1.0/data/raw/target_CHEMBL4005-1.IC50.csv +1243 -0
  59. biased_split-0.1.0/data/raw/target_CHEMBL4015-1.IC50.csv +500 -0
  60. biased_split-0.1.0/data/raw/target_CHEMBL4015-2.IC50.csv +311 -0
  61. biased_split-0.1.0/data/raw/target_CHEMBL4078-1.IC50.csv +2420 -0
  62. biased_split-0.1.0/data/raw/target_CHEMBL4235-1.IC50.csv +626 -0
  63. biased_split-0.1.0/data/raw/target_CHEMBL4235-2.IC50.csv +244 -0
  64. biased_split-0.1.0/data/raw/target_CHEMBL4235-3.IC50.csv +380 -0
  65. biased_split-0.1.0/data/raw/target_CHEMBL4235-4.IC50.csv +253 -0
  66. biased_split-0.1.0/data/raw/target_CHEMBL4296-1.IC50.csv +919 -0
  67. biased_split-0.1.0/data/raw/target_CHEMBL4685-1.IC50.csv +569 -0
  68. biased_split-0.1.0/data/raw/target_CHEMBL4685-2.IC50.csv +218 -0
  69. biased_split-0.1.0/data/raw/target_CHEMBL4794-1.IC50.csv +234 -0
  70. biased_split-0.1.0/data/raw/target_CHEMBL4794-2.IC50.csv +284 -0
  71. biased_split-0.1.0/data/raw/target_CHEMBL4822-1.IC50.csv +1397 -0
  72. biased_split-0.1.0/data/raw/target_CHEMBL5145-1.IC50.csv +222 -0
  73. biased_split-0.1.0/data/raw/target_CHEMBL5145-2.IC50.csv +330 -0
  74. biased_split-0.1.0/data/raw/target_CHEMBL5251-1.IC50.csv +598 -0
  75. biased_split-0.1.0/data/raw/target_CHEMBL5251-2.IC50.csv +224 -0
  76. biased_split-0.1.0/data/raw/target_CHEMBL5763-1.IC50.csv +1824 -0
  77. biased_split-0.1.0/data/raw/target_CHEMBL5763-2.IC50.csv +224 -0
  78. biased_split-0.1.0/data/standardized/target_CHEMBL1163125-1.IC50.csv +688 -0
  79. biased_split-0.1.0/data/standardized/target_CHEMBL1163125-2.IC50.csv +215 -0
  80. biased_split-0.1.0/data/standardized/target_CHEMBL1163125-3.IC50.csv +363 -0
  81. biased_split-0.1.0/data/standardized/target_CHEMBL1163125-4.IC50.csv +278 -0
  82. biased_split-0.1.0/data/standardized/target_CHEMBL1741186-1.IC50.csv +387 -0
  83. biased_split-0.1.0/data/standardized/target_CHEMBL1741186-2.IC50.csv +286 -0
  84. biased_split-0.1.0/data/standardized/target_CHEMBL1865-1.IC50.csv +685 -0
  85. biased_split-0.1.0/data/standardized/target_CHEMBL1865-2.IC50.csv +337 -0
  86. biased_split-0.1.0/data/standardized/target_CHEMBL1974-1.IC50.csv +507 -0
  87. biased_split-0.1.0/data/standardized/target_CHEMBL1974-2.IC50.csv +202 -0
  88. biased_split-0.1.0/data/standardized/target_CHEMBL203-1.IC50.csv +227 -0
  89. biased_split-0.1.0/data/standardized/target_CHEMBL203-2.IC50.csv +778 -0
  90. biased_split-0.1.0/data/standardized/target_CHEMBL2039-1.IC50.csv +925 -0
  91. biased_split-0.1.0/data/standardized/target_CHEMBL2039-2.IC50.csv +461 -0
  92. biased_split-0.1.0/data/standardized/target_CHEMBL2039-3.IC50.csv +455 -0
  93. biased_split-0.1.0/data/standardized/target_CHEMBL206-1.IC50.csv +471 -0
  94. biased_split-0.1.0/data/standardized/target_CHEMBL2148-1.IC50.csv +703 -0
  95. biased_split-0.1.0/data/standardized/target_CHEMBL2148-2.IC50.csv +203 -0
  96. biased_split-0.1.0/data/standardized/target_CHEMBL220-1.IC50.csv +1199 -0
  97. biased_split-0.1.0/data/standardized/target_CHEMBL222-1.IC50.csv +599 -0
  98. biased_split-0.1.0/data/standardized/target_CHEMBL222-2.IC50.csv +195 -0
  99. biased_split-0.1.0/data/standardized/target_CHEMBL228-1.IC50.csv +196 -0
  100. biased_split-0.1.0/data/standardized/target_CHEMBL228-2.IC50.csv +813 -0
  101. biased_split-0.1.0/data/standardized/target_CHEMBL240-1.IC50.csv +1313 -0
  102. biased_split-0.1.0/data/standardized/target_CHEMBL240-2.IC50.csv +242 -0
  103. biased_split-0.1.0/data/standardized/target_CHEMBL240-3.IC50.csv +600 -0
  104. biased_split-0.1.0/data/standardized/target_CHEMBL2409-1.IC50.csv +1232 -0
  105. biased_split-0.1.0/data/standardized/target_CHEMBL243-1.IC50.csv +960 -0
  106. biased_split-0.1.0/data/standardized/target_CHEMBL247-1.IC50.csv +1847 -0
  107. biased_split-0.1.0/data/standardized/target_CHEMBL247-2.IC50.csv +208 -0
  108. biased_split-0.1.0/data/standardized/target_CHEMBL260-1.IC50.csv +1464 -0
  109. biased_split-0.1.0/data/standardized/target_CHEMBL260-2.IC50.csv +287 -0
  110. biased_split-0.1.0/data/standardized/target_CHEMBL262-1.IC50.csv +817 -0
  111. biased_split-0.1.0/data/standardized/target_CHEMBL279-1.IC50.csv +1660 -0
  112. biased_split-0.1.0/data/standardized/target_CHEMBL284-1.IC50.csv +985 -0
  113. biased_split-0.1.0/data/standardized/target_CHEMBL284-2.IC50.csv +223 -0
  114. biased_split-0.1.0/data/standardized/target_CHEMBL2971-1.IC50.csv +780 -0
  115. biased_split-0.1.0/data/standardized/target_CHEMBL2971-2.IC50.csv +207 -0
  116. biased_split-0.1.0/data/standardized/target_CHEMBL3105-1.IC50.csv +726 -0
  117. biased_split-0.1.0/data/standardized/target_CHEMBL3130-1.IC50.csv +932 -0
  118. biased_split-0.1.0/data/standardized/target_CHEMBL3229-1.IC50.csv +303 -0
  119. biased_split-0.1.0/data/standardized/target_CHEMBL3229-2.IC50.csv +206 -0
  120. biased_split-0.1.0/data/standardized/target_CHEMBL325-1.IC50.csv +1349 -0
  121. biased_split-0.1.0/data/standardized/target_CHEMBL3267-1.IC50.csv +648 -0
  122. biased_split-0.1.0/data/standardized/target_CHEMBL344-1.IC50.csv +384 -0
  123. biased_split-0.1.0/data/standardized/target_CHEMBL3471-1.IC50.csv +1539 -0
  124. biased_split-0.1.0/data/standardized/target_CHEMBL3717-1.IC50.csv +1325 -0
  125. biased_split-0.1.0/data/standardized/target_CHEMBL4005-1.IC50.csv +1217 -0
  126. biased_split-0.1.0/data/standardized/target_CHEMBL4015-1.IC50.csv +477 -0
  127. biased_split-0.1.0/data/standardized/target_CHEMBL4015-2.IC50.csv +310 -0
  128. biased_split-0.1.0/data/standardized/target_CHEMBL4078-1.IC50.csv +2007 -0
  129. biased_split-0.1.0/data/standardized/target_CHEMBL4235-1.IC50.csv +613 -0
  130. biased_split-0.1.0/data/standardized/target_CHEMBL4235-2.IC50.csv +225 -0
  131. biased_split-0.1.0/data/standardized/target_CHEMBL4235-3.IC50.csv +376 -0
  132. biased_split-0.1.0/data/standardized/target_CHEMBL4235-4.IC50.csv +215 -0
  133. biased_split-0.1.0/data/standardized/target_CHEMBL4296-1.IC50.csv +873 -0
  134. biased_split-0.1.0/data/standardized/target_CHEMBL4685-1.IC50.csv +538 -0
  135. biased_split-0.1.0/data/standardized/target_CHEMBL4685-2.IC50.csv +213 -0
  136. biased_split-0.1.0/data/standardized/target_CHEMBL4794-1.IC50.csv +227 -0
  137. biased_split-0.1.0/data/standardized/target_CHEMBL4794-2.IC50.csv +278 -0
  138. biased_split-0.1.0/data/standardized/target_CHEMBL4822-1.IC50.csv +1228 -0
  139. biased_split-0.1.0/data/standardized/target_CHEMBL5145-1.IC50.csv +210 -0
  140. biased_split-0.1.0/data/standardized/target_CHEMBL5145-2.IC50.csv +233 -0
  141. biased_split-0.1.0/data/standardized/target_CHEMBL5251-1.IC50.csv +577 -0
  142. biased_split-0.1.0/data/standardized/target_CHEMBL5251-2.IC50.csv +208 -0
  143. biased_split-0.1.0/data/standardized/target_CHEMBL5763-1.IC50.csv +1397 -0
  144. biased_split-0.1.0/data/standardized/target_CHEMBL5763-2.IC50.csv +216 -0
  145. biased_split-0.1.0/notebooks/00_Data_Source_and_Standardize.ipynb +243 -0
  146. biased_split-0.1.0/notebooks/01_Molecular_Network.ipynb +2454 -0
  147. biased_split-0.1.0/notebooks/02_Activity_Cliff_Split.ipynb +568 -0
  148. biased_split-0.1.0/notebooks/03_kNN_Failure_Split.ipynb +427 -0
  149. biased_split-0.1.0/notebooks/04_Substructure_Distance_Split.ipynb +364 -0
  150. biased_split-0.1.0/notebooks/05_Proxy_Sorted_Split.ipynb +360 -0
  151. biased_split-0.1.0/notebooks/activity_cliff.gif +0 -0
  152. biased_split-0.1.0/notebooks/knn_failure_sweep.gif +0 -0
  153. biased_split-0.1.0/notebooks/logp_sweep.gif +0 -0
  154. biased_split-0.1.0/notebooks/similarity_dist.gif +0 -0
  155. biased_split-0.1.0/pyproject.toml +37 -0
  156. biased_split-0.1.0/uv.lock +3135 -0
@@ -0,0 +1,228 @@
1
+ manuscript/
2
+
3
+ .DS_Store
4
+
5
+ # Byte-compiled / optimized / DLL files
6
+ __pycache__/
7
+ *.py[codz]
8
+ *$py.class
9
+
10
+ # C extensions
11
+ *.so
12
+
13
+ # Distribution / packaging
14
+ .Python
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ share/python-wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ MANIFEST
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+
39
+ # Installer logs
40
+ pip-log.txt
41
+ pip-delete-this-directory.txt
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ *.py.cover
54
+ *.lcov
55
+ .hypothesis/
56
+ .pytest_cache/
57
+ cover/
58
+
59
+ # Translations
60
+ *.mo
61
+ *.pot
62
+
63
+ # Django stuff:
64
+ *.log
65
+ local_settings.py
66
+ db.sqlite3
67
+ db.sqlite3-journal
68
+
69
+ # Flask stuff:
70
+ instance/
71
+ .webassets-cache
72
+
73
+ # Scrapy stuff:
74
+ .scrapy
75
+
76
+ # Sphinx documentation
77
+ docs/_build/
78
+
79
+ # PyBuilder
80
+ .pybuilder/
81
+ target/
82
+
83
+ # Jupyter Notebook
84
+ .ipynb_checkpoints
85
+
86
+ # IPython
87
+ profile_default/
88
+ ipython_config.py
89
+
90
+ # pyenv
91
+ # For a library or package, you might want to ignore these files since the code is
92
+ # intended to run in multiple environments; otherwise, check them in:
93
+ # .python-version
94
+
95
+ # pipenv
96
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
97
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
98
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
99
+ # install all needed dependencies.
100
+ # Pipfile.lock
101
+
102
+ # UV
103
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
104
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
105
+ # commonly ignored for libraries.
106
+ # uv.lock
107
+
108
+ # poetry
109
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
110
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
111
+ # commonly ignored for libraries.
112
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
113
+ # poetry.lock
114
+ # poetry.toml
115
+
116
+ # pdm
117
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
118
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
119
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
120
+ # pdm.lock
121
+ # pdm.toml
122
+ .pdm-python
123
+ .pdm-build/
124
+
125
+ # pixi
126
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
127
+ # pixi.lock
128
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
129
+ # in the .venv directory. It is recommended not to include this directory in version control.
130
+ .pixi/*
131
+ !.pixi/config.toml
132
+
133
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
134
+ __pypackages__/
135
+
136
+ # Celery stuff
137
+ celerybeat-schedule*
138
+ celerybeat.pid
139
+
140
+ # Redis
141
+ *.rdb
142
+ *.aof
143
+ *.pid
144
+
145
+ # RabbitMQ
146
+ mnesia/
147
+ rabbitmq/
148
+ rabbitmq-data/
149
+
150
+ # ActiveMQ
151
+ activemq-data/
152
+
153
+ # SageMath parsed files
154
+ *.sage.py
155
+
156
+ # Environments
157
+ .env
158
+ .envrc
159
+ .venv
160
+ env/
161
+ venv/
162
+ ENV/
163
+ env.bak/
164
+ venv.bak/
165
+
166
+ # Spyder project settings
167
+ .spyderproject
168
+ .spyproject
169
+
170
+ # Rope project settings
171
+ .ropeproject
172
+
173
+ # mkdocs documentation
174
+ /site
175
+
176
+ # mypy
177
+ .mypy_cache/
178
+ .dmypy.json
179
+ dmypy.json
180
+
181
+ # Pyre type checker
182
+ .pyre/
183
+
184
+ # pytype static type analyzer
185
+ .pytype/
186
+
187
+ # Cython debug symbols
188
+ cython_debug/
189
+
190
+ # PyCharm
191
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
192
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
193
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
194
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
195
+ # .idea/
196
+
197
+ # Abstra
198
+ # Abstra is an AI-powered process automation framework.
199
+ # Ignore directories containing user credentials, local state, and settings.
200
+ # Learn more at https://abstra.io/docs
201
+ .abstra/
202
+
203
+ # Visual Studio Code
204
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
205
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
206
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
207
+ # you could uncomment the following to ignore the entire vscode folder
208
+ # .vscode/
209
+ # Temporary file for partial code execution
210
+ tempCodeRunnerFile.py
211
+
212
+ # Ruff stuff:
213
+ .ruff_cache/
214
+
215
+ # PyPI configuration file
216
+ .pypirc
217
+
218
+ # Marimo
219
+ marimo/_static/
220
+ marimo/_lsp/
221
+ __marimo__/
222
+
223
+ # Streamlit
224
+ .streamlit/secrets.toml
225
+
226
+
227
+ .DS_Store
228
+ */.DS_Store
@@ -0,0 +1 @@
1
+ 3.13
@@ -0,0 +1,26 @@
1
+ Metadata-Version: 2.4
2
+ Name: biased-split
3
+ Version: 0.1.0
4
+ Summary: Biased Data Splitting Method for Chemically Meaningful Model Validation
5
+ Requires-Python: >=3.13
6
+ Requires-Dist: matplotlib>=3.11.0
7
+ Requires-Dist: networkx>=3.6.1
8
+ Requires-Dist: numpy>=2.4.6
9
+ Requires-Dist: pandas>=3.0.3
10
+ Requires-Dist: pyarrow>=18.0
11
+ Requires-Dist: pygraphviz>=1.14
12
+ Requires-Dist: rdkit>=2026.3.3
13
+ Requires-Dist: scikit-learn>=1.9.0
14
+ Requires-Dist: scipy>=1.17.1
15
+ Requires-Dist: statsmodels>=0.14
16
+ Requires-Dist: xgboost>=2.0
17
+ Provides-Extra: benchmark
18
+ Requires-Dist: chemprop>=2.0; extra == 'benchmark'
19
+ Requires-Dist: lightning>=2.0; extra == 'benchmark'
20
+ Requires-Dist: torch>=2.0; extra == 'benchmark'
21
+ Provides-Extra: notebook
22
+ Requires-Dist: ipykernel>=7.3.0; extra == 'notebook'
23
+ Requires-Dist: notebook>=7.6.0; extra == 'notebook'
24
+ Description-Content-Type: text/markdown
25
+
26
+ # Chemically Meaningful Model Validation using Biased Data Splits
@@ -0,0 +1 @@
1
+ # Chemically Meaningful Model Validation using Biased Data Splits
@@ -0,0 +1,29 @@
1
+ """Biased Split for Chemically Meaningful Model Validation"""
2
+
3
+ from biased_split.activity_cliff import ActivityCliffSplitter
4
+ from biased_split.knn_failure import KNNFailureSplitter
5
+ from biased_split.substructure_distance import SubstructureDistanceSplitter
6
+ from biased_split.proxy_sorted import ProxySortedSplitter
7
+ from biased_split.molecularnetwork import (
8
+ smiles_to_ecfp4_bitvect,
9
+ smiles_to_ecfp4_np,
10
+ compute_similarity_matrix,
11
+ molecular_network_from_list,
12
+ df_to_ecfp4_molecular_network,
13
+ visualise_molnet,
14
+ visualise_molnet_split,
15
+ )
16
+
17
+ __all__ = [
18
+ "ActivityCliffSplitter",
19
+ "KNNFailureSplitter",
20
+ "SubstructureDistanceSplitter",
21
+ "ProxySortedSplitter",
22
+ "smiles_to_ecfp4_bitvect",
23
+ "smiles_to_ecfp4_np",
24
+ "compute_similarity_matrix",
25
+ "molecular_network_from_list",
26
+ "df_to_ecfp4_molecular_network",
27
+ "visualise_molnet",
28
+ "visualise_molnet_split",
29
+ ]
@@ -0,0 +1,294 @@
1
+ import os
2
+ import tempfile
3
+ from PIL import Image
4
+ import numpy as np
5
+
6
+ from biased_split.molecularnetwork import (
7
+ smiles_to_ecfp4_bitvect,
8
+ compute_similarity_matrix,
9
+ molecular_network_from_list,
10
+ visualise_molnet_split,
11
+ )
12
+
13
+ UNASSIGNED_NODE = 0
14
+ TRAIN_NODE = 1
15
+ TEST_NODE = 2
16
+
17
+
18
+ class ActivityCliffSplitter:
19
+ def __init__(
20
+ self,
21
+ similarity_threshold,
22
+ activity_threshold,
23
+ test_fraction=0.2, # of total dataset, default 20% of total dataset should be test set
24
+ ):
25
+ self.similarity_threshold = similarity_threshold
26
+ self.activity_threshold = activity_threshold
27
+ self.test_fraction = test_fraction
28
+
29
+ def split_for_intended_bias(
30
+ self,
31
+ smiless,
32
+ activity_values,
33
+ similarity_matrix,
34
+ intended_bias, # this is the fraction that we _try_ to construct. Depending on dataset and parameters this may not be possible and thus we ALWAYS report and use *effective bias*.
35
+ random_seed,
36
+ ):
37
+ if not (0.0 <= intended_bias <= 1.0):
38
+ raise ValueError(f"intended_bias must be in [0, 1], got {intended_bias}")
39
+
40
+ rng = np.random.default_rng(random_seed)
41
+ n_molecules = len(smiless)
42
+ # int(2.1) => 2; int(2.9) => 2; thus int here acts as floor operator
43
+ target_test_size = int(self.test_fraction * n_molecules)
44
+ n_cliff_test_molecules = int(intended_bias * target_test_size)
45
+
46
+ cliff_edges = self.find_cliff_edges(
47
+ similarity_matrix=similarity_matrix,
48
+ activity_values=activity_values,
49
+ similarity_threshold=self.similarity_threshold,
50
+ activity_threshold=self.activity_threshold,
51
+ ) # this gives us (node idx1, node idx2, activity difference)
52
+
53
+ # One can sort edges so the largest activity gaps are processed first. But in this case, we will randomly sort it.
54
+ # cliff_edges.sort(key=lambda edge: edge[2], reverse=True) # edge[2] is the activity difference from cliff_edges
55
+ rng.shuffle(cliff_edges)
56
+
57
+ # calculate cliff degrees for heuristic sorting into TRAIN_NODE
58
+ cliff_degrees = self.compute_cliff_degrees(cliff_edges, n_molecules)
59
+
60
+ # assign the cliff nodes by walking the cliff edges
61
+ assignment = self.walk_cliff_edges(
62
+ cliff_edges=cliff_edges,
63
+ cliff_degrees=cliff_degrees,
64
+ n_molecules=n_molecules,
65
+ n_cliff_test_target=n_cliff_test_molecules,
66
+ rng=rng,
67
+ )
68
+
69
+ unassigned_indices = np.where(assignment == UNASSIGNED_NODE)[0]
70
+ unassigned_non_cliff_indices = unassigned_indices[
71
+ cliff_degrees[unassigned_indices] == 0
72
+ ]
73
+ unassigned_cliff_indices = unassigned_indices[
74
+ cliff_degrees[unassigned_indices] > 0
75
+ ]
76
+
77
+ n_random_fill = target_test_size - int((assignment == TEST_NODE).sum())
78
+
79
+ if n_random_fill > 0:
80
+ if len(unassigned_non_cliff_indices) >= n_random_fill:
81
+ random_test_indices = rng.choice(
82
+ unassigned_non_cliff_indices, size=n_random_fill, replace=False
83
+ )
84
+ else:
85
+ shortfall = n_random_fill - len(unassigned_non_cliff_indices)
86
+ cliff_topup_indices = rng.choice(
87
+ unassigned_cliff_indices,
88
+ size=min(shortfall, len(unassigned_cliff_indices)),
89
+ replace=False,
90
+ )
91
+ random_test_indices = np.concatenate(
92
+ [unassigned_non_cliff_indices, cliff_topup_indices]
93
+ )
94
+ assignment[random_test_indices] = TEST_NODE
95
+
96
+ # now, all unassigned molecules go to training.
97
+ assignment[assignment == UNASSIGNED_NODE] = TRAIN_NODE
98
+
99
+ train_indices = np.where(assignment == TRAIN_NODE)[0]
100
+ test_indices = np.where(assignment == TEST_NODE)[0]
101
+
102
+ question_results = self.evaluate_cliff_question(
103
+ test_indices=test_indices,
104
+ train_indices=train_indices,
105
+ similarity_matrix=similarity_matrix,
106
+ activity_values=activity_values,
107
+ similarity_threshold=self.similarity_threshold,
108
+ activity_threshold=self.activity_threshold,
109
+ )
110
+
111
+ # calculate the effective bias after random sampling.
112
+ effective_bias = self.effective_bias_from_question_results(question_results)
113
+ return train_indices, test_indices, effective_bias
114
+
115
+ def split(self, smiless, activity_values, intended_biases, n_repeats):
116
+ fps_bitvect = [smiles_to_ecfp4_bitvect(smiles) for smiles in smiless]
117
+ similarity_matrix = compute_similarity_matrix(fps_bitvect)
118
+
119
+ for intended_bias in intended_biases:
120
+ for repeat_index in range(n_repeats):
121
+ train_indices, test_indices, effective_bias = (
122
+ self.split_for_intended_bias(
123
+ smiless,
124
+ similarity_matrix,
125
+ activity_values,
126
+ intended_bias,
127
+ repeat_index,
128
+ )
129
+ )
130
+ yield train_indices, test_indices, effective_bias, intended_bias, repeat_index
131
+
132
+ @staticmethod
133
+ def effective_bias_from_question_results(question_results):
134
+ if question_results.size == 0:
135
+ return 0.0
136
+ return float(question_results.mean())
137
+
138
+ @staticmethod
139
+ def evaluate_cliff_question(
140
+ test_indices,
141
+ train_indices,
142
+ similarity_matrix,
143
+ activity_values,
144
+ activity_threshold,
145
+ similarity_threshold,
146
+ ):
147
+ if len(test_indices) == 0:
148
+ return np.array([])
149
+
150
+ # similarity[i, j] = similarity between test molecule i and train molecule j
151
+ similarity_test_vs_train = similarity_matrix[
152
+ test_indices[:, None], train_indices
153
+ ]
154
+
155
+ # activity_diff[i, j] = |activity(test i) - activity(train j)|
156
+ activity_diff_test_vs_train = np.abs(
157
+ activity_values[test_indices][:, None] - activity_values[train_indices]
158
+ )
159
+
160
+ is_cliff_edge = (similarity_test_vs_train >= similarity_threshold) & (
161
+ activity_diff_test_vs_train >= activity_threshold
162
+ )
163
+
164
+ # A test molecule counts if it has at least one cliff edge to any train molecule.
165
+ test_molecule_has_cliff_partner = is_cliff_edge.any(axis=1)
166
+ return test_molecule_has_cliff_partner.astype(float)
167
+
168
+ @staticmethod
169
+ def find_cliff_edges(
170
+ similarity_matrix,
171
+ activity_values,
172
+ similarity_threshold,
173
+ activity_threshold,
174
+ ):
175
+ n = len(activity_values)
176
+ cliff_edges = []
177
+
178
+ for i in range(n):
179
+ for j in range(i + 1, n): # symmetric matrix
180
+ if similarity_matrix[i, j] < similarity_threshold:
181
+ continue
182
+ activity_difference = abs(
183
+ float(activity_values[i]) - float(activity_values[j])
184
+ )
185
+ if activity_difference >= activity_threshold:
186
+ cliff_edges.append((i, j, activity_difference))
187
+
188
+ return cliff_edges
189
+
190
+ @staticmethod
191
+ def compute_cliff_degrees(
192
+ cliff_edges, # these come from before (node idx1, node idx2, activity_difference)
193
+ n_molecules,
194
+ ):
195
+
196
+ degrees = np.zeros(n_molecules, dtype=int)
197
+ for mol_a, mol_b, _ in cliff_edges:
198
+ degrees[mol_a] += 1
199
+ degrees[mol_b] += 1
200
+ return degrees
201
+
202
+ @staticmethod
203
+ def walk_cliff_edges(
204
+ cliff_edges, cliff_degrees, n_molecules, n_cliff_test_target, rng
205
+ ): # this is to ensure reproducibility with random selection
206
+ assignment = np.full(
207
+ n_molecules, UNASSIGNED_NODE, dtype=np.int8
208
+ ) # array with length of n_molecules filled with 0s
209
+ n_cliff_test_placed = 0
210
+
211
+ for mol_a, mol_b, _ in cliff_edges:
212
+ if (
213
+ n_cliff_test_placed >= n_cliff_test_target
214
+ ): # Stop condition as explained above
215
+ break
216
+
217
+ status_a = assignment[mol_a]
218
+ status_b = assignment[mol_b]
219
+
220
+ if status_a == UNASSIGNED_NODE and status_b == UNASSIGNED_NODE:
221
+ # higher cliff-degree molecule goes to train.
222
+ if cliff_degrees[mol_a] > cliff_degrees[mol_b]:
223
+ train_molecule, test_molecule = mol_a, mol_b
224
+ elif cliff_degrees[mol_b] > cliff_degrees[mol_a]:
225
+ train_molecule, test_molecule = mol_b, mol_a
226
+ else:
227
+ # Equal cliff degree: randomly pick
228
+ if rng.random() < 0.5:
229
+ train_molecule, test_molecule = mol_a, mol_b
230
+ else:
231
+ train_molecule, test_molecule = mol_b, mol_a
232
+
233
+ assignment[train_molecule] = TRAIN_NODE
234
+ assignment[test_molecule] = TEST_NODE
235
+ n_cliff_test_placed += 1
236
+
237
+ elif status_a == TRAIN_NODE and status_b == UNASSIGNED_NODE:
238
+ # Unassigned partner of a train molecule goes to test.
239
+ assignment[mol_b] = TEST_NODE
240
+ n_cliff_test_placed += 1
241
+
242
+ elif status_b == TRAIN_NODE and status_a == UNASSIGNED_NODE:
243
+ # Same as above with roles swapped.
244
+ assignment[mol_a] = TEST_NODE
245
+ n_cliff_test_placed += 1
246
+
247
+ elif status_a == TEST_NODE and status_b == UNASSIGNED_NODE:
248
+ # Unassigned partner of a test molecule goes to train.
249
+ assignment[mol_b] = TRAIN_NODE
250
+
251
+ elif status_b == TEST_NODE and status_a == UNASSIGNED_NODE:
252
+ # Same as above just swapped
253
+ assignment[mol_a] = TRAIN_NODE
254
+
255
+ # If both are already assigned, there is nothing to do for this edge.
256
+
257
+ return assignment
258
+
259
+ def visualise_splits(
260
+ self,
261
+ smiless,
262
+ activity_values,
263
+ intended_biases,
264
+ n_repeats,
265
+ output_path,
266
+ duration=500,
267
+ ):
268
+ G = molecular_network_from_list(
269
+ smiless, activity_values, self.similarity_threshold, self.activity_threshold
270
+ )
271
+ with tempfile.TemporaryDirectory() as tmpdir:
272
+ paths = []
273
+ for frame_index, (
274
+ train_idx,
275
+ test_idx,
276
+ effective_bias,
277
+ intended_bias,
278
+ _,
279
+ ) in enumerate(
280
+ self.split(smiless, activity_values, intended_biases, n_repeats)
281
+ ):
282
+ p = os.path.join(tmpdir, f"frame_{frame_index:04d}.png")
283
+ visualise_molnet_split(
284
+ G, train_idx, test_idx, effective_bias, intended_bias, filepath=p
285
+ )
286
+ paths.append(p)
287
+ frames = [Image.open(p) for p in paths]
288
+ frames[0].save(
289
+ output_path,
290
+ save_all=True,
291
+ append_images=frames[1:],
292
+ duration=duration,
293
+ loop=0,
294
+ )