biased-split 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,29 @@
1
+ """Biased Split for Chemically Meaningful Model Validation"""
2
+
3
+ from biased_split.activity_cliff import ActivityCliffSplitter
4
+ from biased_split.knn_failure import KNNFailureSplitter
5
+ from biased_split.substructure_distance import SubstructureDistanceSplitter
6
+ from biased_split.proxy_sorted import ProxySortedSplitter
7
+ from biased_split.molecularnetwork import (
8
+ smiles_to_ecfp4_bitvect,
9
+ smiles_to_ecfp4_np,
10
+ compute_similarity_matrix,
11
+ molecular_network_from_list,
12
+ df_to_ecfp4_molecular_network,
13
+ visualise_molnet,
14
+ visualise_molnet_split,
15
+ )
16
+
17
+ __all__ = [
18
+ "ActivityCliffSplitter",
19
+ "KNNFailureSplitter",
20
+ "SubstructureDistanceSplitter",
21
+ "ProxySortedSplitter",
22
+ "smiles_to_ecfp4_bitvect",
23
+ "smiles_to_ecfp4_np",
24
+ "compute_similarity_matrix",
25
+ "molecular_network_from_list",
26
+ "df_to_ecfp4_molecular_network",
27
+ "visualise_molnet",
28
+ "visualise_molnet_split",
29
+ ]
@@ -0,0 +1,294 @@
1
+ import os
2
+ import tempfile
3
+ from PIL import Image
4
+ import numpy as np
5
+
6
+ from biased_split.molecularnetwork import (
7
+ smiles_to_ecfp4_bitvect,
8
+ compute_similarity_matrix,
9
+ molecular_network_from_list,
10
+ visualise_molnet_split,
11
+ )
12
+
13
+ UNASSIGNED_NODE = 0
14
+ TRAIN_NODE = 1
15
+ TEST_NODE = 2
16
+
17
+
18
+ class ActivityCliffSplitter:
19
+ def __init__(
20
+ self,
21
+ similarity_threshold,
22
+ activity_threshold,
23
+ test_fraction=0.2, # of total dataset, default 20% of total dataset should be test set
24
+ ):
25
+ self.similarity_threshold = similarity_threshold
26
+ self.activity_threshold = activity_threshold
27
+ self.test_fraction = test_fraction
28
+
29
+ def split_for_intended_bias(
30
+ self,
31
+ smiless,
32
+ activity_values,
33
+ similarity_matrix,
34
+ intended_bias, # this is the fraction that we _try_ to construct. Depending on dataset and parameters this may not be possible and thus we ALWAYS report and use *effective bias*.
35
+ random_seed,
36
+ ):
37
+ if not (0.0 <= intended_bias <= 1.0):
38
+ raise ValueError(f"intended_bias must be in [0, 1], got {intended_bias}")
39
+
40
+ rng = np.random.default_rng(random_seed)
41
+ n_molecules = len(smiless)
42
+ # int(2.1) => 2; int(2.9) => 2; thus int here acts as floor operator
43
+ target_test_size = int(self.test_fraction * n_molecules)
44
+ n_cliff_test_molecules = int(intended_bias * target_test_size)
45
+
46
+ cliff_edges = self.find_cliff_edges(
47
+ similarity_matrix=similarity_matrix,
48
+ activity_values=activity_values,
49
+ similarity_threshold=self.similarity_threshold,
50
+ activity_threshold=self.activity_threshold,
51
+ ) # this gives us (node idx1, node idx2, activity difference)
52
+
53
+ # One can sort edges so the largest activity gaps are processed first. But in this case, we will randomly sort it.
54
+ # cliff_edges.sort(key=lambda edge: edge[2], reverse=True) # edge[2] is the activity difference from cliff_edges
55
+ rng.shuffle(cliff_edges)
56
+
57
+ # calculate cliff degrees for heuristic sorting into TRAIN_NODE
58
+ cliff_degrees = self.compute_cliff_degrees(cliff_edges, n_molecules)
59
+
60
+ # assign the cliff nodes by walking the cliff edges
61
+ assignment = self.walk_cliff_edges(
62
+ cliff_edges=cliff_edges,
63
+ cliff_degrees=cliff_degrees,
64
+ n_molecules=n_molecules,
65
+ n_cliff_test_target=n_cliff_test_molecules,
66
+ rng=rng,
67
+ )
68
+
69
+ unassigned_indices = np.where(assignment == UNASSIGNED_NODE)[0]
70
+ unassigned_non_cliff_indices = unassigned_indices[
71
+ cliff_degrees[unassigned_indices] == 0
72
+ ]
73
+ unassigned_cliff_indices = unassigned_indices[
74
+ cliff_degrees[unassigned_indices] > 0
75
+ ]
76
+
77
+ n_random_fill = target_test_size - int((assignment == TEST_NODE).sum())
78
+
79
+ if n_random_fill > 0:
80
+ if len(unassigned_non_cliff_indices) >= n_random_fill:
81
+ random_test_indices = rng.choice(
82
+ unassigned_non_cliff_indices, size=n_random_fill, replace=False
83
+ )
84
+ else:
85
+ shortfall = n_random_fill - len(unassigned_non_cliff_indices)
86
+ cliff_topup_indices = rng.choice(
87
+ unassigned_cliff_indices,
88
+ size=min(shortfall, len(unassigned_cliff_indices)),
89
+ replace=False,
90
+ )
91
+ random_test_indices = np.concatenate(
92
+ [unassigned_non_cliff_indices, cliff_topup_indices]
93
+ )
94
+ assignment[random_test_indices] = TEST_NODE
95
+
96
+ # now, all unassigned molecules go to training.
97
+ assignment[assignment == UNASSIGNED_NODE] = TRAIN_NODE
98
+
99
+ train_indices = np.where(assignment == TRAIN_NODE)[0]
100
+ test_indices = np.where(assignment == TEST_NODE)[0]
101
+
102
+ question_results = self.evaluate_cliff_question(
103
+ test_indices=test_indices,
104
+ train_indices=train_indices,
105
+ similarity_matrix=similarity_matrix,
106
+ activity_values=activity_values,
107
+ similarity_threshold=self.similarity_threshold,
108
+ activity_threshold=self.activity_threshold,
109
+ )
110
+
111
+ # calculate the effective bias after random sampling.
112
+ effective_bias = self.effective_bias_from_question_results(question_results)
113
+ return train_indices, test_indices, effective_bias
114
+
115
+ def split(self, smiless, activity_values, intended_biases, n_repeats):
116
+ fps_bitvect = [smiles_to_ecfp4_bitvect(smiles) for smiles in smiless]
117
+ similarity_matrix = compute_similarity_matrix(fps_bitvect)
118
+
119
+ for intended_bias in intended_biases:
120
+ for repeat_index in range(n_repeats):
121
+ train_indices, test_indices, effective_bias = (
122
+ self.split_for_intended_bias(
123
+ smiless,
124
+ similarity_matrix,
125
+ activity_values,
126
+ intended_bias,
127
+ repeat_index,
128
+ )
129
+ )
130
+ yield train_indices, test_indices, effective_bias, intended_bias, repeat_index
131
+
132
+ @staticmethod
133
+ def effective_bias_from_question_results(question_results):
134
+ if question_results.size == 0:
135
+ return 0.0
136
+ return float(question_results.mean())
137
+
138
+ @staticmethod
139
+ def evaluate_cliff_question(
140
+ test_indices,
141
+ train_indices,
142
+ similarity_matrix,
143
+ activity_values,
144
+ activity_threshold,
145
+ similarity_threshold,
146
+ ):
147
+ if len(test_indices) == 0:
148
+ return np.array([])
149
+
150
+ # similarity[i, j] = similarity between test molecule i and train molecule j
151
+ similarity_test_vs_train = similarity_matrix[
152
+ test_indices[:, None], train_indices
153
+ ]
154
+
155
+ # activity_diff[i, j] = |activity(test i) - activity(train j)|
156
+ activity_diff_test_vs_train = np.abs(
157
+ activity_values[test_indices][:, None] - activity_values[train_indices]
158
+ )
159
+
160
+ is_cliff_edge = (similarity_test_vs_train >= similarity_threshold) & (
161
+ activity_diff_test_vs_train >= activity_threshold
162
+ )
163
+
164
+ # A test molecule counts if it has at least one cliff edge to any train molecule.
165
+ test_molecule_has_cliff_partner = is_cliff_edge.any(axis=1)
166
+ return test_molecule_has_cliff_partner.astype(float)
167
+
168
+ @staticmethod
169
+ def find_cliff_edges(
170
+ similarity_matrix,
171
+ activity_values,
172
+ similarity_threshold,
173
+ activity_threshold,
174
+ ):
175
+ n = len(activity_values)
176
+ cliff_edges = []
177
+
178
+ for i in range(n):
179
+ for j in range(i + 1, n): # symmetric matrix
180
+ if similarity_matrix[i, j] < similarity_threshold:
181
+ continue
182
+ activity_difference = abs(
183
+ float(activity_values[i]) - float(activity_values[j])
184
+ )
185
+ if activity_difference >= activity_threshold:
186
+ cliff_edges.append((i, j, activity_difference))
187
+
188
+ return cliff_edges
189
+
190
+ @staticmethod
191
+ def compute_cliff_degrees(
192
+ cliff_edges, # these come from before (node idx1, node idx2, activity_difference)
193
+ n_molecules,
194
+ ):
195
+
196
+ degrees = np.zeros(n_molecules, dtype=int)
197
+ for mol_a, mol_b, _ in cliff_edges:
198
+ degrees[mol_a] += 1
199
+ degrees[mol_b] += 1
200
+ return degrees
201
+
202
+ @staticmethod
203
+ def walk_cliff_edges(
204
+ cliff_edges, cliff_degrees, n_molecules, n_cliff_test_target, rng
205
+ ): # this is to ensure reproducibility with random selection
206
+ assignment = np.full(
207
+ n_molecules, UNASSIGNED_NODE, dtype=np.int8
208
+ ) # array with length of n_molecules filled with 0s
209
+ n_cliff_test_placed = 0
210
+
211
+ for mol_a, mol_b, _ in cliff_edges:
212
+ if (
213
+ n_cliff_test_placed >= n_cliff_test_target
214
+ ): # Stop condition as explained above
215
+ break
216
+
217
+ status_a = assignment[mol_a]
218
+ status_b = assignment[mol_b]
219
+
220
+ if status_a == UNASSIGNED_NODE and status_b == UNASSIGNED_NODE:
221
+ # higher cliff-degree molecule goes to train.
222
+ if cliff_degrees[mol_a] > cliff_degrees[mol_b]:
223
+ train_molecule, test_molecule = mol_a, mol_b
224
+ elif cliff_degrees[mol_b] > cliff_degrees[mol_a]:
225
+ train_molecule, test_molecule = mol_b, mol_a
226
+ else:
227
+ # Equal cliff degree: randomly pick
228
+ if rng.random() < 0.5:
229
+ train_molecule, test_molecule = mol_a, mol_b
230
+ else:
231
+ train_molecule, test_molecule = mol_b, mol_a
232
+
233
+ assignment[train_molecule] = TRAIN_NODE
234
+ assignment[test_molecule] = TEST_NODE
235
+ n_cliff_test_placed += 1
236
+
237
+ elif status_a == TRAIN_NODE and status_b == UNASSIGNED_NODE:
238
+ # Unassigned partner of a train molecule goes to test.
239
+ assignment[mol_b] = TEST_NODE
240
+ n_cliff_test_placed += 1
241
+
242
+ elif status_b == TRAIN_NODE and status_a == UNASSIGNED_NODE:
243
+ # Same as above with roles swapped.
244
+ assignment[mol_a] = TEST_NODE
245
+ n_cliff_test_placed += 1
246
+
247
+ elif status_a == TEST_NODE and status_b == UNASSIGNED_NODE:
248
+ # Unassigned partner of a test molecule goes to train.
249
+ assignment[mol_b] = TRAIN_NODE
250
+
251
+ elif status_b == TEST_NODE and status_a == UNASSIGNED_NODE:
252
+ # Same as above just swapped
253
+ assignment[mol_a] = TRAIN_NODE
254
+
255
+ # If both are already assigned, there is nothing to do for this edge.
256
+
257
+ return assignment
258
+
259
+ def visualise_splits(
260
+ self,
261
+ smiless,
262
+ activity_values,
263
+ intended_biases,
264
+ n_repeats,
265
+ output_path,
266
+ duration=500,
267
+ ):
268
+ G = molecular_network_from_list(
269
+ smiless, activity_values, self.similarity_threshold, self.activity_threshold
270
+ )
271
+ with tempfile.TemporaryDirectory() as tmpdir:
272
+ paths = []
273
+ for frame_index, (
274
+ train_idx,
275
+ test_idx,
276
+ effective_bias,
277
+ intended_bias,
278
+ _,
279
+ ) in enumerate(
280
+ self.split(smiless, activity_values, intended_biases, n_repeats)
281
+ ):
282
+ p = os.path.join(tmpdir, f"frame_{frame_index:04d}.png")
283
+ visualise_molnet_split(
284
+ G, train_idx, test_idx, effective_bias, intended_bias, filepath=p
285
+ )
286
+ paths.append(p)
287
+ frames = [Image.open(p) for p in paths]
288
+ frames[0].save(
289
+ output_path,
290
+ save_all=True,
291
+ append_images=frames[1:],
292
+ duration=duration,
293
+ loop=0,
294
+ )
@@ -0,0 +1,231 @@
1
+ import os
2
+ import tempfile
3
+
4
+ import numpy as np
5
+ from PIL import Image
6
+
7
+ from biased_split.molecularnetwork import (
8
+ smiles_to_ecfp4_bitvect,
9
+ compute_similarity_matrix,
10
+ molecular_network_from_list,
11
+ visualise_molnet_split,
12
+ )
13
+
14
+ UNASSIGNED_NODE = 0
15
+ TRAIN_NODE = 1
16
+ TEST_NODE = 2
17
+
18
+
19
+ class KNNFailureSplitter:
20
+ def __init__(
21
+ self, similarity_threshold, activity_threshold, n_neighbors, test_fraction=0.2
22
+ ):
23
+ self.similarity_threshold = similarity_threshold
24
+ self.activity_threshold = activity_threshold
25
+ self.n_neighbors = n_neighbors
26
+ self.test_fraction = test_fraction
27
+
28
+ def split_for_intended_bias(
29
+ self, smiless, similarity_matrix, activity_values, intended_bias, random_seed
30
+ ):
31
+ if not (0.0 <= intended_bias <= 1.0):
32
+ raise ValueError(f"intended_bias must be in [0, 1], got {intended_bias}")
33
+
34
+ rng = np.random.default_rng(random_seed)
35
+ n_molecules = len(smiless)
36
+ target_test_size = int(self.test_fraction * n_molecules)
37
+ n_failure_test_target = int(intended_bias * target_test_size)
38
+
39
+ failure_n_edges = self.find_failure_n_edges(
40
+ similarity_matrix,
41
+ activity_values,
42
+ self.similarity_threshold,
43
+ self.activity_threshold,
44
+ self.n_neighbors,
45
+ )
46
+ shuffled_order = rng.permutation(len(failure_n_edges))
47
+ failure_n_edges = [failure_n_edges[i] for i in shuffled_order]
48
+
49
+ assignment = self.walk_failure_n_edges(
50
+ failure_n_edges, n_molecules, n_failure_test_target
51
+ )
52
+
53
+ candidate_set = {molecule_index for molecule_index, _ in failure_n_edges}
54
+ is_candidate_mask = np.zeros(n_molecules, dtype=bool)
55
+ if candidate_set:
56
+ is_candidate_mask[list(candidate_set)] = True
57
+
58
+ unassigned_indices = np.where(assignment == UNASSIGNED_NODE)[0]
59
+ unassigned_non_candidate_indices = unassigned_indices[
60
+ ~is_candidate_mask[unassigned_indices]
61
+ ]
62
+ unassigned_candidate_indices = unassigned_indices[
63
+ is_candidate_mask[unassigned_indices]
64
+ ]
65
+
66
+ n_random_fill = target_test_size - int((assignment == TEST_NODE).sum())
67
+ if n_random_fill > 0:
68
+ if len(unassigned_non_candidate_indices) >= n_random_fill:
69
+ random_test_indices = rng.choice(
70
+ unassigned_non_candidate_indices, size=n_random_fill, replace=False
71
+ )
72
+ else:
73
+ shortfall = n_random_fill - len(unassigned_non_candidate_indices)
74
+ candidate_topup_indices = rng.choice(
75
+ unassigned_candidate_indices,
76
+ size=min(shortfall, len(unassigned_candidate_indices)),
77
+ replace=False,
78
+ )
79
+ random_test_indices = np.concatenate(
80
+ [unassigned_non_candidate_indices, candidate_topup_indices]
81
+ )
82
+ assignment[random_test_indices] = TEST_NODE
83
+
84
+ assignment[assignment == UNASSIGNED_NODE] = TRAIN_NODE
85
+
86
+ train_indices = np.where(assignment == TRAIN_NODE)[0]
87
+ test_indices = np.where(assignment == TEST_NODE)[0]
88
+
89
+ question_results = self.evaluate_knn_failure_question(
90
+ test_indices,
91
+ train_indices,
92
+ np.asarray(activity_values, dtype=float),
93
+ similarity_matrix,
94
+ self.similarity_threshold,
95
+ self.activity_threshold,
96
+ self.n_neighbors,
97
+ )
98
+ effective_bias = self.effective_bias_from_question_results(question_results)
99
+
100
+ return train_indices, test_indices, effective_bias
101
+
102
+ def split(self, smiless, activity_values, intended_biases, n_repeats):
103
+
104
+ fps_bitvect = [smiles_to_ecfp4_bitvect(s) for s in smiless]
105
+ similarity_matrix = compute_similarity_matrix(fps_bitvect)
106
+
107
+ for intended_bias in intended_biases:
108
+ for repeat_index in range(n_repeats):
109
+ train_indices, test_indices, effective_bias = (
110
+ self.split_for_intended_bias(
111
+ smiless=smiless,
112
+ similarity_matrix=similarity_matrix,
113
+ activity_values=activity_values,
114
+ intended_bias=intended_bias,
115
+ random_seed=repeat_index,
116
+ )
117
+ )
118
+ yield train_indices, test_indices, effective_bias, intended_bias, repeat_index
119
+
120
+ @staticmethod
121
+ def find_failure_n_edges(
122
+ similarity_matrix,
123
+ activity_values,
124
+ similarity_threshold,
125
+ activity_threshold,
126
+ n_neighbors,
127
+ ):
128
+ n_molecules = len(activity_values)
129
+ n_edges = []
130
+ for molecule_index in range(n_molecules):
131
+ similarities = similarity_matrix[molecule_index].copy()
132
+ similarities[molecule_index] = -1.0
133
+ qualifying = np.where(similarities >= similarity_threshold)[0]
134
+ if len(qualifying) < n_neighbors:
135
+ continue
136
+ top_k = qualifying[np.argsort(similarities[qualifying])[::-1][:n_neighbors]]
137
+ consensus = float(activity_values[top_k].mean())
138
+ disagreement = abs(consensus - float(activity_values[molecule_index]))
139
+ if disagreement >= activity_threshold:
140
+ n_edges.append((int(molecule_index), tuple(int(n) for n in top_k)))
141
+ return n_edges
142
+
143
+ @staticmethod
144
+ def walk_failure_n_edges(failure_n_edges, n_molecules, n_failure_test_target):
145
+ assignment = np.full(n_molecules, UNASSIGNED_NODE, dtype=np.int8)
146
+ n_failures_placed = 0
147
+ for molecule_index, neighbor_indices in failure_n_edges:
148
+ if n_failures_placed >= n_failure_test_target:
149
+ break
150
+ if assignment[molecule_index] == TRAIN_NODE:
151
+ continue
152
+ if any(assignment[n] == TEST_NODE for n in neighbor_indices):
153
+ continue
154
+ assignment[molecule_index] = TEST_NODE
155
+ for neighbor_index in neighbor_indices:
156
+ assignment[neighbor_index] = TRAIN_NODE
157
+ n_failures_placed += 1
158
+ return assignment
159
+
160
+ @staticmethod
161
+ def evaluate_knn_failure_question(
162
+ test_indices,
163
+ train_indices,
164
+ activity_values,
165
+ similarity_matrix,
166
+ similarity_threshold,
167
+ activity_threshold,
168
+ n_neighbors,
169
+ ):
170
+ results = np.full(len(test_indices), np.nan, dtype=float)
171
+ if len(test_indices) == 0 or len(train_indices) == 0:
172
+ return results
173
+ for position, test_idx in enumerate(test_indices):
174
+ similarities_to_train = similarity_matrix[test_idx][train_indices]
175
+ qualifying = np.where(similarities_to_train >= similarity_threshold)[0]
176
+ if len(qualifying) < n_neighbors:
177
+ continue
178
+ top_k_positions = qualifying[
179
+ np.argsort(similarities_to_train[qualifying])[::-1][:n_neighbors]
180
+ ]
181
+ top_k_train_indices = train_indices[top_k_positions]
182
+ consensus = float(activity_values[top_k_train_indices].mean())
183
+ disagreement = abs(consensus - float(activity_values[test_idx]))
184
+ results[position] = 1.0 if disagreement >= activity_threshold else 0.0
185
+ return results
186
+
187
+ @staticmethod
188
+ def effective_bias_from_question_results(question_results):
189
+ if question_results.size == 0:
190
+ return 0.0
191
+ evaluable = question_results[~np.isnan(question_results)]
192
+ if evaluable.size == 0:
193
+ return 0.0
194
+ return float(evaluable.mean())
195
+
196
+ def visualise_splits(
197
+ self,
198
+ smiless,
199
+ activity_values,
200
+ intended_biases,
201
+ n_repeats,
202
+ output_path,
203
+ duration=500,
204
+ ):
205
+ G = molecular_network_from_list(
206
+ smiless, activity_values, self.similarity_threshold, self.activity_threshold
207
+ )
208
+ with tempfile.TemporaryDirectory() as tmpdir:
209
+ paths = []
210
+ for frame_index, (
211
+ train_idx,
212
+ test_idx,
213
+ effective_bias,
214
+ intended_bias,
215
+ _,
216
+ ) in enumerate(
217
+ self.split(smiless, activity_values, intended_biases, n_repeats)
218
+ ):
219
+ p = os.path.join(tmpdir, f"frame_{frame_index:04d}.png")
220
+ visualise_molnet_split(
221
+ G, train_idx, test_idx, effective_bias, intended_bias, filepath=p
222
+ )
223
+ paths.append(p)
224
+ frames = [Image.open(p) for p in paths]
225
+ frames[0].save(
226
+ output_path,
227
+ save_all=True,
228
+ append_images=frames[1:],
229
+ duration=duration,
230
+ loop=0,
231
+ )
@@ -0,0 +1,271 @@
1
+ """Implementation of Molecular Network"""
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import networkx as nx
6
+ import matplotlib.pyplot as plt
7
+ from matplotlib.colors import LinearSegmentedColormap
8
+ from matplotlib.lines import Line2D
9
+ from matplotlib.ticker import MaxNLocator
10
+ from rdkit import Chem
11
+ from rdkit.Chem import rdFingerprintGenerator
12
+ from rdkit.DataStructs.cDataStructs import BulkTanimotoSimilarity, BulkTverskySimilarity
13
+
14
+ mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
15
+
16
+
17
+ def smiles_to_ecfp4_bitvect(smi):
18
+ return mfpgen.GetFingerprint(Chem.MolFromSmiles(smi))
19
+
20
+
21
+ def smiles_to_ecfp4_np(smi):
22
+ return mfpgen.GetFingerprintAsNumPy(Chem.MolFromSmiles(smi))
23
+
24
+
25
+ def compute_similarity_matrix(fps_bitvect, method="tanimoto"):
26
+ alpha = 1
27
+ beta = 0
28
+
29
+ n = len(fps_bitvect)
30
+ sim_matrix = np.eye(n, dtype=np.float32)
31
+
32
+ for i in range(n - 1):
33
+ target_fp = fps_bitvect[i]
34
+ query_fps = fps_bitvect[i + 1 :]
35
+
36
+ if method == "tanimoto":
37
+ sims = BulkTanimotoSimilarity(target_fp, query_fps)
38
+ sim_matrix[i, i + 1 :] = sims
39
+ sim_matrix[i + 1 :, i] = sims
40
+
41
+ elif method == "tversky":
42
+ # Compute Tv(A, B) using standard alpha, beta
43
+ sims_ab = BulkTverskySimilarity(target_fp, query_fps, alpha, beta)
44
+
45
+ # Compute Tv(B, A) by swapping alpha and beta
46
+ sims_ba = BulkTverskySimilarity(target_fp, query_fps, beta, alpha)
47
+
48
+ # Get element-wise maximum for the two directions
49
+ max_sims = np.maximum(sims_ab, sims_ba)
50
+
51
+ # Assign symmetrically
52
+ sim_matrix[i, i + 1 :] = max_sims
53
+ sim_matrix[i + 1 :, i] = max_sims
54
+
55
+ return sim_matrix
56
+
57
+
58
+ def molecular_network_from_list(
59
+ smiless,
60
+ activities,
61
+ similarity_threshold,
62
+ activity_threshold,
63
+ similarity_method="tanimoto",
64
+ ):
65
+ fps_bitvect = [smiles_to_ecfp4_bitvect(smiles) for smiles in smiless]
66
+ sim_matrix = compute_similarity_matrix(fps_bitvect, method=similarity_method)
67
+
68
+ adj_matrix = np.triu(sim_matrix, k=1)
69
+ adj_matrix[adj_matrix < similarity_threshold] = 0
70
+ G = nx.from_numpy_array(adj_matrix)
71
+
72
+ node_attrs = {
73
+ n: {"smiles": smi, "activity": act}
74
+ for n, (smi, act) in enumerate(zip(smiless, activities))
75
+ }
76
+ nx.set_node_attributes(G, node_attrs)
77
+ G.graph["activity_label"] = "activity"
78
+ G.graph["activity_threshold"] = activity_threshold
79
+ G.graph["similarity_threshold"] = similarity_threshold
80
+ G.graph["similarity_fp"] = "2048bit ECFP4"
81
+ G.graph["similarity_distance"] = similarity_method
82
+ return G
83
+
84
+
85
+ def df_to_ecfp4_molecular_network(
86
+ df,
87
+ smiles_col,
88
+ activity_col,
89
+ similarity_threshold,
90
+ activity_threshold,
91
+ similarity_method="tanimoto",
92
+ ):
93
+ fps_bitvect = df[smiles_col].map(smiles_to_ecfp4_bitvect).tolist()
94
+ sim_matrix = compute_similarity_matrix(fps_bitvect, similarity_method)
95
+
96
+ adj_matrix = np.triu(sim_matrix, k=1)
97
+ adj_matrix[adj_matrix < similarity_threshold] = 0
98
+ G = nx.from_numpy_array(adj_matrix)
99
+
100
+ node_attrs = {
101
+ n: {"smiles": smi, "activity": act}
102
+ for n, (smi, act) in enumerate(zip(df[smiles_col], df[activity_col]))
103
+ }
104
+ nx.set_node_attributes(G, node_attrs)
105
+ G.graph["activity_label"] = activity_col
106
+ G.graph["activity_threshold"] = activity_threshold
107
+ G.graph["similarity_threshold"] = similarity_threshold
108
+ G.graph["similarity_fp"] = "2048bit ECFP4"
109
+ G.graph["similarity_distance"] = "Tanimoto"
110
+ return G
111
+
112
+
113
+ def visualise_molnet(G, filepath=None):
114
+ fig, ax = plt.subplots(figsize=(12, 9))
115
+
116
+ pos = nx.nx_agraph.graphviz_layout(G, prog="sfdp")
117
+
118
+ edge_colors = []
119
+ for u, v in G.edges():
120
+ if (
121
+ abs(G.nodes[u]["activity"] - G.nodes[v]["activity"])
122
+ > G.graph["activity_threshold"]
123
+ ):
124
+ edge_colors.append((1, 0, 0, 1))
125
+ else:
126
+ w = G.edges[u, v]["weight"]
127
+ edge_colors.append((1 - w, 1 - w, 1 - w, 0.6))
128
+
129
+ node_colors = [G.nodes[n]["activity"] for n in G.nodes()]
130
+
131
+ nx.draw_networkx_edges(G, pos, edge_color=edge_colors, width=0.8, ax=ax)
132
+ nodes = nx.draw_networkx_nodes(
133
+ G,
134
+ pos,
135
+ node_color=node_colors,
136
+ cmap=plt.cm.Greys,
137
+ node_size=40,
138
+ linewidths=0,
139
+ ax=ax,
140
+ )
141
+
142
+ cbar = fig.colorbar(nodes, ax=ax)
143
+ cbar.set_label(G.graph["activity_label"])
144
+ ax.axis("off")
145
+ plt.title(
146
+ f"Molecular Network with {G.number_of_nodes()} molecules & {G.number_of_edges()} edges made using \nSimilarity Threshold of {G.graph['similarity_threshold']} over {G.graph['similarity_fp']} fingerprints using {G.graph['similarity_distance']} Similarity"
147
+ )
148
+ if filepath:
149
+ plt.tight_layout()
150
+ plt.savefig(filepath)
151
+ plt.show()
152
+
153
+
154
+ def visualise_molnet_split(
155
+ G, train_idx, test_idx, effective_bias, intended_bias, filepath=None, cliff=True
156
+ ):
157
+ CLIFF = (0.65, 0.15, 0.20)
158
+ TEST = (0.20, 0.40, 0.60)
159
+
160
+ if "_pos" not in G.graph:
161
+ G.graph["_pos"] = nx.nx_agraph.graphviz_layout(
162
+ G, prog="sfdp", args="-Goverlap=false -GK=1.5"
163
+ )
164
+ pos = G.graph["_pos"]
165
+
166
+ fig, ax = plt.subplots(figsize=(10, 9))
167
+ edge_colors = []
168
+ for u, v in G.edges():
169
+ is_cliff_edge = (
170
+ cliff
171
+ and abs(G.nodes[u]["activity"] - G.nodes[v]["activity"])
172
+ > G.graph["activity_threshold"]
173
+ )
174
+ if is_cliff_edge:
175
+ edge_colors.append(CLIFF + (0.9,))
176
+ else:
177
+ w = G.edges[u, v]["weight"]
178
+ edge_colors.append((1 - w, 1 - w, 1 - w, 0.4))
179
+ nx.draw_networkx_edges(G, pos, edge_color=edge_colors, width=0.5, ax=ax)
180
+
181
+ nodes_array = np.array(list(G.nodes()))
182
+ train_nodes = nodes_array[train_idx]
183
+ test_nodes = nodes_array[test_idx]
184
+ activities = np.array([G.nodes[n]["activity"] for n in G.nodes()])
185
+ vmin, vmax = activities.min(), activities.max()
186
+ cmap = LinearSegmentedColormap.from_list(
187
+ "greys_trunc", plt.cm.Greys(np.linspace(0.35, 0.95, 256))
188
+ )
189
+
190
+ nx.draw_networkx_nodes(
191
+ G,
192
+ pos,
193
+ nodelist=train_nodes,
194
+ node_color=[G.nodes[n]["activity"] for n in train_nodes],
195
+ cmap=cmap,
196
+ vmin=vmin,
197
+ vmax=vmax,
198
+ node_size=22,
199
+ linewidths=0,
200
+ ax=ax,
201
+ )
202
+ nodes = nx.draw_networkx_nodes(
203
+ G,
204
+ pos,
205
+ nodelist=test_nodes,
206
+ node_color=[G.nodes[n]["activity"] for n in test_nodes],
207
+ cmap=cmap,
208
+ vmin=vmin,
209
+ vmax=vmax,
210
+ node_size=22,
211
+ linewidths=0.9,
212
+ edgecolors=TEST,
213
+ ax=ax,
214
+ )
215
+
216
+ cbar = fig.colorbar(nodes, ax=ax, fraction=0.018, pad=0.02, aspect=40)
217
+ cbar.set_label(G.graph["activity_label"], fontsize=9, labelpad=6)
218
+ cbar.ax.tick_params(labelsize=8, length=2)
219
+ cbar.locator = MaxNLocator(nbins=4)
220
+ cbar.update_ticks()
221
+
222
+ handles = [
223
+ Line2D(
224
+ [0],
225
+ [0],
226
+ marker="o",
227
+ color="w",
228
+ markerfacecolor="0.5",
229
+ markersize=6,
230
+ label="train",
231
+ ),
232
+ Line2D(
233
+ [0],
234
+ [0],
235
+ marker="o",
236
+ color="w",
237
+ markerfacecolor="0.5",
238
+ markeredgecolor=TEST,
239
+ markeredgewidth=0.9,
240
+ markersize=6,
241
+ label="test",
242
+ ),
243
+ ]
244
+ if cliff:
245
+ handles.append(
246
+ Line2D([0], [0], color=CLIFF, linewidth=1.2, label="activity cliff"),
247
+ )
248
+ ax.legend(
249
+ handles=handles,
250
+ loc="upper center",
251
+ bbox_to_anchor=(0.5, 1.02),
252
+ frameon=False,
253
+ ncol=3,
254
+ fontsize=9,
255
+ handletextpad=1.0,
256
+ columnspacing=1.0,
257
+ )
258
+
259
+ caption = (
260
+ f"{G.number_of_nodes()} molecules, {G.number_of_edges()} edges. "
261
+ f"{G.graph['similarity_fp']}, {G.graph['similarity_distance']} ≥ {G.graph['similarity_threshold']}. "
262
+ f"intended bias {intended_bias:.2f}, effective bias {effective_bias:.2f}"
263
+ )
264
+ fig.text(0.5, 0.045, caption, ha="center", fontsize=8, color="0.4")
265
+ ax.axis("off")
266
+
267
+ if filepath:
268
+ plt.savefig(filepath, dpi=200, bbox_inches="tight")
269
+ plt.close(fig)
270
+ else:
271
+ plt.show()
@@ -0,0 +1,251 @@
1
+ import os
2
+ import tempfile
3
+
4
+ import numpy as np
5
+ import matplotlib.pyplot as plt
6
+ from matplotlib.lines import Line2D
7
+ from matplotlib.patches import Patch
8
+ from PIL import Image
9
+ from scipy.stats import gaussian_kde
10
+
11
+ UNASSIGNED_NODE = 0
12
+ TRAIN_NODE = 1
13
+ TEST_NODE = 2
14
+
15
+
16
+ def visualise_proxy_split(
17
+ proxy_values,
18
+ train_idx,
19
+ test_idx,
20
+ ideal_range_min,
21
+ ideal_range_max,
22
+ effective_bias,
23
+ intended_bias,
24
+ proxy_label="proxy",
25
+ x_range=None,
26
+ filepath=None,
27
+ ):
28
+ TEST = (0.20, 0.40, 0.60)
29
+ TRAIN = (0.5, 0.5, 0.5)
30
+ IDEAL = (0.65, 0.15, 0.20)
31
+
32
+ fig, ax = plt.subplots(figsize=(10, 5))
33
+
34
+ train_values = proxy_values[train_idx]
35
+ test_values = proxy_values[test_idx]
36
+
37
+ if x_range is None:
38
+ x_min, x_max = float(proxy_values.min()), float(proxy_values.max())
39
+ pad = (x_max - x_min) * 0.05
40
+ x_range = (x_min - pad, x_max + pad)
41
+
42
+ x = np.linspace(x_range[0], x_range[1], 500)
43
+ train_kde = gaussian_kde(train_values)
44
+ test_kde = gaussian_kde(test_values)
45
+ train_density = train_kde(x)
46
+ test_density = test_kde(x)
47
+
48
+ ax.axvspan(ideal_range_min, ideal_range_max, color=IDEAL, alpha=0.10, linewidth=0)
49
+ ax.fill_between(x, train_density, color=TRAIN, alpha=0.35, linewidth=0)
50
+ ax.fill_between(x, test_density, color=TEST, alpha=0.45, linewidth=0)
51
+ ax.plot(x, train_density, color=TRAIN, linewidth=1)
52
+ ax.plot(x, test_density, color=TEST, linewidth=1)
53
+
54
+ ax.set_xlabel(proxy_label)
55
+ ax.set_ylabel("density")
56
+ ax.set_xlim(x_range)
57
+ ax.spines["top"].set_visible(False)
58
+ ax.spines["right"].set_visible(False)
59
+
60
+ handles = [
61
+ Line2D([0], [0], color=TRAIN, linewidth=2, label="train"),
62
+ Line2D([0], [0], color=TEST, linewidth=2, label="test"),
63
+ Patch(facecolor=IDEAL, alpha=0.30, label="ideal range"),
64
+ ]
65
+ ax.legend(
66
+ handles=handles,
67
+ loc="upper center",
68
+ bbox_to_anchor=(0.5, 1.05),
69
+ frameon=False,
70
+ ncol=3,
71
+ fontsize=9,
72
+ )
73
+
74
+ caption = (
75
+ f"{len(proxy_values)} molecules ({len(train_idx)} train, {len(test_idx)} test). "
76
+ f"ideal range [{ideal_range_min}, {ideal_range_max}]. "
77
+ f"intended bias {intended_bias:.2f}, effective bias {effective_bias:.2f}"
78
+ )
79
+ fig.text(0.5, 0.00, caption, ha="center", fontsize=8, color="0.4")
80
+
81
+ if filepath:
82
+ plt.savefig(filepath, dpi=200, bbox_inches="tight")
83
+ plt.close(fig)
84
+ else:
85
+ plt.show()
86
+
87
+
88
+ class ProxySortedSplitter:
89
+ def __init__(
90
+ self, proxy_function, ideal_range_min, ideal_range_max, test_fraction=0.2
91
+ ):
92
+ self.proxy_function = proxy_function
93
+ self.ideal_range_min = ideal_range_min
94
+ self.ideal_range_max = ideal_range_max
95
+ self.test_fraction = test_fraction
96
+
97
+ def split_for_intended_bias(
98
+ self, smiless, proxy_values, activity_values, intended_bias, random_seed
99
+ ):
100
+ if not (0.0 <= intended_bias <= 1.0):
101
+ raise ValueError(f"intended_bias must be in [0, 1], got {intended_bias}")
102
+
103
+ rng = np.random.default_rng(random_seed)
104
+ n_molecules = len(smiless)
105
+ target_test_size = int(self.test_fraction * n_molecules)
106
+ n_in_range_test_target = int(intended_bias * target_test_size)
107
+
108
+ in_range_mask = self.find_in_range_mask(
109
+ proxy_values, self.ideal_range_min, self.ideal_range_max
110
+ )
111
+
112
+ assignment = self.walk_in_range_molecules(
113
+ in_range_mask, n_molecules, n_in_range_test_target, rng
114
+ )
115
+
116
+ unassigned_indices = np.where(assignment == UNASSIGNED_NODE)[0]
117
+ unassigned_out_of_range_indices = unassigned_indices[
118
+ ~in_range_mask[unassigned_indices]
119
+ ]
120
+ unassigned_in_range_indices = unassigned_indices[
121
+ in_range_mask[unassigned_indices]
122
+ ]
123
+
124
+ n_random_fill = target_test_size - int((assignment == TEST_NODE).sum())
125
+ if n_random_fill > 0:
126
+ if len(unassigned_out_of_range_indices) >= n_random_fill:
127
+ random_test_indices = rng.choice(
128
+ unassigned_out_of_range_indices, size=n_random_fill, replace=False
129
+ )
130
+ else:
131
+ shortfall = n_random_fill - len(unassigned_out_of_range_indices)
132
+ in_range_topup_indices = rng.choice(
133
+ unassigned_in_range_indices,
134
+ size=min(shortfall, len(unassigned_in_range_indices)),
135
+ replace=False,
136
+ )
137
+ random_test_indices = np.concatenate(
138
+ [unassigned_out_of_range_indices, in_range_topup_indices]
139
+ )
140
+ assignment[random_test_indices] = TEST_NODE
141
+
142
+ assignment[assignment == UNASSIGNED_NODE] = TRAIN_NODE
143
+
144
+ train_indices = np.where(assignment == TRAIN_NODE)[0]
145
+ test_indices = np.where(assignment == TEST_NODE)[0]
146
+
147
+ question_results = self.evaluate_proxy_question(
148
+ test_indices, proxy_values, self.ideal_range_min, self.ideal_range_max
149
+ )
150
+ effective_bias = self.effective_bias_from_question_results(question_results)
151
+
152
+ return train_indices, test_indices, effective_bias
153
+
154
+ def split(self, smiless, activity_values, intended_biases, n_repeats):
155
+ proxy_values = np.array([self.proxy_function(s) for s in smiless], dtype=float)
156
+ for intended_bias in intended_biases:
157
+ for repeat_index in range(n_repeats):
158
+ train_indices, test_indices, effective_bias = (
159
+ self.split_for_intended_bias(
160
+ smiless,
161
+ proxy_values,
162
+ activity_values,
163
+ intended_bias,
164
+ repeat_index,
165
+ )
166
+ )
167
+ yield train_indices, test_indices, effective_bias, intended_bias, repeat_index
168
+
169
+ @staticmethod
170
+ def find_in_range_mask(proxy_values, ideal_range_min, ideal_range_max):
171
+ return (proxy_values >= ideal_range_min) & (proxy_values <= ideal_range_max)
172
+
173
+ @staticmethod
174
+ def walk_in_range_molecules(
175
+ in_range_mask, n_molecules, n_in_range_test_target, rng
176
+ ):
177
+ assignment = np.full(n_molecules, UNASSIGNED_NODE, dtype=np.int8)
178
+ in_range_indices = np.where(in_range_mask)[0]
179
+ if n_in_range_test_target == 0 or len(in_range_indices) == 0:
180
+ return assignment
181
+ n_to_place = min(n_in_range_test_target, len(in_range_indices))
182
+ selected = rng.choice(in_range_indices, size=n_to_place, replace=False)
183
+ assignment[selected] = TEST_NODE
184
+ return assignment
185
+
186
+ @staticmethod
187
+ def evaluate_proxy_question(
188
+ test_indices, proxy_values, ideal_range_min, ideal_range_max
189
+ ):
190
+ if len(test_indices) == 0:
191
+ return np.array([], dtype=float)
192
+ test_proxy = proxy_values[test_indices]
193
+ in_range = (test_proxy >= ideal_range_min) & (test_proxy <= ideal_range_max)
194
+ return in_range.astype(float)
195
+
196
+ @staticmethod
197
+ def effective_bias_from_question_results(question_results):
198
+ if question_results.size == 0:
199
+ return 0.0
200
+ return float(question_results.mean())
201
+
202
+ def visualise_splits(
203
+ self,
204
+ smiless,
205
+ activity_values,
206
+ intended_biases,
207
+ n_repeats,
208
+ output_path,
209
+ duration=500,
210
+ proxy_label="proxy",
211
+ ):
212
+ proxy_values = np.array([self.proxy_function(s) for s in smiless], dtype=float)
213
+ x_min, x_max = float(proxy_values.min()), float(proxy_values.max())
214
+ pad = (x_max - x_min) * 0.05
215
+ x_range = (x_min - pad, x_max + pad)
216
+
217
+ with tempfile.TemporaryDirectory() as tmpdir:
218
+ paths = []
219
+ frame_index = 0
220
+ for intended_bias in intended_biases:
221
+ for repeat_index in range(n_repeats):
222
+ train_idx, test_idx, effective_bias = self.split_for_intended_bias(
223
+ smiless,
224
+ proxy_values,
225
+ activity_values,
226
+ intended_bias,
227
+ repeat_index,
228
+ )
229
+ p = os.path.join(tmpdir, f"frame_{frame_index:04d}.png")
230
+ visualise_proxy_split(
231
+ proxy_values,
232
+ train_idx,
233
+ test_idx,
234
+ self.ideal_range_min,
235
+ self.ideal_range_max,
236
+ effective_bias,
237
+ intended_bias,
238
+ proxy_label=proxy_label,
239
+ x_range=x_range,
240
+ filepath=p,
241
+ )
242
+ paths.append(p)
243
+ frame_index += 1
244
+ frames = [Image.open(p) for p in paths]
245
+ frames[0].save(
246
+ output_path,
247
+ save_all=True,
248
+ append_images=frames[1:],
249
+ duration=duration,
250
+ loop=0,
251
+ )
@@ -0,0 +1,185 @@
1
+ import os
2
+ import tempfile
3
+
4
+ import numpy as np
5
+ import networkx as nx
6
+ from PIL import Image
7
+
8
+ from biased_split.molecularnetwork import (
9
+ smiles_to_ecfp4_bitvect,
10
+ compute_similarity_matrix,
11
+ visualise_molnet_split,
12
+ )
13
+
14
+ UNASSIGNED_NODE = 0
15
+ TRAIN_NODE = 1
16
+ TEST_NODE = 2
17
+
18
+
19
+ class SubstructureDistanceSplitter:
20
+ def __init__(self, similarity_threshold, test_fraction=0.2):
21
+ self.similarity_threshold = similarity_threshold
22
+ self.test_fraction = test_fraction
23
+
24
+ def split_for_intended_bias(
25
+ self, smiless, tversky_matrix, activity_values, intended_bias, random_seed
26
+ ):
27
+ if not (0.0 <= intended_bias <= 1.0):
28
+ raise ValueError(f"intended_bias must be in [0, 1], got {intended_bias}")
29
+
30
+ rng = np.random.default_rng(random_seed)
31
+ n_molecules = len(smiless)
32
+ target_test_size = int(self.test_fraction * n_molecules)
33
+ n_isolated_test_target = int(intended_bias * target_test_size)
34
+
35
+ components = self.find_components(tversky_matrix, self.similarity_threshold)
36
+ assignment = self.walk_components(
37
+ components, n_molecules, n_isolated_test_target, rng
38
+ )
39
+
40
+ unassigned_indices = np.where(assignment == UNASSIGNED_NODE)[0]
41
+ n_random_fill = target_test_size - int((assignment == TEST_NODE).sum())
42
+ if n_random_fill > 0 and len(unassigned_indices) > 0:
43
+ n_to_sample = min(n_random_fill, len(unassigned_indices))
44
+ random_test_indices = rng.choice(
45
+ unassigned_indices, size=n_to_sample, replace=False
46
+ )
47
+ assignment[random_test_indices] = TEST_NODE
48
+
49
+ assignment[assignment == UNASSIGNED_NODE] = TRAIN_NODE
50
+
51
+ train_indices = np.where(assignment == TRAIN_NODE)[0]
52
+ test_indices = np.where(assignment == TEST_NODE)[0]
53
+
54
+ question_results = self.evaluate_substructure_question(
55
+ test_indices, train_indices, tversky_matrix, self.similarity_threshold
56
+ )
57
+ effective_bias = self.effective_bias_from_question_results(question_results)
58
+
59
+ return train_indices, test_indices, effective_bias
60
+
61
+ def split(self, smiless, activity_values, intended_biases, n_repeats):
62
+ fps_bitvect = [smiles_to_ecfp4_bitvect(s) for s in smiless]
63
+ tversky_matrix = compute_similarity_matrix(fps_bitvect, method="tversky")
64
+ for intended_bias in intended_biases:
65
+ for repeat_index in range(n_repeats):
66
+ train_indices, test_indices, effective_bias = (
67
+ self.split_for_intended_bias(
68
+ smiless,
69
+ tversky_matrix,
70
+ activity_values,
71
+ intended_bias,
72
+ repeat_index,
73
+ )
74
+ )
75
+ yield train_indices, test_indices, effective_bias, intended_bias, repeat_index
76
+
77
+ @staticmethod
78
+ def find_components(tversky_matrix, similarity_threshold):
79
+ adj_matrix = np.triu(tversky_matrix, k=1)
80
+ adj_matrix[adj_matrix < similarity_threshold] = 0
81
+ similarity_graph = nx.from_numpy_array(adj_matrix)
82
+ return sorted(nx.connected_components(similarity_graph), key=len, reverse=True)
83
+
84
+ @staticmethod
85
+ def walk_components(components, n_molecules, n_isolated_test_target, rng):
86
+ assignment = np.full(n_molecules, UNASSIGNED_NODE, dtype=np.int8)
87
+ remaining_budget = n_isolated_test_target
88
+ unused_components = list(components)
89
+ while True:
90
+ fitting = [c for c in unused_components if len(c) <= remaining_budget]
91
+ if not fitting:
92
+ break
93
+ max_size = max(len(c) for c in fitting)
94
+ largest = [c for c in fitting if len(c) == max_size]
95
+ chosen = largest[int(rng.integers(len(largest)))]
96
+ for molecule_index in chosen:
97
+ assignment[molecule_index] = TEST_NODE
98
+ unused_components.remove(chosen)
99
+ remaining_budget -= len(chosen)
100
+ return assignment
101
+
102
+ @staticmethod
103
+ def evaluate_substructure_question(
104
+ test_indices, train_indices, tversky_matrix, similarity_threshold
105
+ ):
106
+ if len(test_indices) == 0:
107
+ return np.array([], dtype=float)
108
+ if len(train_indices) == 0:
109
+ return np.ones(len(test_indices), dtype=float)
110
+ similarity_test_vs_train = tversky_matrix[np.ix_(test_indices, train_indices)]
111
+ max_train_similarity = similarity_test_vs_train.max(axis=1)
112
+ is_isolated = max_train_similarity < similarity_threshold
113
+ return is_isolated.astype(float)
114
+
115
+ @staticmethod
116
+ def effective_bias_from_question_results(question_results):
117
+ if question_results.size == 0:
118
+ return 0.0
119
+ return float(question_results.mean())
120
+
121
+ @staticmethod
122
+ def build_visualization_network(
123
+ smiless, activity_values, tversky_matrix, similarity_threshold
124
+ ):
125
+ adj_matrix = np.triu(tversky_matrix, k=1)
126
+ adj_matrix[adj_matrix < similarity_threshold] = 0
127
+ G = nx.from_numpy_array(adj_matrix)
128
+ node_attrs = {
129
+ n: {"smiles": smi, "activity": act}
130
+ for n, (smi, act) in enumerate(zip(smiless, activity_values))
131
+ }
132
+ nx.set_node_attributes(G, node_attrs)
133
+ G.graph["activity_label"] = "activity"
134
+ G.graph["activity_threshold"] = np.inf
135
+ G.graph["similarity_threshold"] = similarity_threshold
136
+ G.graph["similarity_fp"] = "2048bit ECFP4"
137
+ G.graph["similarity_distance"] = "tversky"
138
+ return G
139
+
140
+ def visualise_splits(
141
+ self,
142
+ smiless,
143
+ activity_values,
144
+ intended_biases,
145
+ n_repeats,
146
+ output_path,
147
+ duration=500,
148
+ ):
149
+ fps_bitvect = [smiles_to_ecfp4_bitvect(s) for s in smiless]
150
+ tversky_matrix = compute_similarity_matrix(fps_bitvect, method="tversky")
151
+ G = self.build_visualization_network(
152
+ smiless, activity_values, tversky_matrix, self.similarity_threshold
153
+ )
154
+ with tempfile.TemporaryDirectory() as tmpdir:
155
+ paths = []
156
+ frame_index = 0
157
+ for intended_bias in intended_biases:
158
+ for repeat_index in range(n_repeats):
159
+ train_idx, test_idx, effective_bias = self.split_for_intended_bias(
160
+ smiless,
161
+ tversky_matrix,
162
+ activity_values,
163
+ intended_bias,
164
+ repeat_index,
165
+ )
166
+ p = os.path.join(tmpdir, f"frame_{frame_index:04d}.png")
167
+ visualise_molnet_split(
168
+ G,
169
+ train_idx,
170
+ test_idx,
171
+ effective_bias,
172
+ intended_bias,
173
+ filepath=p,
174
+ cliff=False,
175
+ )
176
+ paths.append(p)
177
+ frame_index += 1
178
+ frames = [Image.open(p) for p in paths]
179
+ frames[0].save(
180
+ output_path,
181
+ save_all=True,
182
+ append_images=frames[1:],
183
+ duration=duration,
184
+ loop=0,
185
+ )
@@ -0,0 +1,26 @@
1
+ Metadata-Version: 2.4
2
+ Name: biased-split
3
+ Version: 0.1.0
4
+ Summary: Biased Data Splitting Method for Chemically Meaningful Model Validation
5
+ Requires-Python: >=3.13
6
+ Requires-Dist: matplotlib>=3.11.0
7
+ Requires-Dist: networkx>=3.6.1
8
+ Requires-Dist: numpy>=2.4.6
9
+ Requires-Dist: pandas>=3.0.3
10
+ Requires-Dist: pyarrow>=18.0
11
+ Requires-Dist: pygraphviz>=1.14
12
+ Requires-Dist: rdkit>=2026.3.3
13
+ Requires-Dist: scikit-learn>=1.9.0
14
+ Requires-Dist: scipy>=1.17.1
15
+ Requires-Dist: statsmodels>=0.14
16
+ Requires-Dist: xgboost>=2.0
17
+ Provides-Extra: benchmark
18
+ Requires-Dist: chemprop>=2.0; extra == 'benchmark'
19
+ Requires-Dist: lightning>=2.0; extra == 'benchmark'
20
+ Requires-Dist: torch>=2.0; extra == 'benchmark'
21
+ Provides-Extra: notebook
22
+ Requires-Dist: ipykernel>=7.3.0; extra == 'notebook'
23
+ Requires-Dist: notebook>=7.6.0; extra == 'notebook'
24
+ Description-Content-Type: text/markdown
25
+
26
+ # Chemically Meaningful Model Validation using Biased Data Splits
@@ -0,0 +1,9 @@
1
+ biased_split/__init__.py,sha256=M8pdc583gVtikoF8LyT39zdtLK_INAARXoJgQZTGjhE,912
2
+ biased_split/activity_cliff.py,sha256=2kMoE0DR8Sljj8FAC6qtg9RwtU99DYw_7UacK6RH_OM,11091
3
+ biased_split/knn_failure.py,sha256=2ObfaM2_mAdZ5LU7E0sec7j8PUPwoeK-Tx7M88c936c,8828
4
+ biased_split/molecularnetwork.py,sha256=m3wt129wtMR1kDZ8u9yvkepYS3fXeq8MEzTIGmahZDY,8225
5
+ biased_split/proxy_sorted.py,sha256=v11GJE2g2PiNxJcil4m0m8Id-PFV5QDKVHz8_vXy2CQ,9012
6
+ biased_split/substructure_distance.py,sha256=81iT8r7-sQahZ3MpGwIWZTxVIFtxhuQuElYweHGnnEI,7251
7
+ biased_split-0.1.0.dist-info/METADATA,sha256=YXALQfioGmI8sZM_5xn2TPOimuMaxuoMUo_y8Hq2e3Q,913
8
+ biased_split-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
9
+ biased_split-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any