PyPI - biased-split - Versions diffs - 0.1.0__py3-none-any.whl - Mend

biased-split 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

biased_split/__init__.py +29 -0
biased_split/activity_cliff.py +294 -0
biased_split/knn_failure.py +231 -0
biased_split/molecularnetwork.py +271 -0
biased_split/proxy_sorted.py +251 -0
biased_split/substructure_distance.py +185 -0
biased_split-0.1.0.dist-info/METADATA +26 -0
biased_split-0.1.0.dist-info/RECORD +9 -0
biased_split-0.1.0.dist-info/WHEEL +4 -0

biased_split/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+"""Biased Split for Chemically Meaningful Model Validation"""
+from biased_split.activity_cliff import ActivityCliffSplitter
+from biased_split.knn_failure import KNNFailureSplitter
+from biased_split.substructure_distance import SubstructureDistanceSplitter
+from biased_split.proxy_sorted import ProxySortedSplitter
+from biased_split.molecularnetwork import (
+    smiles_to_ecfp4_bitvect,
+    smiles_to_ecfp4_np,
+    compute_similarity_matrix,
+    molecular_network_from_list,
+    df_to_ecfp4_molecular_network,
+    visualise_molnet,
+    visualise_molnet_split,
+)
+__all__ = [
+    "ActivityCliffSplitter",
+    "KNNFailureSplitter",
+    "SubstructureDistanceSplitter",
+    "ProxySortedSplitter",
+    "smiles_to_ecfp4_bitvect",
+    "smiles_to_ecfp4_np",
+    "compute_similarity_matrix",
+    "molecular_network_from_list",
+    "df_to_ecfp4_molecular_network",
+    "visualise_molnet",
+    "visualise_molnet_split",
+]

biased_split/activity_cliff.py ADDED Viewed

@@ -0,0 +1,294 @@
+import os
+import tempfile
+from PIL import Image
+import numpy as np
+from biased_split.molecularnetwork import (
+    smiles_to_ecfp4_bitvect,
+    compute_similarity_matrix,
+    molecular_network_from_list,
+    visualise_molnet_split,
+)
+UNASSIGNED_NODE = 0
+TRAIN_NODE = 1
+TEST_NODE = 2
+class ActivityCliffSplitter:
+    def __init__(
+        self,
+        similarity_threshold,
+        activity_threshold,
+        test_fraction=0.2,  # of total dataset, default 20% of total dataset should be test set
+    ):
+        self.similarity_threshold = similarity_threshold
+        self.activity_threshold = activity_threshold
+        self.test_fraction = test_fraction
+    def split_for_intended_bias(
+        self,
+        smiless,
+        activity_values,
+        similarity_matrix,
+        intended_bias,  # this is the fraction that we _try_ to construct. Depending on dataset and parameters this may not be possible and thus we ALWAYS report and use *effective bias*.
+        random_seed,
+    ):
+        if not (0.0 <= intended_bias <= 1.0):
+            raise ValueError(f"intended_bias must be in [0, 1], got {intended_bias}")
+        rng = np.random.default_rng(random_seed)
+        n_molecules = len(smiless)
+        # int(2.1) => 2; int(2.9) => 2; thus int here acts as floor operator
+        target_test_size = int(self.test_fraction * n_molecules)
+        n_cliff_test_molecules = int(intended_bias * target_test_size)
+        cliff_edges = self.find_cliff_edges(
+            similarity_matrix=similarity_matrix,
+            activity_values=activity_values,
+            similarity_threshold=self.similarity_threshold,
+            activity_threshold=self.activity_threshold,
+        )  # this gives us (node idx1, node idx2, activity difference)
+        # One can sort edges so the largest activity gaps are processed first. But in this case, we will randomly sort it.
+        # cliff_edges.sort(key=lambda edge: edge[2], reverse=True) # edge[2] is the activity difference from cliff_edges
+        rng.shuffle(cliff_edges)
+        # calculate cliff degrees for heuristic sorting into TRAIN_NODE
+        cliff_degrees = self.compute_cliff_degrees(cliff_edges, n_molecules)
+        # assign the cliff nodes by walking the cliff edges
+        assignment = self.walk_cliff_edges(
+            cliff_edges=cliff_edges,
+            cliff_degrees=cliff_degrees,
+            n_molecules=n_molecules,
+            n_cliff_test_target=n_cliff_test_molecules,
+            rng=rng,
+        )
+        unassigned_indices = np.where(assignment == UNASSIGNED_NODE)[0]
+        unassigned_non_cliff_indices = unassigned_indices[
+            cliff_degrees[unassigned_indices] == 0
+        ]
+        unassigned_cliff_indices = unassigned_indices[
+            cliff_degrees[unassigned_indices] > 0
+        ]
+        n_random_fill = target_test_size - int((assignment == TEST_NODE).sum())
+        if n_random_fill > 0:
+            if len(unassigned_non_cliff_indices) >= n_random_fill:
+                random_test_indices = rng.choice(
+                    unassigned_non_cliff_indices, size=n_random_fill, replace=False
+                )
+            else:
+                shortfall = n_random_fill - len(unassigned_non_cliff_indices)
+                cliff_topup_indices = rng.choice(
+                    unassigned_cliff_indices,
+                    size=min(shortfall, len(unassigned_cliff_indices)),
+                    replace=False,
+                )
+                random_test_indices = np.concatenate(
+                    [unassigned_non_cliff_indices, cliff_topup_indices]
+                )
+            assignment[random_test_indices] = TEST_NODE
+        # now, all unassigned molecules go to training.
+        assignment[assignment == UNASSIGNED_NODE] = TRAIN_NODE
+        train_indices = np.where(assignment == TRAIN_NODE)[0]
+        test_indices = np.where(assignment == TEST_NODE)[0]
+        question_results = self.evaluate_cliff_question(
+            test_indices=test_indices,
+            train_indices=train_indices,
+            similarity_matrix=similarity_matrix,
+            activity_values=activity_values,
+            similarity_threshold=self.similarity_threshold,
+            activity_threshold=self.activity_threshold,
+        )
+        # calculate the effective bias after random sampling.
+        effective_bias = self.effective_bias_from_question_results(question_results)
+        return train_indices, test_indices, effective_bias
+    def split(self, smiless, activity_values, intended_biases, n_repeats):
+        fps_bitvect = [smiles_to_ecfp4_bitvect(smiles) for smiles in smiless]
+        similarity_matrix = compute_similarity_matrix(fps_bitvect)
+        for intended_bias in intended_biases:
+            for repeat_index in range(n_repeats):
+                train_indices, test_indices, effective_bias = (
+                    self.split_for_intended_bias(
+                        smiless,
+                        similarity_matrix,
+                        activity_values,
+                        intended_bias,
+                        repeat_index,
+                    )
+                )
+                yield train_indices, test_indices, effective_bias, intended_bias, repeat_index
+    @staticmethod
+    def effective_bias_from_question_results(question_results):
+        if question_results.size == 0:
+            return 0.0
+        return float(question_results.mean())
+    @staticmethod
+    def evaluate_cliff_question(
+        test_indices,
+        train_indices,
+        similarity_matrix,
+        activity_values,
+        activity_threshold,
+        similarity_threshold,
+    ):
+        if len(test_indices) == 0:
+            return np.array([])
+        # similarity[i, j] = similarity between test molecule i and train molecule j
+        similarity_test_vs_train = similarity_matrix[
+            test_indices[:, None], train_indices
+        ]
+        # activity_diff[i, j] = |activity(test i) - activity(train j)|
+        activity_diff_test_vs_train = np.abs(
+            activity_values[test_indices][:, None] - activity_values[train_indices]
+        )
+        is_cliff_edge = (similarity_test_vs_train >= similarity_threshold) & (
+            activity_diff_test_vs_train >= activity_threshold
+        )
+        # A test molecule counts if it has at least one cliff edge to any train molecule.
+        test_molecule_has_cliff_partner = is_cliff_edge.any(axis=1)
+        return test_molecule_has_cliff_partner.astype(float)
+    @staticmethod
+    def find_cliff_edges(
+        similarity_matrix,
+        activity_values,
+        similarity_threshold,
+        activity_threshold,
+    ):
+        n = len(activity_values)
+        cliff_edges = []
+        for i in range(n):
+            for j in range(i + 1, n):  # symmetric matrix
+                if similarity_matrix[i, j] < similarity_threshold:
+                    continue
+                activity_difference = abs(
+                    float(activity_values[i]) - float(activity_values[j])
+                )
+                if activity_difference >= activity_threshold:
+                    cliff_edges.append((i, j, activity_difference))
+        return cliff_edges
+    @staticmethod
+    def compute_cliff_degrees(
+        cliff_edges,  # these come from before (node idx1, node idx2, activity_difference)
+        n_molecules,
+    ):
+        degrees = np.zeros(n_molecules, dtype=int)
+        for mol_a, mol_b, _ in cliff_edges:
+            degrees[mol_a] += 1
+            degrees[mol_b] += 1
+        return degrees
+    @staticmethod
+    def walk_cliff_edges(
+        cliff_edges, cliff_degrees, n_molecules, n_cliff_test_target, rng
+    ):  # this is to ensure reproducibility with random selection
+        assignment = np.full(
+            n_molecules, UNASSIGNED_NODE, dtype=np.int8
+        )  # array with length of n_molecules filled with 0s
+        n_cliff_test_placed = 0
+        for mol_a, mol_b, _ in cliff_edges:
+            if (
+                n_cliff_test_placed >= n_cliff_test_target
+            ):  # Stop condition as explained above
+                break
+            status_a = assignment[mol_a]
+            status_b = assignment[mol_b]
+            if status_a == UNASSIGNED_NODE and status_b == UNASSIGNED_NODE:
+                # higher cliff-degree molecule goes to train.
+                if cliff_degrees[mol_a] > cliff_degrees[mol_b]:
+                    train_molecule, test_molecule = mol_a, mol_b
+                elif cliff_degrees[mol_b] > cliff_degrees[mol_a]:
+                    train_molecule, test_molecule = mol_b, mol_a
+                else:
+                    # Equal cliff degree: randomly pick
+                    if rng.random() < 0.5:
+                        train_molecule, test_molecule = mol_a, mol_b
+                    else:
+                        train_molecule, test_molecule = mol_b, mol_a
+                assignment[train_molecule] = TRAIN_NODE
+                assignment[test_molecule] = TEST_NODE
+                n_cliff_test_placed += 1
+            elif status_a == TRAIN_NODE and status_b == UNASSIGNED_NODE:
+                # Unassigned partner of a train molecule goes to test.
+                assignment[mol_b] = TEST_NODE
+                n_cliff_test_placed += 1
+            elif status_b == TRAIN_NODE and status_a == UNASSIGNED_NODE:
+                # Same as above with roles swapped.
+                assignment[mol_a] = TEST_NODE
+                n_cliff_test_placed += 1
+            elif status_a == TEST_NODE and status_b == UNASSIGNED_NODE:
+                # Unassigned partner of a test molecule goes to train.
+                assignment[mol_b] = TRAIN_NODE
+            elif status_b == TEST_NODE and status_a == UNASSIGNED_NODE:
+                # Same as above just swapped
+                assignment[mol_a] = TRAIN_NODE
+            # If both are already assigned, there is nothing to do for this edge.
+        return assignment
+    def visualise_splits(
+        self,
+        smiless,
+        activity_values,
+        intended_biases,
+        n_repeats,
+        output_path,
+        duration=500,
+    ):
+        G = molecular_network_from_list(
+            smiless, activity_values, self.similarity_threshold, self.activity_threshold
+        )
+        with tempfile.TemporaryDirectory() as tmpdir:
+            paths = []
+            for frame_index, (
+                train_idx,
+                test_idx,
+                effective_bias,
+                intended_bias,
+                _,
+            ) in enumerate(
+                self.split(smiless, activity_values, intended_biases, n_repeats)
+            ):
+                p = os.path.join(tmpdir, f"frame_{frame_index:04d}.png")
+                visualise_molnet_split(
+                    G, train_idx, test_idx, effective_bias, intended_bias, filepath=p
+                )
+                paths.append(p)
+            frames = [Image.open(p) for p in paths]
+            frames[0].save(
+                output_path,
+                save_all=True,
+                append_images=frames[1:],
+                duration=duration,
+                loop=0,
+            )

biased_split/knn_failure.py ADDED Viewed

@@ -0,0 +1,231 @@
+import os
+import tempfile
+import numpy as np
+from PIL import Image
+from biased_split.molecularnetwork import (
+    smiles_to_ecfp4_bitvect,
+    compute_similarity_matrix,
+    molecular_network_from_list,
+    visualise_molnet_split,
+)
+UNASSIGNED_NODE = 0
+TRAIN_NODE = 1
+TEST_NODE = 2
+class KNNFailureSplitter:
+    def __init__(
+        self, similarity_threshold, activity_threshold, n_neighbors, test_fraction=0.2
+    ):
+        self.similarity_threshold = similarity_threshold
+        self.activity_threshold = activity_threshold
+        self.n_neighbors = n_neighbors
+        self.test_fraction = test_fraction
+    def split_for_intended_bias(
+        self, smiless, similarity_matrix, activity_values, intended_bias, random_seed
+    ):
+        if not (0.0 <= intended_bias <= 1.0):
+            raise ValueError(f"intended_bias must be in [0, 1], got {intended_bias}")
+        rng = np.random.default_rng(random_seed)
+        n_molecules = len(smiless)
+        target_test_size = int(self.test_fraction * n_molecules)
+        n_failure_test_target = int(intended_bias * target_test_size)
+        failure_n_edges = self.find_failure_n_edges(
+            similarity_matrix,
+            activity_values,
+            self.similarity_threshold,
+            self.activity_threshold,
+            self.n_neighbors,
+        )
+        shuffled_order = rng.permutation(len(failure_n_edges))
+        failure_n_edges = [failure_n_edges[i] for i in shuffled_order]
+        assignment = self.walk_failure_n_edges(
+            failure_n_edges, n_molecules, n_failure_test_target
+        )
+        candidate_set = {molecule_index for molecule_index, _ in failure_n_edges}
+        is_candidate_mask = np.zeros(n_molecules, dtype=bool)
+        if candidate_set:
+            is_candidate_mask[list(candidate_set)] = True
+        unassigned_indices = np.where(assignment == UNASSIGNED_NODE)[0]
+        unassigned_non_candidate_indices = unassigned_indices[
+            ~is_candidate_mask[unassigned_indices]
+        ]
+        unassigned_candidate_indices = unassigned_indices[
+            is_candidate_mask[unassigned_indices]
+        ]
+        n_random_fill = target_test_size - int((assignment == TEST_NODE).sum())
+        if n_random_fill > 0:
+            if len(unassigned_non_candidate_indices) >= n_random_fill:
+                random_test_indices = rng.choice(
+                    unassigned_non_candidate_indices, size=n_random_fill, replace=False
+                )
+            else:
+                shortfall = n_random_fill - len(unassigned_non_candidate_indices)
+                candidate_topup_indices = rng.choice(
+                    unassigned_candidate_indices,
+                    size=min(shortfall, len(unassigned_candidate_indices)),
+                    replace=False,
+                )
+                random_test_indices = np.concatenate(
+                    [unassigned_non_candidate_indices, candidate_topup_indices]
+                )
+            assignment[random_test_indices] = TEST_NODE
+        assignment[assignment == UNASSIGNED_NODE] = TRAIN_NODE
+        train_indices = np.where(assignment == TRAIN_NODE)[0]
+        test_indices = np.where(assignment == TEST_NODE)[0]
+        question_results = self.evaluate_knn_failure_question(
+            test_indices,
+            train_indices,
+            np.asarray(activity_values, dtype=float),
+            similarity_matrix,
+            self.similarity_threshold,
+            self.activity_threshold,
+            self.n_neighbors,
+        )
+        effective_bias = self.effective_bias_from_question_results(question_results)
+        return train_indices, test_indices, effective_bias
+    def split(self, smiless, activity_values, intended_biases, n_repeats):
+        fps_bitvect = [smiles_to_ecfp4_bitvect(s) for s in smiless]
+        similarity_matrix = compute_similarity_matrix(fps_bitvect)
+        for intended_bias in intended_biases:
+            for repeat_index in range(n_repeats):
+                train_indices, test_indices, effective_bias = (
+                    self.split_for_intended_bias(
+                        smiless=smiless,
+                        similarity_matrix=similarity_matrix,
+                        activity_values=activity_values,
+                        intended_bias=intended_bias,
+                        random_seed=repeat_index,
+                    )
+                )
+                yield train_indices, test_indices, effective_bias, intended_bias, repeat_index
+    @staticmethod
+    def find_failure_n_edges(
+        similarity_matrix,
+        activity_values,
+        similarity_threshold,
+        activity_threshold,
+        n_neighbors,
+    ):
+        n_molecules = len(activity_values)
+        n_edges = []
+        for molecule_index in range(n_molecules):
+            similarities = similarity_matrix[molecule_index].copy()
+            similarities[molecule_index] = -1.0
+            qualifying = np.where(similarities >= similarity_threshold)[0]
+            if len(qualifying) < n_neighbors:
+                continue
+            top_k = qualifying[np.argsort(similarities[qualifying])[::-1][:n_neighbors]]
+            consensus = float(activity_values[top_k].mean())
+            disagreement = abs(consensus - float(activity_values[molecule_index]))
+            if disagreement >= activity_threshold:
+                n_edges.append((int(molecule_index), tuple(int(n) for n in top_k)))
+        return n_edges
+    @staticmethod
+    def walk_failure_n_edges(failure_n_edges, n_molecules, n_failure_test_target):
+        assignment = np.full(n_molecules, UNASSIGNED_NODE, dtype=np.int8)
+        n_failures_placed = 0
+        for molecule_index, neighbor_indices in failure_n_edges:
+            if n_failures_placed >= n_failure_test_target:
+                break
+            if assignment[molecule_index] == TRAIN_NODE:
+                continue
+            if any(assignment[n] == TEST_NODE for n in neighbor_indices):
+                continue
+            assignment[molecule_index] = TEST_NODE
+            for neighbor_index in neighbor_indices:
+                assignment[neighbor_index] = TRAIN_NODE
+            n_failures_placed += 1
+        return assignment
+    @staticmethod
+    def evaluate_knn_failure_question(
+        test_indices,
+        train_indices,
+        activity_values,
+        similarity_matrix,
+        similarity_threshold,
+        activity_threshold,
+        n_neighbors,
+    ):
+        results = np.full(len(test_indices), np.nan, dtype=float)
+        if len(test_indices) == 0 or len(train_indices) == 0:
+            return results
+        for position, test_idx in enumerate(test_indices):
+            similarities_to_train = similarity_matrix[test_idx][train_indices]
+            qualifying = np.where(similarities_to_train >= similarity_threshold)[0]
+            if len(qualifying) < n_neighbors:
+                continue
+            top_k_positions = qualifying[
+                np.argsort(similarities_to_train[qualifying])[::-1][:n_neighbors]
+            ]
+            top_k_train_indices = train_indices[top_k_positions]
+            consensus = float(activity_values[top_k_train_indices].mean())
+            disagreement = abs(consensus - float(activity_values[test_idx]))
+            results[position] = 1.0 if disagreement >= activity_threshold else 0.0
+        return results
+    @staticmethod
+    def effective_bias_from_question_results(question_results):
+        if question_results.size == 0:
+            return 0.0
+        evaluable = question_results[~np.isnan(question_results)]
+        if evaluable.size == 0:
+            return 0.0
+        return float(evaluable.mean())
+    def visualise_splits(
+        self,
+        smiless,
+        activity_values,
+        intended_biases,
+        n_repeats,
+        output_path,
+        duration=500,
+    ):
+        G = molecular_network_from_list(
+            smiless, activity_values, self.similarity_threshold, self.activity_threshold
+        )
+        with tempfile.TemporaryDirectory() as tmpdir:
+            paths = []
+            for frame_index, (
+                train_idx,
+                test_idx,
+                effective_bias,
+                intended_bias,
+                _,
+            ) in enumerate(
+                self.split(smiless, activity_values, intended_biases, n_repeats)
+            ):
+                p = os.path.join(tmpdir, f"frame_{frame_index:04d}.png")
+                visualise_molnet_split(
+                    G, train_idx, test_idx, effective_bias, intended_bias, filepath=p
+                )
+                paths.append(p)
+            frames = [Image.open(p) for p in paths]
+            frames[0].save(
+                output_path,
+                save_all=True,
+                append_images=frames[1:],
+                duration=duration,
+                loop=0,
+            )

biased_split/molecularnetwork.py ADDED Viewed

@@ -0,0 +1,271 @@
+"""Implementation of Molecular Network"""
+import numpy as np
+import pandas as pd
+import networkx as nx
+import matplotlib.pyplot as plt
+from matplotlib.colors import LinearSegmentedColormap
+from matplotlib.lines import Line2D
+from matplotlib.ticker import MaxNLocator
+from rdkit import Chem
+from rdkit.Chem import rdFingerprintGenerator
+from rdkit.DataStructs.cDataStructs import BulkTanimotoSimilarity, BulkTverskySimilarity
+mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
+def smiles_to_ecfp4_bitvect(smi):
+    return mfpgen.GetFingerprint(Chem.MolFromSmiles(smi))
+def smiles_to_ecfp4_np(smi):
+    return mfpgen.GetFingerprintAsNumPy(Chem.MolFromSmiles(smi))
+def compute_similarity_matrix(fps_bitvect, method="tanimoto"):
+    alpha = 1
+    beta = 0
+    n = len(fps_bitvect)
+    sim_matrix = np.eye(n, dtype=np.float32)
+    for i in range(n - 1):
+        target_fp = fps_bitvect[i]
+        query_fps = fps_bitvect[i + 1 :]
+        if method == "tanimoto":
+            sims = BulkTanimotoSimilarity(target_fp, query_fps)
+            sim_matrix[i, i + 1 :] = sims
+            sim_matrix[i + 1 :, i] = sims
+        elif method == "tversky":
+            # Compute Tv(A, B) using standard alpha, beta
+            sims_ab = BulkTverskySimilarity(target_fp, query_fps, alpha, beta)
+            # Compute Tv(B, A) by swapping alpha and beta
+            sims_ba = BulkTverskySimilarity(target_fp, query_fps, beta, alpha)
+            # Get element-wise maximum for the two directions
+            max_sims = np.maximum(sims_ab, sims_ba)
+            # Assign symmetrically
+            sim_matrix[i, i + 1 :] = max_sims
+            sim_matrix[i + 1 :, i] = max_sims
+    return sim_matrix
+def molecular_network_from_list(
+    smiless,
+    activities,
+    similarity_threshold,
+    activity_threshold,
+    similarity_method="tanimoto",
+):
+    fps_bitvect = [smiles_to_ecfp4_bitvect(smiles) for smiles in smiless]
+    sim_matrix = compute_similarity_matrix(fps_bitvect, method=similarity_method)
+    adj_matrix = np.triu(sim_matrix, k=1)
+    adj_matrix[adj_matrix < similarity_threshold] = 0
+    G = nx.from_numpy_array(adj_matrix)
+    node_attrs = {
+        n: {"smiles": smi, "activity": act}
+        for n, (smi, act) in enumerate(zip(smiless, activities))
+    }
+    nx.set_node_attributes(G, node_attrs)
+    G.graph["activity_label"] = "activity"
+    G.graph["activity_threshold"] = activity_threshold
+    G.graph["similarity_threshold"] = similarity_threshold
+    G.graph["similarity_fp"] = "2048bit ECFP4"
+    G.graph["similarity_distance"] = similarity_method
+    return G
+def df_to_ecfp4_molecular_network(
+    df,
+    smiles_col,
+    activity_col,
+    similarity_threshold,
+    activity_threshold,
+    similarity_method="tanimoto",
+):
+    fps_bitvect = df[smiles_col].map(smiles_to_ecfp4_bitvect).tolist()
+    sim_matrix = compute_similarity_matrix(fps_bitvect, similarity_method)
+    adj_matrix = np.triu(sim_matrix, k=1)
+    adj_matrix[adj_matrix < similarity_threshold] = 0
+    G = nx.from_numpy_array(adj_matrix)
+    node_attrs = {
+        n: {"smiles": smi, "activity": act}
+        for n, (smi, act) in enumerate(zip(df[smiles_col], df[activity_col]))
+    }
+    nx.set_node_attributes(G, node_attrs)
+    G.graph["activity_label"] = activity_col
+    G.graph["activity_threshold"] = activity_threshold
+    G.graph["similarity_threshold"] = similarity_threshold
+    G.graph["similarity_fp"] = "2048bit ECFP4"
+    G.graph["similarity_distance"] = "Tanimoto"
+    return G
+def visualise_molnet(G, filepath=None):
+    fig, ax = plt.subplots(figsize=(12, 9))
+    pos = nx.nx_agraph.graphviz_layout(G, prog="sfdp")
+    edge_colors = []
+    for u, v in G.edges():
+        if (
+            abs(G.nodes[u]["activity"] - G.nodes[v]["activity"])
+            > G.graph["activity_threshold"]
+        ):
+            edge_colors.append((1, 0, 0, 1))
+        else:
+            w = G.edges[u, v]["weight"]
+            edge_colors.append((1 - w, 1 - w, 1 - w, 0.6))
+    node_colors = [G.nodes[n]["activity"] for n in G.nodes()]
+    nx.draw_networkx_edges(G, pos, edge_color=edge_colors, width=0.8, ax=ax)
+    nodes = nx.draw_networkx_nodes(
+        G,
+        pos,
+        node_color=node_colors,
+        cmap=plt.cm.Greys,
+        node_size=40,
+        linewidths=0,
+        ax=ax,
+    )
+    cbar = fig.colorbar(nodes, ax=ax)
+    cbar.set_label(G.graph["activity_label"])
+    ax.axis("off")
+    plt.title(
+        f"Molecular Network with {G.number_of_nodes()} molecules & {G.number_of_edges()} edges made using \nSimilarity Threshold of {G.graph['similarity_threshold']} over {G.graph['similarity_fp']} fingerprints using {G.graph['similarity_distance']} Similarity"
+    )
+    if filepath:
+        plt.tight_layout()
+        plt.savefig(filepath)
+    plt.show()
+def visualise_molnet_split(
+    G, train_idx, test_idx, effective_bias, intended_bias, filepath=None, cliff=True
+):
+    CLIFF = (0.65, 0.15, 0.20)
+    TEST = (0.20, 0.40, 0.60)
+    if "_pos" not in G.graph:
+        G.graph["_pos"] = nx.nx_agraph.graphviz_layout(
+            G, prog="sfdp", args="-Goverlap=false -GK=1.5"
+        )
+    pos = G.graph["_pos"]
+    fig, ax = plt.subplots(figsize=(10, 9))
+    edge_colors = []
+    for u, v in G.edges():
+        is_cliff_edge = (
+            cliff
+            and abs(G.nodes[u]["activity"] - G.nodes[v]["activity"])
+            > G.graph["activity_threshold"]
+        )
+        if is_cliff_edge:
+            edge_colors.append(CLIFF + (0.9,))
+        else:
+            w = G.edges[u, v]["weight"]
+            edge_colors.append((1 - w, 1 - w, 1 - w, 0.4))
+    nx.draw_networkx_edges(G, pos, edge_color=edge_colors, width=0.5, ax=ax)
+    nodes_array = np.array(list(G.nodes()))
+    train_nodes = nodes_array[train_idx]
+    test_nodes = nodes_array[test_idx]
+    activities = np.array([G.nodes[n]["activity"] for n in G.nodes()])
+    vmin, vmax = activities.min(), activities.max()
+    cmap = LinearSegmentedColormap.from_list(
+        "greys_trunc", plt.cm.Greys(np.linspace(0.35, 0.95, 256))
+    )
+    nx.draw_networkx_nodes(
+        G,
+        pos,
+        nodelist=train_nodes,
+        node_color=[G.nodes[n]["activity"] for n in train_nodes],
+        cmap=cmap,
+        vmin=vmin,
+        vmax=vmax,
+        node_size=22,
+        linewidths=0,
+        ax=ax,
+    )
+    nodes = nx.draw_networkx_nodes(
+        G,
+        pos,
+        nodelist=test_nodes,
+        node_color=[G.nodes[n]["activity"] for n in test_nodes],
+        cmap=cmap,
+        vmin=vmin,
+        vmax=vmax,
+        node_size=22,
+        linewidths=0.9,
+        edgecolors=TEST,
+        ax=ax,
+    )
+    cbar = fig.colorbar(nodes, ax=ax, fraction=0.018, pad=0.02, aspect=40)
+    cbar.set_label(G.graph["activity_label"], fontsize=9, labelpad=6)
+    cbar.ax.tick_params(labelsize=8, length=2)
+    cbar.locator = MaxNLocator(nbins=4)
+    cbar.update_ticks()
+    handles = [
+        Line2D(
+            [0],
+            [0],
+            marker="o",
+            color="w",
+            markerfacecolor="0.5",
+            markersize=6,
+            label="train",
+        ),
+        Line2D(
+            [0],
+            [0],
+            marker="o",
+            color="w",
+            markerfacecolor="0.5",
+            markeredgecolor=TEST,
+            markeredgewidth=0.9,
+            markersize=6,
+            label="test",
+        ),
+    ]
+    if cliff:
+        handles.append(
+            Line2D([0], [0], color=CLIFF, linewidth=1.2, label="activity cliff"),
+        )
+    ax.legend(
+        handles=handles,
+        loc="upper center",
+        bbox_to_anchor=(0.5, 1.02),
+        frameon=False,
+        ncol=3,
+        fontsize=9,
+        handletextpad=1.0,
+        columnspacing=1.0,
+    )
+    caption = (
+        f"{G.number_of_nodes()} molecules, {G.number_of_edges()} edges. "
+        f"{G.graph['similarity_fp']}, {G.graph['similarity_distance']} ≥ {G.graph['similarity_threshold']}. "
+        f"intended bias {intended_bias:.2f}, effective bias {effective_bias:.2f}"
+    )
+    fig.text(0.5, 0.045, caption, ha="center", fontsize=8, color="0.4")
+    ax.axis("off")
+    if filepath:
+        plt.savefig(filepath, dpi=200, bbox_inches="tight")
+        plt.close(fig)
+    else:
+        plt.show()

biased_split/proxy_sorted.py ADDED Viewed

@@ -0,0 +1,251 @@
+import os
+import tempfile
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.lines import Line2D
+from matplotlib.patches import Patch
+from PIL import Image
+from scipy.stats import gaussian_kde
+UNASSIGNED_NODE = 0
+TRAIN_NODE = 1
+TEST_NODE = 2
+def visualise_proxy_split(
+    proxy_values,
+    train_idx,
+    test_idx,
+    ideal_range_min,
+    ideal_range_max,
+    effective_bias,
+    intended_bias,
+    proxy_label="proxy",
+    x_range=None,
+    filepath=None,
+):
+    TEST = (0.20, 0.40, 0.60)
+    TRAIN = (0.5, 0.5, 0.5)
+    IDEAL = (0.65, 0.15, 0.20)
+    fig, ax = plt.subplots(figsize=(10, 5))
+    train_values = proxy_values[train_idx]
+    test_values = proxy_values[test_idx]
+    if x_range is None:
+        x_min, x_max = float(proxy_values.min()), float(proxy_values.max())
+        pad = (x_max - x_min) * 0.05
+        x_range = (x_min - pad, x_max + pad)
+    x = np.linspace(x_range[0], x_range[1], 500)
+    train_kde = gaussian_kde(train_values)
+    test_kde = gaussian_kde(test_values)
+    train_density = train_kde(x)
+    test_density = test_kde(x)
+    ax.axvspan(ideal_range_min, ideal_range_max, color=IDEAL, alpha=0.10, linewidth=0)
+    ax.fill_between(x, train_density, color=TRAIN, alpha=0.35, linewidth=0)
+    ax.fill_between(x, test_density, color=TEST, alpha=0.45, linewidth=0)
+    ax.plot(x, train_density, color=TRAIN, linewidth=1)
+    ax.plot(x, test_density, color=TEST, linewidth=1)
+    ax.set_xlabel(proxy_label)
+    ax.set_ylabel("density")
+    ax.set_xlim(x_range)
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    handles = [
+        Line2D([0], [0], color=TRAIN, linewidth=2, label="train"),
+        Line2D([0], [0], color=TEST, linewidth=2, label="test"),
+        Patch(facecolor=IDEAL, alpha=0.30, label="ideal range"),
+    ]
+    ax.legend(
+        handles=handles,
+        loc="upper center",
+        bbox_to_anchor=(0.5, 1.05),
+        frameon=False,
+        ncol=3,
+        fontsize=9,
+    )
+    caption = (
+        f"{len(proxy_values)} molecules ({len(train_idx)} train, {len(test_idx)} test). "
+        f"ideal range [{ideal_range_min}, {ideal_range_max}]. "
+        f"intended bias {intended_bias:.2f}, effective bias {effective_bias:.2f}"
+    )
+    fig.text(0.5, 0.00, caption, ha="center", fontsize=8, color="0.4")
+    if filepath:
+        plt.savefig(filepath, dpi=200, bbox_inches="tight")
+        plt.close(fig)
+    else:
+        plt.show()
+class ProxySortedSplitter:
+    def __init__(
+        self, proxy_function, ideal_range_min, ideal_range_max, test_fraction=0.2
+    ):
+        self.proxy_function = proxy_function
+        self.ideal_range_min = ideal_range_min
+        self.ideal_range_max = ideal_range_max
+        self.test_fraction = test_fraction
+    def split_for_intended_bias(
+        self, smiless, proxy_values, activity_values, intended_bias, random_seed
+    ):
+        if not (0.0 <= intended_bias <= 1.0):
+            raise ValueError(f"intended_bias must be in [0, 1], got {intended_bias}")
+        rng = np.random.default_rng(random_seed)
+        n_molecules = len(smiless)
+        target_test_size = int(self.test_fraction * n_molecules)
+        n_in_range_test_target = int(intended_bias * target_test_size)
+        in_range_mask = self.find_in_range_mask(
+            proxy_values, self.ideal_range_min, self.ideal_range_max
+        )
+        assignment = self.walk_in_range_molecules(
+            in_range_mask, n_molecules, n_in_range_test_target, rng
+        )
+        unassigned_indices = np.where(assignment == UNASSIGNED_NODE)[0]
+        unassigned_out_of_range_indices = unassigned_indices[
+            ~in_range_mask[unassigned_indices]
+        ]
+        unassigned_in_range_indices = unassigned_indices[
+            in_range_mask[unassigned_indices]
+        ]
+        n_random_fill = target_test_size - int((assignment == TEST_NODE).sum())
+        if n_random_fill > 0:
+            if len(unassigned_out_of_range_indices) >= n_random_fill:
+                random_test_indices = rng.choice(
+                    unassigned_out_of_range_indices, size=n_random_fill, replace=False
+                )
+            else:
+                shortfall = n_random_fill - len(unassigned_out_of_range_indices)
+                in_range_topup_indices = rng.choice(
+                    unassigned_in_range_indices,
+                    size=min(shortfall, len(unassigned_in_range_indices)),
+                    replace=False,
+                )
+                random_test_indices = np.concatenate(
+                    [unassigned_out_of_range_indices, in_range_topup_indices]
+                )
+            assignment[random_test_indices] = TEST_NODE
+        assignment[assignment == UNASSIGNED_NODE] = TRAIN_NODE
+        train_indices = np.where(assignment == TRAIN_NODE)[0]
+        test_indices = np.where(assignment == TEST_NODE)[0]
+        question_results = self.evaluate_proxy_question(
+            test_indices, proxy_values, self.ideal_range_min, self.ideal_range_max
+        )
+        effective_bias = self.effective_bias_from_question_results(question_results)
+        return train_indices, test_indices, effective_bias
+    def split(self, smiless, activity_values, intended_biases, n_repeats):
+        proxy_values = np.array([self.proxy_function(s) for s in smiless], dtype=float)
+        for intended_bias in intended_biases:
+            for repeat_index in range(n_repeats):
+                train_indices, test_indices, effective_bias = (
+                    self.split_for_intended_bias(
+                        smiless,
+                        proxy_values,
+                        activity_values,
+                        intended_bias,
+                        repeat_index,
+                    )
+                )
+                yield train_indices, test_indices, effective_bias, intended_bias, repeat_index
+    @staticmethod
+    def find_in_range_mask(proxy_values, ideal_range_min, ideal_range_max):
+        return (proxy_values >= ideal_range_min) & (proxy_values <= ideal_range_max)
+    @staticmethod
+    def walk_in_range_molecules(
+        in_range_mask, n_molecules, n_in_range_test_target, rng
+    ):
+        assignment = np.full(n_molecules, UNASSIGNED_NODE, dtype=np.int8)
+        in_range_indices = np.where(in_range_mask)[0]
+        if n_in_range_test_target == 0 or len(in_range_indices) == 0:
+            return assignment
+        n_to_place = min(n_in_range_test_target, len(in_range_indices))
+        selected = rng.choice(in_range_indices, size=n_to_place, replace=False)
+        assignment[selected] = TEST_NODE
+        return assignment
+    @staticmethod
+    def evaluate_proxy_question(
+        test_indices, proxy_values, ideal_range_min, ideal_range_max
+    ):
+        if len(test_indices) == 0:
+            return np.array([], dtype=float)
+        test_proxy = proxy_values[test_indices]
+        in_range = (test_proxy >= ideal_range_min) & (test_proxy <= ideal_range_max)
+        return in_range.astype(float)
+    @staticmethod
+    def effective_bias_from_question_results(question_results):
+        if question_results.size == 0:
+            return 0.0
+        return float(question_results.mean())
+    def visualise_splits(
+        self,
+        smiless,
+        activity_values,
+        intended_biases,
+        n_repeats,
+        output_path,
+        duration=500,
+        proxy_label="proxy",
+    ):
+        proxy_values = np.array([self.proxy_function(s) for s in smiless], dtype=float)
+        x_min, x_max = float(proxy_values.min()), float(proxy_values.max())
+        pad = (x_max - x_min) * 0.05
+        x_range = (x_min - pad, x_max + pad)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            paths = []
+            frame_index = 0
+            for intended_bias in intended_biases:
+                for repeat_index in range(n_repeats):
+                    train_idx, test_idx, effective_bias = self.split_for_intended_bias(
+                        smiless,
+                        proxy_values,
+                        activity_values,
+                        intended_bias,
+                        repeat_index,
+                    )
+                    p = os.path.join(tmpdir, f"frame_{frame_index:04d}.png")
+                    visualise_proxy_split(
+                        proxy_values,
+                        train_idx,
+                        test_idx,
+                        self.ideal_range_min,
+                        self.ideal_range_max,
+                        effective_bias,
+                        intended_bias,
+                        proxy_label=proxy_label,
+                        x_range=x_range,
+                        filepath=p,
+                    )
+                    paths.append(p)
+                    frame_index += 1
+            frames = [Image.open(p) for p in paths]
+            frames[0].save(
+                output_path,
+                save_all=True,
+                append_images=frames[1:],
+                duration=duration,
+                loop=0,
+            )

biased_split/substructure_distance.py ADDED Viewed

@@ -0,0 +1,185 @@
+import os
+import tempfile
+import numpy as np
+import networkx as nx
+from PIL import Image
+from biased_split.molecularnetwork import (
+    smiles_to_ecfp4_bitvect,
+    compute_similarity_matrix,
+    visualise_molnet_split,
+)
+UNASSIGNED_NODE = 0
+TRAIN_NODE = 1
+TEST_NODE = 2
+class SubstructureDistanceSplitter:
+    def __init__(self, similarity_threshold, test_fraction=0.2):
+        self.similarity_threshold = similarity_threshold
+        self.test_fraction = test_fraction
+    def split_for_intended_bias(
+        self, smiless, tversky_matrix, activity_values, intended_bias, random_seed
+    ):
+        if not (0.0 <= intended_bias <= 1.0):
+            raise ValueError(f"intended_bias must be in [0, 1], got {intended_bias}")
+        rng = np.random.default_rng(random_seed)
+        n_molecules = len(smiless)
+        target_test_size = int(self.test_fraction * n_molecules)
+        n_isolated_test_target = int(intended_bias * target_test_size)
+        components = self.find_components(tversky_matrix, self.similarity_threshold)
+        assignment = self.walk_components(
+            components, n_molecules, n_isolated_test_target, rng
+        )
+        unassigned_indices = np.where(assignment == UNASSIGNED_NODE)[0]
+        n_random_fill = target_test_size - int((assignment == TEST_NODE).sum())
+        if n_random_fill > 0 and len(unassigned_indices) > 0:
+            n_to_sample = min(n_random_fill, len(unassigned_indices))
+            random_test_indices = rng.choice(
+                unassigned_indices, size=n_to_sample, replace=False
+            )
+            assignment[random_test_indices] = TEST_NODE
+        assignment[assignment == UNASSIGNED_NODE] = TRAIN_NODE
+        train_indices = np.where(assignment == TRAIN_NODE)[0]
+        test_indices = np.where(assignment == TEST_NODE)[0]
+        question_results = self.evaluate_substructure_question(
+            test_indices, train_indices, tversky_matrix, self.similarity_threshold
+        )
+        effective_bias = self.effective_bias_from_question_results(question_results)
+        return train_indices, test_indices, effective_bias
+    def split(self, smiless, activity_values, intended_biases, n_repeats):
+        fps_bitvect = [smiles_to_ecfp4_bitvect(s) for s in smiless]
+        tversky_matrix = compute_similarity_matrix(fps_bitvect, method="tversky")
+        for intended_bias in intended_biases:
+            for repeat_index in range(n_repeats):
+                train_indices, test_indices, effective_bias = (
+                    self.split_for_intended_bias(
+                        smiless,
+                        tversky_matrix,
+                        activity_values,
+                        intended_bias,
+                        repeat_index,
+                    )
+                )
+                yield train_indices, test_indices, effective_bias, intended_bias, repeat_index
+    @staticmethod
+    def find_components(tversky_matrix, similarity_threshold):
+        adj_matrix = np.triu(tversky_matrix, k=1)
+        adj_matrix[adj_matrix < similarity_threshold] = 0
+        similarity_graph = nx.from_numpy_array(adj_matrix)
+        return sorted(nx.connected_components(similarity_graph), key=len, reverse=True)
+    @staticmethod
+    def walk_components(components, n_molecules, n_isolated_test_target, rng):
+        assignment = np.full(n_molecules, UNASSIGNED_NODE, dtype=np.int8)
+        remaining_budget = n_isolated_test_target
+        unused_components = list(components)
+        while True:
+            fitting = [c for c in unused_components if len(c) <= remaining_budget]
+            if not fitting:
+                break
+            max_size = max(len(c) for c in fitting)
+            largest = [c for c in fitting if len(c) == max_size]
+            chosen = largest[int(rng.integers(len(largest)))]
+            for molecule_index in chosen:
+                assignment[molecule_index] = TEST_NODE
+            unused_components.remove(chosen)
+            remaining_budget -= len(chosen)
+        return assignment
+    @staticmethod
+    def evaluate_substructure_question(
+        test_indices, train_indices, tversky_matrix, similarity_threshold
+    ):
+        if len(test_indices) == 0:
+            return np.array([], dtype=float)
+        if len(train_indices) == 0:
+            return np.ones(len(test_indices), dtype=float)
+        similarity_test_vs_train = tversky_matrix[np.ix_(test_indices, train_indices)]
+        max_train_similarity = similarity_test_vs_train.max(axis=1)
+        is_isolated = max_train_similarity < similarity_threshold
+        return is_isolated.astype(float)
+    @staticmethod
+    def effective_bias_from_question_results(question_results):
+        if question_results.size == 0:
+            return 0.0
+        return float(question_results.mean())
+    @staticmethod
+    def build_visualization_network(
+        smiless, activity_values, tversky_matrix, similarity_threshold
+    ):
+        adj_matrix = np.triu(tversky_matrix, k=1)
+        adj_matrix[adj_matrix < similarity_threshold] = 0
+        G = nx.from_numpy_array(adj_matrix)
+        node_attrs = {
+            n: {"smiles": smi, "activity": act}
+            for n, (smi, act) in enumerate(zip(smiless, activity_values))
+        }
+        nx.set_node_attributes(G, node_attrs)
+        G.graph["activity_label"] = "activity"
+        G.graph["activity_threshold"] = np.inf
+        G.graph["similarity_threshold"] = similarity_threshold
+        G.graph["similarity_fp"] = "2048bit ECFP4"
+        G.graph["similarity_distance"] = "tversky"
+        return G
+    def visualise_splits(
+        self,
+        smiless,
+        activity_values,
+        intended_biases,
+        n_repeats,
+        output_path,
+        duration=500,
+    ):
+        fps_bitvect = [smiles_to_ecfp4_bitvect(s) for s in smiless]
+        tversky_matrix = compute_similarity_matrix(fps_bitvect, method="tversky")
+        G = self.build_visualization_network(
+            smiless, activity_values, tversky_matrix, self.similarity_threshold
+        )
+        with tempfile.TemporaryDirectory() as tmpdir:
+            paths = []
+            frame_index = 0
+            for intended_bias in intended_biases:
+                for repeat_index in range(n_repeats):
+                    train_idx, test_idx, effective_bias = self.split_for_intended_bias(
+                        smiless,
+                        tversky_matrix,
+                        activity_values,
+                        intended_bias,
+                        repeat_index,
+                    )
+                    p = os.path.join(tmpdir, f"frame_{frame_index:04d}.png")
+                    visualise_molnet_split(
+                        G,
+                        train_idx,
+                        test_idx,
+                        effective_bias,
+                        intended_bias,
+                        filepath=p,
+                        cliff=False,
+                    )
+                    paths.append(p)
+                    frame_index += 1
+            frames = [Image.open(p) for p in paths]
+            frames[0].save(
+                output_path,
+                save_all=True,
+                append_images=frames[1:],
+                duration=duration,
+                loop=0,
+            )

biased_split-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,26 @@
+Metadata-Version: 2.4
+Name: biased-split
+Version: 0.1.0
+Summary: Biased Data Splitting Method for Chemically Meaningful Model Validation
+Requires-Python: >=3.13
+Requires-Dist: matplotlib>=3.11.0
+Requires-Dist: networkx>=3.6.1
+Requires-Dist: numpy>=2.4.6
+Requires-Dist: pandas>=3.0.3
+Requires-Dist: pyarrow>=18.0
+Requires-Dist: pygraphviz>=1.14
+Requires-Dist: rdkit>=2026.3.3
+Requires-Dist: scikit-learn>=1.9.0
+Requires-Dist: scipy>=1.17.1
+Requires-Dist: statsmodels>=0.14
+Requires-Dist: xgboost>=2.0
+Provides-Extra: benchmark
+Requires-Dist: chemprop>=2.0; extra == 'benchmark'
+Requires-Dist: lightning>=2.0; extra == 'benchmark'
+Requires-Dist: torch>=2.0; extra == 'benchmark'
+Provides-Extra: notebook
+Requires-Dist: ipykernel>=7.3.0; extra == 'notebook'
+Requires-Dist: notebook>=7.6.0; extra == 'notebook'
+Description-Content-Type: text/markdown
+# Chemically Meaningful Model Validation using Biased Data Splits

biased_split-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+biased_split/__init__.py,sha256=M8pdc583gVtikoF8LyT39zdtLK_INAARXoJgQZTGjhE,912
+biased_split/activity_cliff.py,sha256=2kMoE0DR8Sljj8FAC6qtg9RwtU99DYw_7UacK6RH_OM,11091
+biased_split/knn_failure.py,sha256=2ObfaM2_mAdZ5LU7E0sec7j8PUPwoeK-Tx7M88c936c,8828
+biased_split/molecularnetwork.py,sha256=m3wt129wtMR1kDZ8u9yvkepYS3fXeq8MEzTIGmahZDY,8225
+biased_split/proxy_sorted.py,sha256=v11GJE2g2PiNxJcil4m0m8Id-PFV5QDKVHz8_vXy2CQ,9012
+biased_split/substructure_distance.py,sha256=81iT8r7-sQahZ3MpGwIWZTxVIFtxhuQuElYweHGnnEI,7251
+biased_split-0.1.0.dist-info/METADATA,sha256=YXALQfioGmI8sZM_5xn2TPOimuMaxuoMUo_y8Hq2e3Q,913
+biased_split-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+biased_split-0.1.0.dist-info/RECORD,,

biased_split-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any