PyPI - tyche-tools - Versions diffs - 0.1.0__py3-none-any.whl - Mend

tyche-tools 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

tyche_tools/__init__.py +9 -0
tyche_tools/_features.py +175 -0
tyche_tools/_network.py +250 -0
tyche_tools/_utils.py +51 -0
tyche_tools/median.py +241 -0
tyche_tools/optimize.py +709 -0
tyche_tools/subspace.py +292 -0
tyche_tools-0.1.0.dist-info/METADATA +17 -0
tyche_tools-0.1.0.dist-info/RECORD +12 -0
tyche_tools-0.1.0.dist-info/WHEEL +5 -0
tyche_tools-0.1.0.dist-info/licenses/LICENSE +21 -0
tyche_tools-0.1.0.dist-info/top_level.txt +1 -0

tyche_tools/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from tyche_tools.median import get_median_mols
+from tyche_tools.subspace import get_local_chemical_subspace
+from tyche_tools.optimize import optimize_molecules
+__all__ = [
+    "get_median_mols",
+    "get_local_chemical_subspace",
+    "optimize_molecules",
+]

tyche_tools/_features.py ADDED Viewed

@@ -0,0 +1,175 @@
+"""Molecular feature extraction for the optimizer's neural network classifier.
+Computes a 51-dimensional property vector for a given SMILES string. Features
+include atom-count ratios, RDKit descriptors, bond type ratios, and ring
+statistics.
+"""
+import inspect
+from collections import OrderedDict
+import numpy as np
+from rdkit import Chem, RDLogger
+from rdkit.Chem import Descriptors
+RDLogger.DisableLog('rdApp.*')
+_DESCRIPTOR_NAMES = [
+    "RingCount", "HallKierAlpha", "BalabanJ",
+    "NumAliphaticCarbocycles", "NumAliphaticHeterocycles", "NumAliphaticRings",
+    "NumAromaticCarbocycles", "NumAromaticHeterocycles", "NumAromaticRings",
+    "NumHAcceptors", "NumHDonors", "NumHeteroatoms",
+    "NumRadicalElectrons", "NumSaturatedCarbocycles", "NumSaturatedHeterocycles",
+    "NumSaturatedRings", "NumValenceElectrons",
+]
+_ROTATABLE_BOND_SMARTS = Chem.MolFromSmarts('*-&!@*')
+def _get_rot_bonds_posn(mol):
+    """Return atom-index pairs for all rotatable bonds in mol."""
+    return mol.GetSubstructMatches(_ROTATABLE_BOND_SMARTS)
+def _get_bond_indices(mol, rot):
+    """Convert rotatable bond atom pairs to bond indices."""
+    return [mol.GetBondBetweenAtoms(r[0], r[1]).GetIdx() for r in rot]
+def _obtain_rings(smi):
+    """Return a list of ring SMILES fragments from the input molecule.
+    Fragments the molecule on rotatable bonds and retains the pieces that
+    contain ring closures. Returns ``(None, None)`` for molecules with no
+    rotatable bonds (e.g. purely cyclic structures).
+    """
+    mol = Chem.MolFromSmiles(smi)
+    rot = _get_rot_bonds_posn(mol)
+    if len(rot) == 0:
+        return None, None
+    bond_idx = _get_bond_indices(mol, rot)
+    new_mol = Chem.FragmentOnBonds(mol, bond_idx, addDummies=False)
+    new_smi = Chem.MolToSmiles(new_mol)
+    return [s for s in new_smi.split('.') if '1' in s and Chem.MolFromSmiles(s) is not None]
+def _count_atoms(mol, atomic_num):
+    """Count atoms of a given atomic number in mol."""
+    pat = Chem.MolFromSmarts(f'[#{atomic_num}]')
+    return len(mol.GetSubstructMatches(pat))
+def _get_num_bond_types(mol):
+    """Return [single, double, triple, aromatic] bond counts as fractions of total."""
+    from rdkit.Chem import rdchem
+    counts = {
+        rdchem.BondType.SINGLE: 0,
+        rdchem.BondType.DOUBLE: 0,
+        rdchem.BondType.TRIPLE: 0,
+        rdchem.BondType.AROMATIC: 0,
+    }
+    total = 0
+    for bond in mol.GetBonds():
+        total += 1
+        bt = bond.GetBondType()
+        if bt in counts:
+            counts[bt] += 1
+    if total == 0:
+        return [0.0, 0.0, 0.0, 0.0]
+    return [counts[t] / total for t in [
+        rdchem.BondType.SINGLE, rdchem.BondType.DOUBLE,
+        rdchem.BondType.TRIPLE, rdchem.BondType.AROMATIC,
+    ]]
+def _count_conseq_double(mol):
+    """Count consecutive double bonds in mol."""
+    from rdkit.Chem import rdchem
+    prev = None
+    count = 0
+    for bond in mol.GetBonds():
+        curr = bond.GetBondType()
+        if prev == curr == rdchem.BondType.DOUBLE:
+            count += 1
+        prev = curr
+    return count
+def _size_ring_counter(ring_ls):
+    """Return a 19-element vector: [consecutive doubles in rings, ring counts by size 3–20].
+    Returns all zeros when ``ring_ls`` is ``(None, None)`` (no rotatable bonds).
+    """
+    if ring_ls == (None, None):
+        return [0] * 19
+    ring_mols = [Chem.MolFromSmiles(s) for s in ring_ls]
+    conseq = sum(_count_conseq_double(m) for m in ring_mols)
+    size_counts = [
+        sum(1 for m in ring_mols if m.GetNumAtoms() == sz)
+        for sz in range(3, 21)
+    ]
+    return [conseq] + size_counts
+def get_mol_info(smi):
+    """Compute a 51-dimensional molecular feature vector for classifier training.
+    Features (in order):
+    - 8 atom-count ratios relative to carbon (atoms, H, N, S, O, Cl, Br, F)
+    - 17 RDKit descriptor values
+    - 4 bond-type fractions (single, double, triple, aromatic)
+    - 2 ring summary features (ring count, triple bonds in rings)
+    - 19 ring-size histogram features
+    Parameters
+    ----------
+    smi : str
+        Valid SMILES string.
+    Returns
+    -------
+    numpy.ndarray of shape (51,)
+    """
+    mol = Chem.MolFromSmiles(smi)
+    num_atoms = mol.GetNumAtoms()
+    num_hydro = Chem.AddHs(mol).GetNumAtoms() - num_atoms
+    num_carbon = _count_atoms(mol, 6) or 0.0001  # avoid division by zero
+    basic_props = [
+        num_atoms / num_carbon,
+        num_hydro / num_carbon,
+        _count_atoms(mol, 7) / num_carbon,   # N
+        _count_atoms(mol, 16) / num_carbon,  # S
+        _count_atoms(mol, 8) / num_carbon,   # O
+        _count_atoms(mol, 17) / num_carbon,  # Cl
+        _count_atoms(mol, 35) / num_carbon,  # Br
+        _count_atoms(mol, 9) / num_carbon,   # F
+    ]
+    # 17 RDKit descriptors — selected by name from Descriptors module
+    calc_props = OrderedDict(inspect.getmembers(Descriptors, inspect.isfunction))
+    for key in list(calc_props.keys()):
+        if key.startswith('_') or key not in _DESCRIPTOR_NAMES:
+            del calc_props[key]
+    rdkit_features = []
+    for key, fn in calc_props.items():
+        try:
+            rdkit_features.append(fn(mol))
+        except Exception:
+            rdkit_features.append(0.0)
+    bond_info = _get_num_bond_types(mol)
+    ring_ls = _obtain_rings(smi)
+    num_triple_in_rings = 0
+    if ring_ls and ring_ls != (None, None) and len(ring_ls) > 0:
+        for item in ring_ls:
+            num_triple_in_rings += item.count('#')
+        bond_info.append(len(ring_ls))
+    else:
+        bond_info.append(0)
+    bond_info.append(num_triple_in_rings)
+    bond_info += _size_ring_counter(ring_ls)
+    bond_info.append(_count_conseq_double(mol))
+    return np.array(rdkit_features + basic_props + bond_info)

tyche_tools/_network.py ADDED Viewed

@@ -0,0 +1,250 @@
+"""Neural network classifier for guided molecular exploration.
+Trains a small MLP on previously evaluated (SMILES, fitness) pairs and uses it
+to predict which newly generated molecules are likely to score highly. This
+biases the exploration phase toward promising chemical space without requiring
+a full fitness evaluation for every candidate.
+Requires PyTorch. If PyTorch is not installed, import of this module will
+raise an ImportError; the optimizer falls back to random sampling in that case.
+"""
+import copy
+import multiprocessing
+from typing import List
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, TensorDataset
+from tyche_tools._features import get_mol_info
+# ── Feature extraction ─────────────────────────────────────────────────────────
+def _get_mol_feature(smi: str) -> np.ndarray:
+    return np.array(get_mol_info(smi))
+def obtain_features(smi_list: List[str], num_workers: int = 1) -> np.ndarray:
+    """Compute the 51-dim feature matrix for a list of SMILES.
+    Parameters
+    ----------
+    smi_list : list of str
+    num_workers : int
+        Number of parallel worker processes.
+    Returns
+    -------
+    numpy.ndarray of shape (N, 51)
+    """
+    if num_workers == 1:
+        return np.array([_get_mol_feature(s) for s in smi_list])
+    with multiprocessing.Pool(num_workers) as pool:
+        return np.array(pool.map(_get_mol_feature, smi_list))
+# ── Model architecture ─────────────────────────────────────────────────────────
+class MLP(nn.Module):
+    """Multi-layer perceptron with sigmoid activations throughout.
+    Parameters
+    ----------
+    h_sizes : list of int
+        Hidden layer widths.
+    n_input : int
+        Input dimensionality (51 for the default molecular features).
+    n_output : int
+        Output dimensionality (1 for binary classification).
+    """
+    def __init__(self, h_sizes: List[int], n_input: int, n_output: int):
+        super().__init__()
+        self.hidden = nn.ModuleList([nn.Linear(n_input, h_sizes[0])])
+        for i in range(len(h_sizes) - 1):
+            self.hidden.append(nn.Linear(h_sizes[i], h_sizes[i + 1]))
+        self.predict = nn.Linear(h_sizes[-1], n_output)
+    def forward(self, x):
+        for layer in self.hidden:
+            x = torch.sigmoid(layer(x))
+        return torch.sigmoid(self.predict(x))
+# ── Training utilities ─────────────────────────────────────────────────────────
+class _EarlyStopping:
+    """Monitor validation loss and restore best weights when improvement stalls."""
+    def __init__(self, patience: int = 500, min_delta: float = 1e-7):
+        self.patience = patience
+        self.min_delta = min_delta
+        self.best_val = np.inf
+        self.best_weights = None
+        self.best_epoch = 0
+        self.checkpoint = 0
+    def step(self, net, epoch: int, val_loss: float) -> bool:
+        """Update state. Returns True when training should stop."""
+        if val_loss + self.min_delta < self.best_val:
+            self.best_val = val_loss
+            self.best_weights = copy.deepcopy(net.state_dict())
+            self.best_epoch = epoch
+            self.checkpoint = 0
+        else:
+            self.checkpoint += 1
+        return self.checkpoint > self.patience
+    def restore_best(self, net) -> nn.Module:
+        print(f'        Early stopping at epoch {self.best_epoch}, val loss {self.best_val:.6f}')
+        net.load_state_dict(self.best_weights)
+        return net
+def _get_device(use_gpu: bool) -> str:
+    if use_gpu and torch.cuda.is_available():
+        return 'cuda'
+    if use_gpu:
+        print('No GPU available, defaulting to CPU.')
+    return 'cpu'
+def _train_valid_split(data_x, data_y, train_ratio=0.8, seed=30624700):
+    """Deterministic 80/20 train-validation split."""
+    n = data_x.shape[0]
+    train_n = int(np.floor(n * train_ratio))
+    idx = np.random.RandomState(seed=seed).permutation(n)
+    return (
+        data_x[idx[:train_n]], data_y[idx[:train_n]],
+        data_x[idx[train_n:]], data_y[idx[train_n:]],
+    )
+def _do_training(data_x, data_y, net, optimizer, loss_fn, steps=20000, batch_size=1024, device='cpu'):
+    """Train net for up to steps epochs with early stopping on validation loss."""
+    train_x, train_y, valid_x, valid_y = _train_valid_split(data_x, data_y)
+    train_x = torch.tensor(train_x, device=device, dtype=torch.float)
+    train_y = torch.tensor(train_y, device=device, dtype=torch.float)
+    valid_x = torch.tensor(valid_x, device=device, dtype=torch.float)
+    valid_y = torch.tensor(valid_y, device=device, dtype=torch.float)
+    loader = DataLoader(TensorDataset(train_x, train_y), batch_size=batch_size, shuffle=True)
+    valid_loader = DataLoader(TensorDataset(valid_x, valid_y), batch_size=batch_size)
+    early_stop = _EarlyStopping(patience=500, min_delta=1e-7)
+    net.train()
+    for epoch in range(steps):
+        for x, y in loader:
+            pred = net(x)
+            loss = loss_fn(pred, y)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+        val_loss = 0.0
+        net.eval()
+        with torch.no_grad():
+            for x, y in valid_loader:
+                val_loss += loss_fn(net(x), y).item()
+        val_loss /= len(valid_loader)
+        net.train()
+        if epoch % 1000 == 0:
+            print(f'        Epoch {epoch}: train loss {loss.item():.6f}, val loss {val_loss:.6f}')
+        if early_stop.step(net, epoch, val_loss):
+            net = early_stop.restore_best(net)
+            break
+    return net
+# ── Public API ─────────────────────────────────────────────────────────────────
+def create_and_train_network(
+    smi_list: List[str],
+    targets: List[float],
+    n_hidden: List[int] = None,
+    use_gpu: bool = True,
+    num_workers: int = 1,
+) -> MLP:
+    """Featurize SMILES, build a binary MLP, and train it.
+    Labels are 1 for molecules at or above the 80th fitness percentile, 0 otherwise.
+    The trained model predicts which unseen molecules are likely to score highly.
+    Parameters
+    ----------
+    smi_list : list of str
+        SMILES of all previously evaluated molecules.
+    targets : list of float
+        Fitness values corresponding to each SMILES in smi_list.
+    n_hidden : list of int, default [100, 10]
+        Hidden layer widths of the MLP.
+    use_gpu : bool
+        Use CUDA if available.
+    num_workers : int
+        Parallel workers for feature extraction.
+    Returns
+    -------
+    MLP
+        Trained PyTorch model.
+    """
+    if n_hidden is None:
+        n_hidden = [100, 10]
+    dataset_x = obtain_features(smi_list, num_workers=num_workers)
+    threshold = np.percentile(targets, 80)
+    dataset_y = np.expand_dims(
+        [1.0 if t >= threshold else 0.0 for t in targets], axis=-1
+    )
+    device = _get_device(use_gpu)
+    net = MLP(n_hidden, dataset_x.shape[-1], dataset_y.shape[-1]).to(device)
+    optimizer = torch.optim.Adam(net.parameters(), lr=0.001, weight_decay=1e-4)
+    loss_fn = nn.BCELoss()
+    net = _do_training(
+        dataset_x, dataset_y, net, optimizer, loss_fn,
+        steps=20000, batch_size=1024, device=device,
+    )
+    return net
+def obtain_model_pred(
+    smi_list: List[str],
+    net: MLP,
+    use_gpu: bool = True,
+    num_workers: int = 1,
+    batch_size: int = 1024,
+) -> np.ndarray:
+    """Return classifier predictions for a list of SMILES.
+    Parameters
+    ----------
+    smi_list : list of str
+    net : MLP
+        Trained model from ``create_and_train_network``.
+    use_gpu : bool
+    num_workers : int
+    batch_size : int
+    Returns
+    -------
+    numpy.ndarray of shape (N, 1)
+        Predicted probability of belonging to the high-fitness class.
+    """
+    device = _get_device(use_gpu)
+    data_x = obtain_features(smi_list, num_workers=num_workers)
+    data_x = torch.tensor(data_x, device=device, dtype=torch.float)
+    loader = DataLoader(TensorDataset(data_x), batch_size=batch_size)
+    net.eval()
+    predictions = []
+    with torch.no_grad():
+        for (x,) in loader:
+            predictions.append(net(x).detach().cpu().numpy())
+    return np.concatenate(predictions, axis=0)

tyche_tools/_utils.py ADDED Viewed

@@ -0,0 +1,51 @@
+from rdkit import RDLogger
+from rdkit.Chem import MolFromSmiles as smi2mol
+from rdkit.Chem import MolToSmiles as mol2smi
+RDLogger.DisableLog('rdApp.*')
+def get_selfie_chars(selfie):
+    """Split a SELFIES string into a list of its tokens.
+    Parameters
+    ----------
+    selfie : str
+        A valid SELFIES string.
+    Returns
+    -------
+    list of str
+    Examples
+    --------
+    >>> get_selfie_chars('[C][=C][C][=C][C][=C][Ring1][Branch1_1]')
+    ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[Branch1_1]']
+    """
+    chars = []
+    while selfie:
+        chars.append(selfie[selfie.find('['):selfie.find(']') + 1])
+        selfie = selfie[selfie.find(']') + 1:]
+    return chars
+def sanitize_smiles(smi):
+    """Return a canonical SMILES representation of the input string.
+    Parameters
+    ----------
+    smi : str
+    Returns
+    -------
+    mol : rdkit.Chem.rdchem.Mol or None
+    smi_canon : str or None
+        Canonical, non-isomeric SMILES string.
+    success : bool
+    """
+    try:
+        mol = smi2mol(smi, sanitize=True)
+        smi_canon = mol2smi(mol, isomericSmiles=False, canonical=True)
+        return mol, smi_canon, True
+    except Exception:
+        return None, None, False