PyPI - molcraft - Versions diffs - 0.1.0a16__py3-none-any.whl → 0.1.0a17__py3-none-any.whl - Mend

molcraft 0.1.0a16py3-none-any.whl → 0.1.0a17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of molcraft might be problematic. Click here for more details.

Files changed (18) hide show

molcraft/__init__.py +1 -2
molcraft/applications/chromatography.py +0 -0
molcraft/applications/proteomics.py +47 -92
molcraft/chem.py +17 -22
molcraft/datasets.py +6 -6
molcraft/descriptors.py +14 -0
molcraft/features.py +50 -58
molcraft/featurizers.py +257 -487
molcraft/layers.py +1 -1
molcraft/models.py +2 -0
molcraft/records.py +24 -15
{molcraft-0.1.0a16.dist-info → molcraft-0.1.0a17.dist-info}/METADATA +13 -12
molcraft-0.1.0a17.dist-info/RECORD +21 -0
molcraft/conformers.py +0 -151
molcraft-0.1.0a16.dist-info/RECORD +0 -21
{molcraft-0.1.0a16.dist-info → molcraft-0.1.0a17.dist-info}/WHEEL +0 -0
{molcraft-0.1.0a16.dist-info → molcraft-0.1.0a17.dist-info}/licenses/LICENSE +0 -0
{molcraft-0.1.0a16.dist-info → molcraft-0.1.0a17.dist-info}/top_level.txt +0 -0

molcraft/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = '0.1.0a16'
+__version__ = '0.1.0a17'
 import os
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
@@ -6,7 +6,6 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 from molcraft import chem
 from molcraft import features
 from molcraft import descriptors
-from molcraft import conformers
 from molcraft import featurizers
 from molcraft import layers
 from molcraft import models

molcraft/applications/chromatography.py ADDED Viewed

File without changes

molcraft/applications/proteomics.py CHANGED Viewed

@@ -3,7 +3,6 @@ import keras
 import numpy as np
 import tensorflow as tf
 import tensorflow_text as tf_text
-import json
 from molcraft import featurizers
 from molcraft import tensors
@@ -46,7 +45,22 @@ default_residues: dict[str, str] = {
     "Y": "N[C@@H](Cc1ccc(O)cc1)C(=O)O",
 }
+def register_residues(residues: dict[str, str]) -> None:
+    # TODO: Implement functions that check if residue has N- or C-terminal mod
+    #       if C-terminal mod, no need to enforce concatenatable perm.
+    #       if N-terminal mod, enforce only 'C(=O)O'
+    #       if normal mod, enforce concatenateable perm ('N[C@@H]' and 'C(=O)O)).
+    for residue, smiles in residues.items():
+        if residue.startswith('P'):
+            smiles.startswith('N'), f'Incorrect SMILES permutation for {residue}.'
+        elif not residue.startswith('['):
+            smiles.startswith('N[C@@H]'), f'Incorrect SMILES permutation for {residue}.'
+        if len(residue) > 1 and not residue[1] == "-":
+            assert smiles.endswith('C(=O)O'), f'Incorrect SMILES permutation for {residue}.'
+        registered_residues[residue] = smiles
+        registered_residues[residue + '*'] = smiles.strip('O')
 class Peptide(chem.Mol):
     @classmethod
@@ -72,40 +86,51 @@ class ResidueEmbedding(keras.layers.Layer):
         self,
         featurizer: featurizers.MolGraphFeaturizer,
         embedder: models.GraphModel,
+        residues: dict[str, str] | None = None,
         **kwargs
     ) -> None:
-        residues = kwargs.pop('_residues', None)
         super().__init__(**kwargs)
         if residues is None:
-            residues = registered_residues.copy()
-        self._residues = residues
+            residues = {}
+        self._residue_dict = {**default_residues, **residues}
         self.embedder = embedder
         self.featurizer = featurizer
+        self.embedding_dim = self.embedder.output.shape[-1]
         self.ragged_split = SequenceSplitter(pad=False)
         self.split = SequenceSplitter(pad=True)
+        self.use_cached_embeddings = tf.Variable(False)
         self.supports_masking = True
-    def build(self, input_shape) -> None:
-        embedding_dim = self.embedder.output.shape[-1]
-        residues = sorted(self._residues.keys())
-        smiles = [self._residues[residue] for residue in residues]
+    @property
+    def residues(self) -> dict[str, str]:
+        return self._residue_dict
+    @residues.setter
+    def residues(self, residues: dict[str, str]) -> None:
+        self._residue_dict = residues
         num_residues = len(residues)
-        self.oov_index = np.where(np.array(residues) == "G")[0][0]
+        residue_keys = sorted(residues.keys())
+        oov_value = np.where(np.array(residue_keys) == "G")[0][0]
         self.mapping = tf.lookup.StaticHashTable(
             tf.lookup.KeyValueTensorInitializer(
-                keys=residues,
+                keys=residue_keys,
                 values=range(num_residues)
             ),
-            default_value=-1,
+            default_value=oov_value,
         )
-        self.graph = tf.stack([self.featurizer(s) for s in smiles], axis=0)
+        self.graph = tf.stack([
+            self.featurizer(residues[residue]) for residue in residue_keys
+        ], axis=0)
         self.cached_embeddings = tf.Variable(
-            initial_value=tf.zeros((num_residues, embedding_dim))
+            initial_value=tf.zeros((num_residues, self.embedding_dim))
         )
-        self.use_cached_embeddings = tf.Variable(False)
+        _ = self.cache_and_get_embeddings()
+    def build(self, input_shape) -> None:
+        self.residues = self._residue_dict
         super().build(input_shape)
-    def call(self, sequences, training=None) -> tensors.GraphTensor:
+    def call(self, sequences: tf.Tensor, training: bool = None) -> tf.Tensor:
         if training is False:
             self.use_cached_embeddings.assign(True)
         else:
@@ -113,17 +138,16 @@ class ResidueEmbedding(keras.layers.Layer):
         embeddings = tf.cond(
             pred=self.use_cached_embeddings,
             true_fn=lambda: self.cached_embeddings,
-            false_fn=lambda: self.embeddings(),
+            false_fn=lambda: self.cache_and_get_embeddings(),
         )
         sequences = self.ragged_split(sequences)
         sequences = keras.ops.concatenate([
             tf.strings.join([sequences[:, :-1], '*']), sequences[:, -1:]
         ], axis=1)
         indices = self.mapping.lookup(sequences)
-        indices = keras.ops.where(indices == -1, self.oov_index, indices)
         return tf.gather(embeddings, indices).to_tensor()
-    def embeddings(self) -> tf.Tensor:
+    def cache_and_get_embeddings(self) -> tf.Tensor:
         embeddings = self.embedder(self.graph)
         self.cached_embeddings.assign(embeddings)
         return embeddings
@@ -139,9 +163,9 @@ class ResidueEmbedding(keras.layers.Layer):
     def get_config(self) -> dict:
         config = super().get_config()
         config.update({
-            '_residues': self._residues,
             'featurizer': keras.saving.serialize_keras_object(self.featurizer),
-            'embedder': keras.saving.serialize_keras_object(self.embedder)
+            'embedder': keras.saving.serialize_keras_object(self.embedder),
+            'residues': self._residue_dict,
         })
         return config
@@ -153,87 +177,18 @@ class ResidueEmbedding(keras.layers.Layer):
 @keras.saving.register_keras_serializable(package='proteomics')
-class SequenceSplitter(keras.layers.Layer):
+class SequenceSplitter(keras.layers.Layer):
     def __init__(self, pad: bool, **kwargs):
         super().__init__(**kwargs)
         self.pad = pad
-    def call(self, inputs):
+    def call(self, inputs: tf.Tensor) -> tf.Tensor | tf.RaggedTensor:
         inputs = tf_text.regex_split(inputs, residue_pattern, residue_pattern)
         if self.pad:
             inputs = inputs.to_tensor()
         return inputs
-def interpret(model: keras.models.Model, sequence: list[str]) -> tensors.GraphTensor:
-    if not tf.is_tensor(sequence):
-        sequence = keras.ops.convert_to_tensor(sequence)
-    # Find embedding layer
-    for layer in model.layers:
-        if isinstance(layer, ResidueEmbedding):
-            break
-    # Use embedding layer to convert the sequence to a graph
-    residues = layer.ragged_split(sequence)
-    residues = keras.ops.concatenate([
-        tf.strings.join([residues[:, :-1], '*']), residues[:, -1:]
-    ], axis=1)
-    indices = layer.mapping.lookup(residues)
-    graph = tf.concat([
-        layer.graph[residue_ids] for residue_ids in indices
-    ], axis=0)
-    # Define layer which reshapes data into sequences of residue embeddings
-    num_residues = indices.row_lengths()
-    to_sequence = (
-        lambda x: tf.RaggedTensor.from_row_lengths(x, num_residues).to_tensor()
-    )
-    reshape = keras.layers.Lambda(to_sequence)
-    # Obtain the embedder part of the original model
-    embedder = layer.embedder
-    # Obtain the remaining part of the original model
-    predictor = keras.models.Model(embedder.output, model.output)
-    # Obtain an 'interpretable model', based on the original model
-    inputs = layers.Input(graph.spec)
-    x = inputs
-    for layer in embedder.layers: # Loop over layers to expose them
-        x = layer(x)
-    x = reshape(x)
-    outputs = predictor(x)
-    interpretable_model = models.GraphModel(inputs, outputs)
-    # Interpret original model through the 'interpretable model'
-    graph = models.interpret(interpretable_model, graph)
-    del interpretable_model
-    # Update 'size' field with new sizes corresponding to peptides for convenience
-    # Allows the user to obtain n:th peptide graph using indexing: nth_peptide = graph[n]
-    peptide_indices = range(len(num_residues))
-    peptide_indicator = keras.ops.repeat(peptide_indices, num_residues)
-    residue_sizes = graph.context['size']
-    peptide_sizes = keras.ops.segment_sum(residue_sizes, peptide_indicator)
-    return graph.update({'context': {'size': peptide_sizes, 'sequence': sequence}})
-def register_residues(residues: dict[str, str]) -> None:
-    # TODO: Implement functions that check if residue has N- or C-terminal mod
-    #       if C-terminal mod, no need to enforce concatenatable perm.
-    #       if N-terminal mod, enforce only 'C(=O)O'
-    #       if normal mod, enforce concatenateable perm ('N[C@@H]' and 'C(=O)O)).
-    for residue, smiles in residues.items():
-        if residue.startswith('P'):
-            smiles.startswith('N'), f'Incorrect SMILES permutation for {residue}.'
-        elif not residue.startswith('['):
-            smiles.startswith('N[C@@H]'), f'Incorrect SMILES permutation for {residue}.'
-        if len(residue) > 1 and not residue[1] == "-":
-            assert smiles.endswith('C(=O)O'), f'Incorrect SMILES permutation for {residue}.'
-        registered_residues[residue] = smiles
-        registered_residues[residue + '*'] = smiles.strip('O')
 registered_residues: dict[str, str] = {}
 register_residues(default_residues)

molcraft/chem.py CHANGED Viewed

@@ -19,8 +19,6 @@ class Mol(Chem.Mol):
     @classmethod
     def from_encoding(cls, encoding: str, explicit_hs: bool = False, **kwargs) -> 'Mol':
         rdkit_mol = get_mol(encoding, **kwargs)
-        if not rdkit_mol:
-            return None
         if explicit_hs:
             rdkit_mol = Chem.AddHs(rdkit_mol)
         rdkit_mol.__class__ = cls
@@ -102,21 +100,13 @@ class Mol(Chem.Mol):
     def get_conformer(self, index: int = 0) -> 'Conformer':
         if self.num_conformers == 0:
-            warnings.warn(
-                'Molecule has no conformer. To embed conformer(s), invoke the `embed` method, '
-                'and optionally followed by `minimize()` to perform force field minimization.',
-                stacklevel=2
-            )
+            warnings.warn('Molecule has no conformer.')
             return None
         return Conformer.cast(self.GetConformer(index))
     def get_conformers(self) -> list['Conformer']:
         if self.num_conformers == 0:
-            warnings.warn(
-                'Molecule has no conformers. To embed conformers, invoke the `embed` method, '
-                'and optionally followed by `minimize()` to perform force field minimization.',
-                stacklevel=2
-            )
+            warnings.warn('Molecule has no conformer.')
             return []
         return [Conformer.cast(x) for x in self.GetConformers()]
@@ -222,11 +212,10 @@ def get_mol(
     else:
         mol = Chem.MolFromSmiles(encoding, sanitize=False)
     if mol is not None:
-        return sanitize_mol(mol, strict, assign_stereo_chemistry)
-    raise ValueError(
-        f"{encoding} is invalid; "
-        f"make sure {encoding} is a valid SMILES or InChI string."
-    )
+        mol = sanitize_mol(mol, strict, assign_stereo_chemistry)
+    if mol is not None:
+        return mol
+    raise ValueError(f'Could not obtain `chem.Mol` from {encoding}.')
 def get_adjacency_matrix(
     mol: Chem.Mol,
@@ -402,8 +391,9 @@ def embed_conformers(
     mol: Mol,
     num_conformers: int,
     method: str = 'ETKDGv3',
+    random_seed: int | None = None,
     **kwargs
-) -> None:
+) -> Mol:
     available_embedding_methods = {
         'ETDG': rdDistGeom.ETDG(),
         'ETKDG': rdDistGeom.ETKDG(),
@@ -423,6 +413,9 @@ def embed_conformers(
     for key, value in kwargs.items():
         setattr(embedding_method, key, value)
+    if random_seed is not None:
+        embedding_method.randomSeed = random_seed
     success = rdDistGeom.EmbedMultipleConfs(
         mol, numConfs=num_conformers, params=embedding_method
     )
@@ -440,6 +433,8 @@ def embed_conformers(
             fallback_embedding_method.useRandomCoords = True
             fallback_embedding_method.maxAttempts = max_attempts
             fallback_embedding_method.clearConfs = False
+            if random_seed is not None:
+                fallback_embedding_method.randomSeed = random_seed
             success = rdDistGeom.EmbedMultipleConfs(
                 mol, numConfs=(num_conformers - num_successes), params=fallback_embedding_method
             )
@@ -459,7 +454,7 @@ def optimize_conformers(
     num_threads: bool = 1,
     ignore_interfragment_interactions: bool = True,
     vdw_threshold: float = 10.0,
-):
+) -> Mol:
     available_force_field_methods = [
         'MMFF', 'MMFF94', 'MMFF94s', 'UFF'
     ]
@@ -502,7 +497,7 @@ def prune_conformers(
     keep: int = 1,
     threshold: float = 0.0,
     energy_force_field: str = 'UFF',
-):
+) -> Mol:
     if mol.num_conformers == 0:
         warnings.warn(
             'Molecule has no conformers. To embed conformers, invoke the `embed` method, '
@@ -539,7 +534,7 @@ def _uff_optimize_conformers(
     vdw_threshold: float = 10.0,
     ignore_interfragment_interactions: bool = True,
     **kwargs,
-) -> Mol:
+) -> tuple[list[float], list[bool]]:
     """Universal Force Field Minimization.
     """
     results = rdForceFieldHelpers.UFFOptimizeMoleculeConfs(
@@ -560,7 +555,7 @@ def _mmff_optimize_conformers(
     variant: str = 'MMFF94',
     ignore_interfragment_interactions: bool = True,
     **kwargs,
-) -> Mol:
+) -> tuple[list[float], list[bool]]:
     """Merck Molecular Force Field Minimization.
     """
     if not rdForceFieldHelpers.MMFFHasAllMoleculeParams(mol):

molcraft/datasets.py CHANGED Viewed

@@ -11,7 +11,7 @@ def split(
     test_size: float | None = None,
     groups: str | np.ndarray = None,
     shuffle: bool = False,
-    random_state: int | None = None,
+    random_seed: int | None = None,
 ) -> tuple[np.ndarray | pd.DataFrame, ...]:
     """Splits the dataset into subsets.
@@ -28,7 +28,7 @@ def split(
             The groups to perform the splitting on.
         shuffle:
             Whether the dataset should be shuffled prior to splitting.
-        random_state:
+        random_seed:
             The random state/seed. Only applicable if shuffling.
     """
     if not isinstance(data, (pd.DataFrame, np.ndarray)):
@@ -69,7 +69,7 @@ def split(
     train_size += remainder
     if shuffle:
-        np.random.seed(random_state)
+        np.random.seed(random_seed)
         np.random.shuffle(indices)
     train_mask = np.isin(groups, indices[:train_size])
@@ -84,7 +84,7 @@ def cv_split(
     num_splits: int = 10,
     groups: str | np.ndarray = None,
     shuffle: bool = False,
-    random_state: int | None = None,
+    random_seed: int | None = None,
 ) -> typing.Iterator[
         tuple[np.ndarray | pd.DataFrame, np.ndarray | pd.DataFrame]
     ]:
@@ -99,7 +99,7 @@ def cv_split(
             The groups to perform the splitting on.
         shuffle:
             Whether the dataset should be shuffled prior to splitting.
-        random_state:
+        random_seed:
             The random state/seed. Only applicable if shuffling.
     """
     if not isinstance(data, (pd.DataFrame, np.ndarray)):
@@ -119,7 +119,7 @@ def cv_split(
             f'the data size or the number of groups ({size}).'
         )
     if shuffle:
-        np.random.seed(random_state)
+        np.random.seed(random_seed)
         np.random.shuffle(indices)
     indices_splits = np.array_split(indices, num_splits)

molcraft/descriptors.py CHANGED Viewed

@@ -91,3 +91,17 @@ class NumRings(Descriptor):
     def call(self, mol: chem.Mol) -> np.ndarray:
         return rdMolDescriptors.CalcNumRings(mol)
+@keras.saving.register_keras_serializable(package='molcraft')
+class AtomCount(Descriptor):
+    def __init__(self, atom_type: str, **kwargs):
+        super().__init__(**kwargs)
+        self.atom_type = atom_type
+    def call(self, mol: chem.Mol) -> np.ndarray:
+        count = 0
+        for atom in mol.atoms:
+            if atom.GetSymbol() == self.atom_type:
+                count += 1
+        return count

molcraft/features.py CHANGED Viewed

@@ -41,11 +41,7 @@ class Feature(abc.ABC):
     def __call__(self, mol: chem.Mol) -> np.ndarray:
         if not isinstance(mol, chem.Mol):
-            raise ValueError(
-                f'Input to {self.name} needs to be a `chem.Mol`, which '
-                'implements two properties that should be iterated over '
-                'to compute features: `atoms` and `bonds`.'
-            )
+            raise TypeError(f'Input to {self.name} must be a `chem.Mol` instance.')
         features = self.call(mol)
         if len(features) != mol.num_atoms and len(features) != mol.num_bonds:
             raise ValueError(
@@ -119,59 +115,6 @@ class Feature(abc.ABC):
         return np.asarray([value], dtype=self.dtype)
-@keras.saving.register_keras_serializable(package='molcraft')
-class EdgeFeature(Feature):
-    def __call__(self, mol: chem.Mol) -> np.ndarray:
-        if not isinstance(mol, chem.Mol):
-            raise ValueError(
-                f'Input to {self.name} needs to be a `chem.Mol`, which '
-                'implements two properties that should be iterated over '
-                'to compute features: `atoms` and `bonds`.'
-            )
-        features = self.call(mol)
-        if len(features) != int(mol.num_atoms**2):
-            raise ValueError(
-                f'The number of features computed by {self.name} does not '
-                'match the number of node pairs in the `chem.Mol` object. '
-                f'Make sure the list of items returned by {self.name}(input) '
-                'correspond to node/atom pairs: '
-                '[(0, 0), (0, 1), ..., (0, N), (1, 0), ... (N, N)], '
-                'where N denotes the number of nodes/atoms.'
-            )
-        func = (
-            self._featurize_categorical if self.vocab else
-            self._featurize_floating
-        )
-        return np.asarray([func(x) for x in features], dtype=self.dtype)
-@keras.saving.register_keras_serializable(package='molcraft')
-class Distance(EdgeFeature):
-    def __init__(
-        self,
-        max_distance: int = None,
-        allow_oov: int = True,
-        encode_oov: bool = True,
-        **kwargs,
-    ) -> None:
-        vocab = kwargs.pop('vocab', None)
-        if not vocab:
-            if max_distance is None:
-                max_distance = 20
-            vocab = list(range(max_distance + 1))
-        super().__init__(
-            vocab=vocab,
-            allow_oov=allow_oov,
-            encode_oov=encode_oov,
-            **kwargs
-        )
-    def call(self, mol: chem.Mol) -> list[int]:
-        return [int(x) for x in chem.get_distances(mol).reshape(-1)]
 @keras.saving.register_keras_serializable(package='molcraft')
 class AtomType(Feature):
     def call(self, mol: chem.Mol) -> list[int, float, str]:
@@ -340,6 +283,55 @@ class IsRotatable(Feature):
         return chem.rotatable_bonds(mol)
+@keras.saving.register_keras_serializable(package='molcraft')
+class PairFeature(Feature):
+    def __call__(self, mol: chem.Mol) -> np.ndarray:
+        if not isinstance(mol, chem.Mol):
+            raise TypeError(f'Input to {self.name} must be a `chem.Mol` instance.')
+        features = self.call(mol)
+        if len(features) != int(mol.num_atoms**2):
+            raise ValueError(
+                f'The number of features computed by {self.name} does not '
+                'match the number of node/atom pairs in the `chem.Mol` object. '
+                f'Make sure the list of items returned by {self.name}(input) '
+                'correspond to node/atom pairs: '
+                '[(0, 0), (0, 1), ..., (0, N), (1, 0), ... (N, N)], '
+                'where N denotes the number of nodes/atoms.'
+            )
+        func = (
+            self._featurize_categorical if self.vocab else
+            self._featurize_floating
+        )
+        return np.asarray([func(x) for x in features], dtype=self.dtype)
+@keras.saving.register_keras_serializable(package='molcraft')
+class PairDistance(PairFeature):
+    def __init__(
+        self,
+        max_distance: int = None,
+        allow_oov: int = True,
+        encode_oov: bool = True,
+        **kwargs,
+    ) -> None:
+        vocab = kwargs.pop('vocab', None)
+        if not vocab:
+            if max_distance is None:
+                max_distance = 10
+            vocab = list(range(max_distance + 1))
+        super().__init__(
+            vocab=vocab,
+            allow_oov=allow_oov,
+            encode_oov=encode_oov,
+            **kwargs
+        )
+    def call(self, mol: chem.Mol) -> list[int]:
+        return [int(x) for x in chem.get_distances(mol).reshape(-1)]
 default_vocabulary = {
     'AtomType': [
         '*', 'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na',

molcraft 0.1.0a16__py3-none-any.whl → 0.1.0a17__py3-none-any.whl

Potentially problematic release.

molcraft 0.1.0a16py3-none-any.whl → 0.1.0a17py3-none-any.whl