PyPI - molcraft - Versions diffs - 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl - Mend

molcraft 0.1.0a1py3-none-any.whl → 0.1.0a3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of molcraft might be problematic. Click here for more details.

Files changed (14) hide show

molcraft/__init__.py +2 -1
molcraft/datasets.py +123 -0
molcraft/experimental/peptides.py +28 -67
molcraft/features.py +5 -3
molcraft/featurizers.py +68 -27
molcraft/layers.py +1299 -647
molcraft/models.py +35 -5
molcraft/tensors.py +33 -12
{molcraft-0.1.0a1.dist-info → molcraft-0.1.0a3.dist-info}/METADATA +68 -1
molcraft-0.1.0a3.dist-info/RECORD +20 -0
molcraft-0.1.0a1.dist-info/RECORD +0 -19
{molcraft-0.1.0a1.dist-info → molcraft-0.1.0a3.dist-info}/WHEEL +0 -0
{molcraft-0.1.0a1.dist-info → molcraft-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
{molcraft-0.1.0a1.dist-info → molcraft-0.1.0a3.dist-info}/top_level.txt +0 -0

molcraft/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = '0.1.0a1'
+__version__ = '0.1.0a3'
 import os
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
@@ -14,3 +14,4 @@ from molcraft import ops
 from molcraft import records
 from molcraft import tensors
 from molcraft import callbacks
+from molcraft import datasets

molcraft/datasets.py ADDED Viewed

@@ -0,0 +1,123 @@
+import numpy as np
+import pandas as pd
+def split(
+    data: pd.DataFrame | np.ndarray,
+    train_size: float | None = None,
+    validation_size: float | None = None,
+    test_size: float = 0.1,
+    shuffle: bool = False,
+    random_state: int | None = None,
+) -> pd.DataFrame | np.ndarray:
+    """Splits dataset into subsets.
+    Args:
+        data:
+            A pd.DataFrame or np.ndarray object.
+        train_size:
+            Optional train size, as a fraction (`float`) or size (`int`).
+        validation_size:
+            Optional validation size, as a fraction (`float`) or size (`int`).
+        test_size:
+            Required test size, as a fraction (`float`) or size (`int`).
+        shuffle:
+            Whether the dataset should be shuffled prior to splitting.
+        random_state:
+            The random state (or seed). Only applicable if shuffling.
+    """
+    if not isinstance(data, (pd.DataFrame, np.ndarray, list)):
+        raise ValueError(
+            '`data` needs to be a pd.DataFrame, np.ndarray or a list. '
+            f'Found {type(data)}.'
+        )
+    size = len(data)
+    if test_size is None:
+        raise ValueError('`test_size` is required.')
+    elif test_size <= 0:
+        raise ValueError(
+            f'Test size needs to be positive. Found: {test_size}. '
+            'Either specify a positive `float` (fraction) or '
+            'a positive `int` (size).'
+        )
+    if train_size is not None and train_size <= 0:
+        raise ValueError(
+            f'Train size needs to be None or positive. Found: {train_size}. '
+            'Either specify `None`, a positive `float` (fraction) or '
+            'a positive `int` (size).'
+        )
+    if validation_size is not None and validation_size <= 0:
+        raise ValueError(
+            f'Validation size needs to be None or positive. Found: {validation_size}. '
+            'Either specify `None`, a positive `float` (fraction) or '
+            'a positive `int` (size).'
+        )
+    if isinstance(test_size, float):
+        test_size = int(size * test_size)
+    if validation_size and isinstance(validation_size, float):
+        validation_size = int(size * validation_size)
+    elif not validation_size:
+        validation_size = 0
+    if train_size and isinstance(train_size, float):
+        train_size = int(size * train_size)
+    elif not train_size:
+        train_size = 0
+    if not train_size:
+        train_size = size - test_size
+        if not validation_size:
+            train_size -= validation_size
+    remainder = size - (train_size + validation_size + test_size)
+    if remainder < 0:
+        raise ValueError(
+            'Sizes of data subsets add up to more than the size of the original data set: '
+            f'{size} < ({train_size} + {validation_size} + {test_size})'
+        )
+    if test_size <= 0:
+        raise ValueError(
+            f'Test size needs to be greater than 0. Found: {test_size}.'
+        )
+    if train_size <= 0:
+        raise ValueError(
+            f'Train size needs to be greater than 0. Found: {train_size}.'
+        )
+    train_size += remainder
+    if isinstance(data, pd.DataFrame):
+        if shuffle:
+            data = data.sample(
+                frac=1.0, replace=False, random_state=random_state
+            )
+        train_data = data.iloc[:train_size]
+        test_data = data.iloc[-test_size:]
+        if not validation_size:
+            return train_data, test_data
+        validation_data = data.iloc[train_size:-test_size]
+        return train_data, validation_data, test_data
+    if not isinstance(data, np.ndarray):
+        data = np.asarray(data)
+    np.random.seed(random_state)
+    random_indices = np.arange(size)
+    np.random.shuffle(random_indices)
+    data = data[random_indices]
+    train_data = data[:train_size]
+    test_data = data[-test_size:]
+    if not validation_size:
+        return train_data, test_data
+    validation_data = data[train_size:-test_size]
+    return train_data, validation_data, test_data

molcraft/experimental/peptides.py CHANGED Viewed

@@ -9,75 +9,36 @@ from molcraft import chem
 from molcraft import features
 from molcraft import featurizers
 from molcraft import tensors
+from molcraft import descriptors
-class PeptideGraphFeaturizer(featurizers.MolGraphFeaturizer):
-    def __init__(
-        self,
-        atom_features: list[features.Feature] | str | None = None,
-        bond_features: list[features.Feature] | str | None = None,
-        super_atom_feature: features.Feature | bool = None,
-        radius: int | float | None = None,
-        self_loops: bool = False,
-        include_hs: bool = False,
-        feature_dtype: str = 'float32',
-        index_dtype: str = 'int32',
-    ) -> None:
-        if super_atom_feature is None:
-            super_atom_feature = AminoAcidType()
-        super().__init__(
-            atom_features=atom_features,
-            bond_features=bond_features,
-            super_atom_feature=super_atom_feature,
-            radius=radius,
-            self_loops=self_loops,
-            include_hs=include_hs,
-            feature_dtype=feature_dtype,
-            index_dtype=index_dtype
-        )
+def Graph(
+    inputs,
+    atom_features: list[features.Feature] | str | None = 'auto',
+    bond_features: list[features.Feature] | str | None = 'auto',
+    super_atom: bool = True,
+    radius: int | float | None = None,
+    self_loops: bool = False,
+    include_hs: bool = False,
+    **kwargs,
+):
+    featurizer = featurizers.MolGraphFeaturizer(
+        atom_features=atom_features,
+        bond_features=bond_features,
+        molecule_features=[AminoAcidType()],
+        super_atom=super_atom,
+        radius=radius,
+        self_loops=self_loops,
+        include_hs=include_hs,
+        **kwargs,
+    )
-    def to_index(self, sequence: str):
-        pass
-    def static(self, inputs):
-        # TODO: Make sure it is an ordered sequence
-        inputs = [
-            features.residues[x] for x in ['G'] + inputs
-        ]
-        mols = [
-            chem.Mol.from_encoding(x, explicit_hs=self.include_hs) for x in inputs
-        ]
-        mols = [
-            mol for mol in mols if mol is not None
-        ]
-        if not mols:
-            return None
-        tensor_list: list[tensors.GraphTensor] = [super().call(mol) for mol in mols]
-        tensor: tensors.GraphTensor = tf.stack(tensor_list, axis=0)
-        return tensor
-    def call(self, inputs: str | tuple) -> tensors.GraphTensor:
-        args = []
-        if isinstance(inputs, (list, tuple, np.ndarray)):
-            inputs, *args = inputs
-        inputs = [
-            features.residues[x] for x in chem.sequence_split(inputs)
-        ]
-        tensor_list: list[tensors.GraphTensor] = [super().call(x) for x in inputs]
-        tensor: tensors.GraphTensor = tf.stack(tensor_list, axis=0)
-        tensor = tensor._merge()
-        context = {
-            k: v for (k, v) in zip(['label', 'weight'], args)
-        }
-        tensor = tensor.update(
-            {
-                'context': context
-            }
-        )
+    inputs = [
+        residues[x] for x in ['G'] + inputs
+    ]
+    tensor_list = [featurizer(x) for x in inputs]
+    return tf.stack(tensor_list, axis=0)
-        return tensor
 def GraphLookup(graph: tensors.GraphTensor) -> 'GraphLookupLayer':
     lookup = GraphLookupLayer()
@@ -203,7 +164,7 @@ class Gather(keras.layers.Layer):
 @keras.saving.register_keras_serializable(package='molcraft')
-class AminoAcidType(features.Feature):
+class AminoAcidType(descriptors.Descriptor):
     def __init__(self, vocab=None, **kwargs):
         vocab = [
@@ -217,7 +178,7 @@ class AminoAcidType(features.Feature):
         if not residue:
             raise KeyError(f'Could not find {mol.canonical_smiles} in `residues_reverse`.')
         mol = chem.remove_hs(mol)
-        return [_extract_residue_type(residues_reverse[mol.canonical_smiles])]
+        return _extract_residue_type(residues_reverse[mol.canonical_smiles])
 def sequence_split(sequence: str):
     patterns = [

molcraft/features.py CHANGED Viewed

@@ -155,9 +155,11 @@ class Distance(EdgeFeature):
         encode_oov: bool = True,
         **kwargs,
     ) -> None:
-        if max_distance is None:
-            max_distance = 20
-        vocab = list(range(max_distance + 1))
+        vocab = kwargs.pop('vocab', None)
+        if not vocab:
+            if max_distance is None:
+                max_distance = 20
+            vocab = list(range(max_distance + 1))
         super().__init__(
             vocab=vocab,
             allow_oov=allow_oov,

molcraft/featurizers.py CHANGED Viewed

@@ -200,12 +200,13 @@ class MolGraphFeaturizer(Featurizer):
         self.feature_dtype = 'float32'
         self.index_dtype = 'int32'
-    def call(self, x: str | typing.Tuple) -> tensors.GraphTensor:
-        if isinstance(x, (tuple, list, np.ndarray)):
-            x, *args = x
+    def call(self, inputs: str | tuple) -> tensors.GraphTensor:
+        if isinstance(inputs, (tuple, list, np.ndarray)):
+            x, *context = inputs
+            if len(context) and isinstance(context[0], dict):
+                context = copy.deepcopy(context[0])
         else:
-            args = []
+            x, context = inputs, None
         mol = chem.Mol.from_encoding(x, explicit_hs=self.include_hs)
@@ -220,14 +221,30 @@ class MolGraphFeaturizer(Featurizer):
         bond_feature = self.bond_features(mol)
         context_feature = self.context_feature(mol)
         molecule_size = self.num_atoms(mol)
-        context, node, edge = {}, {}, {}
-        for field, value in zip(['size', 'label', 'weight'], [molecule_size] + args):
-            context[field] = value
+        if isinstance(context, dict):
+            if 'x' in context:
+                context['feature'] = context.pop('x')
+            if 'y' in context:
+                context['label'] = context.pop('y')
+            if 'sample_weight' in context:
+                context['weight'] = context.pop('sample_weight')
+            context = {
+                **{'size': molecule_size},
+                **context
+            }
+        elif isinstance(context, list):
+            context = {
+                **{'size': molecule_size},
+                **{key: value for (key, value) in zip(['label', 'weight'], context)}
+            }
+        else:
+            context = {'size': molecule_size}
         if context_feature is not None:
             context['feature'] = context_feature
+        node = {}
         node['feature'] = atom_feature
         if bond_feature is not None and (self.radius > 1 or self.self_loops):
@@ -239,6 +256,7 @@ class MolGraphFeaturizer(Featurizer):
                 [bond_feature, zero_bond_feature], axis=0
             )
+        edge = {}
         if self.radius == 1:
             edge['source'], edge['target'] = mol.adjacency(
                 fill='full', sparse=True, self_loops=self.self_loops, dtype=self.index_dtype
@@ -384,6 +402,7 @@ class MolGraphFeaturizer(Featurizer):
         return cls(**config)
+@keras.saving.register_keras_serializable(package='molcraft')
 class MolGraphFeaturizer3D(MolGraphFeaturizer):
     """Molecular 3d-graph featurizer.
@@ -494,19 +513,25 @@ class MolGraphFeaturizer3D(MolGraphFeaturizer):
         self.embed_conformer = self.conformer_generator is not None
         self.radius = float(radius) if radius else None
-    def call(self, x: str | typing.Tuple) -> tensors.GraphTensor:
+    def call(self, inputs: str | tuple) -> tensors.GraphTensor:
-        if isinstance(x, (tuple, list, np.ndarray)):
-            x, *args = x
+        if isinstance(inputs, (tuple, list, np.ndarray)):
+            x, *context = inputs
+            if len(context) and isinstance(context[0], dict):
+                context = copy.deepcopy(context[0])
         else:
-            args = []
+            x, context = inputs, None
         explicit_hs = (self.include_hs or self.embed_conformer)
         mol = chem.Mol.from_encoding(x, explicit_hs=explicit_hs)
         if mol is None:
+            warn(
+                f'Could not obtain `chem.Mol` from {x}. '
+                'Proceeding without it.'
+            )
             return None
         if self.embed_conformer:
             mol = self.conformer_generator(mol)
             if not self.include_hs:
@@ -519,21 +544,38 @@ class MolGraphFeaturizer3D(MolGraphFeaturizer):
                 'of the `Featurizer` or input a 3D representation of the molecule. '
             )
-        context, node, edge = {}, {}, {}
-        context['size'] = self.num_atoms(mol) + int(self.super_atom)
-        for field, value in zip(['label', 'weight'], args):
-            context[field] = value
+        context_feature = self.context_feature(mol)
+        molecule_size = self.num_atoms(mol) + int(self.super_atom)
+        if isinstance(context, dict):
+            if 'x' in context:
+                context['feature'] = context.pop('x')
+            if 'y' in context:
+                context['label'] = context.pop('y')
+            if 'sample_weight' in context:
+                context['weight'] = context.pop('sample_weight')
+            context = {
+                **{'size': molecule_size},
+                **context
+            }
+        elif isinstance(context, list):
+            context = {
+                **{'size': molecule_size},
+                **{key: value for (key, value) in zip(['label', 'weight'], context)}
+            }
+        else:
+            context = {'size': molecule_size}
+        if context_feature is not None:
+            context['feature'] = context_feature
+        node = {}
         node['feature'] = self.atom_features(mol)
         if self._bond_features:
             edge_feature = self.bond_features(mol)
-        context_feature = self.context_feature(mol)
-        if context_feature is not None:
-            context['feature'] = context_feature
+        edge = {}
         mols = chem._split_mol_by_confs(mol)
         tensor_list = []
         for i, mol in enumerate(mols):
@@ -563,11 +605,10 @@ class MolGraphFeaturizer3D(MolGraphFeaturizer):
                 node_conformer['coordinate'] = np.concatenate(
                     [node_conformer['coordinate'], conformer.centroid[None]], axis=0
                 )
             tensor_list.append(
                 tensors.GraphTensor(context, node_conformer, edge_conformer)
             )
         return tensor_list
     def stack(self, outputs):
@@ -587,7 +628,7 @@ class MolGraphFeaturizer3D(MolGraphFeaturizer):
         config['conformer_generator'] = keras.saving.deserialize_keras_object(
             config['conformer_generator']
         )
-        return super().from_config(**config)
+        return super().from_config(config)
 def save_featurizer(

molcraft 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl

Potentially problematic release.

molcraft 0.1.0a1py3-none-any.whl → 0.1.0a3py3-none-any.whl