PyPI - molcraft - Versions diffs - 0.1.0a13__tar.gz → 0.1.0a15__tar.gz - Mend

molcraft 0.1.0a13tar.gz → 0.1.0a15tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of molcraft might be problematic. Click here for more details.

Files changed (34) hide show

{molcraft-0.1.0a13 → molcraft-0.1.0a15}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: molcraft
-Version: 0.1.0a13
+Version: 0.1.0a15
 Summary: Graph Neural Networks for Molecular Machine Learning
 Author-email: Alexander Kensert <alexander.kensert@gmail.com>
 License: MIT License

{molcraft-0.1.0a13 → molcraft-0.1.0a15}/molcraft/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-__version__ = '0.1.0a13'
+__version__ = '0.1.0a15'
 import os
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

molcraft-0.1.0a15/molcraft/apps/qsrr.py ADDED Viewed

@@ -0,0 +1,47 @@
+import molcraft
+import keras
+@keras.saving.register_keras_serializable(package='molcraft')
+class AuxiliaryFeatureInjection(molcraft.layers.GraphLayer):
+    def __init__(
+        self,
+        field: str = 'auxiliary_feature',
+        depth: int = 2,
+        drop: bool = True,
+        activation: str | None = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.field = field
+        self.depth = depth
+        self.drop = drop
+        self.activation = keras.activations.get(activation)
+    def build(self, spec: molcraft.tensors.GraphTensor.Spec) -> None:
+        units = spec.node['feature'].shape[1]
+        for i in range(self.depth):
+            setattr(
+                self, f'dense_{i}', self.get_dense(units, activation=self.activation)
+            )
+    def propagate(self, tensor: molcraft.tensors.GraphTensor) -> None:
+        x = tensor.context[self.field]
+        if self.drop:
+            tensor = tensor.update({'context': {self.field: None}})
+        for i in range(self.depth):
+            x = getattr(self, f'dense_{i}')(x)
+        node_feature = molcraft.ops.scatter_add(
+            tensor.node['feature'], tensor.node['super'], x
+        )
+        return tensor.update({'node': {'feature': node_feature}})
+    def get_config(self) -> dict:
+        config = super().get_config()
+        config.update({
+            'field': self.field,
+            'depth': self.depth,
+            'drop': self.drop,
+            'activation': keras.activations.serialize(self.activation)
+        })
+        return config

molcraft-0.1.0a15/molcraft/datasets.py ADDED Viewed

@@ -0,0 +1,131 @@
+import numpy as np
+import pandas as pd
+import typing
+def split(
+    data: pd.DataFrame | np.ndarray,
+    *,
+    train_size: float | None = None,
+    validation_size: float | None = None,
+    test_size: float | None = None,
+    groups: str | np.ndarray = None,
+    shuffle: bool = False,
+    random_state: int | None = None,
+) -> tuple[np.ndarray | pd.DataFrame, ...]:
+    """Splits the dataset into subsets.
+    Args:
+        data:
+            A pd.DataFrame or np.ndarray object.
+        train_size:
+            The size of the train set.
+        validation_size:
+            The size of the validation set.
+        test_size:
+            The size of the test set.
+        groups:
+            The groups to perform the splitting on.
+        shuffle:
+            Whether the dataset should be shuffled prior to splitting.
+        random_state:
+            The random state/seed. Only applicable if shuffling.
+    """
+    if not isinstance(data, (pd.DataFrame, np.ndarray)):
+        raise ValueError(f'Unsupported `data` type ({type(data)}).')
+    if isinstance(groups, str):
+        groups = data[groups].values
+    elif groups is None:
+        groups = np.arange(len(data))
+    indices = np.unique(groups)
+    size = len(indices)
+    if not train_size and not test_size:
+        raise ValueError(
+            f'Found both `train_size` and `test_size` to be `None`, '
+            f'specify at least one of them.'
+        )
+    if isinstance(test_size, float):
+        test_size = int(size * test_size)
+    if isinstance(train_size, float):
+        train_size = int(size * train_size)
+    if isinstance(validation_size, float):
+        validation_size = int(size * validation_size)
+    elif not validation_size:
+        validation_size = 0
+    if not train_size:
+        train_size = (size - test_size - validation_size)
+    if not test_size:
+        test_size = (size - train_size - validation_size)
+    remainder = size - (train_size + validation_size + test_size)
+    if remainder < 0:
+        raise ValueError(
+            f'subset sizes added up to more than the data size.'
+        )
+    train_size += remainder
+    if shuffle:
+        np.random.seed(random_state)
+        np.random.shuffle(indices)
+    train_mask = np.isin(groups, indices[:train_size])
+    test_mask = np.isin(groups, indices[-test_size:])
+    if not validation_size:
+        return data[train_mask], data[test_mask]
+    validation_mask = np.isin(groups, indices[train_size:-test_size])
+    return data[train_mask], data[validation_mask], data[test_mask]
+def cv_split(
+    data: pd.DataFrame | np.ndarray,
+    num_splits: int = 10,
+    groups: str | np.ndarray = None,
+    shuffle: bool = False,
+    random_state: int | None = None,
+) -> typing.Iterator[
+        tuple[np.ndarray | pd.DataFrame, np.ndarray | pd.DataFrame]
+    ]:
+    """Splits the dataset into cross-validation folds.
+    Args:
+        data:
+            A pd.DataFrame or np.ndarray object.
+        num_splits:
+            The number of cross-validation folds.
+        groups:
+            The groups to perform the splitting on.
+        shuffle:
+            Whether the dataset should be shuffled prior to splitting.
+        random_state:
+            The random state/seed. Only applicable if shuffling.
+    """
+    if not isinstance(data, (pd.DataFrame, np.ndarray)):
+        raise ValueError(f'Unsupported `data` type ({type(data)}).')
+    if isinstance(groups, str):
+        groups = data[groups].values
+    elif groups is None:
+        groups = np.arange(len(data))
+    indices = np.unique(groups)
+    size = len(indices)
+    if num_splits > size:
+        raise ValueError(
+            f'`num_splits` ({num_splits}) must not be greater than'
+            f'the data size or the number of groups ({size}).'
+        )
+    if shuffle:
+        np.random.seed(random_state)
+        np.random.shuffle(indices)
+    indices_splits = np.array_split(indices, num_splits)
+    for k in range(num_splits):
+        test_indices = indices_splits[k]
+        test_mask = np.isin(groups, test_indices)
+        train_mask = ~test_mask
+        yield data[train_mask], data[test_mask]

{molcraft-0.1.0a13 → molcraft-0.1.0a15}/molcraft/featurizers.py RENAMED Viewed

@@ -169,7 +169,7 @@ class MolGraphFeaturizer(Featurizer):
         if default_atom_features:
             atom_features = [features.AtomType()]
             if not self.include_hs:
-                atom_features.append(features.TotalNumHs())
+                atom_features.append(features.NumHydrogens())
             atom_features.append(features.Degree())
         if not isinstance(self, MolGraphFeaturizer3D):
             default_bond_features = (

{molcraft-0.1.0a13 → molcraft-0.1.0a15}/molcraft/models.py RENAMED Viewed

@@ -250,7 +250,7 @@ class GraphModel(layers.GraphLayer, keras.models.Model):
                 val_size = int(val_split * x.num_subgraphs)
                 x_val = _make_dataset(x[-val_size:], batch_size)
                 x = x[:-val_size]
-            x = _make_dataset(x, batch_size)
+            x = _make_dataset(x, batch_size, shuffle=kwargs.get('shuffle', True))
         return super().fit(x, validation_data=x_val, **kwargs)
     def evaluate(self, x: tensors.GraphTensor | tf.data.Dataset, **kwargs):
@@ -397,7 +397,7 @@ class GraphModel(layers.GraphLayer, keras.models.Model):
             raise ValueError(
                 'Could not extract output. `Readout` layer not found.'
             )
-        return self.__class__(inputs, outputs, name=f'{self.name}_head')
+        return self.__class__(inputs, outputs, name=f'{self.name}_backbone')
     def head(self) -> functional.Functional:
         if not isinstance(self, FunctionalGraphModel):
@@ -561,9 +561,8 @@ def _functional_init_arguments(args, kwargs):
         or ("inputs" in kwargs and "outputs" in kwargs)
     )
-def _make_dataset(x: tensors.GraphTensor, batch_size: int):
-    return (
-        tf.data.Dataset.from_tensor_slices(x)
-        .batch(batch_size)
-        .prefetch(-1)
-    )
+def _make_dataset(x: tensors.GraphTensor, batch_size: int, shuffle: bool = False):
+    ds = tf.data.Dataset.from_tensor_slices(x)
+    if shuffle:
+        ds = ds.shuffle(buffer_size=ds.cardinality())
+    return ds.batch(batch_size).prefetch(-1)

{molcraft-0.1.0a13 → molcraft-0.1.0a15}/molcraft/ops.py RENAMED Viewed

@@ -4,6 +4,7 @@ import tensorflow as tf
 from keras import backend
+@keras.saving.register_keras_serializable(package='molcraft')
 def gather(
     node_feature: tf.Tensor,
     edge: tf.Tensor
@@ -16,6 +17,7 @@ def gather(
         edge = keras.ops.expand_dims(edge, axis=-1)
     return keras.ops.take_along_axis(node_feature, edge, axis=0)
+@keras.saving.register_keras_serializable(package='molcraft')
 def aggregate(
     node_feature: tf.Tensor,
     edge: tf.Tensor,
@@ -30,6 +32,7 @@ def aggregate(
         node_feature, edge, num_nodes, sorted=False
     )
+@keras.saving.register_keras_serializable(package='molcraft')
 def propagate(
     node_feature: tf.Tensor,
     edge_source: tf.Tensor,
@@ -49,6 +52,7 @@ def propagate(
     return aggregate(node_feature, edge_target, num_nodes)
+@keras.saving.register_keras_serializable(package='molcraft')
 def scatter_update(
     inputs: tf.Tensor,
     indices: tf.Tensor,
@@ -62,6 +66,7 @@ def scatter_update(
         indices = keras.ops.expand_dims(indices, axis=-1)
     return keras.ops.scatter_update(inputs, indices, updates)
+@keras.saving.register_keras_serializable(package='molcraft')
 def scatter_add(
     inputs: tf.Tensor,
     indices: tf.Tensor,
@@ -78,6 +83,7 @@ def scatter_add(
     updates = scatter_update(keras.ops.zeros_like(inputs), indices, updates)
     return inputs + updates
+@keras.saving.register_keras_serializable(package='molcraft')
 def edge_softmax(
     score: tf.Tensor,
     edge_target: tf.Tensor
@@ -98,6 +104,7 @@ def edge_softmax(
     denominator = gather(denominator, edge_target)
     return numerator / denominator
+@keras.saving.register_keras_serializable(package='molcraft')
 def edge_weight(
     edge: tf.Tensor,
     edge_weight: tf.Tensor,
@@ -108,6 +115,7 @@ def edge_weight(
         edge_weight = keras.ops.expand_dims(edge_weight, axis=-1)
     return edge * edge_weight
+@keras.saving.register_keras_serializable(package='molcraft')
 def segment_mean(
     data: tf.Tensor,
     segment_ids: tf.Tensor,
@@ -142,6 +150,7 @@ def segment_mean(
     )
     return x / sizes[:, None]
+@keras.saving.register_keras_serializable(package='molcraft')
 def gaussian(
     x: tf.Tensor,
     mean: tf.Tensor,
@@ -155,6 +164,7 @@ def gaussian(
     a = (2 * np.pi) ** 0.5
     return keras.ops.exp(-0.5 * (((x - mean) / std) ** 2)) / (a * std)
+@keras.saving.register_keras_serializable(package='molcraft')
 def euclidean_distance(
     x1: tf.Tensor,
     x2: tf.Tensor,
@@ -169,6 +179,7 @@ def euclidean_distance(
         )
     )
+@keras.saving.register_keras_serializable(package='molcraft')
 def displacement(
     x1: tf.Tensor,
     x2: tf.Tensor,

{molcraft-0.1.0a13 → molcraft-0.1.0a15}/molcraft.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: molcraft
-Version: 0.1.0a13
+Version: 0.1.0a15
 Summary: Graph Neural Networks for Molecular Machine Learning
 Author-email: Alexander Kensert <alexander.kensert@gmail.com>
 License: MIT License

{molcraft-0.1.0a13 → molcraft-0.1.0a15}/molcraft.egg-info/SOURCES.txt RENAMED Viewed

@@ -22,6 +22,7 @@ molcraft.egg-info/requires.txt
 molcraft.egg-info/top_level.txt
 molcraft/apps/__init__.py
 molcraft/apps/peptides.py
+molcraft/apps/qsrr.py
 tests/test_chem.py
 tests/test_featurizers.py
 tests/test_layers.py

molcraft-0.1.0a13/molcraft/datasets.py DELETED Viewed

@@ -1,123 +0,0 @@
-import numpy as np
-import pandas as pd
-def split(
-    data: pd.DataFrame | np.ndarray,
-    train_size: float | None = None,
-    validation_size: float | None = None,
-    test_size: float = 0.1,
-    shuffle: bool = False,
-    random_state: int | None = None,
-) -> pd.DataFrame | np.ndarray:
-    """Splits dataset into subsets.
-    Args:
-        data:
-            A pd.DataFrame or np.ndarray object.
-        train_size:
-            Optional train size, as a fraction (`float`) or size (`int`).
-        validation_size:
-            Optional validation size, as a fraction (`float`) or size (`int`).
-        test_size:
-            Required test size, as a fraction (`float`) or size (`int`).
-        shuffle:
-            Whether the dataset should be shuffled prior to splitting.
-        random_state:
-            The random state (or seed). Only applicable if shuffling.
-    """
-    if not isinstance(data, (pd.DataFrame, np.ndarray, list)):
-        raise ValueError(
-            '`data` needs to be a pd.DataFrame, np.ndarray or a list. '
-            f'Found {type(data)}.'
-        )
-    size = len(data)
-    if test_size is None:
-        raise ValueError('`test_size` is required.')
-    elif test_size <= 0:
-        raise ValueError(
-            f'Test size needs to be positive. Found: {test_size}. '
-            'Either specify a positive `float` (fraction) or '
-            'a positive `int` (size).'
-        )
-    if train_size is not None and train_size <= 0:
-        raise ValueError(
-            f'Train size needs to be None or positive. Found: {train_size}. '
-            'Either specify `None`, a positive `float` (fraction) or '
-            'a positive `int` (size).'
-        )
-    if validation_size is not None and validation_size <= 0:
-        raise ValueError(
-            f'Validation size needs to be None or positive. Found: {validation_size}. '
-            'Either specify `None`, a positive `float` (fraction) or '
-            'a positive `int` (size).'
-        )
-    if isinstance(test_size, float):
-        test_size = int(size * test_size)
-    if validation_size and isinstance(validation_size, float):
-        validation_size = int(size * validation_size)
-    elif not validation_size:
-        validation_size = 0
-    if train_size and isinstance(train_size, float):
-        train_size = int(size * train_size)
-    elif not train_size:
-        train_size = 0
-    if not train_size:
-        train_size = size - test_size
-        if not validation_size:
-            train_size -= validation_size
-    remainder = size - (train_size + validation_size + test_size)
-    if remainder < 0:
-        raise ValueError(
-            'Sizes of data subsets add up to more than the size of the original data set: '
-            f'{size} < ({train_size} + {validation_size} + {test_size})'
-        )
-    if test_size <= 0:
-        raise ValueError(
-            f'Test size needs to be greater than 0. Found: {test_size}.'
-        )
-    if train_size <= 0:
-        raise ValueError(
-            f'Train size needs to be greater than 0. Found: {train_size}.'
-        )
-    train_size += remainder
-    if isinstance(data, pd.DataFrame):
-        if shuffle:
-            data = data.sample(
-                frac=1.0, replace=False, random_state=random_state
-            )
-        train_data = data.iloc[:train_size]
-        test_data = data.iloc[-test_size:]
-        if not validation_size:
-            return train_data, test_data
-        validation_data = data.iloc[train_size:-test_size]
-        return train_data, validation_data, test_data
-    if not isinstance(data, np.ndarray):
-        data = np.asarray(data)
-    np.random.seed(random_state)
-    random_indices = np.arange(size)
-    np.random.shuffle(random_indices)
-    data = data[random_indices]
-    train_data = data[:train_size]
-    test_data = data[-test_size:]
-    if not validation_size:
-        return train_data, test_data
-    validation_data = data[train_size:-test_size]
-    return train_data, validation_data, test_data