PyPI - molcraft - Versions diffs - 0.1.0a12__py3-none-any.whl → 0.1.0a14__py3-none-any.whl - Mend

molcraft 0.1.0a12py3-none-any.whl → 0.1.0a14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of molcraft might be problematic. Click here for more details.

Files changed (14) hide show

molcraft/__init__.py +1 -1
molcraft/chem.py +4 -4
molcraft/datasets.py +88 -80
molcraft/descriptors.py +5 -3
molcraft/features.py +23 -18
molcraft/featurizers.py +3 -3
molcraft/models.py +6 -7
molcraft/ops.py +16 -0
{molcraft-0.1.0a12.dist-info → molcraft-0.1.0a14.dist-info}/METADATA +1 -1
molcraft-0.1.0a14.dist-info/RECORD +21 -0
molcraft-0.1.0a12.dist-info/RECORD +0 -21
{molcraft-0.1.0a12.dist-info → molcraft-0.1.0a14.dist-info}/WHEEL +0 -0
{molcraft-0.1.0a12.dist-info → molcraft-0.1.0a14.dist-info}/licenses/LICENSE +0 -0
{molcraft-0.1.0a12.dist-info → molcraft-0.1.0a14.dist-info}/top_level.txt +0 -0

molcraft/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = '0.1.0a12'
+__version__ = '0.1.0a14'
 import os
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

molcraft/chem.py CHANGED Viewed

@@ -331,20 +331,20 @@ def get_shortest_paths(
 def get_periodic_table():
     return Chem.GetPeriodicTable()
-def gasteiger_charges(mol: 'Mol') -> list[float]:
+def partial_charges(mol: 'Mol') -> list[float]:
     rdPartialCharges.ComputeGasteigerCharges(mol)
     return [atom.GetDoubleProp("_GasteigerCharge") for atom in mol.atoms]
 def logp_contributions(mol: 'Mol') -> list[float]:
     return [i[0] for i in rdMolDescriptors._CalcCrippenContribs(mol)]
-def molar_refractivity_contribution(mol: 'Mol') -> list[float]:
+def molar_refractivity_contributions(mol: 'Mol') -> list[float]:
     return [i[1] for i in rdMolDescriptors._CalcCrippenContribs(mol)]
-def tpsa_contribution(mol: 'Mol') -> list[float]:
+def total_polar_surface_area_contributions(mol: 'Mol') -> list[float]:
     return list(rdMolDescriptors._CalcTPSAContribs(mol))
-def asa_contribution(mol: 'Mol') -> list[float]:
+def accessible_surface_area_contributions(mol: 'Mol') -> list[float]:
     return list(rdMolDescriptors._CalcLabuteASAContribs(mol)[0])
 def hydrogen_acceptors(mol: 'Mol') -> list[bool]:

molcraft/datasets.py CHANGED Viewed

@@ -1,123 +1,131 @@
 import numpy as np
 import pandas as pd
+import typing
 def split(
     data: pd.DataFrame | np.ndarray,
+    *,
     train_size: float | None = None,
     validation_size: float | None = None,
-    test_size: float = 0.1,
+    test_size: float | None = None,
+    groups: str | np.ndarray = None,
     shuffle: bool = False,
     random_state: int | None = None,
-) -> pd.DataFrame | np.ndarray:
-    """Splits dataset into subsets.
+) -> tuple[np.ndarray | pd.DataFrame, ...]:
+    """Splits the dataset into subsets.
     Args:
         data:
             A pd.DataFrame or np.ndarray object.
         train_size:
-            Optional train size, as a fraction (`float`) or size (`int`).
+            The size of the train set.
         validation_size:
-            Optional validation size, as a fraction (`float`) or size (`int`).
+            The size of the validation set.
         test_size:
-            Required test size, as a fraction (`float`) or size (`int`).
+            The size of the test set.
+        groups:
+            The groups to perform the splitting on.
         shuffle:
             Whether the dataset should be shuffled prior to splitting.
         random_state:
-            The random state (or seed). Only applicable if shuffling.
+            The random state/seed. Only applicable if shuffling.
     """
+    if not isinstance(data, (pd.DataFrame, np.ndarray)):
+        raise ValueError(f'Unsupported `data` type ({type(data)}).')
-    if not isinstance(data, (pd.DataFrame, np.ndarray, list)):
-        raise ValueError(
-            '`data` needs to be a pd.DataFrame, np.ndarray or a list. '
-            f'Found {type(data)}.'
-        )
-    size = len(data)
+    if isinstance(groups, str):
+        groups = data[groups].values
+    elif groups is None:
+        groups = np.arange(len(data))
-    if test_size is None:
-        raise ValueError('`test_size` is required.')
-    elif test_size <= 0:
-        raise ValueError(
-            f'Test size needs to be positive. Found: {test_size}. '
-            'Either specify a positive `float` (fraction) or '
-            'a positive `int` (size).'
-        )
-    if train_size is not None and train_size <= 0:
-        raise ValueError(
-            f'Train size needs to be None or positive. Found: {train_size}. '
-            'Either specify `None`, a positive `float` (fraction) or '
-            'a positive `int` (size).'
-        )
-    if validation_size is not None and validation_size <= 0:
+    indices = np.unique(groups)
+    size = len(indices)
+    if not train_size and not test_size:
         raise ValueError(
-            f'Validation size needs to be None or positive. Found: {validation_size}. '
-            'Either specify `None`, a positive `float` (fraction) or '
-            'a positive `int` (size).'
+            f'Found both `train_size` and `test_size` to be `None`, '
+            f'specify at least one of them.'
         )
     if isinstance(test_size, float):
         test_size = int(size * test_size)
-    if validation_size and isinstance(validation_size, float):
+    if isinstance(train_size, float):
+        train_size = int(size * train_size)
+    if isinstance(validation_size, float):
         validation_size = int(size * validation_size)
     elif not validation_size:
         validation_size = 0
-    if train_size and isinstance(train_size, float):
-        train_size = int(size * train_size)
-    elif not train_size:
-        train_size = 0
     if not train_size:
-        train_size = size - test_size
-        if not validation_size:
-            train_size -= validation_size
+        train_size = (size - test_size - validation_size)
+    if not test_size:
+        test_size = (size - train_size - validation_size)
     remainder = size - (train_size + validation_size + test_size)
     if remainder < 0:
         raise ValueError(
-            'Sizes of data subsets add up to more than the size of the original data set: '
-            f'{size} < ({train_size} + {validation_size} + {test_size})'
+            f'subset sizes added up to more than the data size.'
         )
-    if test_size <= 0:
-        raise ValueError(
-            f'Test size needs to be greater than 0. Found: {test_size}.'
-        )
-    if train_size <= 0:
-        raise ValueError(
-            f'Train size needs to be greater than 0. Found: {train_size}.'
-        )
     train_size += remainder
-    if isinstance(data, pd.DataFrame):
-        if shuffle:
-            data = data.sample(
-                frac=1.0, replace=False, random_state=random_state
-            )
-        train_data = data.iloc[:train_size]
-        test_data = data.iloc[-test_size:]
-        if not validation_size:
-            return train_data, test_data
-        validation_data = data.iloc[train_size:-test_size]
-        return train_data, validation_data, test_data
-    if not isinstance(data, np.ndarray):
-        data = np.asarray(data)
-    np.random.seed(random_state)
-    random_indices = np.arange(size)
-    np.random.shuffle(random_indices)
-    data = data[random_indices]
+    if shuffle:
+        np.random.seed(random_state)
+        np.random.shuffle(indices)
-    train_data = data[:train_size]
-    test_data = data[-test_size:]
+    train_mask = np.isin(groups, indices[:train_size])
+    test_mask = np.isin(groups, indices[-test_size:])
     if not validation_size:
-        return train_data, test_data
-    validation_data = data[train_size:-test_size]
-    return train_data, validation_data, test_data
+        return data[train_mask], data[test_mask]
+    validation_mask = np.isin(groups, indices[train_size:-test_size])
+    return data[train_mask], data[validation_mask], data[test_mask]
+def cv_split(
+    data: pd.DataFrame | np.ndarray,
+    num_splits: int = 10,
+    groups: str | np.ndarray = None,
+    shuffle: bool = False,
+    random_state: int | None = None,
+) -> typing.Iterator[
+        tuple[np.ndarray | pd.DataFrame, np.ndarray | pd.DataFrame]
+    ]:
+    """Splits the dataset into cross-validation folds.
+    Args:
+        data:
+            A pd.DataFrame or np.ndarray object.
+        num_splits:
+            The number of cross-validation folds.
+        groups:
+            The groups to perform the splitting on.
+        shuffle:
+            Whether the dataset should be shuffled prior to splitting.
+        random_state:
+            The random state/seed. Only applicable if shuffling.
+    """
+    if not isinstance(data, (pd.DataFrame, np.ndarray)):
+        raise ValueError(f'Unsupported `data` type ({type(data)}).')
+    if isinstance(groups, str):
+        groups = data[groups].values
+    elif groups is None:
+        groups = np.arange(len(data))
+    indices = np.unique(groups)
+    size = len(indices)
+    if num_splits > size:
+        raise ValueError(
+            f'`num_splits` ({num_splits}) must not be greater than'
+            f'the data size or the number of groups ({size}).'
+        )
+    if shuffle:
+        np.random.seed(random_state)
+        np.random.shuffle(indices)
+    indices_splits = np.array_split(indices, num_splits)
+    for k in range(num_splits):
+        test_indices = indices_splits[k]
+        test_mask = np.isin(groups, test_indices)
+        train_mask = ~test_mask
+        yield data[train_mask], data[test_mask]

molcraft/descriptors.py CHANGED Viewed

@@ -37,19 +37,21 @@ class MolWeight(Descriptor):
 @keras.saving.register_keras_serializable(package='molcraft')
-class TPSA(Descriptor):
+class TotalPolarSurfaceArea(Descriptor):
     def call(self, mol: chem.Mol) -> np.ndarray:
         return rdMolDescriptors.CalcTPSA(mol)
 @keras.saving.register_keras_serializable(package='molcraft')
-class CrippenLogP(Descriptor):
+class LogP(Descriptor):
+    """Crippen logP."""
     def call(self, mol: chem.Mol) -> np.ndarray:
         return rdMolDescriptors.CalcCrippenDescriptors(mol)[0]
 @keras.saving.register_keras_serializable(package='molcraft')
-class CrippenMolarRefractivity(Descriptor):
+class MolarRefractivity(Descriptor):
+    """Crippen molar refractivity."""
     def call(self, mol: chem.Mol) -> np.ndarray:
         return rdMolDescriptors.CalcCrippenDescriptors(mol)[1]

molcraft/features.py CHANGED Viewed

@@ -276,37 +276,42 @@ class IsHydrogenAcceptor(Feature):
 class IsInRing(Feature):
     def call(self, mol: chem.Mol) -> list[int, float, str]:
         return [atom.IsInRing() for atom in mol.atoms]
 @keras.saving.register_keras_serializable(package='molcraft')
-class CrippenLogPContribution(Feature):
-    def call(self, mol: chem.Mol) -> list[int, float, str]:
-        return chem.logp_contributions(mol)
+class PartialCharge(Feature):
+    """Gasteiger partial charge."""
+    def call(self, mol: chem.Mol) -> list[int, float, str]:
+        return chem.partial_charges(mol)
 @keras.saving.register_keras_serializable(package='molcraft')
-class CrippenMolarRefractivityContribution(Feature):
+class TotalPolarSurfaceAreaContribution(Feature):
+    """Total polar surface area (TPSA) contribution."""
     def call(self, mol: chem.Mol) -> list[int, float, str]:
-        return chem.molar_refractivity_contribution(mol)
+        return chem.total_polar_surface_area_contributions(mol)
-@keras.saving.register_keras_serializable(package='molcraft')
-class TPSAContribution(Feature):
-    def call(self, mol: chem.Mol) -> list[int, float, str]:
-        return chem.tpsa_contribution(mol)
 @keras.saving.register_keras_serializable(package='molcraft')
-class LabuteASAContribution(Feature):
+class AccessibleSurfaceAreaContribution(Feature):
+    """Labute accessible surface area (ASA) contribution."""
     def call(self, mol: chem.Mol) -> list[int, float, str]:
-        return chem.asa_contribution(mol)
+        return chem.accessible_surface_area_contributions(mol)
 @keras.saving.register_keras_serializable(package='molcraft')
-class GasteigerCharge(Feature):
-    def call(self, mol: chem.Mol) -> list[int, float, str]:
-        return chem.gasteiger_charges(mol)
+class LogPContribution(Feature):
+    """Crippen logP contribution."""
+    def call(self, mol: chem.Mol) -> list[int, float, str]:
+        return chem.logp_contributions(mol)
+@keras.saving.register_keras_serializable(package='molcraft')
+class MolarRefractivityContribution(Feature):
+    """Crippen molar refractivity contribution."""
+    def call(self, mol: chem.Mol) -> list[int, float, str]:
+        return chem.molar_refractivity_contributions(mol)
 @keras.saving.register_keras_serializable(package='molcraft')
 class BondType(Feature):

molcraft/featurizers.py CHANGED Viewed

@@ -192,9 +192,9 @@ class MolGraphFeaturizer(Featurizer):
         if default_molecule_features:
             molecule_features = [
                 descriptors.MolWeight(),
-                descriptors.TPSA(),
-                descriptors.CrippenLogP(),
-                descriptors.CrippenMolarRefractivity(),
+                descriptors.TotalPolarSurfaceArea(),
+                descriptors.LogP(),
+                descriptors.MolarRefractivity(),
                 descriptors.NumHeavyAtoms(),
                 descriptors.NumHeteroatoms(),
                 descriptors.NumHydrogenDonors(),

molcraft/models.py CHANGED Viewed

@@ -250,7 +250,7 @@ class GraphModel(layers.GraphLayer, keras.models.Model):
                 val_size = int(val_split * x.num_subgraphs)
                 x_val = _make_dataset(x[-val_size:], batch_size)
                 x = x[:-val_size]
-            x = _make_dataset(x, batch_size)
+            x = _make_dataset(x, batch_size, shuffle=kwargs.get('shuffle', True))
         return super().fit(x, validation_data=x_val, **kwargs)
     def evaluate(self, x: tensors.GraphTensor | tf.data.Dataset, **kwargs):
@@ -561,9 +561,8 @@ def _functional_init_arguments(args, kwargs):
         or ("inputs" in kwargs and "outputs" in kwargs)
     )
-def _make_dataset(x: tensors.GraphTensor, batch_size: int):
-    return (
-        tf.data.Dataset.from_tensor_slices(x)
-        .batch(batch_size)
-        .prefetch(-1)
-    )
+def _make_dataset(x: tensors.GraphTensor, batch_size: int, shuffle: bool = False):
+    ds = tf.data.Dataset.from_tensor_slices(x)
+    if shuffle:
+        ds = ds.shuffle(buffer_size=ds.cardinality())
+    return ds.batch(batch_size).prefetch(-1)

molcraft/ops.py CHANGED Viewed

@@ -62,6 +62,22 @@ def scatter_update(
         indices = keras.ops.expand_dims(indices, axis=-1)
     return keras.ops.scatter_update(inputs, indices, updates)
+def scatter_add(
+    inputs: tf.Tensor,
+    indices: tf.Tensor,
+    updates: tf.Tensor,
+) -> tf.Tensor:
+    if indices.dtype == tf.bool:
+        indices = keras.ops.stack(keras.ops.where(indices), axis=-1)
+    expected_rank = len(keras.ops.shape(inputs))
+    current_rank = len(keras.ops.shape(indices))
+    for _ in range(expected_rank - current_rank):
+        indices = keras.ops.expand_dims(indices, axis=-1)
+    if backend.backend() == 'tensorflow':
+        return tf.tensor_scatter_nd_add(inputs, indices, updates)
+    updates = scatter_update(keras.ops.zeros_like(inputs), indices, updates)
+    return inputs + updates
 def edge_softmax(
     score: tf.Tensor,
     edge_target: tf.Tensor

{molcraft-0.1.0a12.dist-info → molcraft-0.1.0a14.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: molcraft
-Version: 0.1.0a12
+Version: 0.1.0a14
 Summary: Graph Neural Networks for Molecular Machine Learning
 Author-email: Alexander Kensert <alexander.kensert@gmail.com>
 License: MIT License

molcraft-0.1.0a14.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,21 @@
+molcraft/__init__.py,sha256=lReyUDRgBySoe9LPZzlwv1N_x9unwr6nHxIU70u3mLU,464
+molcraft/callbacks.py,sha256=x5HnkZhqcFRrW6xdApt_jZ4X08A-0fxcnFKfdmRKa0c,3571
+molcraft/chem.py,sha256=--4AdZV0TCj_cf5i-TRidNJGSFyab1ksUEMjmDi7zaM,21837
+molcraft/conformers.py,sha256=K6ZtiSUNDN_fwqGP9JrPcwALLFFvlMlF_XejEJH3Sr4,4205
+molcraft/datasets.py,sha256=QKHi9SUBKvJvdkRFmRQNowhrnu35pQqtujuLatOK8bE,4151
+molcraft/descriptors.py,sha256=jJpT0XWu3Tx_bxnwk1rENySRkaM8cMDMaDIjG8KKvtg,3097
+molcraft/features.py,sha256=GwOecLCNUIuGfbIVzsAJH4LikkzWMKj5IT7zSgGTttU,13846
+molcraft/featurizers.py,sha256=QiyNEFRJdMcKZM-gJGHU6Soy300RWDtLeYw0QEkFG20,27129
+molcraft/layers.py,sha256=cUpo9dqqNEnc7rNf-Dze8adFhOkTV5F9IhHOKs13OUI,60134
+molcraft/losses.py,sha256=qnS2yC5g-O3n_zVea9MR6TNiFraW2yqRgePOisoUP4A,1065
+molcraft/models.py,sha256=h9cRAdCOU-_UAxROC9Utuz4AR4HfFE9QqJ4geLYlynE,21878
+molcraft/ops.py,sha256=TaAD26V-b7eSNKFKswWt9IExSgIBOmLqwlPPcdpt8wk,5496
+molcraft/records.py,sha256=MbvYkcCunbAmpy_MWXmQ9WBGi2WvwxFUlwQSPKPvSSk,5534
+molcraft/tensors.py,sha256=EOUKx496KUZsjA1zA2ABc7tU_TW3Jv7AXDsug_QsLbA,22407
+molcraft/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+molcraft/apps/peptides.py,sha256=N5wJDGDIDRbmOmxin_dTY-odLqb0avAX9FU22U6x6c0,14576
+molcraft-0.1.0a14.dist-info/licenses/LICENSE,sha256=sbVeqlrtZ0V63uYhZGL5dCxUm8rBAOqe2avyA1zIQNk,1074
+molcraft-0.1.0a14.dist-info/METADATA,sha256=1Op3VxuV9hkciALrrOXx2KnGShFI5a9n_XbhT-oPpKI,3893
+molcraft-0.1.0a14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+molcraft-0.1.0a14.dist-info/top_level.txt,sha256=dENV6MfOceshM6MQCgJlcN1ojZkiCL9B4F7XyUge3QM,9
+molcraft-0.1.0a14.dist-info/RECORD,,

molcraft-0.1.0a12.dist-info/RECORD DELETED Viewed

@@ -1,21 +0,0 @@
-molcraft/__init__.py,sha256=exZr4HcSy0uUnFlh9cshJrs0MBDP-pXT2MqKjq0a2BY,464
-molcraft/callbacks.py,sha256=x5HnkZhqcFRrW6xdApt_jZ4X08A-0fxcnFKfdmRKa0c,3571
-molcraft/chem.py,sha256=JARpv4IgFBtuNia0FLW_VF_DdmaA6e-_eZgH9dFAykA,21796
-molcraft/conformers.py,sha256=K6ZtiSUNDN_fwqGP9JrPcwALLFFvlMlF_XejEJH3Sr4,4205
-molcraft/datasets.py,sha256=rFgXTC1ZheLhfgQgcCspP_wEE54a33PIneH7OplbS-8,4047
-molcraft/descriptors.py,sha256=W8GLuDpc38RtwmreNsPOcn-PRvMjTfVng9ksJwcrVyM,3032
-molcraft/features.py,sha256=FpvT_9zk9EiOhvrk6OA5eEvUAYalquF7V6IvpiEJCns,13559
-molcraft/featurizers.py,sha256=1xyJ2JroFBHzcheRZ8v9P3bYBIaoiY-WCBdbbqXK4co,27126
-molcraft/layers.py,sha256=cUpo9dqqNEnc7rNf-Dze8adFhOkTV5F9IhHOKs13OUI,60134
-molcraft/losses.py,sha256=qnS2yC5g-O3n_zVea9MR6TNiFraW2yqRgePOisoUP4A,1065
-molcraft/models.py,sha256=0x74B4WsaZgmUrHmpX9YNr9QXqd1rR3QF_ygyegHoXU,21770
-molcraft/ops.py,sha256=PVxKfY_XbWCyntiSnmpyeBb-coFGT_VNNP9QzmeUwC0,4870
-molcraft/records.py,sha256=MbvYkcCunbAmpy_MWXmQ9WBGi2WvwxFUlwQSPKPvSSk,5534
-molcraft/tensors.py,sha256=EOUKx496KUZsjA1zA2ABc7tU_TW3Jv7AXDsug_QsLbA,22407
-molcraft/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-molcraft/apps/peptides.py,sha256=N5wJDGDIDRbmOmxin_dTY-odLqb0avAX9FU22U6x6c0,14576
-molcraft-0.1.0a12.dist-info/licenses/LICENSE,sha256=sbVeqlrtZ0V63uYhZGL5dCxUm8rBAOqe2avyA1zIQNk,1074
-molcraft-0.1.0a12.dist-info/METADATA,sha256=zMjHudRgekPvWDmQdtV2pW9tyapaYqkntWZ4k3u9X_g,3893
-molcraft-0.1.0a12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-molcraft-0.1.0a12.dist-info/top_level.txt,sha256=dENV6MfOceshM6MQCgJlcN1ojZkiCL9B4F7XyUge3QM,9
-molcraft-0.1.0a12.dist-info/RECORD,,

{molcraft-0.1.0a12.dist-info → molcraft-0.1.0a14.dist-info}/WHEEL RENAMED Viewed

File without changes

{molcraft-0.1.0a12.dist-info → molcraft-0.1.0a14.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{molcraft-0.1.0a12.dist-info → molcraft-0.1.0a14.dist-info}/top_level.txt RENAMED Viewed

File without changes

molcraft 0.1.0a12__py3-none-any.whl → 0.1.0a14__py3-none-any.whl

Potentially problematic release.

molcraft 0.1.0a12py3-none-any.whl → 0.1.0a14py3-none-any.whl