PyPI - molcraft - Versions diffs - 0.1.0a16__py3-none-any.whl → 0.1.0a18__py3-none-any.whl - Mend

molcraft 0.1.0a16py3-none-any.whl → 0.1.0a18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of molcraft might be problematic. Click here for more details.

Files changed (18) hide show

molcraft/__init__.py +4 -3
molcraft/applications/chromatography.py +0 -0
molcraft/applications/proteomics.py +141 -106
molcraft/chem.py +17 -22
molcraft/datasets.py +6 -6
molcraft/descriptors.py +14 -0
molcraft/features.py +50 -58
molcraft/featurizers.py +257 -487
molcraft/layers.py +95 -40
molcraft/models.py +2 -0
molcraft/records.py +24 -15
{molcraft-0.1.0a16.dist-info → molcraft-0.1.0a18.dist-info}/METADATA +13 -12
molcraft-0.1.0a18.dist-info/RECORD +21 -0
molcraft/conformers.py +0 -151
molcraft-0.1.0a16.dist-info/RECORD +0 -21
{molcraft-0.1.0a16.dist-info → molcraft-0.1.0a18.dist-info}/WHEEL +0 -0
{molcraft-0.1.0a16.dist-info → molcraft-0.1.0a18.dist-info}/licenses/LICENSE +0 -0
{molcraft-0.1.0a16.dist-info → molcraft-0.1.0a18.dist-info}/top_level.txt +0 -0

molcraft/featurizers.py CHANGED Viewed

@@ -14,63 +14,47 @@ from pathlib import Path
 from molcraft import tensors
 from molcraft import features
 from molcraft import chem
-from molcraft import conformers
 from molcraft import descriptors
 @keras.saving.register_keras_serializable(package='molcraft')
-class Featurizer(abc.ABC):
+class GraphFeaturizer(abc.ABC):
-    """Base class for featurizers.
+    """Base graph featurizer.
     """
     @abc.abstractmethod
-    def call(
-        self,
-        x: tensors.GraphTensor
-    ) -> tensors.GraphTensor | list[tensors.GraphTensor]:
+    def call(self, x: str | chem.Mol | tuple) -> tensors.GraphTensor:
         pass
-    @abc.abstractmethod
-    def stack(
-        self,
-        call_outputs: list[tensors.GraphTensor]
-    ) -> tensors.GraphTensor:
-        pass
     def get_config(self) -> dict:
         return {}
     @classmethod
-    def from_config(cls, config: dict) -> 'Featurizer':
+    def from_config(cls, config: dict) -> 'GraphFeaturizer':
         return cls(**config)
     def save(self, filepath: str | Path, *args, **kwargs) -> None:
-        save_featurizer(
-            self, filepath, *args, **kwargs
-        )
+        save_featurizer(self, filepath, *args, **kwargs)
     @staticmethod
-    def load(filepath: str | Path, *args, **kwargs) -> 'Featurizer':
-        return load_featurizer(
-            filepath, *args, **kwargs
-        )
+    def load(filepath: str | Path, *args, **kwargs) -> 'GraphFeaturizer':
+        return load_featurizer(filepath, *args, **kwargs)
     def __call__(
         self,
-        inputs: str | tuple | list | np.ndarray | pd.DataFrame | pd.Series,
+        inputs: str | chem.Mol | tuple | typing.Iterable,
         *,
         multiprocessing: bool = False,
         processes: int | None = None,
         device: str = '/cpu:0',
-        **kwargs
     ) -> tensors.GraphTensor:
         if isinstance(inputs, (str, tuple)):
             return self.call(inputs)
         if isinstance(inputs, (pd.DataFrame, pd.Series)):
-            inputs = inputs.values
-        if isinstance(inputs, np.ndarray):
-            inputs = list(inputs)
+            inputs = inputs.values.tolist()
+        elif isinstance(inputs, np.ndarray):
+            inputs = inputs.tolist()
         if not multiprocessing:
             outputs = [self.call(x) for x in inputs]
         else:
@@ -78,19 +62,19 @@ class Featurizer(abc.ABC):
                 with mp.Pool(processes) as pool:
                     outputs = pool.map(func=self.call, iterable=inputs)
         outputs = [x for x in outputs if x is not None]
-        return self.stack(outputs)
+        if tensors.is_scalar(outputs[0]):
+            return tf.stack(outputs, axis=0)
+        return tf.concat(outputs, axis=0)
 @keras.saving.register_keras_serializable(package='molcraft')
-class MolGraphFeaturizer(Featurizer):
+class MolGraphFeaturizer(GraphFeaturizer):
     """Molecular graph featurizer.
     Converts SMILES or InChI strings to a molecular graph.
     The molecular graph may encode a single molecule or a batch of molecules.
-    In either case, it is a single graph, with each molecule corresponding to
-    a subgraph within the graph.
     Example:
@@ -99,10 +83,14 @@ class MolGraphFeaturizer(Featurizer):
     >>> featurizer = molcraft.featurizers.MolGraphFeaturizer(
     ...     atom_features=[
     ...         molcraft.features.AtomType(),
-    ...         molcraft.features.TotalNumHs(),
+    ...         molcraft.features.NumHydrogens(),
     ...         molcraft.features.Degree(),
     ...     ],
-    ...     radius=1
+    ...     bond_features=[
+    ...         molcraft.features.BondType(),
+    ...     ],
+    ...     super_node=False,
+    ...     self_loops=False,
     ... )
     >>>
     >>> graph = featurizer(["N[C@@H](C)C(=O)O", "N[C@@H](CS)C(=O)O"])
@@ -112,7 +100,7 @@ class MolGraphFeaturizer(Featurizer):
             'size': <tf.Tensor: shape=[2], dtype=int32>
         },
         node={
-            'feature': <tf.Tensor: shape=[13, 133], dtype=float32>
+            'feature': <tf.Tensor: shape=[13, 129], dtype=float32>
         },
         edge={
             'source': <tf.Tensor: shape=[22], dtype=int32>,
@@ -123,73 +111,46 @@ class MolGraphFeaturizer(Featurizer):
     Args:
         atom_features:
-            A list of `features.Feature` encoding the nodes of the molecular graph.
+            A list of `features.Feature` encoded as the node features.
         bond_features:
-            A list of `features.Feature` encoding the edges of the molecular graph.
+            A list of `features.Feature` encoded as the edge features.
         molecule_features:
-            A `features.Feature` encoding the molecule (or `context`) of the graph.
-            If `contextual_super_atom` is set to `True`, then this feature will be
-            embedded, via `NodeEmbedding`, as a super node in the molecular graph.
-        super_atom:
-            A boolean specifying whether super atoms exist and should be embedded
-            via `NodeEmbedding`.
-        radius:
-            An integer specifying how many bond lengths should be considered as an
-            edge. The default is None (or 1), which specifies that only bonds should
-            be considered an edge.
+            A list of `descriptors.Descriptor` encoded as the context feature.
+        super_node:
+            A boolean specifying whether to include a super node.
         self_loops:
-            A boolean specifying whether self loops exist. If True, this means that
-            each node (atom) has an edge (bond) to itself.
-        include_hs:
+            A boolean specifying whether self loops exist.
+        include_hydrogens:
             A boolean specifying whether hydrogens should be encoded as nodes.
     """
     def __init__(
         self,
-        atom_features: list[features.Feature] | str | None = 'auto',
+        atom_features: list[features.Feature] | str = 'auto',
         bond_features: list[features.Feature] | str | None = 'auto',
-        molecule_features: features.Feature | str | None = None,
-        super_atom: bool = False,
-        radius: int | float | None = None,
+        molecule_features: list[descriptors.Descriptor] | str | None = None,
+        super_node: bool = False,
         self_loops: bool = False,
-        include_hs: bool = False,
-        **kwargs,
+        include_hydrogens: bool = False,
     ) -> None:
-        if molecule_features is None:
-            molecule_features = kwargs.pop('mol_features', None)
-        self.radius = int(max(radius or 1, 1))
-        self.include_hs = include_hs
-        self.self_loops = self_loops
-        self.super_atom = super_atom
-        default_atom_features = (
+        use_default_atom_features = (
             atom_features == 'auto' or atom_features == 'default'
         )
-        if default_atom_features:
-            atom_features = [features.AtomType()]
-            if not self.include_hs:
-                atom_features.append(features.NumHydrogens())
-            atom_features.append(features.Degree())
-        if not isinstance(self, MolGraphFeaturizer3D):
-            default_bond_features = (
-                bond_features == 'auto' or bond_features == 'default'
-            )
-            if default_bond_features or self.radius > 1:
-                vocab = ['zero', 'single', 'double', 'triple', 'aromatic']
-                bond_features = [
-                    features.BondType(vocab)
-                ]
-                if not default_bond_features and self.radius > 1:
-                    warnings.warn(
-                        'Replacing user-specified bond features with default bond features, '
-                        'as `radius`>1. When `radius`>1, only bond types are considered.',
-                        stacklevel=2
-                    )
-        default_molecule_features = (
+        if use_default_atom_features:
+            atom_features = [features.AtomType(), features.Degree()]
+            if not include_hydrogens:
+                atom_features += [features.NumHydrogens()]
+        use_default_bond_features = (
+            bond_features == 'auto' or bond_features == 'default'
+        )
+        if use_default_bond_features:
+            bond_features = [features.BondType()]
+        use_default_molecule_features = (
             molecule_features == 'auto' or molecule_features == 'default'
         )
-        if default_molecule_features:
+        if use_default_molecule_features:
             molecule_features = [
                 descriptors.MolWeight(),
                 descriptors.TotalPolarSurfaceArea(),
@@ -202,182 +163,65 @@ class MolGraphFeaturizer(Featurizer):
                 descriptors.NumRotatableBonds(),
                 descriptors.NumRings(),
             ]
         self._atom_features = atom_features
         self._bond_features = bond_features
         self._molecule_features = molecule_features
-        self.feature_dtype = 'float32'
-        self.index_dtype = 'int32'
+        self._include_hydrogens = include_hydrogens
+        self._self_loops = self_loops
+        self._super_node = super_node
     def call(self, inputs: str | tuple) -> tensors.GraphTensor:
-        if isinstance(inputs, (tuple, list, np.ndarray)):
-            x, *context = inputs
-            if len(context) and isinstance(context[0], dict):
-                context = copy.deepcopy(context[0])
-        else:
-            x, context = inputs, None
+        if isinstance(inputs, str):
+            inputs = (inputs,)
-        mol = chem.Mol.from_encoding(x, explicit_hs=self.include_hs)
+        inputs, *context_inputs = inputs
-        if mol is None:
-            warnings.warn(
-                f'Could not obtain `chem.Mol` from {x}. '
-                'Returning `None` (proceeding without it).',
-                stacklevel=2
-            )
-            return None
-        atom_feature = self.atom_features(mol)
-        bond_feature = self.bond_features(mol)
-        molecule_feature = self.molecule_feature(mol)
-        molecule_size = self.num_atoms(mol)
-        if isinstance(context, dict):
-            if 'x' in context:
-                context['feature'] = context.pop('x')
-            if 'y' in context:
-                context['label'] = context.pop('y')
-            if 'sample_weight' in context:
-                context['weight'] = context.pop('sample_weight')
-            context = {
-                **{'size': molecule_size},
-                **context
-            }
-        elif isinstance(context, list):
-            context = {
-                **{'size': molecule_size},
-                **{key: value for (key, value) in zip(['label', 'weight'], context)}
-            }
-        else:
-            context = {'size': molecule_size}
-        if molecule_feature is not None:
-            if 'feature' in context:
-                warnings.warn(
-                    'Found both inputted and computed context feature. '
-                    'Overwriting inputted context feature with computed '
-                    'context feature (based on `molecule_features`).',
-                    stacklevel=2
-                )
-            context['feature'] = molecule_feature
-        node = {}
-        node['feature'] = atom_feature
+        mol = chem.Mol.from_encoding(inputs, explicit_hs=self._include_hydrogens)
+        data = {'context': {}, 'node': {}, 'edge': {}}
-        edge = {}
-        if self.radius == 1:
-            edge['source'], edge['target'] = mol.adjacency(
-                fill='full', sparse=True, self_loops=self.self_loops, dtype=self.index_dtype
-            )
-            if self.self_loops:
-                bond_feature = np.pad(bond_feature, [(0, 1), (0, 0)])
-            if bond_feature is not None:
-                bond_indices = []
-                for atom_i, atom_j in zip(edge['source'], edge['target']):
-                    if atom_i == atom_j:
-                        bond_indices.append(-1)
-                    else:
-                        bond_indices.append(
-                            mol.get_bond_between_atoms(atom_i, atom_j).index
-                        )
-                edge['feature'] = bond_feature[bond_indices]
-        else:
-            paths = chem.get_shortest_paths(
-                mol, radius=self.radius, self_loops=self.self_loops
-            )
-            edge['source'] = np.asarray(
-                [path[0] for path in paths], dtype=self.index_dtype
-            )
-            edge['target'] = np.asarray(
-                [path[-1] for path in paths], dtype=self.index_dtype
+        data['context']['size'] = np.asarray(mol.num_atoms)
+        if len(context_inputs) == 1:
+            data['context']['label'] = np.asarray(context_inputs[0])
+        elif len(context_inputs) == 2:
+            data['context']['label'] = np.asarray(context_inputs[0])
+            data['context']['weight'] = np.asarray(context_inputs[1])
+        if self._molecule_features is not None:
+            data['context']['feature'] = np.concatenate(
+                [f(mol) for f in self._molecule_features], axis=-1
             )
-            if bond_feature is not None:
-                zero_bond_feature = np.array(
-                    [[1., 0., 0., 0., 0.]], dtype=bond_feature.dtype
-                )
-                bond_feature = np.concatenate(
-                    [bond_feature, zero_bond_feature], axis=0
-                )
-                edge['feature'] = self._expand_bond_features(
-                    mol, paths, bond_feature,
-                )
-        if self.super_atom:
-            node, edge = self._add_super_atom(node, edge)
-            context['size'] += 1
-        return tensors.GraphTensor(context, node, edge)
-    def stack(self, outputs):
-        if tensors.is_scalar(outputs[0]):
-            return tf.stack(outputs, axis=0)
-        return tf.concat(outputs, axis=0)
-    def atom_features(self, mol: chem.Mol) -> np.ndarray:
-        atom_feature: np.ndarray = np.concatenate(
+        data['node']['feature'] = np.concatenate(
             [f(mol) for f in self._atom_features], axis=-1
         )
-        return atom_feature.astype(self.feature_dtype)
-    def bond_features(self, mol: chem.Mol) -> np.ndarray:
-        if self._bond_features is None:
-            return None
-        bond_feature: np.ndarray = np.concatenate(
-            [f(mol) for f in self._bond_features], axis=-1
-        )
-        return bond_feature.astype(self.feature_dtype)
-    def molecule_feature(self, mol: chem.Mol) -> np.ndarray:
-        if self._molecule_features is None:
-            return None
-        molecule_feature: np.ndarray = np.concatenate(
-            [f(mol) for f in self._molecule_features], axis=-1
+        data['edge']['source'], data['edge']['target'] = mol.adjacency(
+            fill='full', sparse=True, self_loops=self._self_loops
         )
-        return molecule_feature.astype(self.feature_dtype)
+        if self._bond_features is not None:
+            bond_features = np.concatenate(
+                [f(mol) for f in self._bond_features], axis=-1
+            )
+            if self._self_loops:
+                bond_features = np.pad(bond_features, [(0, 1), (0, 0)])
-    def num_atoms(self, mol: chem.Mol) -> np.ndarray:
-        return np.asarray(mol.num_atoms, dtype=self.index_dtype)
-    def num_bonds(self, mol: chem.Mol) -> np.ndarray:
-        return np.asarray(mol.num_bonds, dtype=self.index_dtype)
-    def _expand_bond_features(
-        self,
-        mol: chem.Mol,
-        paths: list[list[int]],
-        bond_feature: np.ndarray,
-    ) -> np.ndarray:
-        def bond_feature_lookup(path):
-            path_bond_indices = [
-                mol.get_bond_between_atoms(path[i], path[i + 1]).index
-                for i in range(len(path) - 1)
+            bond_indices = [
+                mol.get_bond_between_atoms(i, j).index if (i != j) else -1
+                for (i, j) in zip(data['edge']['source'], data['edge']['target'])
             ]
-            padding = [-1] * (self.radius - len(path) + 1)
-            path_bond_indices += padding
-            return bond_feature[path_bond_indices].reshape(-1)
-        edge_feature = np.asarray(
-            [
-                bond_feature_lookup(path) for path in paths
-            ],
-            dtype=self.feature_dtype
-        ).reshape((-1, bond_feature.shape[-1] * self.radius))
-        return edge_feature
-    def _add_super_atom(
-        self,
-        node: dict[str, np.ndarray],
-        edge: dict[str, np.ndarray],
-    ) -> tuple[dict[str, np.ndarray]]:
-        num_super_nodes = 1
-        num_nodes = node['feature'].shape[0]
-        node = _add_super_nodes(node, num_super_nodes)
-        edge = _add_super_edges(
-            edge, num_nodes, num_super_nodes, self.feature_dtype, self.index_dtype, self.self_loops
-        )
-        return node, edge
+            data['edge']['feature'] = bond_features[bond_indices]
+        if self._super_node:
+            data = _add_super_node(data)
+        return tensors.GraphTensor(**_convert_dtypes(data))
     def get_config(self):
         config = super().get_config()
         config.update({
@@ -390,10 +234,9 @@ class MolGraphFeaturizer(Featurizer):
             'molecule_features': keras.saving.serialize_keras_object(
                 self._molecule_features
             ),
-            'super_atom': self.super_atom,
-            'radius': self.radius,
-            'self_loops': self.self_loops,
-            'include_hs': self.include_hs,
+            'super_node': self._super_node,
+            'self_loops': self._self_loops,
+            'include_hydrogens': self._include_hydrogens,
         })
         return config
@@ -414,15 +257,11 @@ class MolGraphFeaturizer(Featurizer):
 @keras.saving.register_keras_serializable(package='molcraft')
 class MolGraphFeaturizer3D(MolGraphFeaturizer):
-    """Molecular 3d-graph featurizer.
+    """3D Molecular graph featurizer.
-    Converts SMILES or InChI strings to a molecular graph in 3d space.
-    Namely, in addition to the information encoded in a standard molecular
-    graph, cartesian coordinates are also included.
+    Converts SMILES or InChI strings to a 3d molecular graph.
     The molecular graph may encode a single molecule or a batch of molecules.
-    In either case, it is a single graph, with each molecule corresponding to
-    a subgraph within the graph.
     Example:
@@ -431,226 +270,166 @@ class MolGraphFeaturizer3D(MolGraphFeaturizer):
     >>> featurizer = molcraft.featurizers.MolGraphFeaturizer3D(
     ...     atom_features=[
     ...         molcraft.features.AtomType(),
-    ...         molcraft.features.TotalNumHs(),
+    ...         molcraft.features.NumHydrogens(),
     ...         molcraft.features.Degree(),
     ...     ],
-    ...     radius=5.0
+    ...     radius=5.0,
+    ...     random_seed=42,
     ... )
     >>>
     >>> graph = featurizer(["N[C@@H](C)C(=O)O", "N[C@@H](CS)C(=O)O"])
     >>> graph
     GraphTensor(
         context={
-            'size': <tf.Tensor: shape=[20], dtype=int32>
+            'size': <tf.Tensor: shape=[2], dtype=int32>
         },
         node={
-            'feature': <tf.Tensor: shape=[130, 133], dtype=float32>,
-            'coordinate': <tf.Tensor: shape=[130, 3], dtype=float32>
+            'feature': <tf.Tensor: shape=[13, 129], dtype=float32>,
+            'coordinate': <tf.Tensor: shape=[13, 3], dtype=float32>
         },
         edge={
-            'source': <tf.Tensor: shape=[714], dtype=int32>,
-            'target': <tf.Tensor: shape=[714], dtype=int32>,
-            'feature': <tf.Tensor: shape=[714, 23], dtype=float32>
+            'source': <tf.Tensor: shape=[72], dtype=int32>,
+            'target': <tf.Tensor: shape=[72], dtype=int32>,
+            'feature': <tf.Tensor: shape=[72, 12], dtype=float32>
         }
     )
     Args:
         atom_features:
-            A list of `features.Feature` encoding the nodes of the molecular graph.
-        bond_features:
-            A list of `features.Feature` encoding the edges of the molecular graph.
+            A list of `features.Feature` encoded as the node features.
+        pair_features:
+            A list of `features.PairFeature` encoded as the edge features.
         molecule_features:
-            A `features.Feature` encoding the molecule (or `context`) of the graph.
-            If `contextual_super_atom` is set to `True`, then this feature will be
-            embedded, via `NodeEmbedding`, as a super node in the molecular graph.
-        conformer_generator:
-            A `conformers.ConformerGenerator` which produces conformers. If `auto`
-            a `conformers.ConformerEmbedder` will be used. If None, it is assumed
-            that the molecule already has conformer(s).
-        super_atom:
-            A boolean specifying whether super atoms exist and should be embedded
-            via `NodeEmbedding`.
-        radius:
-            A float specifying, for each atom, the maximum distance (in angstroms)
-            that another atom should be within to be considered an edge. Default
-            is set to 6.0 as this should cover most interactions. This parameter
-            can be though of as the receptive field. If None, the radius will be
-            infinite so all the receptive field will be the entire space (graph).
+            A list of `descriptors.Descriptor` encoded as the context feature.
+        super_node:
+            A boolean specifying whether to include a super node.
         self_loops:
-            A boolean specifying whether self loops exist. If True, this means that
-            each node (atom) has an edge (bond) to itself.
-        include_hs:
+            A boolean specifying whether self loops exist.
+        include_hydrogens:
             A boolean specifying whether hydrogens should be encoded as nodes.
+        radius:
+            A floating point value specifying maximum edge length.
+        random_seed:
+            An integer specifying the random seed for the conformer generation.
     """
     def __init__(
         self,
-        atom_features: list[features.Feature] | str | None = 'auto',
-        bond_features: list[features.Feature] | str | None = 'auto',
-        molecule_features: features.Feature | str = None,
-        conformer_generator: conformers.ConformerProcessor | str | None = 'auto',
-        super_atom: bool = False,
-        radius: int | float | None = 6.0,
+        atom_features: list[features.Feature] | str = 'auto',
+        pair_features: list[features.PairFeature] | str = 'auto',
+        molecule_features: features.Feature | str | None = None,
+        super_node: bool = False,
         self_loops: bool = False,
-        include_hs: bool = False,
-        **kwargs,
+        include_hydrogens: bool = False,
+        radius: int | float | None = 6.0,
+        random_seed: int | None = None,
     ) -> None:
-        if bond_features == 'auto':
-            bond_features = [
-                features.Distance()
-            ]
         super().__init__(
             atom_features=atom_features,
-            bond_features=bond_features,
+            bond_features=None,
             molecule_features=molecule_features,
-            super_atom=super_atom,
-            radius=radius,
+            super_node=super_node,
             self_loops=self_loops,
-            include_hs=include_hs,
-            **kwargs,
+            include_hydrogens=include_hydrogens,
         )
-        if conformer_generator == 'auto':
-            conformer_generator = conformers.ConformerGenerator(
-                steps=[
-                    conformers.ConformerEmbedder(
-                        method='ETKDGv3',
-                        num_conformers=5
-                    ),
-                ]
-            )
-        self.conformer_generator = conformer_generator
-        self.embed_conformer = self.conformer_generator is not None
-        self.radius = float(radius) if radius else None
+        use_default_pair_features = (
+            pair_features == 'auto' or pair_features == 'default'
+        )
+        if use_default_pair_features:
+            pair_features = [features.PairDistance()]
+        self._pair_features = pair_features
+        self._radius = float(radius) if radius else None
+        self._random_seed = random_seed
     def call(self, inputs: str | tuple) -> tensors.GraphTensor:
-        if isinstance(inputs, (tuple, list, np.ndarray)):
-            x, *context = inputs
-            if len(context) and isinstance(context[0], dict):
-                context = copy.deepcopy(context[0])
-        else:
-            x, context = inputs, None
+        if isinstance(inputs, str):
+            inputs = [inputs]
-        explicit_hs = (self.include_hs or self.embed_conformer)
-        mol = chem.Mol.from_encoding(x, explicit_hs=explicit_hs)
-        if mol is None:
-            warnings.warn(
-                f'Could not obtain `chem.Mol` from {x}. '
-                'Proceeding without it.',
-                stacklevel=2
-            )
-            return None
+        inputs, *context_inputs = inputs
-        if self.embed_conformer:
-            mol = self.conformer_generator(mol)
-            if not self.include_hs:
-                mol = chem.remove_hs(mol)
+        mol = chem.Mol.from_encoding(inputs, explicit_hs=True)
         if mol.num_conformers == 0:
-            raise ValueError(
-                'Cannot featurize a molecule without conformer(s). '
-                'Make sure to pass a `ConformerGenerator` to the constructor '
-                'of the `Featurizer` or input a 3D representation of the molecule. '
+            mol = chem.embed_conformers(
+                mol, num_conformers=1, random_seed=self._random_seed
             )
-        molecule_feature = self.molecule_feature(mol)
-        molecule_size = self.num_atoms(mol) + int(self.super_atom)
-        molecule_size = molecule_size.astype(self.index_dtype)
-        if isinstance(context, dict):
-            if 'x' in context:
-                context['feature'] = context.pop('x')
-            if 'y' in context:
-                context['label'] = context.pop('y')
-            if 'sample_weight' in context:
-                context['weight'] = context.pop('sample_weight')
-            context = {
-                **{'size': molecule_size},
-                **context
-            }
-        elif isinstance(context, list):
-            context = {
-                **{'size': molecule_size},
-                **{key: value for (key, value) in zip(['label', 'weight'], context)}
-            }
-        else:
-            context = {'size': molecule_size}
-        if molecule_feature is not None:
-            if 'feature' in context:
-                warnings.warn(
-                    'Found both inputted and computed context feature. '
-                    'Overwriting inputted context feature with computed '
-                    'context feature (based on `molecule_features`).',
-                    stacklevel=2
-                )
-            context['feature'] = molecule_feature
-        node = {}
-        node['feature'] = self.atom_features(mol)
-        if self._bond_features:
-            edge_feature = self.bond_features(mol)
-        edge = {}
-        mols = chem.unpack_conformers(mol)
-        tensor_list = []
-        for i, mol in enumerate(mols):
-            node_conformer = copy.deepcopy(node)
-            edge_conformer = copy.deepcopy(edge)
-            conformer = mol.get_conformer()
-            adjacency_matrix = conformer.adjacency(
-                fill='full',
-                radius=self.radius,
-                sparse=False,
-                self_loops=self.self_loops,
-                dtype=bool
+        if not self._include_hydrogens:
+            mol = chem.remove_hs(mol)
+        data = {'context': {}, 'node': {}, 'edge': {}}
+        data['context']['size'] = np.asarray(mol.num_atoms)
+        if len(context_inputs) == 1:
+            data['context']['label'] = np.asarray(context_inputs[0])
+        elif len(context_inputs) == 2:
+            data['context']['label'] = np.asarray(context_inputs[0])
+            data['context']['weight'] = np.asarray(context_inputs[1])
+        if self._molecule_features is not None:
+            data['context']['feature'] = np.concatenate(
+                [f(mol) for f in self._molecule_features], axis=-1
             )
-            edge_conformer['source'], edge_conformer['target'] = np.where(adjacency_matrix)
-            edge_conformer['source'] = edge_conformer['source'].astype(self.index_dtype)
-            edge_conformer['target'] = edge_conformer['target'].astype(self.index_dtype)
-            node_conformer['coordinate'] = conformer.coordinates.astype(self.feature_dtype)
-            if self._bond_features:
-                edge_feature_keep = adjacency_matrix.reshape(-1)
-                edge_conformer['feature'] = edge_feature[edge_feature_keep]
-            if self.super_atom:
-                node_conformer, edge_conformer = self._add_super_atom(
-                    node_conformer, edge_conformer
-                )
-                node_conformer['coordinate'] = np.concatenate(
-                    [node_conformer['coordinate'], conformer.centroid[None]], axis=0
-                ).astype(self.feature_dtype)
-            tensor_list.append(
-                tensors.GraphTensor(context, node_conformer, edge_conformer)
+        conformer = mol.get_conformer()
+        data['node']['feature'] = np.concatenate(
+            [f(mol) for f in self._atom_features], axis=-1
+        )
+        data['node']['coordinate'] = conformer.coordinates
+        adjacency_matrix = conformer.adjacency(
+            fill='full', radius=self._radius, sparse=False, self_loops=self._self_loops,
+        )
+        data['edge']['source'], data['edge']['target'] = np.where(adjacency_matrix)
+        if self._pair_features is not None:
+            pair_features = np.concatenate(
+                [f(mol) for f in self._pair_features], axis=-1
+            )
+            pair_keep = adjacency_matrix.reshape(-1).astype(bool)
+            data['edge']['feature'] = pair_features[pair_keep]
+        if self._super_node:
+            data = _add_super_node(data)
+            data['node']['coordinate'] = np.concatenate(
+                [data['node']['coordinate'], conformer.centroid[None]], axis=0
             )
-        return tensor_list
+        return tensors.GraphTensor(**_convert_dtypes(data))
-    def stack(self, outputs):
-        # Flatten list of lists (due to multiple conformers per molecule)
-        outputs = [x for xs in outputs for x in xs]
-        return super().stack(outputs)
+    @property
+    def random_seed(self) -> int | None:
+        return self._random_seed
+    @random_seed.setter
+    def random_seed(self, value: int) -> None:
+        self._random_seed = value
     def get_config(self):
         config = super().get_config()
-        config['conformer_generator'] = keras.saving.serialize_keras_object(
-            self.conformer_generator
+        config['radius'] = self._radius
+        config['pair_features'] = keras.saving.serialize_keras_object(
+            self._pair_features
         )
+        config['random_seed'] = self._random_seed
         return config
     @classmethod
     def from_config(cls, config: dict):
-        config['conformer_generator'] = keras.saving.deserialize_keras_object(
-            config['conformer_generator']
+        config['pair_features'] = keras.saving.deserialize_keras_object(
+            config['pair_features']
         )
         return super().from_config(config)
 def save_featurizer(
-    featurizer: Featurizer,
+    featurizer: GraphFeaturizer,
     filepath: str | Path,
     overwrite: bool = True,
     **kwargs
@@ -658,8 +437,8 @@ def save_featurizer(
     filepath = Path(filepath)
     if filepath.suffix != '.json':
         raise ValueError(
-            'Invalid `filepath` extension for saving a `Featurizer`. '
-            'A `Featurizer` should be saved as a JSON file.'
+            'Invalid `filepath` extension for saving a `GraphFeaturizer`. '
+            'A `GraphFeaturizer` should be saved as a JSON file.'
         )
     if not filepath.parent.exists():
         filepath.parent.mkdir(parents=True, exist_ok=True)
@@ -672,12 +451,12 @@ def save_featurizer(
 def load_featurizer(
     filepath: str | Path,
     **kwargs
-) -> Featurizer:
+) -> GraphFeaturizer:
     filepath = Path(filepath)
     if filepath.suffix != '.json':
         raise ValueError(
-            'Invalid `filepath` extension for loading a `Featurizer`. '
-            'A `Featurizer` should be saved as a JSON file.'
+            'Invalid `filepath` extension for loading a `GraphFeaturizer`. '
+            'A `GraphFeaturizer` should be saved as a JSON file.'
         )
     if not filepath.exists():
         return
@@ -685,69 +464,60 @@ def load_featurizer(
         config = json.load(f)
     return keras.saving.deserialize_keras_object(config)
-def _add_super_nodes(
-    node: dict[str, np.ndarray],
-    num_super_nodes: int = 1,
-) -> dict[str, np.ndarray]:
-    node = copy.deepcopy(node)
-    node['super'] = np.array(
-        [False] * len(node['feature']) + [True] * num_super_nodes,
-        dtype=bool
-    )
-    super_node_feature = np.zeros(
-        [num_super_nodes, node['feature'].shape[-1]],
-        dtype=node['feature'].dtype
-    )
-    node['feature'] = np.concatenate([node['feature'], super_node_feature])
-    return node
-def _add_super_edges(
-    edge: dict[str, np.ndarray],
-    num_nodes: int,
-    num_super_nodes: int,
-    feature_dtype: str,
-    index_dtype: str,
-    self_loops: bool,
-) -> dict[str, np.ndarray]:
-    edge = copy.deepcopy(edge)
-    super_node_indices = np.arange(num_super_nodes) + num_nodes
-    if self_loops:
-        edge['source'] = np.concatenate([edge['source'], super_node_indices])
-        edge['target'] = np.concatenate([edge['target'], super_node_indices])
-    super_node_indices = np.repeat(super_node_indices, [num_nodes])
-    node_indices = (
-        np.tile(np.arange(num_nodes), [num_super_nodes])
+def _add_super_node(
+    data: dict[str, dict[str, np.ndarray]]
+) -> dict[str, dict[str, np.ndarray]]:
+    data['context']['size'] += 1
+    num_nodes = data['node']['feature'].shape[0]
+    num_edges = data['edge']['source'].shape[0]
+    super_node_index = num_nodes
+    add_self_loops = np.any(
+        data['edge']['source'] == data['edge']['target']
     )
-    edge['source'] = np.concatenate(
-        [edge['source'], node_indices, super_node_indices]
-    ).astype(index_dtype)
-    edge['target'] = np.concatenate(
-        [edge['target'], super_node_indices, node_indices]
-    ).astype(index_dtype)
-    if 'feature' in edge:
-        num_edges = int(edge['feature'].shape[0])
-        num_super_edges = int(num_super_nodes * num_nodes * 2)
-        if self_loops:
-            num_super_edges += num_super_nodes
-        edge['super'] = np.asarray(
-            ([False] * num_edges + [True] * num_super_edges),
-            dtype=bool
+    if add_self_loops:
+        data['edge']['source'] = np.append(
+            data['edge']['source'], super_node_index
         )
-        edge['feature'] = np.concatenate(
-            [
-                edge['feature'],
-                np.zeros(
-                    shape=(num_super_edges, edge['feature'].shape[-1]),
-                    dtype=edge['feature'].dtype
-                )
-            ]
+        data['edge']['target'] = np.append(
+            data['edge']['target'], super_node_index
         )
-    return edge
+    data['node']['feature'] = np.pad(data['node']['feature'], [(0, 1), (0, 0)])
+    data['node']['super'] = np.asarray([False] * num_nodes + [True])
+    node_indices = list(range(num_nodes))
+    super_node_indices = [super_node_index] * num_nodes
-MolFeaturizer = MolGraphFeaturizer
-MolFeaturizer3D = MolGraphFeaturizer3D
+    data['edge']['source'] = np.append(
+        data['edge']['source'], node_indices + super_node_indices
+    )
+    data['edge']['target'] = np.append(
+        data['edge']['target'], super_node_indices + node_indices
+    )
+    total_num_edges = data['edge']['source'].shape[0]
+    num_super_edges = (total_num_edges - num_edges)
+    data['edge']['super'] = np.asarray(
+        [False] * num_edges + [True] * num_super_edges
+    )
+    if 'feature' in data['edge']:
+        data['edge']['feature'] = np.pad(
+            data['edge']['feature'], [(0, num_super_edges), (0, 0)]
+        )
+    return data
+def _convert_dtypes(data: dict[str, dict[str, np.ndarray]]) -> np.ndarray:
+    for outer_key, inner_dict in data.items():
+        for inner_key, inner_value in inner_dict.items():
+            if inner_key in ['source', 'target', 'size']:
+                data[outer_key][inner_key] = inner_value.astype(np.int32)
+            elif np.issubdtype(inner_value.dtype, np.integer):
+                data[outer_key][inner_key] = inner_value.astype(np.int32)
+            elif np.issubdtype(inner_value.dtype, np.floating):
+                data[outer_key][inner_key] = inner_value.astype(np.float32)
+    return data

molcraft 0.1.0a16__py3-none-any.whl → 0.1.0a18__py3-none-any.whl

Potentially problematic release.

molcraft 0.1.0a16py3-none-any.whl → 0.1.0a18py3-none-any.whl