PyPI - aidsorb - Versions diffs - 0.0.0__py3-none-any.whl - Mend

aidsorb 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

aidsorb/__init__.py +34 -0
aidsorb/_cli.py +46 -0
aidsorb/_internal.py +73 -0
aidsorb/data.py +455 -0
aidsorb/datamodules.py +262 -0
aidsorb/litmodels.py +179 -0
aidsorb/models.py +124 -0
aidsorb/modules.py +403 -0
aidsorb/pkg_data/README.md +1 -0
aidsorb/pkg_data/periodic_table.csv +119 -0
aidsorb/transforms.py +262 -0
aidsorb/utils.py +203 -0
aidsorb/visualize.py +174 -0
aidsorb-0.0.0.dist-info/LICENSE +674 -0
aidsorb-0.0.0.dist-info/METADATA +120 -0
aidsorb-0.0.0.dist-info/RECORD +19 -0
aidsorb-0.0.0.dist-info/WHEEL +5 -0
aidsorb-0.0.0.dist-info/entry_points.txt +3 -0
aidsorb-0.0.0.dist-info/top_level.txt +1 -0

aidsorb/__init__.py ADDED Viewed

@@ -0,0 +1,34 @@
+# This file is part of AIdsorb.
+# Copyright (C) 2024 Antonios P. Sarikas
+# AIdsorb is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+r"""
+**AIdsorb** is a :fa:`python; fa-fade` Python package for **deep learning on
+molecular point clouds**.
+.. admonition:: AIdsorb adopts the following conventions
+    * A ``pcd`` is represented as a :class:`numpy.ndarray` of shape ``(N, 3+C)``.
+    * A molecular ``pcd`` is represented as a :class:`numpy.ndarray` of shape ``(N, 4+C)``
+      where ``N`` is the number of atoms, ``pcd[:, :3]`` are the **atomic
+      coordinates**, ``pcd[:, 3]`` are the **atomic numbers** and ``pcd[:, 4:]``
+      any **additional features**. If ``C == 0``, then the only features are the
+      atomic numbers.
+"""
+__author__ = 'Antonios P. Sarikas'
+__copyright__ = 'Copyright (c) 2024 Antonios P. Sarikas'
+__license__ = ' GPL-3.0-only'

aidsorb/_cli.py ADDED Viewed

@@ -0,0 +1,46 @@
+# This file is part of AIdsorb.
+# Copyright (C) 2024 Antonios P. Sarikas
+# AIdsorb is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+r"""
+This module provides helper functions for the CLI.
+"""
+def lightning_cli():
+    r"""
+    CLI for the deep learning part.
+    """
+    from lightning.pytorch.cli import LightningCLI
+    from . datamodules import PCDDataModule
+    from . litmodels import PointLit
+    LightningCLI(PointLit, PCDDataModule)
+def aidsorb_fire():
+    r"""
+    CLI for creating, preparing and visualizing molecular point clouds.
+    """
+    import fire
+    from . visualize import draw_pcd_from_file
+    from . utils import pcd_from_dir
+    from . data import prepare_data
+    fire.Fire({
+        'visualize': draw_pcd_from_file,
+        'create': pcd_from_dir,
+        'prepare': prepare_data,
+        })

aidsorb/_internal.py ADDED Viewed

@@ -0,0 +1,73 @@
+# This file is part of AIdsorb.
+# Copyright (C) 2024 Antonios P. Sarikas
+# AIdsorb is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+r"""
+This module provides helper functions and data for use in other modules.
+"""
+from importlib.resources import files
+import pandas as pd
+def _check_shape(array):
+    r"""
+    Check if ``array`` has valid shape to be considered a point cloud.
+    Parameters
+    ----------
+    array
+    Raises
+    ------
+    ValueError
+        If ``array.shape != (N, 3+C)``.
+    """
+    if not ((array.ndim == 2) and (array.shape[1] >= 3)):
+        raise ValueError(
+                'Expecting array of shape (N, 3+C) '
+                f'but got array of shape {array.shape}!'
+                )
+def _check_shape_vis(array):
+    r"""
+    Check if ``array`` has valid shape to be considered a molecular point cloud.
+    Parameters
+    ----------
+    array
+    Raises
+    ------
+    ValueError
+        If ``array.shape != (N, 4+C)``.
+    """
+    if not ((array.ndim == 2) and (array.shape[1] >= 4)):
+        raise ValueError(
+                'Expecting array of shape (N, 4+C) '
+                f'but got array of shape {array.shape}!'
+                )
+# Default value for controlling randomness.
+_SEED = 1
+# This will be the default on Pandas 3.0
+pd.options.mode.copy_on_write = True
+# Load the periodic table.
+with files('aidsorb.pkg_data').joinpath('periodic_table.csv').open() as fhand:
+    _ptable = pd.read_csv(fhand, index_col='atomic_number')

aidsorb/data.py ADDED Viewed

@@ -0,0 +1,455 @@
+# This file is part of AIdsorb.
+# Copyright (C) 2024 Antonios P. Sarikas
+# AIdsorb is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+r"""
+This module provides helper functions and classes for creating datasets and
+handling point clouds of variable sizes.
+"""
+import os
+import json
+from pathlib import Path
+from typing import Sequence
+import numpy as np
+import torch
+from torch.utils.data import random_split, Dataset
+from torch.nn.utils.rnn import pad_sequence
+from . _internal import _SEED, pd
+def prepare_data(source: str, split_ratio: Sequence=(0.8, 0.1, 0.1), seed: int=_SEED):
+    r"""
+    Split a source of point clouds in train, validation and test sets.
+    Each ``.json`` file that is created, stores the names of the point clouds
+    that will be used for *training*, *validation* and *testing*.
+    .. warning::
+        * No directory is created by :func:`prepare_data`. All ``.json`` files
+          are stored under the directory containing ``source``.
+        * Splitting doesn't support stratification. If your dataset is small and
+          you want to perform classification, consider using
+          `train_test_split`_.
+    .. _train_test_split: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
+    Parameters
+    ----------
+    source : str
+        Absolute or relative path to the file holding the point clouds.
+    split_ratio : sequence, default=(0.8, 0.1, 0.1)
+        The sizes or fractions of splits to be produced.
+        * ``split_ratio[0] == train``.
+        * ``split_ratio[1] == validation``.
+        * ``split_ratio[2] == test``.
+    seed : int, default=1
+        Controls the randomness of the ``rng`` used for splitting.
+    Examples
+    --------
+    Before the split::
+        pcd_data
+        └──source.npz
+    >>> prepare_data('path/to/pcd_data/source.npz')  # doctest: +SKIP
+    After the split::
+        pcd_data
+        ├──source.npz
+        ├──train.json
+        ├──validation.json
+        └──test.json
+    """
+    rng = torch.Generator().manual_seed(seed)
+    path = Path(source).parent
+    pcds = np.load(source)
+    train, val, test = random_split(pcds.files, split_ratio, generator=rng)
+    for split, mode in zip((train, val, test), ('train', 'validation', 'test')):
+        names = list(split)
+        with open(os.path.join(path, f'{mode}.json'), 'w') as fhand:
+            json.dump(names, fhand, indent=4)
+    print('\033[32mData preparation completed!\033[0m')
+def get_names(filename):
+    r"""
+    Return names stored in a ``.json`` file.
+    Parameters
+    ----------
+    filename : str
+        The name of the file from which names will be retrieved.
+    Returns
+    -------
+    names : list
+    """
+    with open(filename, 'r') as fhand:
+        names = json.load(fhand)
+    return names
+def upsample_pcd(pcd, size):
+    r"""
+    Upsample ``pcd`` to a new ``size`` by sampling with replacement from ``pcd``.
+    Parameters
+    ----------
+    pcd : tensor of shape (N, C)
+        The original point cloud of size ``N``.
+    size : int
+        The size of the new point cloud.
+    Returns
+    -------
+    new_pcd : tensor of shape (size, C).
+    Examples
+    --------
+    >>> pcd = torch.tensor([[2, 4, 5, 6]])
+    >>> upsample_pcd(pcd, 3)
+    tensor([[2, 4, 5, 6],
+            [2, 4, 5, 6],
+            [2, 4, 5, 6]])
+    >>> # New points point must be from pcd.
+    >>> pcd = torch.randn(10, 4)
+    >>> new_pcd = upsample_pcd(pcd, 20)
+    >>> (new_pcd[-1] == pcd).all(1).any()  # Check for last point.
+    tensor(True)
+    >>> # No upsampling.
+    >>> pcd = torch.randn(100, 4)
+    >>> new_pcd = upsample_pcd(pcd, len(pcd))
+    >>> torch.equal(pcd, new_pcd)
+    True
+    """
+    n_samples = size - len(pcd)
+    indices = torch.from_numpy(np.random.choice(len(pcd), n_samples, replace=True))
+    new_points = pcd[indices]
+    return torch.cat((pcd, new_points))
+def pad_pcds(pcds, channels_first=True, mode='upsample'):
+    r"""
+    Pad a sequence of variable size point clouds.
+    Each point cloud must have shape ``(N_i, C)``.
+    Parameters
+    ----------
+    pcds : sequence of tensors
+    mode : {'zeropad', 'upsample'}, default='upsample'
+    channels_first : bool, default=True
+    Returns
+    -------
+    batch : tensor of shape (B, T, C) or (B, C, T)
+         If ``channels_first=False``, then ``batch`` has shape ``(B, T, C)``,
+         where  ``B == len(pcds)`` is the batch size and ``T`` is the size of
+         the largest point cloud in ``pcds``. Otherwise, ``(B, C, T)``.
+    See Also
+    --------
+    :func:`upsample_pcd` : For a description of ``'upsample'`` mode.
+    :func:`torch.nn.utils.rnn.pad_sequence` : For a description of ``'zeropad'`` mode.
+    Examples
+    --------
+    >>> x1 = torch.tensor([[1, 2, 3, 4]])
+    >>> x2 = torch.tensor([[2, 5, 3, 8], [0, 2, 8, 9]])
+    >>> batch = pad_pcds((x1, x2), channels_first=False)
+    >>> batch
+    tensor([[[1, 2, 3, 4],
+             [1, 2, 3, 4]],
+    <BLANKLINE>
+            [[2, 5, 3, 8],
+             [0, 2, 8, 9]]])
+    >>> batch = pad_pcds((x1, x2), channels_first=True)
+    >>> batch
+    tensor([[[1, 1],
+             [2, 2],
+             [3, 3],
+             [4, 4]],
+    <BLANKLINE>
+            [[2, 0],
+             [5, 2],
+             [3, 8],
+             [8, 9]]])
+    >>> batch = pad_pcds((x1, x2), channels_first=False, mode='zeropad')
+    >>> batch
+    tensor([[[1, 2, 3, 4],
+             [0, 0, 0, 0]],
+    <BLANKLINE>
+            [[2, 5, 3, 8],
+             [0, 2, 8, 9]]])
+    >>> batch = pad_pcds((x1, x2), channels_first=True, mode='zeropad')
+    >>> batch
+    tensor([[[1, 0],
+             [2, 0],
+             [3, 0],
+             [4, 0]],
+    <BLANKLINE>
+            [[2, 0],
+             [5, 2],
+             [3, 8],
+             [8, 9]]])
+    """
+    if mode == 'zeropad':
+        batch = pad_sequence(pcds, batch_first=True, padding_value=0)
+    elif mode == 'upsample':
+        max_len = max(len(i) for i in pcds)
+        new_pcds = [upsample_pcd(p, max_len) if len(p) < max_len else p for p in pcds]
+        batch = torch.stack(new_pcds)
+    # Shape (B, n_points, C).
+    if channels_first:
+        batch = batch.transpose(1, 2)  # Shape (B, C, n_points).
+    return batch
+class Collator():
+    r"""
+    Collate a sequence of samples into a ``batch``.
+    Point clouds are padded before collation, so they can form a batch.
+    .. rubric:: Shapes
+    * Input: sequence of samples
+        Each sample is a tuple of tensors ``(pcd, label)``, where
+        ``pcd`` has shape ``(N_i, C)`` and ``label`` has shape
+        ``(n_outputs,)`` or ``()``.
+    * Output: tuple of length 2
+        * ``batch[0] == x`` with shape ``(B, C, T)`` if ``channels_first=True``,
+          otherwise ``(B, T, C)``. ``B`` is the batch size and ``T`` is the size
+          of the largest point cloud in the sequence.
+        * ``batch[1] == y`` with shape ``(B, n_outputs)`` or ``(B,)``.
+    .. tip::
+        Use an instance of this class as ``collate_fn`` with
+        ``channels_first=True``, if your model is :class:`~aidsorb.models.PointNet`.
+    .. todo::
+        Add functionality for collating only point clouds (useful when the
+        dataset is unlabeled).
+    Parameters
+    ----------
+    channels_first : bool, default=True
+    mode : {'zeropad', 'upsample'}, default='upsample'
+    See Also
+    --------
+    :func:`pad_pcds` : For a description of the parameters.
+    :func:`upsample_pcd` : For a description of the parameters.
+    Examples
+    --------
+    >>> sample1 = (torch.tensor([[1, 4, 5, 2]]), torch.tensor([1., 2.]))
+    >>> sample2 = (torch.tensor([[0, 4, 0, 2], [2, 4, 1, 8]]), torch.tensor([7., 3.]))
+    >>> collate_fn = Collator()
+    >>> x, y = collate_fn((sample1, sample2))
+    >>> x.shape
+    torch.Size([2, 4, 2])
+    >>> y.shape
+    torch.Size([2, 2])
+    >>> x
+    tensor([[[1, 1],
+             [4, 4],
+             [5, 5],
+             [2, 2]],
+    <BLANKLINE>
+            [[0, 2],
+             [4, 4],
+             [0, 1],
+             [2, 8]]])
+    >>> y
+    tensor([[1., 2.],
+            [7., 3.]])
+    >>> collate_fn = Collator(channels_first=False, mode='zeropad')
+    >>> x, y = collate_fn((sample1, sample2))
+    >>> x
+    tensor([[[1, 4, 5, 2],
+             [0, 0, 0, 0]],
+    <BLANKLINE>
+            [[0, 4, 0, 2],
+             [2, 4, 1, 8]]])
+    >>> y
+    tensor([[1., 2.],
+            [7., 3.]])
+    >>> # Label has shape (), i.e. is scalar.
+    >>> sample1 = (torch.tensor([[3, 4, 3, 2]]), torch.tensor(0))
+    >>> sample2 = (torch.tensor([[2, 4, 8, 2], [9, 4, 1, 8]]), torch.tensor(1))
+    >>> collate_fn = Collator(channels_first=False, mode='zeropad')
+    >>> x, y = collate_fn((sample1, sample2))
+    >>> x
+    tensor([[[3, 4, 3, 2],
+             [0, 0, 0, 0]],
+    <BLANKLINE>
+            [[2, 4, 8, 2],
+             [9, 4, 1, 8]]])
+    >>> y
+    tensor([0, 1])
+    """
+    def __init__(self, channels_first=True, mode='upsample'):
+        self.channels_first = channels_first
+        self.mode = mode
+    def __call__(self, samples):
+        r"""
+        Parameters
+        ----------
+        samples : sequence of tuples
+            Each sample is a tuple of tensors ``(pcd, label)`` where
+            ``pcd.shape == (n_points, C)`` and ``label`` has shape
+            ``(n_outputs,)`` or ``()``.
+        Returns
+        -------
+        batch : tuple of length 2
+            * ``batch[0] == x`` with shape ``(B, C, T)`` or ``(B, T, C)``, where
+              ``T`` is the size of the largest point cloud.
+            * ``batch[1] == y`` with shape ``(B, n_outputs)`` or ``(B,)``.
+        """
+        pcds, labels = list(zip(*samples))
+        x = pad_pcds(pcds, channels_first=self.channels_first, mode=self.mode)
+        y = torch.stack(labels)
+        return x, y
+class PCDDataset(Dataset):
+    r"""
+    ``Dataset`` for point clouds.
+    .. tip::
+        For implementing your own transforms, have a look at the transforms
+        `tutorial`_.  For more flexibility, consider implementing them as
+        callable instances of classes.
+    .. _tutorial: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html#transforms
+    Parameters
+    ----------
+    pcd_names : list
+        List containing the names of the point clouds.
+    path_to_X : str
+        Absolute or relative path to the ``.npz`` file holding the point clouds.
+    path_to_Y : str, optional
+        Absolute or relative path to the ``.csv`` file holding the labels of the
+        point clouds.
+        .. warning::
+            The comma ``,`` is assumed as the field separator.
+    index_col : str, optional
+        Column name of the ``.csv`` file to be used as row labels. The names
+        (values) under this column must follow the same naming scheme as in
+        ``pcd_names``.
+    labels : list, optional
+        List containing the names of the properties to be predicted. No effect
+        if ``path_to_Y=None``.
+    transform_x : callable, optional
+        Transforms applied to ``input``, i.e to each point cloud.
+    transform_y : callable, optional
+        Transforms applied to ``output``. No effect if ``path_to_Y=None``.
+    See Also
+    --------
+    :mod:`aidsorb.transforms` : For available point cloud transformations.
+    """
+    def __init__(
+            self, pcd_names, path_to_X,
+            path_to_Y=None, index_col=None, labels=None,
+            transform_x=None, transform_y=None,
+            ):
+        if (labels is not None) and (type(labels) != list):
+            raise ValueError('labels must be a list!')
+        self._pcd_names = pcd_names
+        self.path_to_X = path_to_X
+        self.path_to_Y = path_to_Y
+        self.labels = labels
+        self.index_col = index_col
+        self.transform_x = transform_x
+        self.transform_y = transform_y
+        self.X = None
+        self.Y = None
+    @property
+    def pcd_names(self):
+        r"""The names of the point clouds."""
+        return self._pcd_names
+    def __len__(self):
+        return len(self.pcd_names)
+    def __getitem__(self, idx):
+        # Account for np.load and multiprocessing.
+        if self.X is None:
+            self.X = np.load(self.path_to_X)
+        if self.Y is None and self.path_to_Y is not None:
+            self.Y = pd.read_csv(
+                    self.path_to_Y,
+                    index_col=self.index_col,
+                    usecols=[*self.labels, self.index_col],
+                    )
+        name = self.pcd_names[idx]
+        sample_x = self.X[name]
+        if self.transform_x is not None:
+            sample_x = self.transform_x(sample_x)
+        # Only for labeled datasets.
+        if self.Y is not None:
+            sample_y = self.Y.loc[name].to_numpy()
+            if self.transform_y is not None:
+                sample_y = self.transform_y(sample_y)
+            return (
+                    torch.tensor(sample_x, dtype=torch.float),
+                    torch.tensor(sample_y, dtype=torch.float)
+                    )
+        return torch.tensor(sample_x, dtype=torch.float)