PyPI - nerdd-module - Versions diffs - 0.1.6__tar.gz - Mend

nerdd-module 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

nerdd-module-0.1.6/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2023 Molecular Informatics Vienna
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

nerdd-module-0.1.6/PKG-INFO ADDED Viewed

@@ -0,0 +1,87 @@
+Metadata-Version: 2.1
+Name: nerdd-module
+Version: 0.1.6
+Summary: Base package to create NERDD modules
+Home-page: https://github.com/molinfo-vienna/nerdd-module.git
+Maintainer: Steffen Hirte
+Maintainer-email: steffen.hirte@univie.ac.at
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pandas>=1.2.1
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: filetype~=1.2.0
+Requires-Dist: rich-click>=1.7.1
+Requires-Dist: stringcase>=1.2.0
+Requires-Dist: chembl_structure_pipeline>=1.0.0
+Provides-Extra: dev
+Provides-Extra: test
+Requires-Dist: pytest; extra == "test"
+Requires-Dist: pytest-cov; extra == "test"
+Requires-Dist: pytest-asyncio; extra == "test"
+Requires-Dist: pytest-bdd; extra == "test"
+Requires-Dist: pytest-mock; extra == "test"
+Requires-Dist: pytest-watch; extra == "test"
+Requires-Dist: hypothesis; extra == "test"
+Requires-Dist: hypothesis-rdkit; extra == "test"
+# Nerdd Module
+This package provides the basis to implement molecular prediction modules in the
+NERDD ecosystem.
+## Installation
+```pip install nerdd-module```
+## Implement your own module
+A new module is created by inheriting from the ```AbstractModel``` class. A
+preprocessing pipeline can be configured via calling the constructor of the superclass.
+The actual prediction procedure is implemented in ```_predict_mols```:
+```python
+import pandas as pd
+from typing import List
+from rdkit.Chem import Mol
+from nerdd_module import AbstractModel
+class MyModel(AbstractModel):
+    def __init__(self):
+        super().__init__(
+            preprocessing_pipeline="chembl_structure_pipeline",
+        )
+    def _predict_mols(self, mols: List[Mol], custom_param: int = 5) -> pd.DataFrame:
+        # implement prediction logic and return a dataframe with new columns
+        # containing values per input molecule
+        return pd.DataFrame(dict(predictions=[custom_param]*len(mols)))
+```
+For custom preprocessing, specify ```preprocessing_pipeline="custom"``` when calling
+the constructor of the superclass and override the method ```_preprocess_single_mol```:
+```python
+class MyModel(AbstractModel):
+    def __init__(self):
+        # important:
+        super().__init__(preprocessing_pipeline="custom")
+    def _preprocess_single_mol(self, mol: Mol) -> Tuple[Mol, List[str]]:
+        # implement custom preprocessing logic here
+        # return preprocessed molecule and a list of error messages
+        return preprocessed_mol, errors
+    # ...
+```
+## Contribute
+1. Fork and clone the code
+2. Install test dependencies with ```pip install -e .[test]```
+3. Run tests via ```pytest``` or ```pytest-watch``` (short: ```ptw```)
+## Contributors
+* Steffen Hirte

nerdd-module-0.1.6/README.md ADDED Viewed

@@ -0,0 +1,61 @@
+# Nerdd Module
+This package provides the basis to implement molecular prediction modules in the
+NERDD ecosystem.
+## Installation
+```pip install nerdd-module```
+## Implement your own module
+A new module is created by inheriting from the ```AbstractModel``` class. A
+preprocessing pipeline can be configured via calling the constructor of the superclass.
+The actual prediction procedure is implemented in ```_predict_mols```:
+```python
+import pandas as pd
+from typing import List
+from rdkit.Chem import Mol
+from nerdd_module import AbstractModel
+class MyModel(AbstractModel):
+    def __init__(self):
+        super().__init__(
+            preprocessing_pipeline="chembl_structure_pipeline",
+        )
+    def _predict_mols(self, mols: List[Mol], custom_param: int = 5) -> pd.DataFrame:
+        # implement prediction logic and return a dataframe with new columns
+        # containing values per input molecule
+        return pd.DataFrame(dict(predictions=[custom_param]*len(mols)))
+```
+For custom preprocessing, specify ```preprocessing_pipeline="custom"``` when calling
+the constructor of the superclass and override the method ```_preprocess_single_mol```:
+```python
+class MyModel(AbstractModel):
+    def __init__(self):
+        # important:
+        super().__init__(preprocessing_pipeline="custom")
+    def _preprocess_single_mol(self, mol: Mol) -> Tuple[Mol, List[str]]:
+        # implement custom preprocessing logic here
+        # return preprocessed molecule and a list of error messages
+        return preprocessed_mol, errors
+    # ...
+```
+## Contribute
+1. Fork and clone the code
+2. Install test dependencies with ```pip install -e .[test]```
+3. Run tests via ```pytest``` or ```pytest-watch``` (short: ```ptw```)
+## Contributors
+* Steffen Hirte

nerdd-module-0.1.6/nerdd_module/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+import pkg_resources
+from .abstract_model import *
+from .config import *
+from .version import *
+for entry_point in pkg_resources.iter_entry_points("nerdd-module.plugins"):
+    entry_point.load()

nerdd-module-0.1.6/nerdd_module/abstract_model.py ADDED Viewed

@@ -0,0 +1,274 @@
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+import pandas as pd
+from rdkit.Chem import Mol, MolToSmiles
+from .config import AutoConfiguration, Configuration
+from .io import MoleculeEntry, guess_and_read
+from .preprocessing import Pipeline, Step, registry
+__all__ = ["AbstractModel"]
+class CustomPreprocessingStep(Step):
+    def __init__(self, fn: Callable[[Mol], Tuple[Mol, List[str]]]):
+        super().__init__()
+        self.fn = fn
+    def _run(self, mol: Mol) -> Tuple[Mol, List[str]]:
+        return self.fn(mol)
+class AbstractModel(ABC):
+    def __init__(
+        self,
+        preprocessing_pipeline: Union[str, Pipeline, Iterable[Step], None],
+        num_processes: int = 1,
+    ):
+        #
+        # preprocessing pipeline
+        #
+        if preprocessing_pipeline is None or preprocessing_pipeline == "custom":
+            self.preprocessing_pipeline = Pipeline(
+                steps=[CustomPreprocessingStep(self._preprocess_single_mol)]
+            )
+        elif isinstance(preprocessing_pipeline, Pipeline):
+            self.preprocessing_pipeline = preprocessing_pipeline
+        elif isinstance(preprocessing_pipeline, str):
+            if preprocessing_pipeline in registry:
+                self.preprocessing_pipeline = registry[preprocessing_pipeline]
+            else:
+                raise ValueError(
+                    "Invalid preprocessing pipeline. Choose one of the following: "
+                    ", ".join(list(registry.keys()) + ["custom"])
+                )
+        elif isinstance(preprocessing_pipeline, Iterable) and all(
+            isinstance(step, Step) for step in preprocessing_pipeline
+        ):
+            # mypy assumes that preprocessing_pipeline might be a string (although we
+            # checked this case above) and complains about that when constructing the
+            # pipeline
+            # --> explicitly assert that preprocessing_pipeline is not a string
+            assert not isinstance(preprocessing_pipeline, str)
+            self.preprocessing_pipeline = Pipeline(steps=preprocessing_pipeline)
+        else:
+            raise ValueError(
+                f"Invalid preprocessing pipeline {preprocessing_pipeline}."
+            )
+        #
+        # reading molecules
+        #
+        # add methods for all supported formats
+        # TODO
+        #
+        # other parameters
+        #
+        self.num_processes = num_processes
+    def _preprocess_single_mol(self, mol: Mol) -> Tuple[Mol, List[str]]:
+        # if this method is called, the preprocessing_pipeline was set to "custom"
+        # and this method has to be overwritten
+        raise NotImplementedError()
+    @abstractmethod
+    def _predict_mols(self, mols: List[Mol], **kwargs) -> pd.DataFrame:
+        pass
+    def _predict_entries(
+        self,
+        inputs: Iterable[MoleculeEntry],
+        **kwargs,
+    ) -> pd.DataFrame:
+        """
+        'preprocessed_mol', 'mol_id', 'input_mol', 'input_type', 'name',
+        'input_smiles', 'preprocessed_smiles', 'atom_id', 'mass', 'errors',
+        'input'
+        """
+        #
+        # LOAD MOLECULES
+        #
+        df_load = pd.DataFrame(
+            inputs,
+            columns=["input", "input_type", "source", "mol", "load_errors"],
+        )
+        df_load["mol_id"] = range(len(df_load))
+        #
+        # PREPROCESS ALL MOLECULES
+        #
+        df_preprocess = pd.DataFrame(
+            [self.preprocessing_pipeline.run(mol) for mol in df_load.mol],
+            columns=["preprocessed_mol", "preprocessing_errors"],
+        )
+        # necessary for models that create multiple (or zero) entries per molecule
+        df_preprocess["mol_id"] = range(len(df_preprocess))
+        # add raw molecules to dataframe
+        df_preprocess["input_mol"] = df_load.mol
+        # add name to dataframe
+        df_preprocess["name"] = [
+            (mol.GetProp("_Name") if mol is not None and mol.HasProp("_Name") else "")
+            for mol in df_preprocess.input_mol
+        ]
+        # add smiles columns for web UI
+        def _to_smiles(mol):
+            try:
+                return MolToSmiles(mol)
+            except:
+                return None
+        #
+        # PREPARE PREDICTION OF MOLECULES
+        #
+        # each molecule gets its unique id (0, 1, ..., n) as its name
+        for id, mol in zip(df_preprocess.mol_id, df_preprocess.preprocessed_mol):
+            if mol is not None:
+                mol.SetProp("_Name", str(id))
+        # do the prediction on molecules that are not None
+        df_valid_subset = df_preprocess[df_preprocess.preprocessed_mol.notnull()]
+        #
+        # PREDICTION
+        #
+        df_predictions = self._predict_mols(
+            df_valid_subset.preprocessed_mol.tolist(), **kwargs
+        )
+        #
+        # POST PROCESSING AND ERROR HANDLING
+        #
+        # make sure that reserved column names do not appear in the output dataframe
+        reserved_column_names = ["input", "name", "input_mol"]
+        assert (
+            set(df_predictions.columns).intersection(reserved_column_names) == set()
+        ), f"Do not use reserved column names {', '.join(reserved_column_names)}!"
+        # during prediction, molecules might have been removed / reordered
+        # there are three ways to connect the predictions to the original molecules:
+        # 1. df_prediction contains a column "mol_id" that contains the molecule ids
+        # 2. df_prediction contains a column "mol" that contains the molecules, which
+        #    have the id as their name so that we can match them to the original
+        # 3. df_prediction has the same length as the number of valid molecules
+        #    (and we assume that the order of the molecules is the same)
+        if "mol_id" in df_predictions.columns:
+            # check that mol_id contains only valid ids
+            assert set(df_predictions.mol_id).issubset(
+                set(df_valid_subset.mol_id)
+            ), "The mol_id column must only contain valid ids!"
+            # use mol_id as index
+            df_predictions.set_index("mol_id", drop=True, inplace=True)
+        elif "mol" in df_predictions.columns:
+            # check that molecule names contain only valid ids
+            names = df_predictions.mol.apply(lambda mol: int(mol.GetProp("_Name")))
+            assert set(names).issubset(
+                set(df_preprocess.mol_id)
+            ), "The molecule names must only contain valid ids!"
+            # use mol_id as index
+            df_predictions.set_index(
+                names,
+                inplace=True,
+            )
+            df_predictions.drop(columns="mol", inplace=True)
+        else:
+            assert len(df_predictions) == len(df_valid_subset), (
+                "The number of predicted molecules must be equal to the number of "
+                "valid input molecules."
+            )
+            # use index from input series (type cast if series was empty)
+            df_predictions.set_index(
+                df_valid_subset.index.astype("int64"), inplace=True
+            )
+        # add column that indicates whether a molecule was missing
+        missing_mol_ids = set(df_preprocess.mol_id).difference(df_predictions.index)
+        df_preprocess["missing"] = df_preprocess.mol_id.isin(missing_mol_ids)
+        # merge the preprocessed molecules with the predictions
+        df_result = df_preprocess.merge(
+            df_predictions, left_on="mol_id", right_index=True, how="left"
+        )
+        # if the result has multiple entries per mol_id, check that atom_id or
+        # derivative_id is present
+        if len(df_result) > df_result.mol_id.nunique():
+            assert (
+                "atom_id" in df_result.columns or "derivative_id" in df_result.columns
+            ), (
+                "The result contains multiple entries per molecule, but does not "
+                "contain atom_id or derivative_id."
+            )
+        # merge errors from preprocessing and prediction
+        if "prediction_errors" in df_result.columns:
+            df_result["errors"] = (
+                df_result.preprocessing_errors + df_result.prediction_errors
+            )
+            df_result.drop(columns=["prediction_errors"], inplace=True)
+        else:
+            df_result["errors"] = df_result.preprocessing_errors
+        df_result["errors"] = df_result.errors + df_result.missing.map(
+            lambda x: ["!1"] if x else []
+        )
+        df_result.drop(columns=["missing", "preprocessing_errors"], inplace=True)
+        # convert errors to string
+        if "errors" in df_result.columns:
+            df_result["errors"] = df_result.errors.map(lambda x: ", ".join(set(x)))
+        else:
+            df_result["errors"] = ""
+        # delete mol column (not needed anymore)
+        df_load.drop(columns=["mol"], inplace=True)
+        # merge load and prediction
+        df_result = df_result.merge(df_load, on="mol_id", how="left")
+        # merge errors from loading and prediction
+        df_result["errors"] = [
+            ", ".join(set(load_errors + [prediction_errors]))
+            for load_errors, prediction_errors in zip(
+                df_result.load_errors, df_result.errors
+            )
+        ]
+        df_result.drop(columns=["load_errors"], inplace=True)
+        # reorder columns
+        mandatory_columns = [
+            "mol_id",
+            "input",
+            "input_type",
+            "source",
+            "name",
+            "input_mol",
+            "preprocessed_mol",
+            "errors",
+        ]
+        remaining_columns = [c for c in df_result.columns if c not in mandatory_columns]
+        df_result = df_result[mandatory_columns + remaining_columns]
+        return df_result
+    def predict(
+        self,
+        inputs: Union[Iterable[str], Iterable[Mol], str, Mol],
+        input_type=None,
+        **kwargs,
+    ):
+        entries = guess_and_read(inputs)
+        return self._predict_entries(entries, **kwargs)
+    def get_config(self) -> Configuration:
+        return AutoConfiguration(self)

nerdd-module-0.1.6/nerdd_module/cli.py ADDED Viewed

@@ -0,0 +1,142 @@
+import logging
+import os
+import sys
+import rich_click as click
+from decorator import decorator
+from nerdd_module.io import WriterRegistry
+__all__ = ["auto_cli"]
+input_description = """{description}
+INPUT molecules are provided as file paths or strings. The following formats are
+supported:
+{format_list}
+Note that input formats shouldn't be mixed.
+"""
+def infer_click_type(param):
+    if "choices" in param:
+        choices = [c["value"] for c in param["choices"]]
+        return click.Choice(choices)
+    type_map = {
+        "float": float,
+        "int": int,
+        "str": str,
+        "bool": bool,
+    }
+    return type_map[param.get("type")]
+@decorator
+def auto_cli(f, *args, **kwargs):
+    # infer the command name
+    command_name = os.path.basename(sys.argv[0])
+    # get the model
+    model = f()
+    config = model.get_config().get_dict()
+    # compose cli description
+    description = config.get("description", "")
+    format_list = "\n".join([f"* {fmt}" for fmt in ["smiles", "sdf", "inchi"]])
+    help_text = input_description.format(
+        description=description, format_list=format_list
+    )
+    # compose footer with examples
+    examples = []
+    if "example_smiles" in config:
+        examples.append(config["example_smiles"])
+    if len(examples) > 0:
+        footer = "Examples:\n"
+        for example in examples:
+            footer += f"* {command_name} {example}\n"
+    else:
+        footer = ""
+    # show_default=True: default values are shown in the help text
+    # show_metavars_column=False: the column types are not in a separate column
+    # append_metavars_help=True: the column types are shown below the help text
+    @click.command(context_settings={"show_default": True}, help=help_text)
+    @click.rich_config(
+        help_config=click.RichHelpConfiguration(
+            use_markdown=True,
+            show_metavars_column=False,
+            append_metavars_help=True,
+            footer_text=footer,
+        )
+    )
+    @click.argument("input", type=click.Path(), nargs=-1, required=True)
+    def main(
+        input,
+        format: str,
+        output: click.Path,
+        log_level: str,
+        **kwargs,
+    ):
+        logging.basicConfig(level=log_level.upper())
+        df_result = model.predict(input, **kwargs)
+        if output.lower() == "stdout":
+            output_handle = sys.stdout
+        else:
+            output_handle = click.open_file(output, "wb")
+        # write results
+        assert format in WriterRegistry().supported_formats
+        writer = WriterRegistry().get_writer(format)
+        entries = (tup._asdict() for tup in df_result.itertuples(index=False))
+        writer.write(output_handle, entries)
+    #
+    # Add job parameters
+    #
+    for param in config["job_parameters"]:
+        main = click.option(
+            f"--{param['name']}",
+            default=param.get("default", None),
+            type=infer_click_type(param),
+            help=param.get("help_text", None),
+        )(main)
+    #
+    # Add other options
+    #
+    main = click.option(
+        "--output",
+        default="stdout",
+        type=click.Path(),
+        help="The output file. If 'stdout' is specified, the output is written to stdout.",
+    )(main)
+    main = click.option(
+        "--format",
+        default="csv",
+        type=click.Choice(["csv", "sdf"], case_sensitive=False),
+        help="The output format.",
+    )(main)
+    main = click.option(
+        "--log-level",
+        default="warning",
+        type=click.Choice(
+            ["debug", "info", "warning", "error", "critical"], case_sensitive=False
+        ),
+        help="The logging level.",
+    )(main)
+    return main()

nerdd-module-0.1.6/nerdd_module/config/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from .auto_configuration import *
+from .configuration import *
+from .default_configuration import *
+from .dict_configuration import *
+from .merged_configuration import *
+from .yaml_configuration import *

nerdd-module-0.1.6/nerdd_module/config/auto_configuration.py ADDED Viewed

@@ -0,0 +1,48 @@
+import os
+import sys
+from .configuration import Configuration
+from .default_configuration import DefaultConfiguration
+from .dict_configuration import DictConfiguration
+from .merged_configuration import MergedConfiguration
+from .yaml_configuration import YamlConfiguration
+__all__ = ["AutoConfiguration"]
+class AutoConfiguration(Configuration):
+    def __init__(self, nerdd_module):
+        super().__init__()
+        nerdd_module_class = nerdd_module.__class__
+        configs = []
+        # 1. module has a default configuration (containing default values)
+        configs.append(DefaultConfiguration(nerdd_module))
+        # 2. module can be configured via a yaml file
+        # search for nerdd.yml
+        # start at the directory containing the file where nerdd_module_class is
+        # defined and go up the directory tree until nerdd.yml is found
+        leaf = sys.modules[nerdd_module_class.__module__].__file__ or ""
+        while True:
+            if os.path.isfile(os.path.join(leaf, "nerdd.yml")):
+                default_config_file = os.path.join(leaf, "nerdd.yml")
+                break
+            elif leaf == os.path.dirname(leaf):
+                default_config_file = None
+                break
+            leaf = os.path.dirname(leaf)
+        if default_config_file is not None:
+            configs.append(YamlConfiguration(default_config_file))
+        # 3. module can be configured via the method _get_config in the module
+        if hasattr(nerdd_module, "_get_config"):
+            configs.append(DictConfiguration(nerdd_module._get_config()))
+        self.delegate = MergedConfiguration(*configs)
+    def _get_dict(self):
+        return self.delegate._get_dict()