PyPI - nerdd-module - Versions diffs - 0.3.37__tar.gz → 0.3.39__tar.gz - Mend

nerdd-module 0.3.37tar.gz → 0.3.39tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

{nerdd_module-0.3.37 → nerdd_module-0.3.39}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nerdd-module
-Version: 0.3.37
+Version: 0.3.39
 Summary: Base package to create NERDD modules
 Author-email: Steffen Hirte <steffen.hirte@univie.ac.at>
 Maintainer-email: Steffen Hirte <steffen.hirte@univie.ac.at>

{nerdd_module-0.3.37 → nerdd_module-0.3.39}/nerdd_module/cli.py RENAMED Viewed

@@ -56,7 +56,7 @@ def auto_cli(f: Callable[..., Model], *args: Any, **kwargs: Any) -> None:
     input_format_list = "\n".join([f"* {fmt}" for fmt in ["smiles", "sdf", "inchi"]])
     help_text = input_description.format(
-        description=model.description, input_format_list=input_format_list
+        description=model.config.description, input_format_list=input_format_list
     )
     output_format_list = [
@@ -117,7 +117,7 @@ def auto_cli(f: Callable[..., Model], *args: Any, **kwargs: Any) -> None:
     #
     # Add job parameters
     #
-    for param in model.job_parameters:
+    for param in model.config.job_parameters:
         # convert parameter name to spinal case (e.g. "max_confs" -> "max-confs")
         param_name = spinalcase(param.name)
         main = click.option(

{nerdd_module-0.3.37 → nerdd_module-0.3.39}/nerdd_module/config/models.py RENAMED Viewed

@@ -123,7 +123,7 @@ class Module(BaseModel):
         return spinalcase(self.name)
     task: Optional[Task] = None
-    rank: Optional[int] = None
+    rank: Optional[float] = None
     name: str
     batch_size: int = 100
     version: Optional[str] = None
@@ -203,7 +203,7 @@ class Module(BaseModel):
             for i, j in zip(indices[:-1], indices[1:]):
                 assert i + 1 == j, (
                     f"Properties with the same group should appear next to each other, "
-                    f"but group {group} appears at incides {i} and {j}."
+                    f"but group {group} appears at indices {i} and {j}."
                 )
         return values

{nerdd_module-0.3.37 → nerdd_module-0.3.39}/nerdd_module/model/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from .assign_name_step import *
 from .convert_representations_step import *
 from .model import *
+from .prediction_step import *
 from .read_input_step import *
-from .simple_model import *
 from .write_output_step import *

nerdd_module-0.3.37/nerdd_module/model/simple_model.py → nerdd_module-0.3.39/nerdd_module/model/model.py RENAMED Viewed

@@ -1,4 +1,5 @@
-from abc import abstractmethod
+import logging
+from abc import ABC, abstractmethod
 from functools import cached_property
 from typing import Any, Iterable, List, Optional, Tuple, Union
@@ -8,7 +9,6 @@ from ..config import (
     Configuration,
     DefaultConfiguration,
     DictConfiguration,
-    JobParameter,
     MergedConfiguration,
     Module,
     PackageConfiguration,
@@ -17,21 +17,22 @@ from ..config import (
 from ..input import DepthFirstExplorer
 from ..preprocessing import PreprocessingStep
 from ..problem import Problem
-from ..steps import Step
+from ..steps import OutputStep, Step
 from ..util import get_file_path_to_instance
 from .assign_name_step import AssignNameStep
 from .convert_representations_step import ConvertRepresentationsStep
 from .enforce_schema_step import EnforceSchemaStep
-from .model import Model
+from .prediction_step import PredictionStep
 from .read_input_step import ReadInputStep
 from .write_output_step import WriteOutputStep
-__all__ = ["SimpleModel"]
+logger = logging.getLogger(__name__)
-class SimpleModel(Model):
+class Model(ABC):
     def __init__(self, preprocessing_steps: Iterable[Step] = []) -> None:
         super().__init__()
         assert isinstance(
             preprocessing_steps, Iterable
         ), f"Expected Iterable for argument preprocessing_steps, got {type(preprocessing_steps)}"
@@ -39,8 +40,12 @@ class SimpleModel(Model):
             f"Expected all elements of preprocessing_steps to be of type Step, "
             f"got {[type(step) for step in preprocessing_steps if not isinstance(step, Step)]}"
         )
         self._preprocessing_steps = preprocessing_steps
+    def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
+        return mol, []
     def _get_input_steps(
         self, input: Any, input_format: Optional[str], **kwargs: Any
     ) -> List[Step]:
@@ -59,6 +64,10 @@ class SimpleModel(Model):
             CustomPreprocessingStep(self),
         ]
+    @abstractmethod
+    def _predict_mols(self, mols: List[Mol], **kwargs: Any) -> Iterable[dict]:
+        pass
     def _get_postprocessing_steps(self, output_format: Optional[str], **kwargs: Any) -> List[Step]:
         output_format = output_format or "pandas"
         return [
@@ -67,13 +76,46 @@ class SimpleModel(Model):
             WriteOutputStep(output_format, config=self.config, **kwargs),
         ]
-    def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
-        return mol, []
+    def predict(
+        self,
+        input: Any,
+        input_format: Optional[str] = None,
+        output_format: Optional[str] = None,
+        **kwargs: Any,
+    ) -> Any:
+        input_steps = self._get_input_steps(input, input_format, **kwargs)
+        preprocessing_steps = self._get_preprocessing_steps(input, input_format, **kwargs)
+        postprocessing_steps = self._get_postprocessing_steps(output_format, **kwargs)
+        output_step = postprocessing_steps[-1]
+        assert isinstance(output_step, OutputStep), "The last step must be an OutputStep."
+        # make mypy happy by restricting the type of self.config.task
+        assert self.config.task is not None
+        steps = [
+            *input_steps,
+            *preprocessing_steps,
+            PredictionStep(
+                self._predict_mols,
+                task=self.config.task,
+                batch_size=self.config.batch_size,
+                **kwargs,
+            ),
+            *postprocessing_steps,
+        ]
-    @abstractmethod
-    def _predict_mols(self, mols: List[Mol], **kwargs: Any) -> List[dict]:
-        pass
+        # build the pipeline from the list of steps
+        pipeline = None
+        for t in steps:
+            pipeline = t(pipeline)
+        # the last pipeline step holds the result
+        return output_step.get_result()
+    #
+    # Configuration
+    #
     def _get_base_config(self) -> Union[Configuration, dict]:
         # get the class of the nerdd module, e.g. <CypstrateModel>
         nerdd_module_class = self.__class__
@@ -107,6 +149,9 @@ class SimpleModel(Model):
         if isinstance(base_config, dict):
             base_config = DictConfiguration(base_config)
+        # ensure that mandatory properties are present
+        base_config = MergedConfiguration(DefaultConfiguration(self), base_config)
         # add default properties mol_id, raw_input, etc.
         task = base_config.get_dict().task
@@ -180,7 +225,6 @@ class SimpleModel(Model):
         ]
         configs = [
-            DefaultConfiguration(self),
             DictConfiguration({"result_properties": default_properties_start}),
             base_config,
             DictConfiguration({"result_properties": default_properties_end}),
@@ -192,24 +236,9 @@ class SimpleModel(Model):
     def config(self) -> Module:
         return self._get_config().get_dict()
-    def _get_batch_size(self) -> int:
-        default = super()._get_batch_size()
-        return self.config.batch_size or default
-    def _get_name(self) -> str:
-        default = super()._get_name()
-        return self.config.name or default
-    def _get_description(self) -> str:
-        default = super()._get_description()
-        return self.config.description or default
-    def _get_job_parameters(self) -> List[JobParameter]:
-        return super()._get_job_parameters() + self.config.job_parameters
 class CustomPreprocessingStep(PreprocessingStep):
-    def __init__(self, model: SimpleModel):
+    def __init__(self, model: Model):
         super().__init__()
         self.model = model

nerdd_module-0.3.37/nerdd_module/model/model.py → nerdd_module-0.3.39/nerdd_module/model/prediction_step.py RENAMED Viewed

@@ -1,114 +1,24 @@
 import logging
-from abc import ABC, abstractmethod
 from collections import defaultdict
-from typing import Any, Iterable, Iterator, List, Optional, Tuple
+from typing import Any, Callable, DefaultDict, Iterator, List, Set, Tuple
-from rdkit.Chem import Mol
-from stringcase import snakecase  # type: ignore
-from ..config import JobParameter
-from ..problem import Problem
-from ..steps import OutputStep, Step
+from ..config import Task
+from ..problem import IncompletePredictionProblem, UnknownPredictionProblem
+from ..steps import Step
 from ..util import call_with_mappings
 logger = logging.getLogger(__name__)
-# an unknown prediction problem indicates that the model raised an exception during
-# prediction
-def UnknownPredictionProblem() -> Problem:
-    return Problem("unknown_prediction_error", "An unknown error occured during prediction.")
-# an incomplete prediction problem indicates that the model successfully returns
-# predictions, but part of the input molecules are missing in the results
-def IncompletePredictionProblem() -> Problem:
-    return Problem("incomplete_prediction_error", "The model couldn't process the molecule.")
-class Model(ABC):
-    def __init__(self) -> None:
-        super().__init__()
-    @abstractmethod
-    def _predict_mols(self, mols: List[Mol], **kwargs: Any) -> Iterable[dict]:
-        pass
-    @abstractmethod
-    def _get_input_steps(
-        self, input: Any, input_format: Optional[str], **kwargs: Any
-    ) -> List[Step]:
-        pass
-    @abstractmethod
-    def _get_preprocessing_steps(
-        self, input: Any, input_format: Optional[str], **kwargs: Any
-    ) -> List[Step]:
-        pass
-    @abstractmethod
-    def _get_postprocessing_steps(self, output_format: Optional[str], **kwargs: Any) -> List[Step]:
-        pass
-    def predict(
-        self,
-        input: Any,
-        input_format: Optional[str] = None,
-        output_format: Optional[str] = None,
-        **kwargs: Any,
-    ) -> Any:
-        input_steps = self._get_input_steps(input, input_format, **kwargs)
-        preprocessing_steps = self._get_preprocessing_steps(input, input_format, **kwargs)
-        postprocessing_steps = self._get_postprocessing_steps(output_format, **kwargs)
-        output_step = postprocessing_steps[-1]
-        assert isinstance(output_step, OutputStep), "The last step must be an OutputStep."
-        steps = [
-            *input_steps,
-            *preprocessing_steps,
-            PredictionStep(self, batch_size=self.batch_size, **kwargs),
-            *postprocessing_steps,
-        ]
-        # build the pipeline from the list of steps
-        pipeline = None
-        for t in steps:
-            pipeline = t(pipeline)
-        # the last pipeline step holds the result
-        return output_step.get_result()
-    #
-    # Properties
-    #
-    def _get_batch_size(self) -> int:
-        return 1
-    batch_size = property(fget=lambda self: self._get_batch_size())
-    def _get_name(self) -> str:
-        return snakecase(self.__class__.__name__)
-    name = property(fget=lambda self: self._get_name())
-    def _get_description(self) -> str:
-        return ""
-    description = property(fget=lambda self: self._get_description())
-    def _get_job_parameters(self) -> List[JobParameter]:
-        return []
-    job_parameters = property(fget=lambda self: self._get_job_parameters())
+__all__ = ["PredictionStep"]
 class PredictionStep(Step):
-    def __init__(self, model: Model, batch_size: int, **kwargs: Any) -> None:
+    def __init__(self, predict_fn: Callable, task: Task, batch_size: int, **kwargs: Any) -> None:
         super().__init__()
-        self.model = model
-        self.batch_size = batch_size
-        self.kwargs = kwargs
+        self._predict_fn = predict_fn
+        self._task = task
+        self._batch_size = batch_size
+        self._kwargs = kwargs
     def _run(self, source: Iterator[dict]) -> Iterator[dict]:
         # We need to process the molecules in batches, because most ML models perform
@@ -131,7 +41,7 @@ class PredictionStep(Step):
             if len(batch) > 0 or len(none_batch) > 0:
                 yield batch, none_batch
-        for batch, none_batch in _batch_and_filter(source, self.batch_size):
+        for batch, none_batch in _batch_and_filter(source, self._batch_size):
             # return the records where mols are None
             yield from none_batch
@@ -151,8 +61,8 @@ class PredictionStep(Step):
             if len(batch) > 0:
                 predictions = list(
                     call_with_mappings(
-                        self.model._predict_mols,
-                        {**self.kwargs, "mols": mols},
+                        self._predict_fn,
+                        {**self._kwargs, "mols": mols},
                     )
                 )
             else:
@@ -208,10 +118,25 @@ class PredictionStep(Step):
                 record["mol_id"] in mol_id_set
             ), f"The mol_id {record['mol_id']} is not in the batch."
+        # depending on the task, we need to check atom_id or derivative_id
+        if self._task == "atom_property_prediction":
+            sub_id_property = "atom_id"
+        elif self._task == "derivative_property_prediction":
+            sub_id_property = "derivative_id"
+        else:
+            sub_id_property = None
         # create a mapping from mol_id to record (for quick access)
-        mol_id_to_record = defaultdict(list)
+        mol_id_to_record: DefaultDict[int, List[dict]] = defaultdict(list)
         for record in predictions:
-            mol_id_to_record[record["mol_id"]].append(record)
+            current_record_list = mol_id_to_record[record["mol_id"]]
+            current_record_list.append(record)
+            if len(current_record_list) > 1 and sub_id_property is None:
+                raise ValueError(
+                    f"There are duplicate records for mol_id={record['mol_id']}, but the "
+                    f"prediction task {self._task} requires unique mol_id values. The duplicates "
+                    f"are: {current_record_list}."
+                )
         # add all records that are missing in the predictions
         for mol_id in temporary_mol_ids:
@@ -224,19 +149,63 @@ class PredictionStep(Step):
                     }
                 )
-        # If the result has multiple entries per mol_id, check that atom_id or
-        # derivative_id is present in multi-entry results.
-        if len(predictions) > len(batch):
-            for _, records in mol_id_to_record.items():
-                if len(records) > 1:
-                    has_atom_id = all("atom_id" in record for record in records)
-                    has_derivative_id = all("derivative_id" in record for record in records)
-                    assert has_atom_id or has_derivative_id, (
-                        "The result contains multiple entries per molecule, but does "
-                        "not contain atom_id or derivative_id."
+        if sub_id_property is not None:
+            # task must be either atom_property_prediction or derivative_property_prediction
+            # -> check consistency of sub_id_property
+            for mol_id, records in mol_id_to_record.items():
+                sub_ids: Set[int] = set()
+                for record in records:
+                    sub_id = record.get(sub_id_property)
+                    if sub_id is not None:
+                        # check that sub_id is an integer
+                        if not isinstance(sub_id, int):
+                            raise ValueError(
+                                f"The {sub_id_property} must be an integer, but got {sub_id}. "
+                                f"Record: {record}"
+                            )
+                        sub_ids.add(sub_id)
+                if (
+                    len(records) == 1
+                    and "problems" in records[0]
+                    and len(records[0]["problems"]) > 0
+                ):
+                    # this record was not predicted, so we skip it
+                    continue
+                elif len(sub_ids) == 0:
+                    # no record has a sub id, we assign them (sequentially)
+                    for i, record in enumerate(records):
+                        record[sub_id_property] = i
+                    continue
+                elif len(sub_ids) < len(records):
+                    # None is not in sub_ids, but the number of unique sub ids is less than
+                    # the number of records.
+                    # -> there must be duplicates
+                    sub_id_list = [record.get(sub_id_property) for record in records]
+                    raise ValueError(
+                        f"The result with mol_id={mol_id} contains multiple entries per "
+                        f"molecule, but the sequence of {sub_id_property} is not unique. "
+                        f"Found: {sub_id_list}."
                     )
-        # TODO: check range and completeness of atom ids and derivative ids
+                else:
+                    min_sub_id = min(sub_ids)
+                    max_sub_id = max(sub_ids)
+                    if min_sub_id != 0:
+                        raise ValueError(
+                            f"The sequence of {sub_id_property} does not start at 0 for "
+                            f"mol_id={mol_id}. Instead, the minimum {sub_id_property} was "
+                            f"{min_sub_id}."
+                        )
+                    elif max_sub_id - min_sub_id + 1 != len(sub_ids):
+                        # there are gaps in the sequence of sub ids
+                        raise ValueError(
+                            f"The result with mol_id={mol_id} contains multiple entries per "
+                            f"molecule, but the sequence of {sub_id_property} has gaps. "
+                            f"Found: {sub_ids}."
+                        )
         for key, records in mol_id_to_record.items():
             for record in records:

{nerdd_module-0.3.37 → nerdd_module-0.3.39}/nerdd_module/problem.py RENAMED Viewed

@@ -2,6 +2,8 @@ from typing import Iterable, NamedTuple
 __all__ = [
     "Problem",
+    "UnknownPredictionProblem",
+    "IncompletePredictionProblem",
     "InvalidSmiles",
     "UnknownProblem",
     "InvalidWeightProblem",
@@ -14,6 +16,18 @@ class Problem(NamedTuple):
     message: str
+# an unknown prediction problem indicates that the model raised an exception during
+# prediction
+def UnknownPredictionProblem() -> Problem:
+    return Problem("unknown_prediction_error", "An unknown error occured during prediction.")
+# an incomplete prediction problem indicates that the model successfully returns
+# predictions, but part of the input molecules are missing in the results
+def IncompletePredictionProblem() -> Problem:
+    return Problem("incomplete_prediction_error", "The model couldn't process the molecule.")
 def InvalidSmiles() -> Problem:
     return Problem(type="invalid_smiles", message="Invalid SMILES string")

{nerdd_module-0.3.37 → nerdd_module-0.3.39}/nerdd_module/tests/models/AtomicMassModel.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from nerdd_module import SimpleModel
+from nerdd_module import Model
 from nerdd_module.preprocessing import Sanitize
 __all__ = ["AtomicMassModel"]
@@ -7,7 +7,7 @@ __all__ = ["AtomicMassModel"]
 allowed_versions = ["mol_ids", "mols", "iterator", "error"]
-class AtomicMassModel(SimpleModel):
+class AtomicMassModel(Model):
     def __init__(self, preprocessing_steps=[Sanitize()], version="mol_ids", **kwargs):
         assert (
             version in allowed_versions

{nerdd_module-0.3.37 → nerdd_module-0.3.39}/nerdd_module/tests/models/MolWeightModel.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from rdkit.Chem.rdMolDescriptors import CalcExactMolWt
-from nerdd_module import SimpleModel
+from nerdd_module import Model
 from nerdd_module.preprocessing import Sanitize
 __all__ = ["MolWeightModel"]
@@ -8,7 +8,7 @@ __all__ = ["MolWeightModel"]
 allowed_versions = ["order_based", "mol_ids", "mols", "iterator", "error"]
-class MolWeightModel(SimpleModel):
+class MolWeightModel(Model):
     def __init__(self, preprocessing_steps=[Sanitize()], version="order_based", **kwargs):
         assert (
             version in allowed_versions

{nerdd_module-0.3.37 → nerdd_module-0.3.39}/nerdd_module.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nerdd-module
-Version: 0.3.37
+Version: 0.3.39
 Summary: Base package to create NERDD modules
 Author-email: Steffen Hirte <steffen.hirte@univie.ac.at>
 Maintainer-email: Steffen Hirte <steffen.hirte@univie.ac.at>

{nerdd_module-0.3.37 → nerdd_module-0.3.39}/nerdd_module.egg-info/SOURCES.txt RENAMED Viewed

@@ -49,8 +49,8 @@ nerdd_module/model/assign_name_step.py
 nerdd_module/model/convert_representations_step.py
 nerdd_module/model/enforce_schema_step.py
 nerdd_module/model/model.py
+nerdd_module/model/prediction_step.py
 nerdd_module/model/read_input_step.py
-nerdd_module/model/simple_model.py
 nerdd_module/model/write_output_step.py
 nerdd_module/output/__init__.py
 nerdd_module/output/csv_writer.py

{nerdd_module-0.3.37 → nerdd_module-0.3.39}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "nerdd-module"
-version = "0.3.37"
+version = "0.3.39"
 description = "Base package to create NERDD modules"
 readme = "README.md"
 license = "BSD-3-Clause"