PyPI - nerdd-module - Versions diffs - 0.3.38__tar.gz → 0.3.40__tar.gz - Mend

nerdd-module 0.3.38tar.gz → 0.3.40tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

{nerdd_module-0.3.38 → nerdd_module-0.3.40}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nerdd-module
-Version: 0.3.38
+Version: 0.3.40
 Summary: Base package to create NERDD modules
 Author-email: Steffen Hirte <steffen.hirte@univie.ac.at>
 Maintainer-email: Steffen Hirte <steffen.hirte@univie.ac.at>

{nerdd_module-0.3.38 → nerdd_module-0.3.40}/nerdd_module/config/models.py RENAMED Viewed

@@ -203,7 +203,7 @@ class Module(BaseModel):
             for i, j in zip(indices[:-1], indices[1:]):
                 assert i + 1 == j, (
                     f"Properties with the same group should appear next to each other, "
-                    f"but group {group} appears at incides {i} and {j}."
+                    f"but group {group} appears at indices {i} and {j}."
                 )
         return values

{nerdd_module-0.3.38 → nerdd_module-0.3.40}/nerdd_module/model/model.py RENAMED Viewed

@@ -90,10 +90,18 @@ class Model(ABC):
         assert isinstance(output_step, OutputStep), "The last step must be an OutputStep."
+        # make mypy happy by restricting the type of self.config.task
+        assert self.config.task is not None
         steps = [
             *input_steps,
             *preprocessing_steps,
-            PredictionStep(self._predict_mols, batch_size=self.config.batch_size, **kwargs),
+            PredictionStep(
+                self._predict_mols,
+                task=self.config.task,
+                batch_size=self.config.batch_size,
+                **kwargs,
+            ),
             *postprocessing_steps,
         ]
@@ -141,6 +149,9 @@ class Model(ABC):
         if isinstance(base_config, dict):
             base_config = DictConfiguration(base_config)
+        # ensure that mandatory properties are present
+        base_config = MergedConfiguration(DefaultConfiguration(self), base_config)
         # add default properties mol_id, raw_input, etc.
         task = base_config.get_dict().task
@@ -214,7 +225,6 @@ class Model(ABC):
         ]
         configs = [
-            DefaultConfiguration(self),
             DictConfiguration({"result_properties": default_properties_start}),
             base_config,
             DictConfiguration({"result_properties": default_properties_end}),

{nerdd_module-0.3.38 → nerdd_module-0.3.40}/nerdd_module/model/prediction_step.py RENAMED Viewed

@@ -1,7 +1,8 @@
 import logging
 from collections import defaultdict
-from typing import Any, Callable, Iterator, List, Tuple
+from typing import Any, Callable, DefaultDict, Iterator, List, Set, Tuple
+from ..config import Task
 from ..problem import IncompletePredictionProblem, UnknownPredictionProblem
 from ..steps import Step
 from ..util import call_with_mappings
@@ -12,9 +13,10 @@ __all__ = ["PredictionStep"]
 class PredictionStep(Step):
-    def __init__(self, predict_fn: Callable, batch_size: int, **kwargs: Any) -> None:
+    def __init__(self, predict_fn: Callable, task: Task, batch_size: int, **kwargs: Any) -> None:
         super().__init__()
         self._predict_fn = predict_fn
+        self._task = task
         self._batch_size = batch_size
         self._kwargs = kwargs
@@ -116,10 +118,25 @@ class PredictionStep(Step):
                 record["mol_id"] in mol_id_set
             ), f"The mol_id {record['mol_id']} is not in the batch."
+        # depending on the task, we need to check atom_id or derivative_id
+        if self._task == "atom_property_prediction":
+            sub_id_property = "atom_id"
+        elif self._task == "derivative_property_prediction":
+            sub_id_property = "derivative_id"
+        else:
+            sub_id_property = None
         # create a mapping from mol_id to record (for quick access)
-        mol_id_to_record = defaultdict(list)
+        mol_id_to_record: DefaultDict[int, List[dict]] = defaultdict(list)
         for record in predictions:
-            mol_id_to_record[record["mol_id"]].append(record)
+            current_record_list = mol_id_to_record[record["mol_id"]]
+            current_record_list.append(record)
+            if len(current_record_list) > 1 and sub_id_property is None:
+                raise ValueError(
+                    f"There are duplicate records for mol_id={record['mol_id']}, but the "
+                    f"prediction task {self._task} requires unique mol_id values. The duplicates "
+                    f"are: {current_record_list}."
+                )
         # add all records that are missing in the predictions
         for mol_id in temporary_mol_ids:
@@ -132,19 +149,63 @@ class PredictionStep(Step):
                     }
                 )
-        # If the result has multiple entries per mol_id, check that atom_id or
-        # derivative_id is present in multi-entry results.
-        if len(predictions) > len(batch):
-            for _, records in mol_id_to_record.items():
-                if len(records) > 1:
-                    has_atom_id = all("atom_id" in record for record in records)
-                    has_derivative_id = all("derivative_id" in record for record in records)
-                    assert has_atom_id or has_derivative_id, (
-                        "The result contains multiple entries per molecule, but does "
-                        "not contain atom_id or derivative_id."
+        if sub_id_property is not None:
+            # task must be either atom_property_prediction or derivative_property_prediction
+            # -> check consistency of sub_id_property
+            for mol_id, records in mol_id_to_record.items():
+                sub_ids: Set[int] = set()
+                for record in records:
+                    sub_id = record.get(sub_id_property)
+                    if sub_id is not None:
+                        # check that sub_id is an integer
+                        if not isinstance(sub_id, int):
+                            raise ValueError(
+                                f"The {sub_id_property} must be an integer, but got {sub_id}. "
+                                f"Record: {record}"
+                            )
+                        sub_ids.add(sub_id)
+                if (
+                    len(records) == 1
+                    and "problems" in records[0]
+                    and len(records[0]["problems"]) > 0
+                ):
+                    # this record was not predicted, so we skip it
+                    continue
+                elif len(sub_ids) == 0:
+                    # no record has a sub id, we assign them (sequentially)
+                    for i, record in enumerate(records):
+                        record[sub_id_property] = i
+                    continue
+                elif len(sub_ids) < len(records):
+                    # None is not in sub_ids, but the number of unique sub ids is less than
+                    # the number of records.
+                    # -> there must be duplicates
+                    sub_id_list = [record.get(sub_id_property) for record in records]
+                    raise ValueError(
+                        f"The result with mol_id={mol_id} contains multiple entries per "
+                        f"molecule, but the sequence of {sub_id_property} is not unique. "
+                        f"Found: {sub_id_list}."
                     )
-        # TODO: check range and completeness of atom ids and derivative ids
+                else:
+                    min_sub_id = min(sub_ids)
+                    max_sub_id = max(sub_ids)
+                    if min_sub_id != 0:
+                        raise ValueError(
+                            f"The sequence of {sub_id_property} does not start at 0 for "
+                            f"mol_id={mol_id}. Instead, the minimum {sub_id_property} was "
+                            f"{min_sub_id}."
+                        )
+                    elif max_sub_id - min_sub_id + 1 != len(sub_ids):
+                        # there are gaps in the sequence of sub ids
+                        raise ValueError(
+                            f"The result with mol_id={mol_id} contains multiple entries per "
+                            f"molecule, but the sequence of {sub_id_property} has gaps. "
+                            f"Found: {sub_ids}."
+                        )
         for key, records in mol_id_to_record.items():
             for record in records:

{nerdd_module-0.3.38 → nerdd_module-0.3.40}/nerdd_module/preprocessing/__init__.py RENAMED Viewed

@@ -3,5 +3,6 @@ from .chembl_structure_pipeline import *
 from .filter_by_element import *
 from .filter_by_weight import *
 from .preprocessing_step import *
+from .remove_small_fragments import *
 from .remove_stereochemistry import *
 from .sanitize import *

{nerdd_module-0.3.38 → nerdd_module-0.3.40}/nerdd_module/preprocessing/filter_by_weight.py RENAMED Viewed

@@ -6,12 +6,14 @@ from rdkit.Chem.rdMolDescriptors import CalcExactMolWt
 from ..problem import InvalidWeightProblem, Problem
 from .preprocessing_step import PreprocessingStep
+__all__ = ["FilterByWeight"]
 class FilterByWeight(PreprocessingStep):
     def __init__(
         self,
-        min_weight: float,
-        max_weight: float,
+        min_weight: float = 0,
+        max_weight: float = float("inf"),
         remove_invalid_molecules: bool = False,
     ) -> None:
         super().__init__()

nerdd_module-0.3.40/nerdd_module/preprocessing/remove_small_fragments.py ADDED Viewed

@@ -0,0 +1,26 @@
+from typing import List, Optional, Tuple
+from rdkit.Chem import GetMolFrags, Mol
+from rdkit.Chem.rdMolDescriptors import CalcExactMolWt
+from ..problem import Problem
+from .preprocessing_step import PreprocessingStep
+__all__ = ["RemoveSmallFragments"]
+class RemoveSmallFragments(PreprocessingStep):
+    def __init__(
+        self,
+    ) -> None:
+        super().__init__()
+    def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
+        fragments = GetMolFrags(mol, asMols=True)
+        if len(fragments) > 1:
+            # select the largest fragment
+            largest_fragment = max(fragments, key=CalcExactMolWt)
+        else:
+            largest_fragment = mol
+        return largest_fragment, []

{nerdd_module-0.3.38 → nerdd_module-0.3.40}/nerdd_module/preprocessing/remove_stereochemistry.py RENAMED Viewed

@@ -6,6 +6,8 @@ from rdkit.Chem import RemoveStereochemistry as remove_stereochemistry
 from ..problem import Problem
 from .preprocessing_step import PreprocessingStep
+__all__ = ["RemoveStereochemistry"]
 class RemoveStereochemistry(PreprocessingStep):
     def __init__(self) -> None:

nerdd_module-0.3.40/nerdd_module/preprocessing/sanitize.py ADDED Viewed

@@ -0,0 +1,31 @@
+import logging
+from typing import List, Optional, Tuple
+from rdkit.Chem import AtomKekulizeException, KekulizeException, Mol, SanitizeMol
+from ..problem import Problem
+from .preprocessing_step import PreprocessingStep
+__all__ = ["Sanitize"]
+logger = logging.getLogger(__name__)
+class Sanitize(PreprocessingStep):
+    def __init__(self) -> None:
+        super().__init__()
+    def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
+        try:
+            SanitizeMol(mol)
+            return mol, []
+        except KekulizeException:
+            return None, [Problem("kekulization_error", "Failed kekulizing the molecule.")]
+        except AtomKekulizeException:
+            return None, [
+                Problem("atom_kekulization_error", "Failed kekulizing an atom in the molecule.")
+            ]
+        except Exception as e:
+            logger.exception(e)
+            return None, [Problem("sanitization_error", "Failed sanitizing the molecule.")]

{nerdd_module-0.3.38 → nerdd_module-0.3.40}/nerdd_module/tests/predictions.py RENAMED Viewed

@@ -48,6 +48,12 @@ def predictions_atomic_mass_model(representations, version, multiplier):
         output_format="record_list",
     )
+@when(
+    "all results are considered",
+    target_fixture="subset",
+)
+def subset_without_none(predictions):
+    return predictions
 @when(
     "the subset of the result where the input was not None is considered",

{nerdd_module-0.3.38 → nerdd_module-0.3.40}/nerdd_module.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nerdd-module
-Version: 0.3.38
+Version: 0.3.40
 Summary: Base package to create NERDD modules
 Author-email: Steffen Hirte <steffen.hirte@univie.ac.at>
 Maintainer-email: Steffen Hirte <steffen.hirte@univie.ac.at>

{nerdd_module-0.3.38 → nerdd_module-0.3.40}/nerdd_module.egg-info/SOURCES.txt RENAMED Viewed

@@ -75,6 +75,7 @@ nerdd_module/preprocessing/chembl_structure_pipeline.py
 nerdd_module/preprocessing/filter_by_element.py
 nerdd_module/preprocessing/filter_by_weight.py
 nerdd_module/preprocessing/preprocessing_step.py
+nerdd_module/preprocessing/remove_small_fragments.py
 nerdd_module/preprocessing/remove_stereochemistry.py
 nerdd_module/preprocessing/sanitize.py
 nerdd_module/steps/__init__.py

{nerdd_module-0.3.38 → nerdd_module-0.3.40}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "nerdd-module"
-version = "0.3.38"
+version = "0.3.40"
 description = "Base package to create NERDD modules"
 readme = "README.md"
 license = "BSD-3-Clause"

nerdd_module-0.3.38/nerdd_module/preprocessing/sanitize.py DELETED Viewed

@@ -1,21 +0,0 @@
-from typing import List, Optional, Tuple
-from rdkit.Chem import Mol, SanitizeMol
-from ..problem import Problem
-from .preprocessing_step import PreprocessingStep
-__all__ = ["Sanitize"]
-class Sanitize(PreprocessingStep):
-    def __init__(self) -> None:
-        super().__init__()
-    def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
-        problems: List[Problem] = []
-        # sanitize molecule
-        SanitizeMol(mol)
-        return mol, problems