nerdd-module 0.2.4__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/PKG-INFO +6 -3
  2. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/abstract_model.py +16 -11
  3. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/cli.py +1 -1
  4. nerdd_module-0.2.6/nerdd_module/config/default_configuration.py +41 -0
  5. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/config/merged_configuration.py +2 -0
  6. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/input}/__init__.py +0 -4
  7. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/input}/file_reader.py +21 -10
  8. nerdd_module-0.2.6/nerdd_module/input/reader_registry.py +64 -0
  9. nerdd_module-0.2.6/nerdd_module/output/__init__.py +1 -0
  10. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/output}/csv_writer.py +1 -1
  11. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/polyfills/__init__.py +1 -0
  12. {nerdd-module-0.2.4/nerdd_module → nerdd_module-0.2.6/nerdd_module/polyfills}/version.py +2 -4
  13. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/preprocessing/__init__.py +2 -1
  14. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/preprocessing/check_valid_smiles.py +4 -6
  15. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/preprocessing/chembl_structure_pipeline.py +3 -3
  16. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/preprocessing/filter_by_element.py +2 -2
  17. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/preprocessing/filter_by_weight.py +7 -5
  18. nerdd_module-0.2.6/nerdd_module/preprocessing/sanitize.py +18 -0
  19. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/preprocessing/step.py +3 -3
  20. nerdd_module-0.2.6/nerdd_module/problem.py +13 -0
  21. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/tests/checks.py +54 -4
  22. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/tests/representations.py +1 -1
  23. nerdd_module-0.2.6/nerdd_module/version.py +5 -0
  24. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module.egg-info/PKG-INFO +6 -3
  25. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module.egg-info/SOURCES.txt +22 -20
  26. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module.egg-info/requires.txt +5 -1
  27. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/setup.py +13 -6
  28. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/tests/conftest.py +1 -1
  29. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/tests/models/MolWeightModel.py +3 -6
  30. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/tests/models/MolWeightModelWithExplicitMolIds.py +2 -1
  31. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/tests/models/MolWeightModelWithExplicitMols.py +3 -5
  32. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/tests/steps/__init__.py +0 -1
  33. nerdd-module-0.2.4/nerdd_module/config/default_configuration.py +0 -17
  34. nerdd-module-0.2.4/nerdd_module/io/reader_registry.py +0 -30
  35. nerdd-module-0.2.4/nerdd_module/problem.py +0 -8
  36. nerdd-module-0.2.4/tests/steps/molecules.py +0 -54
  37. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/LICENSE +0 -0
  38. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/README.md +0 -0
  39. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/__init__.py +0 -0
  40. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/config/__init__.py +0 -0
  41. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/config/auto_configuration.py +0 -0
  42. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/config/configuration.py +0 -0
  43. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/config/dict_configuration.py +0 -0
  44. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/config/package_configuration.py +0 -0
  45. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/config/yaml_configuration.py +0 -0
  46. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/input}/depth_first_explorer.py +0 -0
  47. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/input}/explorer.py +0 -0
  48. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/input}/gzip_reader.py +0 -0
  49. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/input}/inchi_reader.py +0 -0
  50. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/input}/list_reader.py +0 -0
  51. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/input}/mol_reader.py +0 -0
  52. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/input}/reader.py +0 -0
  53. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/input}/sdf_reader.py +0 -0
  54. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/input}/smiles_reader.py +0 -0
  55. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/input}/string_reader.py +0 -0
  56. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/input}/tar_reader.py +0 -0
  57. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/input}/zip_reader.py +0 -0
  58. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/output}/sdf_writer.py +0 -0
  59. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/output}/writer.py +0 -0
  60. {nerdd-module-0.2.4/nerdd_module/io → nerdd_module-0.2.6/nerdd_module/output}/writer_registry.py +0 -0
  61. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/polyfills/files.py +0 -0
  62. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/polyfills/get_entry_points.py +0 -0
  63. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/preprocessing/empty_pipeline.py +0 -0
  64. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/preprocessing/pipeline.py +0 -0
  65. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/preprocessing/registry.py +0 -0
  66. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/preprocessing/remove_stereochemistry.py +0 -0
  67. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/tests/__init__.py +0 -0
  68. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module/tests/predictions.py +0 -0
  69. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module.egg-info/dependency_links.txt +0 -0
  70. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/nerdd_module.egg-info/top_level.txt +0 -0
  71. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/setup.cfg +0 -0
  72. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/tests/__init__.py +0 -0
  73. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/tests/models/AtomicMassModel.py +0 -0
  74. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/tests/models/__init__.py +0 -0
  75. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/tests/steps/checks.py +0 -0
  76. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/tests/steps/predictors.py +0 -0
  77. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/tests/steps/preprocessing.py +0 -0
  78. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/tests/test_atom_property_prediction.py +0 -0
  79. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/tests/test_molecule_property_prediction.py +0 -0
  80. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/tests/test_preprocessing.py +0 -0
  81. {nerdd-module-0.2.4 → nerdd_module-0.2.6}/tests/test_reading_formats.py +0 -0
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nerdd-module
3
- Version: 0.2.4
3
+ Version: 0.2.6
4
4
  Summary: Base package to create NERDD modules
5
- Home-page: https://github.com/molinfo-vienna/nerdd-module.git
5
+ Home-page: https://github.com/molinfo-vienna/nerdd-module
6
6
  Maintainer: Steffen Hirte
7
7
  Maintainer-email: steffen.hirte@univie.ac.at
8
8
  License: BSD 3-Clause License
@@ -33,8 +33,11 @@ Requires-Dist: stringcase>=1.2.0
33
33
  Requires-Dist: decorator>=5.1.1
34
34
  Requires-Dist: importlib-resources>=5; python_version < "3.10"
35
35
  Requires-Dist: importlib-metadata>=4.6; python_version < "3.10"
36
- Requires-Dist: chembl_structure_pipeline>=1.0.0
37
36
  Provides-Extra: dev
37
+ Requires-Dist: black; extra == "dev"
38
+ Requires-Dist: isort; extra == "dev"
39
+ Provides-Extra: csp
40
+ Requires-Dist: chembl_structure_pipeline>=1.0.0; extra == "csp"
38
41
  Provides-Extra: test
39
42
  Requires-Dist: pytest; extra == "test"
40
43
  Requires-Dist: pytest-sugar; extra == "test"
@@ -1,13 +1,13 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
2
+ from typing import Callable, Iterable, List, Tuple, Union
3
3
 
4
4
  import pandas as pd
5
- from rdkit.Chem import Mol, MolToSmiles
5
+ from rdkit.Chem import Mol
6
6
 
7
7
  from .config import AutoConfiguration, Configuration
8
- from .io import DepthFirstExplorer, MoleculeEntry
8
+ from .input import DepthFirstExplorer, MoleculeEntry
9
9
  from .preprocessing import Pipeline, Step, registry
10
- from .problem import Problem
10
+ from .problem import Problem, UnknownProblem
11
11
 
12
12
  __all__ = ["AbstractModel"]
13
13
 
@@ -156,17 +156,20 @@ class AbstractModel(ABC):
156
156
  # (and we assume that the order of the molecules is the same)
157
157
  if "mol_id" in df_predictions.columns:
158
158
  # check that mol_id contains only valid ids
159
- assert set(df_predictions.mol_id).issubset(
160
- set(df_valid_subset.mol_id)
161
- ), "The mol_id column must only contain valid ids!"
159
+ assert set(df_predictions.mol_id).issubset(set(df_valid_subset.mol_id)), (
160
+ f"The mol_id column contains invalid ids: "
161
+ f"{set(df_predictions.mol_id).difference(set(df_valid_subset.mol_id))}."
162
+ )
163
+
162
164
  # use mol_id as index
163
165
  df_predictions.set_index("mol_id", drop=True, inplace=True)
164
166
  elif "mol" in df_predictions.columns:
165
167
  # check that molecule names contain only valid ids
166
168
  names = df_predictions.mol.apply(lambda mol: int(mol.GetProp("_Name")))
167
- assert set(names).issubset(
168
- set(df_preprocess.mol_id)
169
- ), "The molecule names must only contain valid ids!"
169
+ assert set(names).issubset(set(df_preprocess.mol_id)), (
170
+ f"The mol_id column contains invalid ids: "
171
+ f"{set(df_predictions.mol_id).difference(set(df_valid_subset.mol_id))}."
172
+ )
170
173
 
171
174
  # use mol_id as index
172
175
  df_predictions.set_index(
@@ -184,6 +187,8 @@ class AbstractModel(ABC):
184
187
  df_valid_subset.index.astype("int64"), inplace=True
185
188
  )
186
189
 
190
+ # TODO: check derivative_id or atom_id
191
+
187
192
  # add column that indicates whether a molecule was missing
188
193
  missing_mol_ids = set(df_preprocess.mol_id).difference(df_predictions.index)
189
194
  df_preprocess["missing"] = df_preprocess.mol_id.isin(missing_mol_ids)
@@ -212,7 +217,7 @@ class AbstractModel(ABC):
212
217
  else:
213
218
  df_result["errors"] = df_result.preprocessing_errors
214
219
  df_result["errors"] = df_result.errors + df_result.missing.map(
215
- lambda x: ["!1"] if x else []
220
+ lambda x: [UnknownProblem()] if x else []
216
221
  )
217
222
  df_result.drop(columns=["missing", "preprocessing_errors"], inplace=True)
218
223
 
@@ -4,7 +4,7 @@ import sys
4
4
 
5
5
  import rich_click as click
6
6
  from decorator import decorator
7
- from nerdd_module.io import WriterRegistry
7
+ from nerdd_module.output import WriterRegistry
8
8
  from stringcase import spinalcase
9
9
 
10
10
  __all__ = ["auto_cli"]
@@ -0,0 +1,41 @@
1
+ from stringcase import snakecase
2
+
3
+ from ..polyfills import version
4
+ from .configuration import Configuration
5
+
6
+ __all__ = ["DefaultConfiguration"]
7
+
8
+
9
+ class DefaultConfiguration(Configuration):
10
+ def __init__(self, nerdd_module):
11
+ super().__init__()
12
+
13
+ # generate a name from the module name
14
+ class_name = nerdd_module.__class__.__name__
15
+ if class_name.endswith("Model"):
16
+ # remove the "Model" suffix
17
+ # e.g. SkinDoctorModel -> SkinDoctor
18
+ class_name = class_name[: -len("Model")]
19
+
20
+ # convert the class name to snake case
21
+ # e.g. SkinDoctor -> skin_doctor
22
+ name = snakecase(class_name)
23
+
24
+ # append version to the configuration
25
+ try:
26
+ module = nerdd_module.__module__
27
+ root_module = module.split(".", 1)[0]
28
+ version_ = version(root_module)
29
+ except ModuleNotFoundError:
30
+ pass
31
+
32
+ self.config = dict(
33
+ name=name,
34
+ version=version_,
35
+ task="molecular_property_prediction",
36
+ job_parameters=[],
37
+ result_properties=[],
38
+ )
39
+
40
+ def _get_dict(self):
41
+ return self.config
@@ -9,6 +9,8 @@ class MergedConfiguration(Configuration):
9
9
 
10
10
  self.config = dict()
11
11
 
12
+ # merge all configurations starting from the first one
13
+ # --> last configuration has the highest priority
12
14
  for c in configs:
13
15
  self.config.update(c._get_dict())
14
16
 
@@ -1,4 +1,3 @@
1
- from .csv_writer import *
2
1
  from .depth_first_explorer import *
3
2
  from .file_reader import *
4
3
  from .gzip_reader import *
@@ -8,10 +7,7 @@ from .mol_reader import *
8
7
  from .reader import *
9
8
  from .reader_registry import *
10
9
  from .sdf_reader import *
11
- from .sdf_writer import *
12
10
  from .smiles_reader import *
13
11
  from .string_reader import *
14
12
  from .tar_reader import *
15
- from .writer import *
16
- from .writer_registry import *
17
13
  from .zip_reader import *
@@ -1,6 +1,5 @@
1
- import os
2
1
  from pathlib import Path
3
- from typing import Generator
2
+ from typing import Generator, Tuple
4
3
 
5
4
  from .reader import MoleculeEntry, Reader
6
5
  from .reader_registry import register_reader
@@ -8,30 +7,42 @@ from .reader_registry import register_reader
8
7
  __all__ = ["FileReader"]
9
8
 
10
9
 
11
- @register_reader
10
+ @register_reader("data_dir")
12
11
  class FileReader(Reader):
13
- def __init__(self, data_dir="."):
12
+ def __init__(self, data_dir=None):
14
13
  super().__init__()
15
- self.data_dir = Path(data_dir)
14
+ self.data_dir = data_dir
15
+ if self.data_dir is not None:
16
+ self.data_dir = Path(self.data_dir)
16
17
 
17
18
  def read(self, filename, explore) -> Generator[MoleculeEntry, None, None]:
18
19
  assert isinstance(filename, str), "input must be a string"
19
20
 
21
+ # convert filename to path
20
22
  try:
21
23
  path = Path(filename)
22
-
23
- if not path.is_absolute():
24
- path = self.data_dir / path
25
24
  except:
26
25
  raise ValueError("input must be a valid path")
27
26
 
28
- assert self.data_dir in path.parents, "input must be a relative path"
27
+ # convert to absolute path
28
+ if not path.is_absolute():
29
+ if self.data_dir is not None:
30
+ path = self.data_dir / path
31
+ else:
32
+ path = Path(".") / path
33
+
34
+ # check that the file is within the data_dir
35
+ assert (
36
+ self.data_dir is None or self.data_dir in path.parents
37
+ ), "input must be a relative path"
38
+
39
+ # check that the file exists
29
40
  assert path.exists(), "input must be a valid file"
30
41
 
31
42
  with open(path, "rb") as f:
32
43
  for entry in explore(f):
33
44
  if len(entry.source) == 1 and entry.source[0] == "raw_input":
34
- source = tuple()
45
+ source: Tuple[str, ...] = tuple()
35
46
  else:
36
47
  source = entry.source
37
48
  yield entry._replace(source=tuple([filename, *source]))
@@ -0,0 +1,64 @@
1
+ from functools import lru_cache
2
+ from typing import Dict, Generator, List, Tuple, Type
3
+
4
+ from .reader import Reader
5
+
6
+ __all__ = ["ReaderRegistry", "register_reader"]
7
+
8
+
9
+ # lru_cache makes the registry a singleton
10
+ @lru_cache(maxsize=1)
11
+ class ReaderRegistry:
12
+ def __init__(self):
13
+ self._factories: List[Tuple[Type[Reader], Tuple[str, ...], Dict[str, str]]] = []
14
+ self._config = {}
15
+
16
+ def _create_reader(self, ReaderClass: Type[Reader], *args, **kwargs) -> Reader:
17
+ # translate all args
18
+ args = tuple(self._config.get(arg, None) for arg in args)
19
+ # translate all kwargs
20
+ kwargs = {
21
+ k: self._config.get(v, None) for k, v in kwargs.items() if v in self._config
22
+ }
23
+
24
+ return ReaderClass(*args, **kwargs)
25
+
26
+ def register(self, ReaderClass: Type[Reader], *args: str, **kwargs: str):
27
+ assert issubclass(ReaderClass, Reader)
28
+ assert all([isinstance(arg, str) for arg in args])
29
+ assert all(
30
+ [isinstance(k, str) and isinstance(v, str) for k, v in kwargs.items()]
31
+ )
32
+ self._factories.append((ReaderClass, args, kwargs))
33
+
34
+ def readers(self) -> Generator[Reader, None, None]:
35
+ for reader, args, kwargs in self._factories:
36
+ yield self._create_reader(reader, *args, **kwargs)
37
+
38
+ def __iter__(self):
39
+ return iter(self.readers())
40
+
41
+
42
+ def register_reader(*args, **kwargs):
43
+ def wrapper(cls, *args, **kwargs):
44
+ ReaderRegistry().register(cls, *args, **kwargs)
45
+ return cls
46
+
47
+ # Case 1: first argument is a class
48
+ # --> decorator is used without arguments
49
+ # @register_reader
50
+ # class F:
51
+ # ...
52
+ if len(args) > 0 and isinstance(args[0], type):
53
+ return wrapper(args[0], *args[1:], **kwargs)
54
+
55
+ # Case 2: first argument is a not a class
56
+ # --> decorator is used with arguments
57
+ # @register_reader("blah")
58
+ # class F:
59
+ # ...
60
+ def inner(cls):
61
+ assert isinstance(cls, type), "Decorator must be used with a class"
62
+ return wrapper(cls, *args, **kwargs)
63
+
64
+ return inner
@@ -0,0 +1 @@
1
+ from .writer_registry import *
@@ -26,5 +26,5 @@ class CsvWriter(Writer):
26
26
  for entry in chain([first_entry], entry_iter):
27
27
  for key, value in entry.items():
28
28
  if isinstance(value, Mol):
29
- entry[key] = MolToSmiles(value)
29
+ entry[key] = MolToSmiles(value, canonical=False)
30
30
  writer.writerow(entry)
@@ -1,2 +1,3 @@
1
1
  from .files import *
2
2
  from .get_entry_points import *
3
+ from .version import *
@@ -1,10 +1,8 @@
1
1
  import sys
2
2
 
3
+ __all__ = ["version"]
4
+
3
5
  if sys.version_info < (3, 10):
4
6
  from importlib_metadata import version
5
7
  else:
6
8
  from importlib.metadata import version
7
-
8
- __all__ = ["__version__"]
9
-
10
- __version__ = version(__package__)
@@ -4,6 +4,7 @@ from .empty_pipeline import *
4
4
  from .filter_by_element import *
5
5
  from .filter_by_weight import *
6
6
  from .pipeline import *
7
- from ..problem import *
8
7
  from .registry import *
8
+ from .remove_stereochemistry import *
9
+ from .sanitize import *
9
10
  from .step import *
@@ -1,8 +1,8 @@
1
- from typing import List, Tuple
1
+ from typing import List, Optional, Tuple
2
2
 
3
3
  from rdkit.Chem import Mol, MolFromSmiles, MolToSmiles
4
4
 
5
- from ..problem import Problem
5
+ from ..problem import InvalidSmiles, Problem
6
6
  from .step import Step
7
7
 
8
8
  __all__ = ["CheckValidSmiles"]
@@ -14,15 +14,13 @@ class CheckValidSmiles(Step):
14
14
  def __init__(self):
15
15
  super().__init__()
16
16
 
17
- def _run(self, mol: Mol) -> Tuple[Mol, List[Problem]]:
17
+ def _run(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
18
18
  errors = []
19
19
 
20
20
  smi = MolToSmiles(mol, True)
21
21
  check_mol = MolFromSmiles(smi)
22
22
  if check_mol is None:
23
- errors.append(
24
- Problem("invalid_smiles", "Cannot convert molecule to SMILES")
25
- )
23
+ errors.append(InvalidSmiles())
26
24
  mol = None
27
25
 
28
26
  return mol, errors
@@ -1,5 +1,5 @@
1
1
  import warnings
2
- from typing import List, Tuple
2
+ from typing import List, Optional, Tuple
3
3
 
4
4
  from rdkit.Chem import Mol
5
5
  from rdkit.rdBase import BlockLogs
@@ -41,7 +41,7 @@ class StandardizeWithCsp(Step):
41
41
  if import_error is not None:
42
42
  raise import_error
43
43
 
44
- def _run(self, mol: Mol) -> Tuple[Mol, List[Problem]]:
44
+ def _run(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
45
45
  errors = []
46
46
 
47
47
  # chembl structure pipeline cannot handle molecules with 3D coordinates
@@ -65,7 +65,7 @@ class GetParentMol(Step):
65
65
  if import_error is not None:
66
66
  raise import_error
67
67
 
68
- def _run(self, mol: Mol) -> Tuple[Mol, List[Problem]]:
68
+ def _run(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
69
69
  errors = []
70
70
 
71
71
  # chembl structure pipeline cannot handle molecules with 3D coordinates
@@ -1,4 +1,4 @@
1
- from typing import Iterable, List, Tuple
1
+ from typing import Iterable, List, Optional, Tuple
2
2
 
3
3
  from rdkit.Chem import Mol
4
4
 
@@ -14,7 +14,7 @@ class FilterByElement(Step):
14
14
  self.allowed_elements = set(allowed_elements)
15
15
  self.remove_invalid_molecules = remove_invalid_molecules
16
16
 
17
- def _run(self, mol: Mol) -> Tuple[Mol, List[Problem]]:
17
+ def _run(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
18
18
  errors = []
19
19
  result_mol = mol
20
20
 
@@ -1,4 +1,4 @@
1
- from typing import List, Tuple
1
+ from typing import List, Optional, Tuple
2
2
 
3
3
  from rdkit.Chem import Mol
4
4
  from rdkit.Chem.Descriptors import MolWt
@@ -14,7 +14,7 @@ class FilterByWeight(Step):
14
14
  self.max_weight = max_weight
15
15
  self.remove_invalid_molecules = remove_invalid_molecules
16
16
 
17
- def _run(self, mol: Mol) -> Tuple[Mol, List[Problem]]:
17
+ def _run(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
18
18
  errors = []
19
19
 
20
20
  weight = MolWt(mol)
@@ -25,9 +25,11 @@ class FilterByWeight(Step):
25
25
  result_mol = mol
26
26
  errors.append(
27
27
  Problem(
28
- "invalid_weight",
29
- f"Molecular weight {weight:.2f} out of range "
30
- f"[{self.min_weight}, {self.max_weight}]",
28
+ type="invalid_weight",
29
+ message=(
30
+ f"Molecular weight {weight:.2f} out of range "
31
+ f"[{self.min_weight}, {self.max_weight}]"
32
+ ),
31
33
  )
32
34
  )
33
35
  else:
@@ -0,0 +1,18 @@
1
+ from rdkit.Chem import SanitizeMol
2
+
3
+ from .step import Step
4
+
5
+ __all__ = ["Sanitize"]
6
+
7
+
8
+ class Sanitize(Step):
9
+ def __init__(self):
10
+ super().__init__()
11
+
12
+ def _run(self, mol):
13
+ errors = []
14
+
15
+ # sanitize molecule
16
+ SanitizeMol(mol)
17
+
18
+ return mol, errors
@@ -1,5 +1,5 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import List, Tuple
2
+ from typing import List, Optional, Tuple
3
3
 
4
4
  from rdkit.Chem import Mol
5
5
 
@@ -12,14 +12,14 @@ class Step(ABC):
12
12
  def __init__(self):
13
13
  pass
14
14
 
15
- def run(self, mol: Mol) -> Tuple[Mol, List[Problem]]:
15
+ def run(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
16
16
  """
17
17
  Runs the step on a molecule.
18
18
  """
19
19
  return self._run(mol)
20
20
 
21
21
  @abstractmethod
22
- def _run(self, mol: Mol) -> Tuple[Mol, List[Problem]]:
22
+ def _run(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
23
23
  """
24
24
  Runs the step on a molecule.
25
25
  """
@@ -0,0 +1,13 @@
1
+ from typing import NamedTuple
2
+
3
+ __all__ = ["Problem", "InvalidSmiles", "UnknownProblem"]
4
+
5
+
6
+ class Problem(NamedTuple):
7
+ type: str
8
+ message: str
9
+
10
+
11
+ InvalidSmiles = lambda: Problem(type="invalid_smiles", message="Invalid SMILES string")
12
+
13
+ UnknownProblem = lambda: Problem(type="unknown", message="Unknown error occurred")
@@ -1,4 +1,5 @@
1
1
  import json
2
+ from ast import literal_eval
2
3
 
3
4
  import numpy as np
4
5
  import pandas as pd
@@ -39,14 +40,17 @@ def check_column_range(subset, column_name, low, high):
39
40
 
40
41
 
41
42
  @then(parsers.parse("the value in column '{column_name}' should be '{expected_value}'"))
42
- def check_column_value(predictions, column_name, expected_value):
43
- value = predictions[column_name].iloc[0]
43
+ def check_column_value(subset, column_name, expected_value):
44
+ if len(subset) == 0:
45
+ return
46
+
47
+ value = subset[column_name].iloc[0]
44
48
 
45
49
  # expected value is always provided as string
46
50
  # try to convert to float if possible
47
51
  try:
48
- expected_value = float(expected_value)
49
- except ValueError:
52
+ expected_value = literal_eval(expected_value)
53
+ except:
50
54
  pass
51
55
 
52
56
  if expected_value == "(none)":
@@ -132,3 +136,49 @@ def check_column_length(subset, column_name, length):
132
136
  assert (
133
137
  subset[column_name].map(lambda x: len(x) > length)
134
138
  ).all(), f"Column {column_name} has unexpected length"
139
+
140
+
141
+ @then(
142
+ parsers.parse(
143
+ "when '{condition_column_name}' is '{condition_value}' "
144
+ "the value in column '{column_name}' should be '{expected_value}'"
145
+ )
146
+ )
147
+ def check_conditional_column_value(
148
+ subset, condition_column_name, condition_value, column_name, expected_value
149
+ ):
150
+ # expected value is always provided as string
151
+ # try to convert to float if possible
152
+ try:
153
+ expected_value = literal_eval(expected_value)
154
+ except:
155
+ pass
156
+
157
+ # same for condition value
158
+ try:
159
+ condition_value = literal_eval(condition_value)
160
+ except:
161
+ pass
162
+
163
+ # condition value can be (none) to indicate None
164
+ if condition_value == "(none)":
165
+ subset = subset[pd.isnull(subset[condition_column_name])]
166
+ else:
167
+ subset = subset[subset[condition_column_name] == condition_value]
168
+
169
+ value = subset[column_name]
170
+ assert (
171
+ len(value) > 0
172
+ ), f"No rows found for condition {condition_column_name} == {condition_value}"
173
+
174
+ # expected value can be (none) to indicate None
175
+ if expected_value == "(none)":
176
+ # if expected_value is the magic string "(none)", we expect None
177
+ assert pd.isnull(
178
+ value
179
+ ).all(), f"Column {column_name} is assigned to {value} != None"
180
+ else:
181
+ # otherwise, we expect the value to be equal to the expected value
182
+ assert (
183
+ value == expected_value
184
+ ).all(), f"Column {column_name} is assigned to {value} != {expected_value}"
@@ -45,7 +45,7 @@ def representations_from_molecules(molecules, input_type):
45
45
  ),
46
46
  target_fixture="molecules",
47
47
  )
48
- def molecules(num, num_none, random_seed):
48
+ def molecules(num, num_none, random_seed=0):
49
49
  result = None
50
50
 
51
51
  # pytest-bdd and hypothesis don't play well together (yet)
@@ -0,0 +1,5 @@
1
+ from .polyfills import version
2
+
3
+ __all__ = ["__version__"]
4
+
5
+ __version__ = version(__package__)
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nerdd-module
3
- Version: 0.2.4
3
+ Version: 0.2.6
4
4
  Summary: Base package to create NERDD modules
5
- Home-page: https://github.com/molinfo-vienna/nerdd-module.git
5
+ Home-page: https://github.com/molinfo-vienna/nerdd-module
6
6
  Maintainer: Steffen Hirte
7
7
  Maintainer-email: steffen.hirte@univie.ac.at
8
8
  License: BSD 3-Clause License
@@ -33,8 +33,11 @@ Requires-Dist: stringcase>=1.2.0
33
33
  Requires-Dist: decorator>=5.1.1
34
34
  Requires-Dist: importlib-resources>=5; python_version < "3.10"
35
35
  Requires-Dist: importlib-metadata>=4.6; python_version < "3.10"
36
- Requires-Dist: chembl_structure_pipeline>=1.0.0
37
36
  Provides-Extra: dev
37
+ Requires-Dist: black; extra == "dev"
38
+ Requires-Dist: isort; extra == "dev"
39
+ Provides-Extra: csp
40
+ Requires-Dist: chembl_structure_pipeline>=1.0.0; extra == "csp"
38
41
  Provides-Extra: test
39
42
  Requires-Dist: pytest; extra == "test"
40
43
  Requires-Dist: pytest-sugar; extra == "test"
@@ -19,28 +19,30 @@ nerdd_module/config/dict_configuration.py
19
19
  nerdd_module/config/merged_configuration.py
20
20
  nerdd_module/config/package_configuration.py
21
21
  nerdd_module/config/yaml_configuration.py
22
- nerdd_module/io/__init__.py
23
- nerdd_module/io/csv_writer.py
24
- nerdd_module/io/depth_first_explorer.py
25
- nerdd_module/io/explorer.py
26
- nerdd_module/io/file_reader.py
27
- nerdd_module/io/gzip_reader.py
28
- nerdd_module/io/inchi_reader.py
29
- nerdd_module/io/list_reader.py
30
- nerdd_module/io/mol_reader.py
31
- nerdd_module/io/reader.py
32
- nerdd_module/io/reader_registry.py
33
- nerdd_module/io/sdf_reader.py
34
- nerdd_module/io/sdf_writer.py
35
- nerdd_module/io/smiles_reader.py
36
- nerdd_module/io/string_reader.py
37
- nerdd_module/io/tar_reader.py
38
- nerdd_module/io/writer.py
39
- nerdd_module/io/writer_registry.py
40
- nerdd_module/io/zip_reader.py
22
+ nerdd_module/input/__init__.py
23
+ nerdd_module/input/depth_first_explorer.py
24
+ nerdd_module/input/explorer.py
25
+ nerdd_module/input/file_reader.py
26
+ nerdd_module/input/gzip_reader.py
27
+ nerdd_module/input/inchi_reader.py
28
+ nerdd_module/input/list_reader.py
29
+ nerdd_module/input/mol_reader.py
30
+ nerdd_module/input/reader.py
31
+ nerdd_module/input/reader_registry.py
32
+ nerdd_module/input/sdf_reader.py
33
+ nerdd_module/input/smiles_reader.py
34
+ nerdd_module/input/string_reader.py
35
+ nerdd_module/input/tar_reader.py
36
+ nerdd_module/input/zip_reader.py
37
+ nerdd_module/output/__init__.py
38
+ nerdd_module/output/csv_writer.py
39
+ nerdd_module/output/sdf_writer.py
40
+ nerdd_module/output/writer.py
41
+ nerdd_module/output/writer_registry.py
41
42
  nerdd_module/polyfills/__init__.py
42
43
  nerdd_module/polyfills/files.py
43
44
  nerdd_module/polyfills/get_entry_points.py
45
+ nerdd_module/polyfills/version.py
44
46
  nerdd_module/preprocessing/__init__.py
45
47
  nerdd_module/preprocessing/check_valid_smiles.py
46
48
  nerdd_module/preprocessing/chembl_structure_pipeline.py
@@ -50,6 +52,7 @@ nerdd_module/preprocessing/filter_by_weight.py
50
52
  nerdd_module/preprocessing/pipeline.py
51
53
  nerdd_module/preprocessing/registry.py
52
54
  nerdd_module/preprocessing/remove_stereochemistry.py
55
+ nerdd_module/preprocessing/sanitize.py
53
56
  nerdd_module/preprocessing/step.py
54
57
  nerdd_module/tests/__init__.py
55
58
  nerdd_module/tests/checks.py
@@ -68,6 +71,5 @@ tests/models/MolWeightModelWithExplicitMols.py
68
71
  tests/models/__init__.py
69
72
  tests/steps/__init__.py
70
73
  tests/steps/checks.py
71
- tests/steps/molecules.py
72
74
  tests/steps/predictors.py
73
75
  tests/steps/preprocessing.py
@@ -5,13 +5,17 @@ filetype~=1.2.0
5
5
  rich-click>=1.7.1
6
6
  stringcase>=1.2.0
7
7
  decorator>=5.1.1
8
- chembl_structure_pipeline>=1.0.0
9
8
 
10
9
  [:python_version < "3.10"]
11
10
  importlib-resources>=5
12
11
  importlib-metadata>=4.6
13
12
 
13
+ [csp]
14
+ chembl_structure_pipeline>=1.0.0
15
+
14
16
  [dev]
17
+ black
18
+ isort
15
19
 
16
20
  [docs]
17
21
  mkdocs
@@ -16,11 +16,11 @@ rdkit_requirement = ["rdkit>=2022.3.3"] if not rdkit_installed else []
16
16
 
17
17
  setup(
18
18
  name="nerdd-module",
19
- version="0.2.4",
19
+ version="0.2.6",
20
20
  maintainer="Steffen Hirte",
21
21
  maintainer_email="steffen.hirte@univie.ac.at",
22
22
  packages=find_packages(),
23
- url="https://github.com/molinfo-vienna/nerdd-module.git",
23
+ url="https://github.com/molinfo-vienna/nerdd-module",
24
24
  description="Base package to create NERDD modules",
25
25
  license="BSD 3-Clause License",
26
26
  long_description=open("README.md").read(),
@@ -36,12 +36,19 @@ setup(
36
36
  # install importlib-resources and importlib-metadata for old Python versions
37
37
  "importlib-resources>=5; python_version<'3.10'",
38
38
  "importlib-metadata>=4.6; python_version<'3.10'",
39
- # note: version 1.0.0 of chembl_structure_pipeline is not available on pypi,
40
- # but it could potentially be installed from github
41
- "chembl_structure_pipeline>=1.0.0",
42
39
  ],
43
40
  extras_require={
44
- "dev": [],
41
+ "dev": [
42
+ "black",
43
+ "isort",
44
+ ],
45
+ "csp": [
46
+ # note: version 1.0.0 of chembl_structure_pipeline is not available on pypi
47
+ # BUT: maybe it was already installed in the current environment manually
48
+ # other note: chembl_structure_pipeline *always* installs a recent version
49
+ # of rdkit
50
+ "chembl_structure_pipeline>=1.0.0"
51
+ ],
45
52
  "test": [
46
53
  "pytest",
47
54
  "pytest-sugar",
@@ -4,4 +4,4 @@
4
4
  # from .steps import *
5
5
  #
6
6
  # instead, we use pytest_plugins to make this work
7
- pytest_plugins = ["tests.steps"]
7
+ pytest_plugins = ["tests.steps", "nerdd_module.tests"]
@@ -1,20 +1,17 @@
1
1
  import pandas as pd
2
2
  from nerdd_module import AbstractModel
3
+ from nerdd_module.preprocessing import Sanitize
3
4
  from rdkit.Chem.Descriptors import MolWt
4
5
 
5
6
  __all__ = ["MolWeightModel"]
6
7
 
7
8
 
8
9
  class MolWeightModel(AbstractModel):
9
- def __init__(self, preprocessing_pipeline="chembl_structure_pipeline", **kwargs):
10
+ def __init__(self, preprocessing_pipeline=[Sanitize()], **kwargs):
10
11
  super().__init__(preprocessing_pipeline, **kwargs)
11
12
 
12
13
  def _predict_mols(self, mols, multiplier):
13
- return pd.DataFrame(
14
- {
15
- "weight": [MolWt(m) * multiplier for m in mols],
16
- }
17
- )
14
+ return pd.DataFrame({"weight": [MolWt(m) * multiplier for m in mols]})
18
15
 
19
16
  def _get_config(self):
20
17
  return {
@@ -1,12 +1,13 @@
1
1
  import pandas as pd
2
2
  from nerdd_module import AbstractModel
3
+ from nerdd_module.preprocessing import Sanitize
3
4
  from rdkit.Chem.Descriptors import MolWt
4
5
 
5
6
  __all__ = ["MolWeightModelWithExplicitMolIds"]
6
7
 
7
8
 
8
9
  class MolWeightModelWithExplicitMolIds(AbstractModel):
9
- def __init__(self, preprocessing_pipeline="chembl_structure_pipeline", **kwargs):
10
+ def __init__(self, preprocessing_pipeline=[Sanitize()], **kwargs):
10
11
  super().__init__(preprocessing_pipeline, **kwargs)
11
12
 
12
13
  def _predict_mols(self, mols, multiplier):
@@ -1,20 +1,18 @@
1
1
  import pandas as pd
2
2
  from nerdd_module import AbstractModel
3
+ from nerdd_module.preprocessing import Sanitize
3
4
  from rdkit.Chem.Descriptors import MolWt
4
5
 
5
6
  __all__ = ["MolWeightModelWithExplicitMols"]
6
7
 
7
8
 
8
9
  class MolWeightModelWithExplicitMols(AbstractModel):
9
- def __init__(self, preprocessing_pipeline="chembl_structure_pipeline", **kwargs):
10
+ def __init__(self, preprocessing_pipeline=[Sanitize()], **kwargs):
10
11
  super().__init__(preprocessing_pipeline, **kwargs)
11
12
 
12
13
  def _predict_mols(self, mols, multiplier):
13
14
  return pd.DataFrame(
14
- {
15
- "mol": mols,
16
- "weight": [MolWt(m) * multiplier for m in mols],
17
- }
15
+ {"mol": mols, "weight": [MolWt(m) * multiplier for m in mols]}
18
16
  )
19
17
 
20
18
  def _get_config(self):
@@ -1,4 +1,3 @@
1
1
  from .checks import *
2
- from .molecules import *
3
2
  from .predictors import *
4
3
  from .preprocessing import *
@@ -1,17 +0,0 @@
1
- from .configuration import Configuration
2
-
3
- __all__ = ["DefaultConfiguration"]
4
-
5
-
6
- class DefaultConfiguration(Configuration):
7
- def __init__(self, nerdd_module):
8
- super().__init__()
9
-
10
- self.config = dict(
11
- task="molecular_property_prediction",
12
- job_parameters=[],
13
- result_properties=[],
14
- )
15
-
16
- def _get_dict(self):
17
- return self.config
@@ -1,30 +0,0 @@
1
- from functools import lru_cache
2
- from typing import Generator, Type
3
-
4
- from .reader import Reader
5
-
6
- __all__ = ["ReaderRegistry", "register_reader"]
7
-
8
-
9
- # lru_cache makes the registry a singleton
10
- @lru_cache(maxsize=1)
11
- class ReaderRegistry:
12
- def __init__(self):
13
- self._factories = []
14
-
15
- def register(self, ReaderClass: Type[Reader], *args, **kwargs):
16
- assert issubclass(ReaderClass, Reader)
17
- self._factories.append(lambda: ReaderClass(*args, **kwargs))
18
-
19
- def readers(self) -> Generator[Reader, None, None]:
20
- for reader in self._factories:
21
- yield reader()
22
-
23
- def __iter__(self):
24
- return iter(map(lambda f: f(), self._factories))
25
-
26
-
27
- def register_reader(clazz, *args, **kwargs):
28
- # TODO: implement both decorator modes
29
- ReaderRegistry().register(clazz, *args, **kwargs)
30
- return clazz
@@ -1,8 +0,0 @@
1
- from typing import NamedTuple
2
-
3
- __all__ = ["Problem"]
4
-
5
-
6
- class Problem(NamedTuple):
7
- type: str
8
- message: str
@@ -1,54 +0,0 @@
1
- import numpy as np
2
- from hypothesis import given as hgiven
3
- from hypothesis import settings
4
- from hypothesis import strategies as st
5
- from hypothesis_rdkit import mols
6
- from pytest_bdd import given, parsers
7
- from rdkit.Chem import MolToInchi, MolToMolBlock, MolToSmiles
8
-
9
-
10
- @given(
11
- parsers.parse(
12
- "a list of {num:d} random molecules, where {num_none:d} entries are None"
13
- ),
14
- target_fixture="molecules",
15
- )
16
- def molecules(num, num_none):
17
- result = None
18
-
19
- @hgiven(st.lists(mols(), min_size=num, max_size=num, unique_by=MolToSmiles))
20
- @settings(max_examples=1, deadline=None)
21
- def generate(mols):
22
- nonlocal result
23
- # ensure that all molecules are valid
24
- result = mols
25
-
26
- generate()
27
-
28
- # replace random entries with None
29
- indices = np.random.choice(num, num_none, replace=False)
30
- for i in indices:
31
- result[i] = None
32
-
33
- return result
34
-
35
-
36
- @given(
37
- parsers.parse("the representations of the molecules as {input_type}"),
38
- target_fixture="representations",
39
- )
40
- def representations(molecules, input_type):
41
- if input_type == "smiles":
42
- converter = MolToSmiles
43
- elif input_type == "mol_block":
44
- converter = MolToMolBlock
45
- elif input_type == "inchi":
46
- converter = MolToInchi
47
- elif input_type == "rdkit_mol":
48
- converter = lambda mol: mol
49
- else:
50
- raise ValueError(f"Unknown input_type: {input_type}")
51
-
52
- result = [converter(mol) if mol is not None else None for mol in molecules]
53
-
54
- return result
File without changes
File without changes
File without changes