nerdd-module 0.1.12__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. nerdd-module-0.2.0/PKG-INFO +70 -0
  2. nerdd-module-0.2.0/README.md +18 -0
  3. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/__init__.py +4 -1
  4. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/abstract_model.py +9 -17
  5. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/cli.py +1 -1
  6. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/config/default_configuration.py +5 -3
  7. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/io/__init__.py +7 -7
  8. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/io/csv_writer.py +1 -2
  9. nerdd-module-0.2.0/nerdd_module/io/depth_first_explorer.py +111 -0
  10. nerdd-module-0.2.0/nerdd_module/io/explorer.py +13 -0
  11. nerdd-module-0.2.0/nerdd_module/io/file_reader.py +28 -0
  12. nerdd-module-0.2.0/nerdd_module/io/gzip_reader.py +30 -0
  13. nerdd-module-0.2.0/nerdd_module/io/inchi_reader.py +59 -0
  14. nerdd-module-0.2.0/nerdd_module/io/list_reader.py +24 -0
  15. nerdd-module-0.2.0/nerdd_module/io/mol_reader.py +25 -0
  16. nerdd-module-0.2.0/nerdd_module/io/reader.py +25 -0
  17. nerdd-module-0.2.0/nerdd_module/io/reader_registry.py +30 -0
  18. nerdd-module-0.2.0/nerdd_module/io/sdf_reader.py +81 -0
  19. nerdd-module-0.2.0/nerdd_module/io/smiles_reader.py +66 -0
  20. nerdd-module-0.2.0/nerdd_module/io/string_reader.py +22 -0
  21. nerdd-module-0.2.0/nerdd_module/io/tar_reader.py +29 -0
  22. nerdd-module-0.2.0/nerdd_module/io/zip_reader.py +31 -0
  23. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/__init__.py +1 -0
  24. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/check_valid_smiles.py +7 -6
  25. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/chembl_structure_pipeline.py +11 -16
  26. nerdd-module-0.2.0/nerdd_module/preprocessing/filter_by_element.py +39 -0
  27. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/filter_by_weight.py +9 -2
  28. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/pipeline.py +4 -3
  29. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/remove_stereochemistry.py +6 -3
  30. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/step.py +4 -2
  31. nerdd-module-0.2.0/nerdd_module/problem.py +8 -0
  32. nerdd-module-0.2.0/nerdd_module.egg-info/PKG-INFO +70 -0
  33. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module.egg-info/SOURCES.txt +8 -9
  34. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module.egg-info/requires.txt +5 -0
  35. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/setup.py +6 -1
  36. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/models/MolWeightModel.py +1 -1
  37. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/models/MolWeightModelWithExplicitMolIds.py +1 -1
  38. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/models/MolWeightModelWithExplicitMols.py +1 -1
  39. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/steps/checks.py +11 -3
  40. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/steps/molecules.py +6 -4
  41. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/test_molecule_property_prediction.py +0 -8
  42. nerdd-module-0.2.0/tests/test_reading_formats.py +137 -0
  43. nerdd-module-0.1.12/PKG-INFO +0 -90
  44. nerdd-module-0.1.12/README.md +0 -61
  45. nerdd-module-0.1.12/nerdd_module/io/elementary_inchi_reader.py +0 -38
  46. nerdd-module-0.1.12/nerdd_module/io/elementary_mol_block_reader.py +0 -40
  47. nerdd-module-0.1.12/nerdd_module/io/elementary_rdkit_mol_reader.py +0 -24
  48. nerdd-module-0.1.12/nerdd_module/io/elementary_reader.py +0 -30
  49. nerdd-module-0.1.12/nerdd_module/io/elementary_smiles_reader.py +0 -43
  50. nerdd-module-0.1.12/nerdd_module/io/file_reader.py +0 -20
  51. nerdd-module-0.1.12/nerdd_module/io/guess_and_read.py +0 -75
  52. nerdd-module-0.1.12/nerdd_module/io/guessing_reader.py +0 -55
  53. nerdd-module-0.1.12/nerdd_module/io/gzip_file_reader.py +0 -29
  54. nerdd-module-0.1.12/nerdd_module/io/inchi_reader.py +0 -30
  55. nerdd-module-0.1.12/nerdd_module/io/list_reader.py +0 -25
  56. nerdd-module-0.1.12/nerdd_module/io/reader.py +0 -31
  57. nerdd-module-0.1.12/nerdd_module/io/reader_registry.py +0 -44
  58. nerdd-module-0.1.12/nerdd_module/io/sdf_reader.py +0 -49
  59. nerdd-module-0.1.12/nerdd_module/io/smiles_reader.py +0 -31
  60. nerdd-module-0.1.12/nerdd_module/io/splitting_reader.py +0 -28
  61. nerdd-module-0.1.12/nerdd_module/preprocessing/filter_by_element.py +0 -29
  62. nerdd-module-0.1.12/nerdd_module.egg-info/PKG-INFO +0 -90
  63. nerdd-module-0.1.12/tests/test_reading_formats.py +0 -137
  64. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/LICENSE +0 -0
  65. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/config/__init__.py +0 -0
  66. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/config/auto_configuration.py +0 -0
  67. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/config/configuration.py +0 -0
  68. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/config/dict_configuration.py +0 -0
  69. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/config/merged_configuration.py +0 -0
  70. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/config/package_configuration.py +0 -0
  71. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/config/yaml_configuration.py +0 -0
  72. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/io/sdf_writer.py +0 -0
  73. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/io/writer.py +0 -0
  74. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/io/writer_registry.py +0 -0
  75. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/empty_pipeline.py +0 -0
  76. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/registry.py +0 -0
  77. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/version.py +0 -0
  78. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module.egg-info/dependency_links.txt +0 -0
  79. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module.egg-info/top_level.txt +0 -0
  80. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/setup.cfg +0 -0
  81. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/__init__.py +0 -0
  82. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/conftest.py +0 -0
  83. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/models/AtomicMassModel.py +0 -0
  84. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/models/__init__.py +0 -0
  85. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/steps/__init__.py +0 -0
  86. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/steps/predictors.py +0 -0
  87. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/steps/preprocessing.py +0 -0
  88. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/test_atom_property_prediction.py +0 -0
  89. {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/test_preprocessing.py +0 -0
@@ -0,0 +1,70 @@
1
+ Metadata-Version: 2.1
2
+ Name: nerdd-module
3
+ Version: 0.2.0
4
+ Summary: Base package to create NERDD modules
5
+ Home-page: https://github.com/molinfo-vienna/nerdd-module.git
6
+ Maintainer: Steffen Hirte
7
+ Maintainer-email: steffen.hirte@univie.ac.at
8
+ License: BSD 3-Clause License
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: BSD License
12
+ Classifier: Programming Language :: C
13
+ Classifier: Programming Language :: Python
14
+ Classifier: Topic :: Software Development
15
+ Classifier: Topic :: Scientific/Engineering
16
+ Classifier: Operating System :: Microsoft :: Windows
17
+ Classifier: Operating System :: POSIX
18
+ Classifier: Operating System :: Unix
19
+ Classifier: Operating System :: MacOS
20
+ Classifier: Programming Language :: Python :: 3
21
+ Classifier: Programming Language :: Python :: 3.9
22
+ Classifier: Programming Language :: Python :: 3.10
23
+ Classifier: Programming Language :: Python :: 3.11
24
+ Classifier: Programming Language :: Python :: 3.12
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: rdkit>=2022.3.3
28
+ Requires-Dist: pandas>=1.2.1
29
+ Requires-Dist: pyyaml>=6.0
30
+ Requires-Dist: filetype~=1.2.0
31
+ Requires-Dist: rich-click>=1.7.1
32
+ Requires-Dist: stringcase>=1.2.0
33
+ Requires-Dist: decorator>=5.1.1
34
+ Requires-Dist: importlib-resources>=5; python_version < "3.10"
35
+ Requires-Dist: importlib-metadata>=4.6; python_version < "3.10"
36
+ Requires-Dist: chembl_structure_pipeline>=1.0.0
37
+ Provides-Extra: dev
38
+ Provides-Extra: test
39
+ Requires-Dist: pytest; extra == "test"
40
+ Requires-Dist: pytest-sugar; extra == "test"
41
+ Requires-Dist: pytest-cov; extra == "test"
42
+ Requires-Dist: pytest-asyncio; extra == "test"
43
+ Requires-Dist: pytest-bdd; extra == "test"
44
+ Requires-Dist: pytest-mock; extra == "test"
45
+ Requires-Dist: pytest-watch; extra == "test"
46
+ Requires-Dist: hypothesis; extra == "test"
47
+ Requires-Dist: hypothesis-rdkit; extra == "test"
48
+ Provides-Extra: docs
49
+ Requires-Dist: mkdocs; extra == "docs"
50
+ Requires-Dist: mkdocs-material; extra == "docs"
51
+ Requires-Dist: mkdocstrings; extra == "docs"
52
+
53
+ # NERDD Module
54
+
55
+ This package provides the basis to implement molecular prediction modules in the
56
+ NERDD ecosystem.
57
+
58
+ ## Installation
59
+
60
+ ```bash
61
+ pip install -U nerdd-module
62
+ ```
63
+
64
+
65
+ ## Contribute
66
+
67
+ 1. Fork and clone the code
68
+ 2. Install test dependencies with ```pip install -e .[test]```
69
+ 3. Run tests via ```pytest``` or ```pytest-watch``` (short: ```ptw```)
70
+ 4. Build docs via ```pip install -e .[docs]``` and ```mkdocs serve```
@@ -0,0 +1,18 @@
1
+ # NERDD Module
2
+
3
+ This package provides the basis to implement molecular prediction modules in the
4
+ NERDD ecosystem.
5
+
6
+ ## Installation
7
+
8
+ ```bash
9
+ pip install -U nerdd-module
10
+ ```
11
+
12
+
13
+ ## Contribute
14
+
15
+ 1. Fork and clone the code
16
+ 2. Install test dependencies with ```pip install -e .[test]```
17
+ 3. Run tests via ```pytest``` or ```pytest-watch``` (short: ```ptw```)
18
+ 4. Build docs via ```pip install -e .[docs]``` and ```mkdocs serve```
@@ -1,6 +1,7 @@
1
1
  from .abstract_model import *
2
2
  from .cli import *
3
3
  from .config import *
4
+ from .problem import *
4
5
  from .version import *
5
6
 
6
7
  # import entry_points from importlib.metadata or fall back to pkg_resources
@@ -9,11 +10,13 @@ try:
9
10
 
10
11
  def get_entry_points(group):
11
12
  return entry_points().get(group, [])
13
+
12
14
  except ImportError:
13
15
  import pkg_resources
14
-
16
+
15
17
  def get_entry_points(group):
16
18
  return pkg_resources.iter_entry_points(group)
17
19
 
20
+
18
21
  for entry_point in get_entry_points("nerdd-module.plugins"):
19
22
  entry_point.load()
@@ -5,18 +5,19 @@ import pandas as pd
5
5
  from rdkit.Chem import Mol, MolToSmiles
6
6
 
7
7
  from .config import AutoConfiguration, Configuration
8
- from .io import MoleculeEntry, guess_and_read
8
+ from .io import DepthFirstExplorer, MoleculeEntry
9
9
  from .preprocessing import Pipeline, Step, registry
10
+ from .problem import Problem
10
11
 
11
12
  __all__ = ["AbstractModel"]
12
13
 
13
14
 
14
15
  class CustomPreprocessingStep(Step):
15
- def __init__(self, fn: Callable[[Mol], Tuple[Mol, List[str]]]):
16
+ def __init__(self, fn: Callable[[Mol], Tuple[Mol, List[Problem]]]):
16
17
  super().__init__()
17
18
  self.fn = fn
18
19
 
19
- def _run(self, mol: Mol) -> Tuple[Mol, List[str]]:
20
+ def _run(self, mol: Mol) -> Tuple[Mol, List[Problem]]:
20
21
  return self.fn(mol)
21
22
 
22
23
 
@@ -69,7 +70,7 @@ class AbstractModel(ABC):
69
70
  #
70
71
  self.num_processes = num_processes
71
72
 
72
- def _preprocess_single_mol(self, mol: Mol) -> Tuple[Mol, List[str]]:
73
+ def _preprocess_single_mol(self, mol: Mol) -> Tuple[Mol, List[Problem]]:
73
74
  # if this method is called, the preprocessing_pipeline was set to "custom"
74
75
  # and this method has to be overwritten
75
76
  raise NotImplementedError()
@@ -117,13 +118,6 @@ class AbstractModel(ABC):
117
118
  for mol in df_preprocess.input_mol
118
119
  ]
119
120
 
120
- # add smiles columns for web UI
121
- def _to_smiles(mol):
122
- try:
123
- return MolToSmiles(mol)
124
- except:
125
- return None
126
-
127
121
  #
128
122
  # PREPARE PREDICTION OF MOLECULES
129
123
  #
@@ -223,10 +217,8 @@ class AbstractModel(ABC):
223
217
  df_result.drop(columns=["missing", "preprocessing_errors"], inplace=True)
224
218
 
225
219
  # convert errors to string
226
- if "errors" in df_result.columns:
227
- df_result["errors"] = df_result.errors.map(lambda x: ", ".join(set(x)))
228
- else:
229
- df_result["errors"] = ""
220
+ if "errors" not in df_result.columns:
221
+ df_result["errors"] = []
230
222
 
231
223
  # delete mol column (not needed anymore)
232
224
  df_load.drop(columns=["mol"], inplace=True)
@@ -236,7 +228,7 @@ class AbstractModel(ABC):
236
228
 
237
229
  # merge errors from loading and prediction
238
230
  df_result["errors"] = [
239
- ", ".join(set(load_errors + [prediction_errors]))
231
+ load_errors + prediction_errors
240
232
  for load_errors, prediction_errors in zip(
241
233
  df_result.load_errors, df_result.errors
242
234
  )
@@ -266,7 +258,7 @@ class AbstractModel(ABC):
266
258
  input_type=None,
267
259
  **kwargs,
268
260
  ):
269
- entries = guess_and_read(inputs)
261
+ entries = DepthFirstExplorer().explore(inputs)
270
262
 
271
263
  return self._predict_entries(entries, **kwargs)
272
264
 
@@ -62,7 +62,7 @@ def auto_cli(f, *args, **kwargs):
62
62
  if len(examples) > 0:
63
63
  footer = "Examples:\n"
64
64
  for example in examples:
65
- footer += f"* {command_name} {example}\n"
65
+ footer += f'* {command_name} "{example}"\n'
66
66
  else:
67
67
  footer = ""
68
68
 
@@ -7,9 +7,11 @@ class DefaultConfiguration(Configuration):
7
7
  def __init__(self, nerdd_module):
8
8
  super().__init__()
9
9
 
10
- # we do not use default values at the moment
11
- # feel free to add values here if needed
12
- self.config = {}
10
+ self.config = dict(
11
+ task="molecular_property_prediction",
12
+ job_parameters=[],
13
+ result_properties=[],
14
+ )
13
15
 
14
16
  def _get_dict(self):
15
17
  return self.config
@@ -1,17 +1,17 @@
1
1
  from .csv_writer import *
2
- from .elementary_reader import *
3
- from .guess_and_read import *
2
+ from .depth_first_explorer import *
3
+ from .file_reader import *
4
+ from .gzip_reader import *
4
5
  from .inchi_reader import *
5
- from .elementary_inchi_reader import *
6
6
  from .list_reader import *
7
- from .elementary_mol_block_reader import *
8
- from .elementary_rdkit_mol_reader import *
7
+ from .mol_reader import *
9
8
  from .reader import *
10
9
  from .reader_registry import *
11
10
  from .sdf_reader import *
12
11
  from .sdf_writer import *
13
12
  from .smiles_reader import *
14
- from .elementary_smiles_reader import *
15
- from .splitting_reader import *
13
+ from .string_reader import *
14
+ from .tar_reader import *
16
15
  from .writer import *
17
16
  from .writer_registry import *
17
+ from .zip_reader import *
@@ -1,7 +1,6 @@
1
1
  import csv
2
- from io import TextIOWrapper
3
2
  from itertools import chain
4
- from typing import BinaryIO, Dict, Iterable, TextIO, Union
3
+ from typing import Dict, Iterable
5
4
 
6
5
  from rdkit.Chem import Mol, MolToSmiles
7
6
 
@@ -0,0 +1,111 @@
1
+ from itertools import chain, islice, repeat
2
+ from typing import Generator, Iterable, Optional
3
+
4
+ from .explorer import Explorer
5
+ from .reader import MoleculeEntry, Problem, Reader
6
+ from .reader_registry import ReaderRegistry
7
+
8
+ __all__ = ["DepthFirstExplorer"]
9
+
10
+
11
+ class InvalidInputReader(Reader):
12
+ def __init__(self):
13
+ super().__init__()
14
+
15
+ def read(self, input, explore) -> Generator[MoleculeEntry, None, None]:
16
+ yield MoleculeEntry(
17
+ raw_input=input,
18
+ input_type="unknown",
19
+ source=tuple(["input"]),
20
+ mol=None,
21
+ errors=[Problem("invalid_input", "Invalid input")],
22
+ )
23
+
24
+ def __repr__(self) -> str:
25
+ return "InvalidInputReader()"
26
+
27
+
28
+ class DepthFirstExplorer(Explorer):
29
+ def __init__(
30
+ self,
31
+ readers: Optional[Iterable[Reader]] = None,
32
+ num_test_entries: int = 10,
33
+ threshold: float = 0.5,
34
+ maximum_depth: int = 50,
35
+ ):
36
+ super().__init__()
37
+
38
+ if readers is None:
39
+ self.reader_registry = ReaderRegistry()
40
+ else:
41
+ self.reader_registry = None
42
+
43
+ self.num_test_entries = num_test_entries
44
+ self.threshold = threshold
45
+ self.state_stack = [self.empty_state()]
46
+ self.maximum_depth = maximum_depth
47
+
48
+ def empty_state(self):
49
+ return dict(first_guess=[])
50
+
51
+ def explore(self, input) -> Generator[MoleculeEntry, None, None]:
52
+ # create a new child node and set it as the current node
53
+ state = self.empty_state()
54
+ parent = self.state_stack[-1]
55
+ self.state_stack.append(state)
56
+
57
+ depth = len(self.state_stack)
58
+ if depth > self.maximum_depth:
59
+ raise ValueError(f"Maximum depth of {self.maximum_depth} reached")
60
+
61
+ readers_iter = chain(
62
+ zip(parent["first_guess"], repeat("guess")),
63
+ zip(self.reader_registry, repeat("builtin")),
64
+ )
65
+
66
+ # try all readers and take a sample of the first num_test_entries
67
+ # the reader with most valid molecule entries will be used
68
+ best_reader: Optional[Reader] = None
69
+ best_mode = None
70
+ best_score = 0
71
+ best_ratio = 0.0
72
+ generator = None
73
+ sample = []
74
+ for reader, mode in readers_iter:
75
+ try:
76
+ # read at most num_test_entries entries
77
+ generator = reader.read(input, self.explore)
78
+ sample = list(islice(generator, self.num_test_entries))
79
+ valid_entries = [entry for entry in sample if entry.mol is not None]
80
+
81
+ score = len(valid_entries)
82
+ ratio = len(valid_entries) / len(sample)
83
+
84
+ if score > best_score or (score == best_score and ratio > best_ratio):
85
+ best_reader = reader
86
+ best_mode = mode
87
+ best_score = score
88
+ best_ratio = ratio
89
+
90
+ if score == self.num_test_entries:
91
+ break
92
+ except Exception:
93
+ pass
94
+
95
+ # clean up tree
96
+ while len(self.state_stack) > depth:
97
+ self.state_stack.pop()
98
+ generator = None
99
+
100
+ if generator is None:
101
+ if best_reader is None:
102
+ generator = InvalidInputReader().read(input, self.explore)
103
+ else:
104
+ generator = best_reader.read(input, self.explore)
105
+ sample = list(islice(generator, self.num_test_entries))
106
+ else:
107
+ if best_mode is not None and best_mode != "guess":
108
+ parent["first_guess"].append(best_reader)
109
+
110
+ yield from sample
111
+ yield from generator
@@ -0,0 +1,13 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Generator
3
+
4
+ from .reader import MoleculeEntry
5
+
6
+
7
+ class Explorer(ABC):
8
+ def __init__(self):
9
+ pass
10
+
11
+ @abstractmethod
12
+ def explore(self, input) -> Generator[MoleculeEntry, None, None]:
13
+ pass
@@ -0,0 +1,28 @@
1
+ import os
2
+ from typing import Generator
3
+
4
+ from .reader import MoleculeEntry, Reader
5
+ from .reader_registry import register_reader
6
+
7
+ __all__ = ["FileReader"]
8
+
9
+
10
+ @register_reader
11
+ class FileReader(Reader):
12
+ def __init__(self):
13
+ super().__init__()
14
+
15
+ def read(self, filename, explore) -> Generator[MoleculeEntry, None, None]:
16
+ if not isinstance(filename, str) or not os.path.exists(filename):
17
+ raise TypeError("input must be a valid filename")
18
+
19
+ with open(filename, "rb") as f:
20
+ for entry in explore(f):
21
+ if len(entry.source) == 1 and entry.source[0] == "raw_input":
22
+ source = tuple()
23
+ else:
24
+ source = entry.source
25
+ yield entry._replace(source=tuple([filename, *source]))
26
+
27
+ def __repr__(self):
28
+ return f"FileReader()"
@@ -0,0 +1,30 @@
1
+ import gzip
2
+ from typing import Generator
3
+
4
+ from .reader import MoleculeEntry, Reader
5
+ from .reader_registry import register_reader
6
+
7
+ __all__ = ["GzipReader"]
8
+
9
+
10
+ @register_reader
11
+ class GzipReader(Reader):
12
+ def __init__(self):
13
+ super().__init__()
14
+
15
+ def read(self, input_stream, explore) -> Generator[MoleculeEntry, None, None]:
16
+ if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
17
+ raise TypeError("input must be a stream-like object")
18
+
19
+ input_stream.seek(0)
20
+
21
+ with gzip.open(input_stream, "rb") as f:
22
+ # gzip.open will not raise an exception if the file is not a valid gzip file
23
+ # --> check by attempting to read the first byte
24
+ f.read(1)
25
+ f.seek(0)
26
+
27
+ yield from explore(f)
28
+
29
+ def __repr__(self) -> str:
30
+ return "GzipReader()"
@@ -0,0 +1,59 @@
1
+ from codecs import getreader
2
+ from typing import Generator
3
+
4
+ from rdkit.Chem import MolFromInchi
5
+ from rdkit.rdBase import BlockLogs
6
+
7
+ from ..problem import Problem
8
+ from .reader import MoleculeEntry, Reader
9
+ from .reader_registry import register_reader
10
+
11
+ __all__ = ["InchiReader"]
12
+
13
+ StreamReader = getreader("utf-8")
14
+
15
+
16
+ @register_reader
17
+ class InchiReader(Reader):
18
+ def __init__(self):
19
+ super().__init__()
20
+
21
+ def read(self, input_stream, explore) -> Generator[MoleculeEntry, None, None]:
22
+ if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
23
+ raise TypeError("input must be a stream-like object")
24
+
25
+ input_stream.seek(0)
26
+
27
+ reader = StreamReader(input_stream)
28
+
29
+ # suppress RDKit warnings
30
+ with BlockLogs():
31
+ for line in reader:
32
+ # skip empty lines
33
+ if line.strip() == "":
34
+ continue
35
+
36
+ # skip comments
37
+ if line.strip().startswith("#"):
38
+ continue
39
+
40
+ try:
41
+ mol = MolFromInchi(line, sanitize=False)
42
+ except:
43
+ mol = None
44
+
45
+ if mol is None:
46
+ errors = [Problem("invalid_inchi", "Invalid InChI")]
47
+ else:
48
+ errors = []
49
+
50
+ yield MoleculeEntry(
51
+ raw_input=line,
52
+ input_type="inchi",
53
+ source=tuple(["raw_input"]),
54
+ mol=mol,
55
+ errors=errors,
56
+ )
57
+
58
+ def __repr__(self) -> str:
59
+ return "InchiReader()"
@@ -0,0 +1,24 @@
1
+ from io import BytesIO, StringIO
2
+ from typing import BinaryIO, Generator, Iterable
3
+
4
+ from .reader import MoleculeEntry, Reader
5
+ from .reader_registry import register_reader
6
+
7
+ __all__ = ["ListReader"]
8
+
9
+
10
+ @register_reader
11
+ class ListReader(Reader):
12
+ def __init__(self):
13
+ super().__init__()
14
+
15
+ def read(self, input_iterable, explore) -> Generator[MoleculeEntry, None, None]:
16
+ assert isinstance(input_iterable, Iterable) and not isinstance(
17
+ input_iterable, (str, bytes, BytesIO, StringIO, BinaryIO)
18
+ ), f"input must be an iterable, but is {type(input_iterable)}"
19
+
20
+ for entry in input_iterable:
21
+ yield from explore(entry)
22
+
23
+ def __repr__(self) -> str:
24
+ return "ListReader()"
@@ -0,0 +1,25 @@
1
+ from typing import Generator
2
+
3
+ from rdkit.Chem import Mol
4
+
5
+ from .reader import MoleculeEntry, Reader
6
+ from .reader_registry import register_reader
7
+
8
+
9
+ @register_reader
10
+ class MolReader(Reader):
11
+ def __init__(self):
12
+ super().__init__()
13
+
14
+ def read(self, mol, explore) -> Generator[MoleculeEntry, None, None]:
15
+ assert isinstance(mol, Mol)
16
+ yield MoleculeEntry(
17
+ raw_input=mol,
18
+ input_type="rdkit_mol",
19
+ source=tuple(["raw_input"]),
20
+ mol=mol,
21
+ errors=[],
22
+ )
23
+
24
+ def __repr__(self) -> str:
25
+ return "MolReader()"
@@ -0,0 +1,25 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Generator, List, NamedTuple, Optional, Tuple
3
+
4
+ from rdkit.Chem import Mol
5
+
6
+ from ..problem import Problem
7
+
8
+ __all__ = ["MoleculeEntry", "Reader"]
9
+
10
+
11
+ class MoleculeEntry(NamedTuple):
12
+ raw_input: str
13
+ input_type: str
14
+ source: Tuple[str, ...]
15
+ mol: Optional[Mol]
16
+ errors: List[Problem]
17
+
18
+
19
+ class Reader(ABC):
20
+ def __init__(self):
21
+ super().__init__()
22
+
23
+ @abstractmethod
24
+ def read(self, input, explore) -> Generator[MoleculeEntry, None, None]:
25
+ pass
@@ -0,0 +1,30 @@
1
+ from functools import lru_cache
2
+ from typing import Generator, Type
3
+
4
+ from .reader import Reader
5
+
6
+ __all__ = ["ReaderRegistry", "register_reader"]
7
+
8
+
9
+ # lru_cache makes the registry a singleton
10
+ @lru_cache(maxsize=1)
11
+ class ReaderRegistry:
12
+ def __init__(self):
13
+ self._factories = []
14
+
15
+ def register(self, ReaderClass: Type[Reader], *args, **kwargs):
16
+ assert issubclass(ReaderClass, Reader)
17
+ self._factories.append(lambda: ReaderClass(*args, **kwargs))
18
+
19
+ def readers(self) -> Generator[Reader, None, None]:
20
+ for reader in self._factories:
21
+ yield reader()
22
+
23
+ def __iter__(self):
24
+ return iter(map(lambda f: f(), self._factories))
25
+
26
+
27
+ def register_reader(clazz, *args, **kwargs):
28
+ # TODO: implement both decorator modes
29
+ ReaderRegistry().register(clazz, *args, **kwargs)
30
+ return clazz
@@ -0,0 +1,81 @@
1
+ from codecs import getreader
2
+ from typing import Generator
3
+
4
+ from rdkit.Chem import MolFromMolBlock
5
+ from rdkit.rdBase import BlockLogs
6
+
7
+ from ..problem import Problem
8
+ from .reader import MoleculeEntry, Reader
9
+ from .reader_registry import register_reader
10
+
11
+ __all__ = ["SdfReader"]
12
+
13
+ StreamReader = getreader("utf-8")
14
+
15
+
16
+ @register_reader
17
+ class SdfReader(Reader):
18
+ def __init__(self, max_num_lines_mol_block: int = 10000):
19
+ super().__init__()
20
+ self.max_num_lines_mol_block = max_num_lines_mol_block
21
+
22
+ def read(self, input_stream, explore) -> Generator[MoleculeEntry, None, None]:
23
+ if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
24
+ raise TypeError("input must be a stream-like object")
25
+
26
+ input_stream.seek(0)
27
+
28
+ reader = StreamReader(input_stream)
29
+
30
+ # suppress RDKit warnings
31
+ with BlockLogs():
32
+
33
+ # We do not use SDMolSupplier, because it does not accept a stream-like
34
+ # object as input. The ForwadSDMolSupplier is not suitable either, because
35
+ # it does not allow to return the raw text.
36
+ while True:
37
+ # collect lines to parse as a mol block
38
+ mol_block = ""
39
+ num_lines = 0
40
+ line = reader.readline()
41
+ while line:
42
+ mol_block += line
43
+ if line.strip() == "$$$$":
44
+ break
45
+
46
+ num_lines += 1
47
+ if num_lines > self.max_num_lines_mol_block:
48
+ break
49
+
50
+ # read next line
51
+ line = reader.readline()
52
+
53
+ if mol_block.strip() != "":
54
+ try:
55
+ mol = MolFromMolBlock(mol_block, sanitize=False, removeHs=False)
56
+ except:
57
+ mol = None
58
+
59
+ if mol is None:
60
+ errors = [Problem("invalid_mol_block", "Invalid mol block")]
61
+ else:
62
+ errors = []
63
+
64
+ yield MoleculeEntry(
65
+ raw_input=mol_block,
66
+ input_type="mol_block",
67
+ source=tuple(["raw_input"]),
68
+ mol=mol,
69
+ errors=errors,
70
+ )
71
+
72
+ # We stop reading if
73
+ # (1) we have reached the end of the file OR
74
+ # (2) the last entry had more than MAX_NUM_LINES_MOL_BLOCK lines
75
+ # (this entry is probably not a valid mol block and everything after
76
+ # it is probably not a valid mol block either)
77
+ if (not line) or (num_lines > self.max_num_lines_mol_block):
78
+ break
79
+
80
+ def __repr__(self) -> str:
81
+ return f"SdfReader(max_num_lines_mol_block={self.max_num_lines_mol_block})"