nerdd-module 0.1.12__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nerdd-module-0.2.0/PKG-INFO +70 -0
- nerdd-module-0.2.0/README.md +18 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/__init__.py +4 -1
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/abstract_model.py +9 -17
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/cli.py +1 -1
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/config/default_configuration.py +5 -3
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/io/__init__.py +7 -7
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/io/csv_writer.py +1 -2
- nerdd-module-0.2.0/nerdd_module/io/depth_first_explorer.py +111 -0
- nerdd-module-0.2.0/nerdd_module/io/explorer.py +13 -0
- nerdd-module-0.2.0/nerdd_module/io/file_reader.py +28 -0
- nerdd-module-0.2.0/nerdd_module/io/gzip_reader.py +30 -0
- nerdd-module-0.2.0/nerdd_module/io/inchi_reader.py +59 -0
- nerdd-module-0.2.0/nerdd_module/io/list_reader.py +24 -0
- nerdd-module-0.2.0/nerdd_module/io/mol_reader.py +25 -0
- nerdd-module-0.2.0/nerdd_module/io/reader.py +25 -0
- nerdd-module-0.2.0/nerdd_module/io/reader_registry.py +30 -0
- nerdd-module-0.2.0/nerdd_module/io/sdf_reader.py +81 -0
- nerdd-module-0.2.0/nerdd_module/io/smiles_reader.py +66 -0
- nerdd-module-0.2.0/nerdd_module/io/string_reader.py +22 -0
- nerdd-module-0.2.0/nerdd_module/io/tar_reader.py +29 -0
- nerdd-module-0.2.0/nerdd_module/io/zip_reader.py +31 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/__init__.py +1 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/check_valid_smiles.py +7 -6
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/chembl_structure_pipeline.py +11 -16
- nerdd-module-0.2.0/nerdd_module/preprocessing/filter_by_element.py +39 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/filter_by_weight.py +9 -2
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/pipeline.py +4 -3
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/remove_stereochemistry.py +6 -3
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/step.py +4 -2
- nerdd-module-0.2.0/nerdd_module/problem.py +8 -0
- nerdd-module-0.2.0/nerdd_module.egg-info/PKG-INFO +70 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module.egg-info/SOURCES.txt +8 -9
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module.egg-info/requires.txt +5 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/setup.py +6 -1
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/models/MolWeightModel.py +1 -1
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/models/MolWeightModelWithExplicitMolIds.py +1 -1
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/models/MolWeightModelWithExplicitMols.py +1 -1
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/steps/checks.py +11 -3
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/steps/molecules.py +6 -4
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/test_molecule_property_prediction.py +0 -8
- nerdd-module-0.2.0/tests/test_reading_formats.py +137 -0
- nerdd-module-0.1.12/PKG-INFO +0 -90
- nerdd-module-0.1.12/README.md +0 -61
- nerdd-module-0.1.12/nerdd_module/io/elementary_inchi_reader.py +0 -38
- nerdd-module-0.1.12/nerdd_module/io/elementary_mol_block_reader.py +0 -40
- nerdd-module-0.1.12/nerdd_module/io/elementary_rdkit_mol_reader.py +0 -24
- nerdd-module-0.1.12/nerdd_module/io/elementary_reader.py +0 -30
- nerdd-module-0.1.12/nerdd_module/io/elementary_smiles_reader.py +0 -43
- nerdd-module-0.1.12/nerdd_module/io/file_reader.py +0 -20
- nerdd-module-0.1.12/nerdd_module/io/guess_and_read.py +0 -75
- nerdd-module-0.1.12/nerdd_module/io/guessing_reader.py +0 -55
- nerdd-module-0.1.12/nerdd_module/io/gzip_file_reader.py +0 -29
- nerdd-module-0.1.12/nerdd_module/io/inchi_reader.py +0 -30
- nerdd-module-0.1.12/nerdd_module/io/list_reader.py +0 -25
- nerdd-module-0.1.12/nerdd_module/io/reader.py +0 -31
- nerdd-module-0.1.12/nerdd_module/io/reader_registry.py +0 -44
- nerdd-module-0.1.12/nerdd_module/io/sdf_reader.py +0 -49
- nerdd-module-0.1.12/nerdd_module/io/smiles_reader.py +0 -31
- nerdd-module-0.1.12/nerdd_module/io/splitting_reader.py +0 -28
- nerdd-module-0.1.12/nerdd_module/preprocessing/filter_by_element.py +0 -29
- nerdd-module-0.1.12/nerdd_module.egg-info/PKG-INFO +0 -90
- nerdd-module-0.1.12/tests/test_reading_formats.py +0 -137
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/LICENSE +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/config/__init__.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/config/auto_configuration.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/config/configuration.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/config/dict_configuration.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/config/merged_configuration.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/config/package_configuration.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/config/yaml_configuration.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/io/sdf_writer.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/io/writer.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/io/writer_registry.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/empty_pipeline.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/preprocessing/registry.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module/version.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module.egg-info/dependency_links.txt +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/nerdd_module.egg-info/top_level.txt +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/setup.cfg +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/__init__.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/conftest.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/models/AtomicMassModel.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/models/__init__.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/steps/__init__.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/steps/predictors.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/steps/preprocessing.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/test_atom_property_prediction.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.0}/tests/test_preprocessing.py +0 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: nerdd-module
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Base package to create NERDD modules
|
|
5
|
+
Home-page: https://github.com/molinfo-vienna/nerdd-module.git
|
|
6
|
+
Maintainer: Steffen Hirte
|
|
7
|
+
Maintainer-email: steffen.hirte@univie.ac.at
|
|
8
|
+
License: BSD 3-Clause License
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
12
|
+
Classifier: Programming Language :: C
|
|
13
|
+
Classifier: Programming Language :: Python
|
|
14
|
+
Classifier: Topic :: Software Development
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering
|
|
16
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
17
|
+
Classifier: Operating System :: POSIX
|
|
18
|
+
Classifier: Operating System :: Unix
|
|
19
|
+
Classifier: Operating System :: MacOS
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: rdkit>=2022.3.3
|
|
28
|
+
Requires-Dist: pandas>=1.2.1
|
|
29
|
+
Requires-Dist: pyyaml>=6.0
|
|
30
|
+
Requires-Dist: filetype~=1.2.0
|
|
31
|
+
Requires-Dist: rich-click>=1.7.1
|
|
32
|
+
Requires-Dist: stringcase>=1.2.0
|
|
33
|
+
Requires-Dist: decorator>=5.1.1
|
|
34
|
+
Requires-Dist: importlib-resources>=5; python_version < "3.10"
|
|
35
|
+
Requires-Dist: importlib-metadata>=4.6; python_version < "3.10"
|
|
36
|
+
Requires-Dist: chembl_structure_pipeline>=1.0.0
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Provides-Extra: test
|
|
39
|
+
Requires-Dist: pytest; extra == "test"
|
|
40
|
+
Requires-Dist: pytest-sugar; extra == "test"
|
|
41
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
42
|
+
Requires-Dist: pytest-asyncio; extra == "test"
|
|
43
|
+
Requires-Dist: pytest-bdd; extra == "test"
|
|
44
|
+
Requires-Dist: pytest-mock; extra == "test"
|
|
45
|
+
Requires-Dist: pytest-watch; extra == "test"
|
|
46
|
+
Requires-Dist: hypothesis; extra == "test"
|
|
47
|
+
Requires-Dist: hypothesis-rdkit; extra == "test"
|
|
48
|
+
Provides-Extra: docs
|
|
49
|
+
Requires-Dist: mkdocs; extra == "docs"
|
|
50
|
+
Requires-Dist: mkdocs-material; extra == "docs"
|
|
51
|
+
Requires-Dist: mkdocstrings; extra == "docs"
|
|
52
|
+
|
|
53
|
+
# NERDD Module
|
|
54
|
+
|
|
55
|
+
This package provides the basis to implement molecular prediction modules in the
|
|
56
|
+
NERDD ecosystem.
|
|
57
|
+
|
|
58
|
+
## Installation
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pip install -U nerdd-module
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
## Contribute
|
|
66
|
+
|
|
67
|
+
1. Fork and clone the code
|
|
68
|
+
2. Install test dependencies with ```pip install -e .[test]```
|
|
69
|
+
3. Run tests via ```pytest``` or ```pytest-watch``` (short: ```ptw```)
|
|
70
|
+
4. Build docs via ```pip install -e .[docs]``` and ```mkdocs serve```
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# NERDD Module
|
|
2
|
+
|
|
3
|
+
This package provides the basis to implement molecular prediction modules in the
|
|
4
|
+
NERDD ecosystem.
|
|
5
|
+
|
|
6
|
+
## Installation
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
pip install -U nerdd-module
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
## Contribute
|
|
14
|
+
|
|
15
|
+
1. Fork and clone the code
|
|
16
|
+
2. Install test dependencies with ```pip install -e .[test]```
|
|
17
|
+
3. Run tests via ```pytest``` or ```pytest-watch``` (short: ```ptw```)
|
|
18
|
+
4. Build docs via ```pip install -e .[docs]``` and ```mkdocs serve```
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from .abstract_model import *
|
|
2
2
|
from .cli import *
|
|
3
3
|
from .config import *
|
|
4
|
+
from .problem import *
|
|
4
5
|
from .version import *
|
|
5
6
|
|
|
6
7
|
# import entry_points from importlib.metadata or fall back to pkg_resources
|
|
@@ -9,11 +10,13 @@ try:
|
|
|
9
10
|
|
|
10
11
|
def get_entry_points(group):
|
|
11
12
|
return entry_points().get(group, [])
|
|
13
|
+
|
|
12
14
|
except ImportError:
|
|
13
15
|
import pkg_resources
|
|
14
|
-
|
|
16
|
+
|
|
15
17
|
def get_entry_points(group):
|
|
16
18
|
return pkg_resources.iter_entry_points(group)
|
|
17
19
|
|
|
20
|
+
|
|
18
21
|
for entry_point in get_entry_points("nerdd-module.plugins"):
|
|
19
22
|
entry_point.load()
|
|
@@ -5,18 +5,19 @@ import pandas as pd
|
|
|
5
5
|
from rdkit.Chem import Mol, MolToSmiles
|
|
6
6
|
|
|
7
7
|
from .config import AutoConfiguration, Configuration
|
|
8
|
-
from .io import
|
|
8
|
+
from .io import DepthFirstExplorer, MoleculeEntry
|
|
9
9
|
from .preprocessing import Pipeline, Step, registry
|
|
10
|
+
from .problem import Problem
|
|
10
11
|
|
|
11
12
|
__all__ = ["AbstractModel"]
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
class CustomPreprocessingStep(Step):
|
|
15
|
-
def __init__(self, fn: Callable[[Mol], Tuple[Mol, List[
|
|
16
|
+
def __init__(self, fn: Callable[[Mol], Tuple[Mol, List[Problem]]]):
|
|
16
17
|
super().__init__()
|
|
17
18
|
self.fn = fn
|
|
18
19
|
|
|
19
|
-
def _run(self, mol: Mol) -> Tuple[Mol, List[
|
|
20
|
+
def _run(self, mol: Mol) -> Tuple[Mol, List[Problem]]:
|
|
20
21
|
return self.fn(mol)
|
|
21
22
|
|
|
22
23
|
|
|
@@ -69,7 +70,7 @@ class AbstractModel(ABC):
|
|
|
69
70
|
#
|
|
70
71
|
self.num_processes = num_processes
|
|
71
72
|
|
|
72
|
-
def _preprocess_single_mol(self, mol: Mol) -> Tuple[Mol, List[
|
|
73
|
+
def _preprocess_single_mol(self, mol: Mol) -> Tuple[Mol, List[Problem]]:
|
|
73
74
|
# if this method is called, the preprocessing_pipeline was set to "custom"
|
|
74
75
|
# and this method has to be overwritten
|
|
75
76
|
raise NotImplementedError()
|
|
@@ -117,13 +118,6 @@ class AbstractModel(ABC):
|
|
|
117
118
|
for mol in df_preprocess.input_mol
|
|
118
119
|
]
|
|
119
120
|
|
|
120
|
-
# add smiles columns for web UI
|
|
121
|
-
def _to_smiles(mol):
|
|
122
|
-
try:
|
|
123
|
-
return MolToSmiles(mol)
|
|
124
|
-
except:
|
|
125
|
-
return None
|
|
126
|
-
|
|
127
121
|
#
|
|
128
122
|
# PREPARE PREDICTION OF MOLECULES
|
|
129
123
|
#
|
|
@@ -223,10 +217,8 @@ class AbstractModel(ABC):
|
|
|
223
217
|
df_result.drop(columns=["missing", "preprocessing_errors"], inplace=True)
|
|
224
218
|
|
|
225
219
|
# convert errors to string
|
|
226
|
-
if "errors" in df_result.columns:
|
|
227
|
-
df_result["errors"] =
|
|
228
|
-
else:
|
|
229
|
-
df_result["errors"] = ""
|
|
220
|
+
if "errors" not in df_result.columns:
|
|
221
|
+
df_result["errors"] = []
|
|
230
222
|
|
|
231
223
|
# delete mol column (not needed anymore)
|
|
232
224
|
df_load.drop(columns=["mol"], inplace=True)
|
|
@@ -236,7 +228,7 @@ class AbstractModel(ABC):
|
|
|
236
228
|
|
|
237
229
|
# merge errors from loading and prediction
|
|
238
230
|
df_result["errors"] = [
|
|
239
|
-
|
|
231
|
+
load_errors + prediction_errors
|
|
240
232
|
for load_errors, prediction_errors in zip(
|
|
241
233
|
df_result.load_errors, df_result.errors
|
|
242
234
|
)
|
|
@@ -266,7 +258,7 @@ class AbstractModel(ABC):
|
|
|
266
258
|
input_type=None,
|
|
267
259
|
**kwargs,
|
|
268
260
|
):
|
|
269
|
-
entries =
|
|
261
|
+
entries = DepthFirstExplorer().explore(inputs)
|
|
270
262
|
|
|
271
263
|
return self._predict_entries(entries, **kwargs)
|
|
272
264
|
|
|
@@ -7,9 +7,11 @@ class DefaultConfiguration(Configuration):
|
|
|
7
7
|
def __init__(self, nerdd_module):
|
|
8
8
|
super().__init__()
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
self.config = dict(
|
|
11
|
+
task="molecular_property_prediction",
|
|
12
|
+
job_parameters=[],
|
|
13
|
+
result_properties=[],
|
|
14
|
+
)
|
|
13
15
|
|
|
14
16
|
def _get_dict(self):
|
|
15
17
|
return self.config
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
from .csv_writer import *
|
|
2
|
-
from .
|
|
3
|
-
from .
|
|
2
|
+
from .depth_first_explorer import *
|
|
3
|
+
from .file_reader import *
|
|
4
|
+
from .gzip_reader import *
|
|
4
5
|
from .inchi_reader import *
|
|
5
|
-
from .elementary_inchi_reader import *
|
|
6
6
|
from .list_reader import *
|
|
7
|
-
from .
|
|
8
|
-
from .elementary_rdkit_mol_reader import *
|
|
7
|
+
from .mol_reader import *
|
|
9
8
|
from .reader import *
|
|
10
9
|
from .reader_registry import *
|
|
11
10
|
from .sdf_reader import *
|
|
12
11
|
from .sdf_writer import *
|
|
13
12
|
from .smiles_reader import *
|
|
14
|
-
from .
|
|
15
|
-
from .
|
|
13
|
+
from .string_reader import *
|
|
14
|
+
from .tar_reader import *
|
|
16
15
|
from .writer import *
|
|
17
16
|
from .writer_registry import *
|
|
17
|
+
from .zip_reader import *
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from itertools import chain, islice, repeat
|
|
2
|
+
from typing import Generator, Iterable, Optional
|
|
3
|
+
|
|
4
|
+
from .explorer import Explorer
|
|
5
|
+
from .reader import MoleculeEntry, Problem, Reader
|
|
6
|
+
from .reader_registry import ReaderRegistry
|
|
7
|
+
|
|
8
|
+
__all__ = ["DepthFirstExplorer"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class InvalidInputReader(Reader):
|
|
12
|
+
def __init__(self):
|
|
13
|
+
super().__init__()
|
|
14
|
+
|
|
15
|
+
def read(self, input, explore) -> Generator[MoleculeEntry, None, None]:
|
|
16
|
+
yield MoleculeEntry(
|
|
17
|
+
raw_input=input,
|
|
18
|
+
input_type="unknown",
|
|
19
|
+
source=tuple(["input"]),
|
|
20
|
+
mol=None,
|
|
21
|
+
errors=[Problem("invalid_input", "Invalid input")],
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
def __repr__(self) -> str:
|
|
25
|
+
return "InvalidInputReader()"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DepthFirstExplorer(Explorer):
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
readers: Optional[Iterable[Reader]] = None,
|
|
32
|
+
num_test_entries: int = 10,
|
|
33
|
+
threshold: float = 0.5,
|
|
34
|
+
maximum_depth: int = 50,
|
|
35
|
+
):
|
|
36
|
+
super().__init__()
|
|
37
|
+
|
|
38
|
+
if readers is None:
|
|
39
|
+
self.reader_registry = ReaderRegistry()
|
|
40
|
+
else:
|
|
41
|
+
self.reader_registry = None
|
|
42
|
+
|
|
43
|
+
self.num_test_entries = num_test_entries
|
|
44
|
+
self.threshold = threshold
|
|
45
|
+
self.state_stack = [self.empty_state()]
|
|
46
|
+
self.maximum_depth = maximum_depth
|
|
47
|
+
|
|
48
|
+
def empty_state(self):
|
|
49
|
+
return dict(first_guess=[])
|
|
50
|
+
|
|
51
|
+
def explore(self, input) -> Generator[MoleculeEntry, None, None]:
|
|
52
|
+
# create a new child node and set it as the current node
|
|
53
|
+
state = self.empty_state()
|
|
54
|
+
parent = self.state_stack[-1]
|
|
55
|
+
self.state_stack.append(state)
|
|
56
|
+
|
|
57
|
+
depth = len(self.state_stack)
|
|
58
|
+
if depth > self.maximum_depth:
|
|
59
|
+
raise ValueError(f"Maximum depth of {self.maximum_depth} reached")
|
|
60
|
+
|
|
61
|
+
readers_iter = chain(
|
|
62
|
+
zip(parent["first_guess"], repeat("guess")),
|
|
63
|
+
zip(self.reader_registry, repeat("builtin")),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# try all readers and take a sample of the first num_test_entries
|
|
67
|
+
# the reader with most valid molecule entries will be used
|
|
68
|
+
best_reader: Optional[Reader] = None
|
|
69
|
+
best_mode = None
|
|
70
|
+
best_score = 0
|
|
71
|
+
best_ratio = 0.0
|
|
72
|
+
generator = None
|
|
73
|
+
sample = []
|
|
74
|
+
for reader, mode in readers_iter:
|
|
75
|
+
try:
|
|
76
|
+
# read at most num_test_entries entries
|
|
77
|
+
generator = reader.read(input, self.explore)
|
|
78
|
+
sample = list(islice(generator, self.num_test_entries))
|
|
79
|
+
valid_entries = [entry for entry in sample if entry.mol is not None]
|
|
80
|
+
|
|
81
|
+
score = len(valid_entries)
|
|
82
|
+
ratio = len(valid_entries) / len(sample)
|
|
83
|
+
|
|
84
|
+
if score > best_score or (score == best_score and ratio > best_ratio):
|
|
85
|
+
best_reader = reader
|
|
86
|
+
best_mode = mode
|
|
87
|
+
best_score = score
|
|
88
|
+
best_ratio = ratio
|
|
89
|
+
|
|
90
|
+
if score == self.num_test_entries:
|
|
91
|
+
break
|
|
92
|
+
except Exception:
|
|
93
|
+
pass
|
|
94
|
+
|
|
95
|
+
# clean up tree
|
|
96
|
+
while len(self.state_stack) > depth:
|
|
97
|
+
self.state_stack.pop()
|
|
98
|
+
generator = None
|
|
99
|
+
|
|
100
|
+
if generator is None:
|
|
101
|
+
if best_reader is None:
|
|
102
|
+
generator = InvalidInputReader().read(input, self.explore)
|
|
103
|
+
else:
|
|
104
|
+
generator = best_reader.read(input, self.explore)
|
|
105
|
+
sample = list(islice(generator, self.num_test_entries))
|
|
106
|
+
else:
|
|
107
|
+
if best_mode is not None and best_mode != "guess":
|
|
108
|
+
parent["first_guess"].append(best_reader)
|
|
109
|
+
|
|
110
|
+
yield from sample
|
|
111
|
+
yield from generator
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Generator
|
|
3
|
+
|
|
4
|
+
from .reader import MoleculeEntry
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Explorer(ABC):
|
|
8
|
+
def __init__(self):
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def explore(self, input) -> Generator[MoleculeEntry, None, None]:
|
|
13
|
+
pass
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Generator
|
|
3
|
+
|
|
4
|
+
from .reader import MoleculeEntry, Reader
|
|
5
|
+
from .reader_registry import register_reader
|
|
6
|
+
|
|
7
|
+
__all__ = ["FileReader"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_reader
|
|
11
|
+
class FileReader(Reader):
|
|
12
|
+
def __init__(self):
|
|
13
|
+
super().__init__()
|
|
14
|
+
|
|
15
|
+
def read(self, filename, explore) -> Generator[MoleculeEntry, None, None]:
|
|
16
|
+
if not isinstance(filename, str) or not os.path.exists(filename):
|
|
17
|
+
raise TypeError("input must be a valid filename")
|
|
18
|
+
|
|
19
|
+
with open(filename, "rb") as f:
|
|
20
|
+
for entry in explore(f):
|
|
21
|
+
if len(entry.source) == 1 and entry.source[0] == "raw_input":
|
|
22
|
+
source = tuple()
|
|
23
|
+
else:
|
|
24
|
+
source = entry.source
|
|
25
|
+
yield entry._replace(source=tuple([filename, *source]))
|
|
26
|
+
|
|
27
|
+
def __repr__(self):
|
|
28
|
+
return f"FileReader()"
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import gzip
|
|
2
|
+
from typing import Generator
|
|
3
|
+
|
|
4
|
+
from .reader import MoleculeEntry, Reader
|
|
5
|
+
from .reader_registry import register_reader
|
|
6
|
+
|
|
7
|
+
__all__ = ["GzipReader"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_reader
|
|
11
|
+
class GzipReader(Reader):
|
|
12
|
+
def __init__(self):
|
|
13
|
+
super().__init__()
|
|
14
|
+
|
|
15
|
+
def read(self, input_stream, explore) -> Generator[MoleculeEntry, None, None]:
|
|
16
|
+
if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
|
|
17
|
+
raise TypeError("input must be a stream-like object")
|
|
18
|
+
|
|
19
|
+
input_stream.seek(0)
|
|
20
|
+
|
|
21
|
+
with gzip.open(input_stream, "rb") as f:
|
|
22
|
+
# gzip.open will not raise an exception if the file is not a valid gzip file
|
|
23
|
+
# --> check by attempting to read the first byte
|
|
24
|
+
f.read(1)
|
|
25
|
+
f.seek(0)
|
|
26
|
+
|
|
27
|
+
yield from explore(f)
|
|
28
|
+
|
|
29
|
+
def __repr__(self) -> str:
|
|
30
|
+
return "GzipReader()"
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from codecs import getreader
|
|
2
|
+
from typing import Generator
|
|
3
|
+
|
|
4
|
+
from rdkit.Chem import MolFromInchi
|
|
5
|
+
from rdkit.rdBase import BlockLogs
|
|
6
|
+
|
|
7
|
+
from ..problem import Problem
|
|
8
|
+
from .reader import MoleculeEntry, Reader
|
|
9
|
+
from .reader_registry import register_reader
|
|
10
|
+
|
|
11
|
+
__all__ = ["InchiReader"]
|
|
12
|
+
|
|
13
|
+
StreamReader = getreader("utf-8")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@register_reader
|
|
17
|
+
class InchiReader(Reader):
|
|
18
|
+
def __init__(self):
|
|
19
|
+
super().__init__()
|
|
20
|
+
|
|
21
|
+
def read(self, input_stream, explore) -> Generator[MoleculeEntry, None, None]:
|
|
22
|
+
if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
|
|
23
|
+
raise TypeError("input must be a stream-like object")
|
|
24
|
+
|
|
25
|
+
input_stream.seek(0)
|
|
26
|
+
|
|
27
|
+
reader = StreamReader(input_stream)
|
|
28
|
+
|
|
29
|
+
# suppress RDKit warnings
|
|
30
|
+
with BlockLogs():
|
|
31
|
+
for line in reader:
|
|
32
|
+
# skip empty lines
|
|
33
|
+
if line.strip() == "":
|
|
34
|
+
continue
|
|
35
|
+
|
|
36
|
+
# skip comments
|
|
37
|
+
if line.strip().startswith("#"):
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
mol = MolFromInchi(line, sanitize=False)
|
|
42
|
+
except:
|
|
43
|
+
mol = None
|
|
44
|
+
|
|
45
|
+
if mol is None:
|
|
46
|
+
errors = [Problem("invalid_inchi", "Invalid InChI")]
|
|
47
|
+
else:
|
|
48
|
+
errors = []
|
|
49
|
+
|
|
50
|
+
yield MoleculeEntry(
|
|
51
|
+
raw_input=line,
|
|
52
|
+
input_type="inchi",
|
|
53
|
+
source=tuple(["raw_input"]),
|
|
54
|
+
mol=mol,
|
|
55
|
+
errors=errors,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def __repr__(self) -> str:
|
|
59
|
+
return "InchiReader()"
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from io import BytesIO, StringIO
|
|
2
|
+
from typing import BinaryIO, Generator, Iterable
|
|
3
|
+
|
|
4
|
+
from .reader import MoleculeEntry, Reader
|
|
5
|
+
from .reader_registry import register_reader
|
|
6
|
+
|
|
7
|
+
__all__ = ["ListReader"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_reader
|
|
11
|
+
class ListReader(Reader):
|
|
12
|
+
def __init__(self):
|
|
13
|
+
super().__init__()
|
|
14
|
+
|
|
15
|
+
def read(self, input_iterable, explore) -> Generator[MoleculeEntry, None, None]:
|
|
16
|
+
assert isinstance(input_iterable, Iterable) and not isinstance(
|
|
17
|
+
input_iterable, (str, bytes, BytesIO, StringIO, BinaryIO)
|
|
18
|
+
), f"input must be an iterable, but is {type(input_iterable)}"
|
|
19
|
+
|
|
20
|
+
for entry in input_iterable:
|
|
21
|
+
yield from explore(entry)
|
|
22
|
+
|
|
23
|
+
def __repr__(self) -> str:
|
|
24
|
+
return "ListReader()"
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from typing import Generator
|
|
2
|
+
|
|
3
|
+
from rdkit.Chem import Mol
|
|
4
|
+
|
|
5
|
+
from .reader import MoleculeEntry, Reader
|
|
6
|
+
from .reader_registry import register_reader
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_reader
|
|
10
|
+
class MolReader(Reader):
|
|
11
|
+
def __init__(self):
|
|
12
|
+
super().__init__()
|
|
13
|
+
|
|
14
|
+
def read(self, mol, explore) -> Generator[MoleculeEntry, None, None]:
|
|
15
|
+
assert isinstance(mol, Mol)
|
|
16
|
+
yield MoleculeEntry(
|
|
17
|
+
raw_input=mol,
|
|
18
|
+
input_type="rdkit_mol",
|
|
19
|
+
source=tuple(["raw_input"]),
|
|
20
|
+
mol=mol,
|
|
21
|
+
errors=[],
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
def __repr__(self) -> str:
|
|
25
|
+
return "MolReader()"
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Generator, List, NamedTuple, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
from rdkit.Chem import Mol
|
|
5
|
+
|
|
6
|
+
from ..problem import Problem
|
|
7
|
+
|
|
8
|
+
__all__ = ["MoleculeEntry", "Reader"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MoleculeEntry(NamedTuple):
|
|
12
|
+
raw_input: str
|
|
13
|
+
input_type: str
|
|
14
|
+
source: Tuple[str, ...]
|
|
15
|
+
mol: Optional[Mol]
|
|
16
|
+
errors: List[Problem]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Reader(ABC):
|
|
20
|
+
def __init__(self):
|
|
21
|
+
super().__init__()
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def read(self, input, explore) -> Generator[MoleculeEntry, None, None]:
|
|
25
|
+
pass
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from functools import lru_cache
|
|
2
|
+
from typing import Generator, Type
|
|
3
|
+
|
|
4
|
+
from .reader import Reader
|
|
5
|
+
|
|
6
|
+
__all__ = ["ReaderRegistry", "register_reader"]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# lru_cache makes the registry a singleton
|
|
10
|
+
@lru_cache(maxsize=1)
|
|
11
|
+
class ReaderRegistry:
|
|
12
|
+
def __init__(self):
|
|
13
|
+
self._factories = []
|
|
14
|
+
|
|
15
|
+
def register(self, ReaderClass: Type[Reader], *args, **kwargs):
|
|
16
|
+
assert issubclass(ReaderClass, Reader)
|
|
17
|
+
self._factories.append(lambda: ReaderClass(*args, **kwargs))
|
|
18
|
+
|
|
19
|
+
def readers(self) -> Generator[Reader, None, None]:
|
|
20
|
+
for reader in self._factories:
|
|
21
|
+
yield reader()
|
|
22
|
+
|
|
23
|
+
def __iter__(self):
|
|
24
|
+
return iter(map(lambda f: f(), self._factories))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def register_reader(clazz, *args, **kwargs):
|
|
28
|
+
# TODO: implement both decorator modes
|
|
29
|
+
ReaderRegistry().register(clazz, *args, **kwargs)
|
|
30
|
+
return clazz
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from codecs import getreader
|
|
2
|
+
from typing import Generator
|
|
3
|
+
|
|
4
|
+
from rdkit.Chem import MolFromMolBlock
|
|
5
|
+
from rdkit.rdBase import BlockLogs
|
|
6
|
+
|
|
7
|
+
from ..problem import Problem
|
|
8
|
+
from .reader import MoleculeEntry, Reader
|
|
9
|
+
from .reader_registry import register_reader
|
|
10
|
+
|
|
11
|
+
__all__ = ["SdfReader"]
|
|
12
|
+
|
|
13
|
+
StreamReader = getreader("utf-8")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@register_reader
|
|
17
|
+
class SdfReader(Reader):
|
|
18
|
+
def __init__(self, max_num_lines_mol_block: int = 10000):
|
|
19
|
+
super().__init__()
|
|
20
|
+
self.max_num_lines_mol_block = max_num_lines_mol_block
|
|
21
|
+
|
|
22
|
+
def read(self, input_stream, explore) -> Generator[MoleculeEntry, None, None]:
|
|
23
|
+
if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
|
|
24
|
+
raise TypeError("input must be a stream-like object")
|
|
25
|
+
|
|
26
|
+
input_stream.seek(0)
|
|
27
|
+
|
|
28
|
+
reader = StreamReader(input_stream)
|
|
29
|
+
|
|
30
|
+
# suppress RDKit warnings
|
|
31
|
+
with BlockLogs():
|
|
32
|
+
|
|
33
|
+
# We do not use SDMolSupplier, because it does not accept a stream-like
|
|
34
|
+
# object as input. The ForwadSDMolSupplier is not suitable either, because
|
|
35
|
+
# it does not allow to return the raw text.
|
|
36
|
+
while True:
|
|
37
|
+
# collect lines to parse as a mol block
|
|
38
|
+
mol_block = ""
|
|
39
|
+
num_lines = 0
|
|
40
|
+
line = reader.readline()
|
|
41
|
+
while line:
|
|
42
|
+
mol_block += line
|
|
43
|
+
if line.strip() == "$$$$":
|
|
44
|
+
break
|
|
45
|
+
|
|
46
|
+
num_lines += 1
|
|
47
|
+
if num_lines > self.max_num_lines_mol_block:
|
|
48
|
+
break
|
|
49
|
+
|
|
50
|
+
# read next line
|
|
51
|
+
line = reader.readline()
|
|
52
|
+
|
|
53
|
+
if mol_block.strip() != "":
|
|
54
|
+
try:
|
|
55
|
+
mol = MolFromMolBlock(mol_block, sanitize=False, removeHs=False)
|
|
56
|
+
except:
|
|
57
|
+
mol = None
|
|
58
|
+
|
|
59
|
+
if mol is None:
|
|
60
|
+
errors = [Problem("invalid_mol_block", "Invalid mol block")]
|
|
61
|
+
else:
|
|
62
|
+
errors = []
|
|
63
|
+
|
|
64
|
+
yield MoleculeEntry(
|
|
65
|
+
raw_input=mol_block,
|
|
66
|
+
input_type="mol_block",
|
|
67
|
+
source=tuple(["raw_input"]),
|
|
68
|
+
mol=mol,
|
|
69
|
+
errors=errors,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# We stop reading if
|
|
73
|
+
# (1) we have reached the end of the file OR
|
|
74
|
+
# (2) the last entry had more than MAX_NUM_LINES_MOL_BLOCK lines
|
|
75
|
+
# (this entry is probably not a valid mol block and everything after
|
|
76
|
+
# it is probably not a valid mol block either)
|
|
77
|
+
if (not line) or (num_lines > self.max_num_lines_mol_block):
|
|
78
|
+
break
|
|
79
|
+
|
|
80
|
+
def __repr__(self) -> str:
|
|
81
|
+
return f"SdfReader(max_num_lines_mol_block={self.max_num_lines_mol_block})"
|