nerdd-module 0.1.12__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nerdd-module-0.2.1/PKG-INFO +70 -0
- nerdd-module-0.2.1/README.md +18 -0
- nerdd-module-0.2.1/nerdd_module/__init__.py +10 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/abstract_model.py +9 -17
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/cli.py +1 -1
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/config/default_configuration.py +5 -3
- nerdd-module-0.2.1/nerdd_module/config/package_configuration.py +31 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/io/__init__.py +7 -7
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/io/csv_writer.py +1 -2
- nerdd-module-0.2.1/nerdd_module/io/depth_first_explorer.py +111 -0
- nerdd-module-0.2.1/nerdd_module/io/explorer.py +13 -0
- nerdd-module-0.2.1/nerdd_module/io/file_reader.py +37 -0
- nerdd-module-0.2.1/nerdd_module/io/gzip_reader.py +30 -0
- nerdd-module-0.2.1/nerdd_module/io/inchi_reader.py +59 -0
- nerdd-module-0.2.1/nerdd_module/io/list_reader.py +24 -0
- nerdd-module-0.2.1/nerdd_module/io/mol_reader.py +25 -0
- nerdd-module-0.2.1/nerdd_module/io/reader.py +25 -0
- nerdd-module-0.2.1/nerdd_module/io/reader_registry.py +30 -0
- nerdd-module-0.2.1/nerdd_module/io/sdf_reader.py +81 -0
- nerdd-module-0.2.1/nerdd_module/io/smiles_reader.py +66 -0
- nerdd-module-0.2.1/nerdd_module/io/string_reader.py +22 -0
- nerdd-module-0.2.1/nerdd_module/io/tar_reader.py +29 -0
- nerdd-module-0.2.1/nerdd_module/io/zip_reader.py +31 -0
- nerdd-module-0.2.1/nerdd_module/polyfills/__init__.py +2 -0
- nerdd-module-0.2.1/nerdd_module/polyfills/files.py +8 -0
- nerdd-module-0.1.12/nerdd_module/__init__.py → nerdd-module-0.2.1/nerdd_module/polyfills/get_entry_points.py +5 -9
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/preprocessing/__init__.py +1 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/preprocessing/check_valid_smiles.py +7 -6
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/preprocessing/chembl_structure_pipeline.py +11 -16
- nerdd-module-0.2.1/nerdd_module/preprocessing/filter_by_element.py +39 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/preprocessing/filter_by_weight.py +9 -2
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/preprocessing/pipeline.py +4 -3
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/preprocessing/remove_stereochemistry.py +6 -3
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/preprocessing/step.py +4 -2
- nerdd-module-0.2.1/nerdd_module/problem.py +8 -0
- nerdd-module-0.2.1/nerdd_module.egg-info/PKG-INFO +70 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module.egg-info/SOURCES.txt +11 -9
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module.egg-info/requires.txt +5 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/setup.py +6 -1
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/tests/models/MolWeightModel.py +1 -1
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/tests/models/MolWeightModelWithExplicitMolIds.py +1 -1
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/tests/models/MolWeightModelWithExplicitMols.py +1 -1
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/tests/steps/checks.py +11 -3
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/tests/steps/molecules.py +6 -5
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/tests/test_molecule_property_prediction.py +0 -8
- nerdd-module-0.2.1/tests/test_reading_formats.py +137 -0
- nerdd-module-0.1.12/PKG-INFO +0 -90
- nerdd-module-0.1.12/README.md +0 -61
- nerdd-module-0.1.12/nerdd_module/config/package_configuration.py +0 -30
- nerdd-module-0.1.12/nerdd_module/io/elementary_inchi_reader.py +0 -38
- nerdd-module-0.1.12/nerdd_module/io/elementary_mol_block_reader.py +0 -40
- nerdd-module-0.1.12/nerdd_module/io/elementary_rdkit_mol_reader.py +0 -24
- nerdd-module-0.1.12/nerdd_module/io/elementary_reader.py +0 -30
- nerdd-module-0.1.12/nerdd_module/io/elementary_smiles_reader.py +0 -43
- nerdd-module-0.1.12/nerdd_module/io/file_reader.py +0 -20
- nerdd-module-0.1.12/nerdd_module/io/guess_and_read.py +0 -75
- nerdd-module-0.1.12/nerdd_module/io/guessing_reader.py +0 -55
- nerdd-module-0.1.12/nerdd_module/io/gzip_file_reader.py +0 -29
- nerdd-module-0.1.12/nerdd_module/io/inchi_reader.py +0 -30
- nerdd-module-0.1.12/nerdd_module/io/list_reader.py +0 -25
- nerdd-module-0.1.12/nerdd_module/io/reader.py +0 -31
- nerdd-module-0.1.12/nerdd_module/io/reader_registry.py +0 -44
- nerdd-module-0.1.12/nerdd_module/io/sdf_reader.py +0 -49
- nerdd-module-0.1.12/nerdd_module/io/smiles_reader.py +0 -31
- nerdd-module-0.1.12/nerdd_module/io/splitting_reader.py +0 -28
- nerdd-module-0.1.12/nerdd_module/preprocessing/filter_by_element.py +0 -29
- nerdd-module-0.1.12/nerdd_module.egg-info/PKG-INFO +0 -90
- nerdd-module-0.1.12/tests/test_reading_formats.py +0 -137
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/LICENSE +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/config/__init__.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/config/auto_configuration.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/config/configuration.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/config/dict_configuration.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/config/merged_configuration.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/config/yaml_configuration.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/io/sdf_writer.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/io/writer.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/io/writer_registry.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/preprocessing/empty_pipeline.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/preprocessing/registry.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module/version.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module.egg-info/dependency_links.txt +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/nerdd_module.egg-info/top_level.txt +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/setup.cfg +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/tests/__init__.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/tests/conftest.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/tests/models/AtomicMassModel.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/tests/models/__init__.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/tests/steps/__init__.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/tests/steps/predictors.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/tests/steps/preprocessing.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/tests/test_atom_property_prediction.py +0 -0
- {nerdd-module-0.1.12 → nerdd-module-0.2.1}/tests/test_preprocessing.py +0 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: nerdd-module
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: Base package to create NERDD modules
|
|
5
|
+
Home-page: https://github.com/molinfo-vienna/nerdd-module.git
|
|
6
|
+
Maintainer: Steffen Hirte
|
|
7
|
+
Maintainer-email: steffen.hirte@univie.ac.at
|
|
8
|
+
License: BSD 3-Clause License
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
12
|
+
Classifier: Programming Language :: C
|
|
13
|
+
Classifier: Programming Language :: Python
|
|
14
|
+
Classifier: Topic :: Software Development
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering
|
|
16
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
17
|
+
Classifier: Operating System :: POSIX
|
|
18
|
+
Classifier: Operating System :: Unix
|
|
19
|
+
Classifier: Operating System :: MacOS
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: rdkit>=2022.3.3
|
|
28
|
+
Requires-Dist: pandas>=1.2.1
|
|
29
|
+
Requires-Dist: pyyaml>=6.0
|
|
30
|
+
Requires-Dist: filetype~=1.2.0
|
|
31
|
+
Requires-Dist: rich-click>=1.7.1
|
|
32
|
+
Requires-Dist: stringcase>=1.2.0
|
|
33
|
+
Requires-Dist: decorator>=5.1.1
|
|
34
|
+
Requires-Dist: importlib-resources>=5; python_version < "3.10"
|
|
35
|
+
Requires-Dist: importlib-metadata>=4.6; python_version < "3.10"
|
|
36
|
+
Requires-Dist: chembl_structure_pipeline>=1.0.0
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Provides-Extra: test
|
|
39
|
+
Requires-Dist: pytest; extra == "test"
|
|
40
|
+
Requires-Dist: pytest-sugar; extra == "test"
|
|
41
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
42
|
+
Requires-Dist: pytest-asyncio; extra == "test"
|
|
43
|
+
Requires-Dist: pytest-bdd; extra == "test"
|
|
44
|
+
Requires-Dist: pytest-mock; extra == "test"
|
|
45
|
+
Requires-Dist: pytest-watch; extra == "test"
|
|
46
|
+
Requires-Dist: hypothesis; extra == "test"
|
|
47
|
+
Requires-Dist: hypothesis-rdkit; extra == "test"
|
|
48
|
+
Provides-Extra: docs
|
|
49
|
+
Requires-Dist: mkdocs; extra == "docs"
|
|
50
|
+
Requires-Dist: mkdocs-material; extra == "docs"
|
|
51
|
+
Requires-Dist: mkdocstrings; extra == "docs"
|
|
52
|
+
|
|
53
|
+
# NERDD Module
|
|
54
|
+
|
|
55
|
+
This package provides the basis to implement molecular prediction modules in the
|
|
56
|
+
NERDD ecosystem.
|
|
57
|
+
|
|
58
|
+
## Installation
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pip install -U nerdd-module
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
## Contribute
|
|
66
|
+
|
|
67
|
+
1. Fork and clone the code
|
|
68
|
+
2. Install test dependencies with ```pip install -e .[test]```
|
|
69
|
+
3. Run tests via ```pytest``` or ```pytest-watch``` (short: ```ptw```)
|
|
70
|
+
4. Build docs via ```pip install -e .[docs]``` and ```mkdocs serve```
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# NERDD Module
|
|
2
|
+
|
|
3
|
+
This package provides the basis to implement molecular prediction modules in the
|
|
4
|
+
NERDD ecosystem.
|
|
5
|
+
|
|
6
|
+
## Installation
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
pip install -U nerdd-module
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
## Contribute
|
|
14
|
+
|
|
15
|
+
1. Fork and clone the code
|
|
16
|
+
2. Install test dependencies with ```pip install -e .[test]```
|
|
17
|
+
3. Run tests via ```pytest``` or ```pytest-watch``` (short: ```ptw```)
|
|
18
|
+
4. Build docs via ```pip install -e .[docs]``` and ```mkdocs serve```
|
|
@@ -5,18 +5,19 @@ import pandas as pd
|
|
|
5
5
|
from rdkit.Chem import Mol, MolToSmiles
|
|
6
6
|
|
|
7
7
|
from .config import AutoConfiguration, Configuration
|
|
8
|
-
from .io import
|
|
8
|
+
from .io import DepthFirstExplorer, MoleculeEntry
|
|
9
9
|
from .preprocessing import Pipeline, Step, registry
|
|
10
|
+
from .problem import Problem
|
|
10
11
|
|
|
11
12
|
__all__ = ["AbstractModel"]
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
class CustomPreprocessingStep(Step):
|
|
15
|
-
def __init__(self, fn: Callable[[Mol], Tuple[Mol, List[
|
|
16
|
+
def __init__(self, fn: Callable[[Mol], Tuple[Mol, List[Problem]]]):
|
|
16
17
|
super().__init__()
|
|
17
18
|
self.fn = fn
|
|
18
19
|
|
|
19
|
-
def _run(self, mol: Mol) -> Tuple[Mol, List[
|
|
20
|
+
def _run(self, mol: Mol) -> Tuple[Mol, List[Problem]]:
|
|
20
21
|
return self.fn(mol)
|
|
21
22
|
|
|
22
23
|
|
|
@@ -69,7 +70,7 @@ class AbstractModel(ABC):
|
|
|
69
70
|
#
|
|
70
71
|
self.num_processes = num_processes
|
|
71
72
|
|
|
72
|
-
def _preprocess_single_mol(self, mol: Mol) -> Tuple[Mol, List[
|
|
73
|
+
def _preprocess_single_mol(self, mol: Mol) -> Tuple[Mol, List[Problem]]:
|
|
73
74
|
# if this method is called, the preprocessing_pipeline was set to "custom"
|
|
74
75
|
# and this method has to be overwritten
|
|
75
76
|
raise NotImplementedError()
|
|
@@ -117,13 +118,6 @@ class AbstractModel(ABC):
|
|
|
117
118
|
for mol in df_preprocess.input_mol
|
|
118
119
|
]
|
|
119
120
|
|
|
120
|
-
# add smiles columns for web UI
|
|
121
|
-
def _to_smiles(mol):
|
|
122
|
-
try:
|
|
123
|
-
return MolToSmiles(mol)
|
|
124
|
-
except:
|
|
125
|
-
return None
|
|
126
|
-
|
|
127
121
|
#
|
|
128
122
|
# PREPARE PREDICTION OF MOLECULES
|
|
129
123
|
#
|
|
@@ -223,10 +217,8 @@ class AbstractModel(ABC):
|
|
|
223
217
|
df_result.drop(columns=["missing", "preprocessing_errors"], inplace=True)
|
|
224
218
|
|
|
225
219
|
# convert errors to string
|
|
226
|
-
if "errors" in df_result.columns:
|
|
227
|
-
df_result["errors"] =
|
|
228
|
-
else:
|
|
229
|
-
df_result["errors"] = ""
|
|
220
|
+
if "errors" not in df_result.columns:
|
|
221
|
+
df_result["errors"] = []
|
|
230
222
|
|
|
231
223
|
# delete mol column (not needed anymore)
|
|
232
224
|
df_load.drop(columns=["mol"], inplace=True)
|
|
@@ -236,7 +228,7 @@ class AbstractModel(ABC):
|
|
|
236
228
|
|
|
237
229
|
# merge errors from loading and prediction
|
|
238
230
|
df_result["errors"] = [
|
|
239
|
-
|
|
231
|
+
load_errors + prediction_errors
|
|
240
232
|
for load_errors, prediction_errors in zip(
|
|
241
233
|
df_result.load_errors, df_result.errors
|
|
242
234
|
)
|
|
@@ -266,7 +258,7 @@ class AbstractModel(ABC):
|
|
|
266
258
|
input_type=None,
|
|
267
259
|
**kwargs,
|
|
268
260
|
):
|
|
269
|
-
entries =
|
|
261
|
+
entries = DepthFirstExplorer().explore(inputs)
|
|
270
262
|
|
|
271
263
|
return self._predict_entries(entries, **kwargs)
|
|
272
264
|
|
|
@@ -7,9 +7,11 @@ class DefaultConfiguration(Configuration):
|
|
|
7
7
|
def __init__(self, nerdd_module):
|
|
8
8
|
super().__init__()
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
self.config = dict(
|
|
11
|
+
task="molecular_property_prediction",
|
|
12
|
+
job_parameters=[],
|
|
13
|
+
result_properties=[],
|
|
14
|
+
)
|
|
13
15
|
|
|
14
16
|
def _get_dict(self):
|
|
15
17
|
return self.config
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from ..polyfills import files
|
|
2
|
+
from .configuration import Configuration
|
|
3
|
+
from .dict_configuration import DictConfiguration
|
|
4
|
+
from .yaml_configuration import YamlConfiguration
|
|
5
|
+
|
|
6
|
+
__all__ = ["PackageConfiguration"]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PackageConfiguration(Configuration):
|
|
10
|
+
def __init__(self, package):
|
|
11
|
+
super().__init__()
|
|
12
|
+
|
|
13
|
+
# get the resource directory
|
|
14
|
+
try:
|
|
15
|
+
root_dir = files(package)
|
|
16
|
+
except ModuleNotFoundError:
|
|
17
|
+
root_dir = None
|
|
18
|
+
|
|
19
|
+
if root_dir is None:
|
|
20
|
+
self.config = DictConfiguration({})
|
|
21
|
+
else:
|
|
22
|
+
# navigate to the config file
|
|
23
|
+
config_file = root_dir / "nerdd.yml"
|
|
24
|
+
|
|
25
|
+
if config_file is not None and config_file.exists():
|
|
26
|
+
self.config = YamlConfiguration(config_file, base_path=root_dir)
|
|
27
|
+
else:
|
|
28
|
+
self.config = DictConfiguration({})
|
|
29
|
+
|
|
30
|
+
def _get_dict(self):
|
|
31
|
+
return self.config.get_dict()
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
from .csv_writer import *
|
|
2
|
-
from .
|
|
3
|
-
from .
|
|
2
|
+
from .depth_first_explorer import *
|
|
3
|
+
from .file_reader import *
|
|
4
|
+
from .gzip_reader import *
|
|
4
5
|
from .inchi_reader import *
|
|
5
|
-
from .elementary_inchi_reader import *
|
|
6
6
|
from .list_reader import *
|
|
7
|
-
from .
|
|
8
|
-
from .elementary_rdkit_mol_reader import *
|
|
7
|
+
from .mol_reader import *
|
|
9
8
|
from .reader import *
|
|
10
9
|
from .reader_registry import *
|
|
11
10
|
from .sdf_reader import *
|
|
12
11
|
from .sdf_writer import *
|
|
13
12
|
from .smiles_reader import *
|
|
14
|
-
from .
|
|
15
|
-
from .
|
|
13
|
+
from .string_reader import *
|
|
14
|
+
from .tar_reader import *
|
|
16
15
|
from .writer import *
|
|
17
16
|
from .writer_registry import *
|
|
17
|
+
from .zip_reader import *
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from itertools import chain, islice, repeat
|
|
2
|
+
from typing import Generator, Iterable, Optional
|
|
3
|
+
|
|
4
|
+
from .explorer import Explorer
|
|
5
|
+
from .reader import MoleculeEntry, Problem, Reader
|
|
6
|
+
from .reader_registry import ReaderRegistry
|
|
7
|
+
|
|
8
|
+
__all__ = ["DepthFirstExplorer"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class InvalidInputReader(Reader):
|
|
12
|
+
def __init__(self):
|
|
13
|
+
super().__init__()
|
|
14
|
+
|
|
15
|
+
def read(self, input, explore) -> Generator[MoleculeEntry, None, None]:
|
|
16
|
+
yield MoleculeEntry(
|
|
17
|
+
raw_input=input,
|
|
18
|
+
input_type="unknown",
|
|
19
|
+
source=tuple(["input"]),
|
|
20
|
+
mol=None,
|
|
21
|
+
errors=[Problem("invalid_input", "Invalid input")],
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
def __repr__(self) -> str:
|
|
25
|
+
return "InvalidInputReader()"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DepthFirstExplorer(Explorer):
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
readers: Optional[Iterable[Reader]] = None,
|
|
32
|
+
num_test_entries: int = 10,
|
|
33
|
+
threshold: float = 0.5,
|
|
34
|
+
maximum_depth: int = 50,
|
|
35
|
+
):
|
|
36
|
+
super().__init__()
|
|
37
|
+
|
|
38
|
+
if readers is None:
|
|
39
|
+
self.reader_registry = ReaderRegistry()
|
|
40
|
+
else:
|
|
41
|
+
self.reader_registry = readers
|
|
42
|
+
|
|
43
|
+
self.num_test_entries = num_test_entries
|
|
44
|
+
self.threshold = threshold
|
|
45
|
+
self.state_stack = [self.empty_state()]
|
|
46
|
+
self.maximum_depth = maximum_depth
|
|
47
|
+
|
|
48
|
+
def empty_state(self):
|
|
49
|
+
return dict(first_guess=[])
|
|
50
|
+
|
|
51
|
+
def explore(self, input) -> Generator[MoleculeEntry, None, None]:
|
|
52
|
+
# create a new child node and set it as the current node
|
|
53
|
+
state = self.empty_state()
|
|
54
|
+
parent = self.state_stack[-1]
|
|
55
|
+
self.state_stack.append(state)
|
|
56
|
+
|
|
57
|
+
depth = len(self.state_stack)
|
|
58
|
+
if depth > self.maximum_depth:
|
|
59
|
+
raise ValueError(f"Maximum depth of {self.maximum_depth} reached")
|
|
60
|
+
|
|
61
|
+
readers_iter = chain(
|
|
62
|
+
zip(parent["first_guess"], repeat("guess")),
|
|
63
|
+
zip(self.reader_registry, repeat("builtin")),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# try all readers and take a sample of the first num_test_entries
|
|
67
|
+
# the reader with most valid molecule entries will be used
|
|
68
|
+
best_reader: Optional[Reader] = None
|
|
69
|
+
best_mode = None
|
|
70
|
+
best_score = 0
|
|
71
|
+
best_ratio = 0.0
|
|
72
|
+
generator = None
|
|
73
|
+
sample = []
|
|
74
|
+
for reader, mode in readers_iter:
|
|
75
|
+
try:
|
|
76
|
+
# read at most num_test_entries entries
|
|
77
|
+
generator = reader.read(input, self.explore)
|
|
78
|
+
sample = list(islice(generator, self.num_test_entries))
|
|
79
|
+
valid_entries = [entry for entry in sample if entry.mol is not None]
|
|
80
|
+
|
|
81
|
+
score = len(valid_entries)
|
|
82
|
+
ratio = len(valid_entries) / len(sample)
|
|
83
|
+
|
|
84
|
+
if score > best_score or (score == best_score and ratio > best_ratio):
|
|
85
|
+
best_reader = reader
|
|
86
|
+
best_mode = mode
|
|
87
|
+
best_score = score
|
|
88
|
+
best_ratio = ratio
|
|
89
|
+
|
|
90
|
+
if score == self.num_test_entries:
|
|
91
|
+
break
|
|
92
|
+
except Exception:
|
|
93
|
+
pass
|
|
94
|
+
|
|
95
|
+
# clean up tree
|
|
96
|
+
while len(self.state_stack) > depth:
|
|
97
|
+
self.state_stack.pop()
|
|
98
|
+
generator = None
|
|
99
|
+
|
|
100
|
+
if generator is None:
|
|
101
|
+
if best_reader is None:
|
|
102
|
+
generator = InvalidInputReader().read(input, self.explore)
|
|
103
|
+
else:
|
|
104
|
+
generator = best_reader.read(input, self.explore)
|
|
105
|
+
sample = list(islice(generator, self.num_test_entries))
|
|
106
|
+
else:
|
|
107
|
+
if best_mode is not None and best_mode != "guess":
|
|
108
|
+
parent["first_guess"].append(best_reader)
|
|
109
|
+
|
|
110
|
+
yield from sample
|
|
111
|
+
yield from generator
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Generator
|
|
3
|
+
|
|
4
|
+
from .reader import MoleculeEntry
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Explorer(ABC):
|
|
8
|
+
def __init__(self):
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def explore(self, input) -> Generator[MoleculeEntry, None, None]:
|
|
13
|
+
pass
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Generator
|
|
4
|
+
|
|
5
|
+
from .reader import MoleculeEntry, Reader
|
|
6
|
+
from .reader_registry import register_reader
|
|
7
|
+
|
|
8
|
+
__all__ = ["FileReader"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@register_reader
|
|
12
|
+
class FileReader(Reader):
|
|
13
|
+
def __init__(self, data_dir="."):
|
|
14
|
+
super().__init__()
|
|
15
|
+
self.data_dir = Path(data_dir)
|
|
16
|
+
|
|
17
|
+
def read(self, filename, explore) -> Generator[MoleculeEntry, None, None]:
|
|
18
|
+
assert isinstance(filename, str), "input must be a string"
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
path = Path(filename).absolute()
|
|
22
|
+
except:
|
|
23
|
+
raise ValueError("input must be a valid path")
|
|
24
|
+
|
|
25
|
+
assert path.is_relative_to(self.data_dir), "input must be a relative path"
|
|
26
|
+
assert path.exists(), "input must be a valid file"
|
|
27
|
+
|
|
28
|
+
with open(filename, "rb") as f:
|
|
29
|
+
for entry in explore(f):
|
|
30
|
+
if len(entry.source) == 1 and entry.source[0] == "raw_input":
|
|
31
|
+
source = tuple()
|
|
32
|
+
else:
|
|
33
|
+
source = entry.source
|
|
34
|
+
yield entry._replace(source=tuple([filename, *source]))
|
|
35
|
+
|
|
36
|
+
def __repr__(self):
|
|
37
|
+
return f"FileReader()"
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import gzip
|
|
2
|
+
from typing import Generator
|
|
3
|
+
|
|
4
|
+
from .reader import MoleculeEntry, Reader
|
|
5
|
+
from .reader_registry import register_reader
|
|
6
|
+
|
|
7
|
+
__all__ = ["GzipReader"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_reader
|
|
11
|
+
class GzipReader(Reader):
|
|
12
|
+
def __init__(self):
|
|
13
|
+
super().__init__()
|
|
14
|
+
|
|
15
|
+
def read(self, input_stream, explore) -> Generator[MoleculeEntry, None, None]:
|
|
16
|
+
if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
|
|
17
|
+
raise TypeError("input must be a stream-like object")
|
|
18
|
+
|
|
19
|
+
input_stream.seek(0)
|
|
20
|
+
|
|
21
|
+
with gzip.open(input_stream, "rb") as f:
|
|
22
|
+
# gzip.open will not raise an exception if the file is not a valid gzip file
|
|
23
|
+
# --> check by attempting to read the first byte
|
|
24
|
+
f.read(1)
|
|
25
|
+
f.seek(0)
|
|
26
|
+
|
|
27
|
+
yield from explore(f)
|
|
28
|
+
|
|
29
|
+
def __repr__(self) -> str:
|
|
30
|
+
return "GzipReader()"
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from codecs import getreader
|
|
2
|
+
from typing import Generator
|
|
3
|
+
|
|
4
|
+
from rdkit.Chem import MolFromInchi
|
|
5
|
+
from rdkit.rdBase import BlockLogs
|
|
6
|
+
|
|
7
|
+
from ..problem import Problem
|
|
8
|
+
from .reader import MoleculeEntry, Reader
|
|
9
|
+
from .reader_registry import register_reader
|
|
10
|
+
|
|
11
|
+
__all__ = ["InchiReader"]
|
|
12
|
+
|
|
13
|
+
StreamReader = getreader("utf-8")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@register_reader
|
|
17
|
+
class InchiReader(Reader):
|
|
18
|
+
def __init__(self):
|
|
19
|
+
super().__init__()
|
|
20
|
+
|
|
21
|
+
def read(self, input_stream, explore) -> Generator[MoleculeEntry, None, None]:
|
|
22
|
+
if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
|
|
23
|
+
raise TypeError("input must be a stream-like object")
|
|
24
|
+
|
|
25
|
+
input_stream.seek(0)
|
|
26
|
+
|
|
27
|
+
reader = StreamReader(input_stream)
|
|
28
|
+
|
|
29
|
+
# suppress RDKit warnings
|
|
30
|
+
with BlockLogs():
|
|
31
|
+
for line in reader:
|
|
32
|
+
# skip empty lines
|
|
33
|
+
if line.strip() == "":
|
|
34
|
+
continue
|
|
35
|
+
|
|
36
|
+
# skip comments
|
|
37
|
+
if line.strip().startswith("#"):
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
mol = MolFromInchi(line, sanitize=False)
|
|
42
|
+
except:
|
|
43
|
+
mol = None
|
|
44
|
+
|
|
45
|
+
if mol is None:
|
|
46
|
+
errors = [Problem("invalid_inchi", "Invalid InChI")]
|
|
47
|
+
else:
|
|
48
|
+
errors = []
|
|
49
|
+
|
|
50
|
+
yield MoleculeEntry(
|
|
51
|
+
raw_input=line,
|
|
52
|
+
input_type="inchi",
|
|
53
|
+
source=tuple(["raw_input"]),
|
|
54
|
+
mol=mol,
|
|
55
|
+
errors=errors,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def __repr__(self) -> str:
|
|
59
|
+
return "InchiReader()"
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from io import BytesIO, StringIO
|
|
2
|
+
from typing import BinaryIO, Generator, Iterable
|
|
3
|
+
|
|
4
|
+
from .reader import MoleculeEntry, Reader
|
|
5
|
+
from .reader_registry import register_reader
|
|
6
|
+
|
|
7
|
+
__all__ = ["ListReader"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_reader
|
|
11
|
+
class ListReader(Reader):
|
|
12
|
+
def __init__(self):
|
|
13
|
+
super().__init__()
|
|
14
|
+
|
|
15
|
+
def read(self, input_iterable, explore) -> Generator[MoleculeEntry, None, None]:
|
|
16
|
+
assert isinstance(input_iterable, Iterable) and not isinstance(
|
|
17
|
+
input_iterable, (str, bytes, BytesIO, StringIO, BinaryIO)
|
|
18
|
+
), f"input must be an iterable, but is {type(input_iterable)}"
|
|
19
|
+
|
|
20
|
+
for entry in input_iterable:
|
|
21
|
+
yield from explore(entry)
|
|
22
|
+
|
|
23
|
+
def __repr__(self) -> str:
|
|
24
|
+
return "ListReader()"
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from typing import Generator
|
|
2
|
+
|
|
3
|
+
from rdkit.Chem import Mol
|
|
4
|
+
|
|
5
|
+
from .reader import MoleculeEntry, Reader
|
|
6
|
+
from .reader_registry import register_reader
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_reader
|
|
10
|
+
class MolReader(Reader):
|
|
11
|
+
def __init__(self):
|
|
12
|
+
super().__init__()
|
|
13
|
+
|
|
14
|
+
def read(self, mol, explore) -> Generator[MoleculeEntry, None, None]:
|
|
15
|
+
assert isinstance(mol, Mol)
|
|
16
|
+
yield MoleculeEntry(
|
|
17
|
+
raw_input=mol,
|
|
18
|
+
input_type="rdkit_mol",
|
|
19
|
+
source=tuple(["raw_input"]),
|
|
20
|
+
mol=mol,
|
|
21
|
+
errors=[],
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
def __repr__(self) -> str:
|
|
25
|
+
return "MolReader()"
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Generator, List, NamedTuple, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
from rdkit.Chem import Mol
|
|
5
|
+
|
|
6
|
+
from ..problem import Problem
|
|
7
|
+
|
|
8
|
+
__all__ = ["MoleculeEntry", "Reader"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MoleculeEntry(NamedTuple):
|
|
12
|
+
raw_input: str
|
|
13
|
+
input_type: str
|
|
14
|
+
source: Tuple[str, ...]
|
|
15
|
+
mol: Optional[Mol]
|
|
16
|
+
errors: List[Problem]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Reader(ABC):
|
|
20
|
+
def __init__(self):
|
|
21
|
+
super().__init__()
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def read(self, input, explore) -> Generator[MoleculeEntry, None, None]:
|
|
25
|
+
pass
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from functools import lru_cache
|
|
2
|
+
from typing import Generator, Type
|
|
3
|
+
|
|
4
|
+
from .reader import Reader
|
|
5
|
+
|
|
6
|
+
__all__ = ["ReaderRegistry", "register_reader"]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# lru_cache makes the registry a singleton
|
|
10
|
+
@lru_cache(maxsize=1)
|
|
11
|
+
class ReaderRegistry:
|
|
12
|
+
def __init__(self):
|
|
13
|
+
self._factories = []
|
|
14
|
+
|
|
15
|
+
def register(self, ReaderClass: Type[Reader], *args, **kwargs):
|
|
16
|
+
assert issubclass(ReaderClass, Reader)
|
|
17
|
+
self._factories.append(lambda: ReaderClass(*args, **kwargs))
|
|
18
|
+
|
|
19
|
+
def readers(self) -> Generator[Reader, None, None]:
|
|
20
|
+
for reader in self._factories:
|
|
21
|
+
yield reader()
|
|
22
|
+
|
|
23
|
+
def __iter__(self):
|
|
24
|
+
return iter(map(lambda f: f(), self._factories))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def register_reader(clazz, *args, **kwargs):
|
|
28
|
+
# TODO: implement both decorator modes
|
|
29
|
+
ReaderRegistry().register(clazz, *args, **kwargs)
|
|
30
|
+
return clazz
|