nerdd-module 0.2.6__tar.gz → 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/PKG-INFO +6 -2
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/__init__.py +4 -4
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/cli.py +12 -17
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/config/__init__.py +1 -1
- nerdd_module-0.3.3/nerdd_module/config/configuration.py +71 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/config/default_configuration.py +8 -12
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/config/dict_configuration.py +4 -5
- nerdd_module-0.3.3/nerdd_module/config/merged_configuration.py +44 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/config/package_configuration.py +5 -0
- nerdd_module-0.3.3/nerdd_module/config/search_yaml_configuration.py +40 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/config/yaml_configuration.py +6 -2
- nerdd_module-0.3.3/nerdd_module/converters/__init__.py +2 -0
- nerdd_module-0.3.3/nerdd_module/converters/converter.py +16 -0
- nerdd_module-0.3.3/nerdd_module/converters/converter_registry.py +61 -0
- nerdd_module-0.3.3/nerdd_module/converters/identity_converter.py +5 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/__init__.py +1 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/depth_first_explorer.py +43 -27
- nerdd_module-0.3.3/nerdd_module/input/explorer.py +16 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/file_reader.py +5 -4
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/gzip_reader.py +2 -2
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/inchi_reader.py +2 -2
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/list_reader.py +2 -2
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/mol_reader.py +2 -2
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/reader.py +2 -2
- nerdd_module-0.3.3/nerdd_module/input/reader_registry.py +41 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/sdf_reader.py +2 -2
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/smiles_reader.py +8 -3
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/string_reader.py +2 -2
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/tar_reader.py +2 -2
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/zip_reader.py +2 -2
- nerdd_module-0.3.3/nerdd_module/model/__init__.py +6 -0
- nerdd_module-0.3.3/nerdd_module/model/add_smiles_step.py +25 -0
- nerdd_module-0.3.3/nerdd_module/model/assign_mol_id_step.py +15 -0
- nerdd_module-0.3.3/nerdd_module/model/assign_name_step.py +19 -0
- nerdd_module-0.3.3/nerdd_module/model/enforce_schema_step.py +29 -0
- nerdd_module-0.3.3/nerdd_module/model/model.py +277 -0
- nerdd_module-0.3.3/nerdd_module/model/read_input_step.py +24 -0
- nerdd_module-0.3.3/nerdd_module/model/simple_model.py +158 -0
- nerdd_module-0.3.3/nerdd_module/model/write_output_step.py +19 -0
- nerdd_module-0.3.3/nerdd_module/output/__init__.py +6 -0
- nerdd_module-0.3.3/nerdd_module/output/csv_writer.py +26 -0
- nerdd_module-0.3.3/nerdd_module/output/file_writer.py +41 -0
- nerdd_module-0.3.3/nerdd_module/output/iterator_writer.py +13 -0
- nerdd_module-0.3.3/nerdd_module/output/pandas_writer.py +16 -0
- nerdd_module-0.3.3/nerdd_module/output/record_list_writer.py +13 -0
- nerdd_module-0.3.3/nerdd_module/output/sdf_writer.py +35 -0
- nerdd_module-0.3.3/nerdd_module/output/writer.py +18 -0
- nerdd_module-0.3.3/nerdd_module/output/writer_registry.py +50 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/preprocessing/__init__.py +1 -4
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/preprocessing/check_valid_smiles.py +6 -6
- nerdd_module-0.3.3/nerdd_module/preprocessing/chembl_structure_pipeline.py +78 -0
- nerdd_module-0.3.3/nerdd_module/preprocessing/filter_by_element.py +67 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/preprocessing/filter_by_weight.py +9 -12
- nerdd_module-0.3.3/nerdd_module/preprocessing/preprocessing_step.py +61 -0
- nerdd_module-0.3.3/nerdd_module/preprocessing/remove_stereochemistry.py +26 -0
- nerdd_module-0.3.3/nerdd_module/preprocessing/sanitize.py +20 -0
- nerdd_module-0.3.3/nerdd_module/problem.py +17 -0
- nerdd_module-0.3.3/nerdd_module/steps/__init__.py +3 -0
- nerdd_module-0.3.3/nerdd_module/steps/map_step.py +38 -0
- nerdd_module-0.3.3/nerdd_module/steps/output_step.py +27 -0
- nerdd_module-0.3.3/nerdd_module/steps/step.py +27 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/tests/__init__.py +1 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/tests/checks.py +64 -37
- nerdd_module-0.3.3/nerdd_module/tests/models/AtomicMassModel.py +51 -0
- nerdd_module-0.3.3/nerdd_module/tests/models/MolWeightModel.py +43 -0
- nerdd_module-0.3.3/nerdd_module/tests/models/__init__.py +2 -0
- nerdd_module-0.3.3/nerdd_module/tests/predictions.py +10 -0
- nerdd_module-0.3.3/nerdd_module/tests/predictors.py +42 -0
- nerdd_module-0.3.3/nerdd_module/tests/preprocessing/DummyPreprocessingStep.py +25 -0
- nerdd_module-0.3.3/nerdd_module/tests/preprocessing/__init__.py +1 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/tests/representations.py +9 -3
- nerdd_module-0.3.3/nerdd_module/util/__init__.py +3 -0
- nerdd_module-0.3.3/nerdd_module/util/call_with_mappings.py +53 -0
- nerdd_module-0.3.3/nerdd_module/util/class_decorator.py +29 -0
- nerdd_module-0.3.3/nerdd_module/util/package.py +24 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module.egg-info/PKG-INFO +6 -2
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module.egg-info/SOURCES.txt +35 -16
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module.egg-info/requires.txt +5 -1
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/setup.py +11 -6
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/tests/steps/__init__.py +1 -1
- nerdd_module-0.3.3/tests/steps/checks.py +77 -0
- nerdd_module-0.3.3/tests/steps/input.py +114 -0
- nerdd_module-0.3.3/tests/steps/preprocessing.py +111 -0
- nerdd_module-0.3.3/tests/test_features.py +3 -0
- nerdd_module-0.2.6/nerdd_module/abstract_model.py +0 -271
- nerdd_module-0.2.6/nerdd_module/config/auto_configuration.py +0 -62
- nerdd_module-0.2.6/nerdd_module/config/configuration.py +0 -52
- nerdd_module-0.2.6/nerdd_module/config/merged_configuration.py +0 -18
- nerdd_module-0.2.6/nerdd_module/input/explorer.py +0 -13
- nerdd_module-0.2.6/nerdd_module/input/reader_registry.py +0 -64
- nerdd_module-0.2.6/nerdd_module/output/__init__.py +0 -1
- nerdd_module-0.2.6/nerdd_module/output/csv_writer.py +0 -30
- nerdd_module-0.2.6/nerdd_module/output/sdf_writer.py +0 -35
- nerdd_module-0.2.6/nerdd_module/output/writer.py +0 -45
- nerdd_module-0.2.6/nerdd_module/output/writer_registry.py +0 -40
- nerdd_module-0.2.6/nerdd_module/preprocessing/chembl_structure_pipeline.py +0 -124
- nerdd_module-0.2.6/nerdd_module/preprocessing/empty_pipeline.py +0 -8
- nerdd_module-0.2.6/nerdd_module/preprocessing/filter_by_element.py +0 -39
- nerdd_module-0.2.6/nerdd_module/preprocessing/pipeline.py +0 -53
- nerdd_module-0.2.6/nerdd_module/preprocessing/registry.py +0 -20
- nerdd_module-0.2.6/nerdd_module/preprocessing/remove_stereochemistry.py +0 -24
- nerdd_module-0.2.6/nerdd_module/preprocessing/sanitize.py +0 -18
- nerdd_module-0.2.6/nerdd_module/preprocessing/step.py +0 -26
- nerdd_module-0.2.6/nerdd_module/problem.py +0 -13
- nerdd_module-0.2.6/nerdd_module/tests/predictions.py +0 -30
- nerdd_module-0.2.6/tests/models/AtomicMassModel.py +0 -29
- nerdd_module-0.2.6/tests/models/MolWeightModel.py +0 -25
- nerdd_module-0.2.6/tests/models/MolWeightModelWithExplicitMolIds.py +0 -30
- nerdd_module-0.2.6/tests/models/MolWeightModelWithExplicitMols.py +0 -27
- nerdd_module-0.2.6/tests/models/__init__.py +0 -4
- nerdd_module-0.2.6/tests/steps/checks.py +0 -45
- nerdd_module-0.2.6/tests/steps/predictors.py +0 -52
- nerdd_module-0.2.6/tests/steps/preprocessing.py +0 -9
- nerdd_module-0.2.6/tests/test_atom_property_prediction.py +0 -66
- nerdd_module-0.2.6/tests/test_molecule_property_prediction.py +0 -60
- nerdd_module-0.2.6/tests/test_preprocessing.py +0 -12
- nerdd_module-0.2.6/tests/test_reading_formats.py +0 -137
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/LICENSE +0 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/README.md +0 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/polyfills/__init__.py +0 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/polyfills/files.py +0 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/polyfills/get_entry_points.py +0 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/polyfills/version.py +0 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/version.py +0 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module.egg-info/dependency_links.txt +0 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module.egg-info/top_level.txt +0 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/setup.cfg +0 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/tests/__init__.py +0 -0
- {nerdd_module-0.2.6 → nerdd_module-0.3.3}/tests/conftest.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: nerdd-module
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.3
|
|
4
4
|
Summary: Base package to create NERDD modules
|
|
5
5
|
Home-page: https://github.com/molinfo-vienna/nerdd-module
|
|
6
6
|
Maintainer: Steffen Hirte
|
|
@@ -35,7 +35,11 @@ Requires-Dist: importlib-resources>=5; python_version < "3.10"
|
|
|
35
35
|
Requires-Dist: importlib-metadata>=4.6; python_version < "3.10"
|
|
36
36
|
Provides-Extra: dev
|
|
37
37
|
Requires-Dist: black; extra == "dev"
|
|
38
|
-
Requires-Dist:
|
|
38
|
+
Requires-Dist: mypy; extra == "dev"
|
|
39
|
+
Requires-Dist: pandas-stubs; extra == "dev"
|
|
40
|
+
Requires-Dist: types-PyYAML; extra == "dev"
|
|
41
|
+
Requires-Dist: types-decorator; extra == "dev"
|
|
42
|
+
Requires-Dist: types-setuptools; extra == "dev"
|
|
39
43
|
Provides-Extra: csp
|
|
40
44
|
Requires-Dist: chembl_structure_pipeline>=1.0.0; extra == "csp"
|
|
41
45
|
Provides-Extra: test
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
from .abstract_model import *
|
|
2
1
|
from .cli import *
|
|
3
|
-
from .
|
|
2
|
+
from .input import ReaderRegistry
|
|
3
|
+
from .model import *
|
|
4
|
+
from .output import WriterRegistry
|
|
5
|
+
from .polyfills import get_entry_points
|
|
4
6
|
from .problem import *
|
|
5
7
|
from .version import *
|
|
6
|
-
from .polyfills import get_entry_points
|
|
7
|
-
|
|
8
8
|
|
|
9
9
|
for entry_point in get_entry_points("nerdd-module.plugins"):
|
|
10
10
|
entry_point.load()
|
|
@@ -4,8 +4,7 @@ import sys
|
|
|
4
4
|
|
|
5
5
|
import rich_click as click
|
|
6
6
|
from decorator import decorator
|
|
7
|
-
from
|
|
8
|
-
from stringcase import spinalcase
|
|
7
|
+
from stringcase import spinalcase # type: ignore
|
|
9
8
|
|
|
10
9
|
__all__ = ["auto_cli"]
|
|
11
10
|
|
|
@@ -14,7 +13,7 @@ input_description = """{description}
|
|
|
14
13
|
INPUT molecules are provided as file paths or strings. The following formats are
|
|
15
14
|
supported:
|
|
16
15
|
|
|
17
|
-
{
|
|
16
|
+
{input_format_list}
|
|
18
17
|
|
|
19
18
|
Note that input formats shouldn't be mixed.
|
|
20
19
|
"""
|
|
@@ -43,17 +42,19 @@ def auto_cli(f, *args, **kwargs):
|
|
|
43
42
|
# get the model
|
|
44
43
|
model = f()
|
|
45
44
|
|
|
46
|
-
config = model.get_config()
|
|
45
|
+
config = model.get_config()
|
|
47
46
|
|
|
48
47
|
# compose cli description
|
|
49
48
|
description = config.get("description", "")
|
|
50
49
|
|
|
51
|
-
|
|
50
|
+
input_format_list = "\n".join([f"* {fmt}" for fmt in ["smiles", "sdf", "inchi"]])
|
|
52
51
|
|
|
53
52
|
help_text = input_description.format(
|
|
54
|
-
description=description,
|
|
53
|
+
description=description, input_format_list=input_format_list
|
|
55
54
|
)
|
|
56
55
|
|
|
56
|
+
output_format_list = ["sdf", "csv"]
|
|
57
|
+
|
|
57
58
|
# compose footer with examples
|
|
58
59
|
examples = []
|
|
59
60
|
if "example_smiles" in config:
|
|
@@ -88,21 +89,15 @@ def auto_cli(f, *args, **kwargs):
|
|
|
88
89
|
):
|
|
89
90
|
logging.basicConfig(level=log_level.upper())
|
|
90
91
|
|
|
91
|
-
df_result = model.predict(input, **kwargs)
|
|
92
|
-
|
|
93
92
|
# write results
|
|
94
|
-
assert format in
|
|
95
|
-
writer = WriterRegistry().get_writer(format)
|
|
93
|
+
assert format in output_format_list, f"Unknown output format: {format}"
|
|
96
94
|
|
|
97
|
-
if output.lower() == "stdout":
|
|
98
|
-
assert not writer.writes_bytes, "stdout does not support binary output"
|
|
95
|
+
if str(output).lower() == "stdout":
|
|
99
96
|
output_handle = sys.stdout
|
|
100
97
|
else:
|
|
101
|
-
|
|
102
|
-
output_handle = click.open_file(output, mode)
|
|
98
|
+
output_handle = click.open_file(str(output), "wb")
|
|
103
99
|
|
|
104
|
-
|
|
105
|
-
writer.write(output_handle, entries)
|
|
100
|
+
model.predict(input, output_format=format, output_file=output_handle, **kwargs)
|
|
106
101
|
|
|
107
102
|
#
|
|
108
103
|
# Add job parameters
|
|
@@ -130,7 +125,7 @@ def auto_cli(f, *args, **kwargs):
|
|
|
130
125
|
main = click.option(
|
|
131
126
|
"--format",
|
|
132
127
|
default="csv",
|
|
133
|
-
type=click.Choice(
|
|
128
|
+
type=click.Choice(output_format_list, case_sensitive=False),
|
|
134
129
|
help="The output format.",
|
|
135
130
|
)(main)
|
|
136
131
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from .auto_configuration import *
|
|
2
1
|
from .configuration import *
|
|
3
2
|
from .default_configuration import *
|
|
4
3
|
from .dict_configuration import *
|
|
5
4
|
from .merged_configuration import *
|
|
6
5
|
from .package_configuration import *
|
|
6
|
+
from .search_yaml_configuration import *
|
|
7
7
|
from .yaml_configuration import *
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
__all__ = ["Configuration"]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_property_columns_of_type(config, t) -> List[dict]:
|
|
9
|
+
return [c for c in config["result_properties"] if c.get("level", "molecule") == t]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Configuration(ABC):
|
|
13
|
+
def __init__(self):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
@lru_cache
|
|
17
|
+
def get_dict(self) -> dict:
|
|
18
|
+
config = self._get_dict()
|
|
19
|
+
|
|
20
|
+
if "result_properties" not in config:
|
|
21
|
+
config["result_properties"] = []
|
|
22
|
+
|
|
23
|
+
# check that a module can only predict atom or derivative properties, not both
|
|
24
|
+
num_atom_properties = len(get_property_columns_of_type(config, "atom"))
|
|
25
|
+
num_derivative_properties = len(
|
|
26
|
+
get_property_columns_of_type(config, "derivative")
|
|
27
|
+
)
|
|
28
|
+
assert (
|
|
29
|
+
num_atom_properties == 0 or num_derivative_properties == 0
|
|
30
|
+
), "A module can only predict atom or derivative properties, not both."
|
|
31
|
+
|
|
32
|
+
return config
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def _get_dict(self) -> dict:
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
def is_empty(self) -> bool:
|
|
39
|
+
return self.get_dict() == {}
|
|
40
|
+
|
|
41
|
+
def molecular_property_columns(self) -> List[dict]:
|
|
42
|
+
return get_property_columns_of_type(self, "molecule")
|
|
43
|
+
|
|
44
|
+
def atom_property_columns(self) -> List[dict]:
|
|
45
|
+
return get_property_columns_of_type(self, "atom")
|
|
46
|
+
|
|
47
|
+
def derivative_property_columns(self) -> List[dict]:
|
|
48
|
+
return get_property_columns_of_type(self, "derivative")
|
|
49
|
+
|
|
50
|
+
def get_task(self) -> str:
|
|
51
|
+
# if task is specified in the config, use that
|
|
52
|
+
config = self.get_dict()
|
|
53
|
+
if "task" in config:
|
|
54
|
+
return config["task"]
|
|
55
|
+
|
|
56
|
+
# try to derive the task from the result_properties
|
|
57
|
+
num_atom_properties = len(self.atom_property_columns())
|
|
58
|
+
num_derivative_properties = len(self.derivative_property_columns())
|
|
59
|
+
|
|
60
|
+
if num_atom_properties > 0:
|
|
61
|
+
return "atom_property_prediction"
|
|
62
|
+
elif num_derivative_properties > 0:
|
|
63
|
+
return "derivative_property_prediction"
|
|
64
|
+
else:
|
|
65
|
+
return "molecular_property_prediction"
|
|
66
|
+
|
|
67
|
+
def __getitem__(self, key):
|
|
68
|
+
return self.get_dict()[key]
|
|
69
|
+
|
|
70
|
+
def __repr__(self):
|
|
71
|
+
return f"{self.__class__.__name__}({self._get_dict()})"
|
|
@@ -1,15 +1,13 @@
|
|
|
1
|
-
from stringcase import snakecase
|
|
1
|
+
from stringcase import snakecase # type: ignore
|
|
2
2
|
|
|
3
3
|
from ..polyfills import version
|
|
4
|
-
from .
|
|
4
|
+
from .dict_configuration import DictConfiguration
|
|
5
5
|
|
|
6
6
|
__all__ = ["DefaultConfiguration"]
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
class DefaultConfiguration(
|
|
9
|
+
class DefaultConfiguration(DictConfiguration):
|
|
10
10
|
def __init__(self, nerdd_module):
|
|
11
|
-
super().__init__()
|
|
12
|
-
|
|
13
11
|
# generate a name from the module name
|
|
14
12
|
class_name = nerdd_module.__class__.__name__
|
|
15
13
|
if class_name.endswith("Model"):
|
|
@@ -25,17 +23,15 @@ class DefaultConfiguration(Configuration):
|
|
|
25
23
|
try:
|
|
26
24
|
module = nerdd_module.__module__
|
|
27
25
|
root_module = module.split(".", 1)[0]
|
|
28
|
-
|
|
26
|
+
package_version = version(root_module)
|
|
29
27
|
except ModuleNotFoundError:
|
|
30
|
-
|
|
28
|
+
package_version = "0.0.1"
|
|
31
29
|
|
|
32
|
-
|
|
30
|
+
config = dict(
|
|
33
31
|
name=name,
|
|
34
|
-
version=
|
|
35
|
-
task="molecular_property_prediction",
|
|
32
|
+
version=package_version,
|
|
36
33
|
job_parameters=[],
|
|
37
34
|
result_properties=[],
|
|
38
35
|
)
|
|
39
36
|
|
|
40
|
-
|
|
41
|
-
return self.config
|
|
37
|
+
super().__init__(config)
|
|
@@ -4,10 +4,9 @@ __all__ = ["DictConfiguration"]
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class DictConfiguration(Configuration):
|
|
7
|
-
def __init__(self, config):
|
|
7
|
+
def __init__(self, config: dict) -> None:
|
|
8
8
|
super().__init__()
|
|
9
|
+
self._config = config
|
|
9
10
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def _get_dict(self):
|
|
13
|
-
return self.config
|
|
11
|
+
def _get_dict(self) -> dict:
|
|
12
|
+
return self._config
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from collections import Counter
|
|
2
|
+
|
|
3
|
+
from .configuration import Configuration
|
|
4
|
+
from .dict_configuration import DictConfiguration
|
|
5
|
+
|
|
6
|
+
__all__ = ["MergedConfiguration"]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def merge(*args):
|
|
10
|
+
assert len(args) > 0
|
|
11
|
+
|
|
12
|
+
first_entry = args[0]
|
|
13
|
+
assert all(isinstance(d, type(first_entry)) for d in args)
|
|
14
|
+
|
|
15
|
+
if isinstance(first_entry, list):
|
|
16
|
+
return [e for d in args for e in d]
|
|
17
|
+
if isinstance(first_entry, dict):
|
|
18
|
+
count_fields = Counter([k for d in args for k in d.keys()])
|
|
19
|
+
|
|
20
|
+
# merge fields that occur in multiple dicts
|
|
21
|
+
overlapping_fields = [k for k, v in count_fields.items() if v > 1]
|
|
22
|
+
merged_overlapping_fields = {
|
|
23
|
+
k: merge(*[d[k] for d in args if k in d]) for k in overlapping_fields
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
# collect fields that occur in only one dict
|
|
27
|
+
non_overlapping_fields = [k for k, v in count_fields.items() if v == 1]
|
|
28
|
+
merged_non_overlapping_fields = {
|
|
29
|
+
k: v for d in args for k, v in d.items() if k in non_overlapping_fields
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
return {
|
|
33
|
+
**merged_non_overlapping_fields,
|
|
34
|
+
**merged_overlapping_fields,
|
|
35
|
+
}
|
|
36
|
+
else:
|
|
37
|
+
# merge all configurations starting from the first one
|
|
38
|
+
# --> last configuration has the highest priority
|
|
39
|
+
return args[-1]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class MergedConfiguration(DictConfiguration):
|
|
43
|
+
def __init__(self, *configs: Configuration):
|
|
44
|
+
super().__init__(merge(*[c.get_dict() for c in configs]))
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
1
3
|
from ..polyfills import files
|
|
2
4
|
from .configuration import Configuration
|
|
3
5
|
from .dict_configuration import DictConfiguration
|
|
@@ -5,6 +7,8 @@ from .yaml_configuration import YamlConfiguration
|
|
|
5
7
|
|
|
6
8
|
__all__ = ["PackageConfiguration"]
|
|
7
9
|
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
8
12
|
|
|
9
13
|
class PackageConfiguration(Configuration):
|
|
10
14
|
def __init__(self, package):
|
|
@@ -23,6 +27,7 @@ class PackageConfiguration(Configuration):
|
|
|
23
27
|
config_file = root_dir / "nerdd.yml"
|
|
24
28
|
|
|
25
29
|
if config_file is not None and config_file.exists():
|
|
30
|
+
logger.info(f"Found configuration file in package: {config_file}")
|
|
26
31
|
self.config = YamlConfiguration(config_file, base_path=root_dir)
|
|
27
32
|
else:
|
|
28
33
|
self.config = DictConfiguration({})
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from .configuration import Configuration
|
|
7
|
+
from .dict_configuration import DictConfiguration
|
|
8
|
+
from .yaml_configuration import YamlConfiguration
|
|
9
|
+
|
|
10
|
+
__all__ = ["SearchYamlConfiguration"]
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SearchYamlConfiguration(DictConfiguration):
|
|
16
|
+
def __init__(self, start: str, base_path: Optional[str] = None) -> None:
|
|
17
|
+
# provide a default configuration if no configuration file is found
|
|
18
|
+
config: Configuration = DictConfiguration({})
|
|
19
|
+
|
|
20
|
+
if start is not None:
|
|
21
|
+
# start at the directory containing the file where nerdd_module_class is
|
|
22
|
+
# defined and go up the directory tree until nerdd.yml is found (or root is
|
|
23
|
+
# reached)
|
|
24
|
+
leaf = start
|
|
25
|
+
while True:
|
|
26
|
+
if os.path.isfile(os.path.join(leaf, "nerdd.yml")):
|
|
27
|
+
default_config_file = os.path.join(leaf, "nerdd.yml")
|
|
28
|
+
break
|
|
29
|
+
elif leaf == os.path.dirname(leaf): # reached root
|
|
30
|
+
default_config_file = None
|
|
31
|
+
break
|
|
32
|
+
leaf = os.path.dirname(leaf)
|
|
33
|
+
|
|
34
|
+
if default_config_file is not None:
|
|
35
|
+
logger.info(
|
|
36
|
+
f"Found configuration file in project directory: {default_config_file}"
|
|
37
|
+
)
|
|
38
|
+
config = YamlConfiguration(default_config_file, base_path)
|
|
39
|
+
|
|
40
|
+
super().__init__(config.get_dict())
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import os
|
|
3
3
|
import pathlib
|
|
4
|
+
from typing import Optional, Union
|
|
5
|
+
from pathlib import Path
|
|
4
6
|
|
|
5
|
-
import filetype
|
|
7
|
+
import filetype # type: ignore
|
|
6
8
|
import yaml
|
|
7
9
|
|
|
8
10
|
from .configuration import Configuration
|
|
@@ -26,7 +28,9 @@ def image_constructor(loader, node):
|
|
|
26
28
|
|
|
27
29
|
|
|
28
30
|
class YamlConfiguration(Configuration):
|
|
29
|
-
def __init__(
|
|
31
|
+
def __init__(
|
|
32
|
+
self, handle: Union[str, Path], base_path: Optional[Union[str, Path]] = None
|
|
33
|
+
) -> None:
|
|
30
34
|
super().__init__()
|
|
31
35
|
|
|
32
36
|
if base_path is None:
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
__all__ = ["Converter"]
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Converter(ABC):
|
|
8
|
+
def __init__(self):
|
|
9
|
+
super().__init__()
|
|
10
|
+
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def _convert(self, input: Any, context: dict, **kwargs) -> Any:
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
def convert(self, input: Any, context: dict, **kwargs) -> Any:
|
|
16
|
+
return self._convert(input, context, **kwargs)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from functools import lru_cache, partial
|
|
2
|
+
from typing import Callable, Dict, Tuple, Type
|
|
3
|
+
|
|
4
|
+
from ..util import call_with_mappings, class_decorator
|
|
5
|
+
from .converter import Converter
|
|
6
|
+
from .identity_converter import IdentityConverter
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"ConverterRegistry",
|
|
10
|
+
"register_representation",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
ConverterFactory = Callable[[dict], Converter]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# lru_cache makes the registry a singleton
|
|
18
|
+
@lru_cache(maxsize=1)
|
|
19
|
+
class ConverterRegistry:
|
|
20
|
+
def __init__(self) -> None:
|
|
21
|
+
self._factories: Dict[Tuple[str, str], ConverterFactory] = {}
|
|
22
|
+
|
|
23
|
+
def register(
|
|
24
|
+
self,
|
|
25
|
+
data_type: str,
|
|
26
|
+
output_format: str,
|
|
27
|
+
ConverterClass: Type[Converter],
|
|
28
|
+
*args: str,
|
|
29
|
+
**kwargs: str,
|
|
30
|
+
):
|
|
31
|
+
assert issubclass(ConverterClass, Converter)
|
|
32
|
+
assert all([isinstance(arg, str) for arg in args])
|
|
33
|
+
assert all(
|
|
34
|
+
[isinstance(k, str) and isinstance(v, str) for k, v in kwargs.items()]
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
self._factories[(data_type, output_format)] = partial(
|
|
38
|
+
call_with_mappings, ConverterClass, args_mapping=args, kwargs_mapping=kwargs
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def get_converter(
|
|
42
|
+
self, data_type: str, output_format: str, return_default=True, **kwargs
|
|
43
|
+
) -> Converter:
|
|
44
|
+
if (data_type, output_format) not in self._factories:
|
|
45
|
+
if return_default:
|
|
46
|
+
return IdentityConverter()
|
|
47
|
+
else:
|
|
48
|
+
raise ValueError(
|
|
49
|
+
f"Unknown data type '{data_type}' or output format '{output_format}'"
|
|
50
|
+
)
|
|
51
|
+
return self._factories[(data_type, output_format)](kwargs)
|
|
52
|
+
|
|
53
|
+
def get_output_formats(self) -> frozenset:
|
|
54
|
+
return frozenset(self._factories.keys())
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@class_decorator
|
|
58
|
+
def register_representation(
|
|
59
|
+
cls: Type[Converter], data_type: str, output_format: str, *args, **kwargs
|
|
60
|
+
):
|
|
61
|
+
ConverterRegistry().register(data_type, output_format, cls, *args, **kwargs)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from itertools import chain, islice, repeat
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Iterable, Iterator, Optional
|
|
3
3
|
|
|
4
4
|
from .explorer import Explorer
|
|
5
5
|
from .reader import MoleculeEntry, Problem, Reader
|
|
@@ -12,7 +12,7 @@ class InvalidInputReader(Reader):
|
|
|
12
12
|
def __init__(self):
|
|
13
13
|
super().__init__()
|
|
14
14
|
|
|
15
|
-
def read(self, input, explore) ->
|
|
15
|
+
def read(self, input, explore) -> Iterator[MoleculeEntry]:
|
|
16
16
|
yield MoleculeEntry(
|
|
17
17
|
raw_input=input,
|
|
18
18
|
input_type="unknown",
|
|
@@ -36,31 +36,31 @@ class DepthFirstExplorer(Explorer):
|
|
|
36
36
|
super().__init__()
|
|
37
37
|
|
|
38
38
|
if readers is None:
|
|
39
|
-
self.
|
|
39
|
+
self._reader_registry = list(ReaderRegistry().get_readers())
|
|
40
40
|
else:
|
|
41
|
-
self.
|
|
41
|
+
self._reader_registry = list(readers)
|
|
42
42
|
|
|
43
|
-
self.
|
|
44
|
-
self.
|
|
45
|
-
self.
|
|
46
|
-
self.
|
|
43
|
+
self._num_test_entries = num_test_entries
|
|
44
|
+
self._threshold = threshold
|
|
45
|
+
self._state_stack = [self._empty_state()]
|
|
46
|
+
self._maximum_depth = maximum_depth
|
|
47
47
|
|
|
48
|
-
def
|
|
48
|
+
def _empty_state(self):
|
|
49
49
|
return dict(first_guess=[])
|
|
50
50
|
|
|
51
|
-
def explore(self, input) ->
|
|
51
|
+
def explore(self, input) -> Iterator[MoleculeEntry]:
|
|
52
52
|
# create a new child node and set it as the current node
|
|
53
|
-
state = self.
|
|
54
|
-
parent = self.
|
|
55
|
-
self.
|
|
53
|
+
state = self._empty_state()
|
|
54
|
+
parent = self._state_stack[-1]
|
|
55
|
+
self._state_stack.append(state)
|
|
56
56
|
|
|
57
|
-
depth = len(self.
|
|
58
|
-
if depth > self.
|
|
59
|
-
raise ValueError(f"Maximum depth of {self.
|
|
57
|
+
depth = len(self._state_stack)
|
|
58
|
+
if depth > self._maximum_depth:
|
|
59
|
+
raise ValueError(f"Maximum depth of {self._maximum_depth} reached")
|
|
60
60
|
|
|
61
61
|
readers_iter = chain(
|
|
62
62
|
zip(parent["first_guess"], repeat("guess")),
|
|
63
|
-
zip(self.
|
|
63
|
+
zip(self._reader_registry, repeat("builtin")),
|
|
64
64
|
)
|
|
65
65
|
|
|
66
66
|
# try all readers and take a sample of the first num_test_entries
|
|
@@ -69,40 +69,56 @@ class DepthFirstExplorer(Explorer):
|
|
|
69
69
|
best_mode = None
|
|
70
70
|
best_score = 0
|
|
71
71
|
best_ratio = 0.0
|
|
72
|
+
best_num_invalid_results = 0
|
|
72
73
|
generator = None
|
|
73
74
|
sample = []
|
|
74
75
|
for reader, mode in readers_iter:
|
|
75
76
|
try:
|
|
76
77
|
# read at most num_test_entries entries
|
|
77
|
-
generator =
|
|
78
|
-
sample = list(islice(generator, self.
|
|
78
|
+
generator = self._read(reader, input)
|
|
79
|
+
sample = list(islice(generator, self._num_test_entries))
|
|
79
80
|
valid_entries = [entry for entry in sample if entry.mol is not None]
|
|
80
81
|
|
|
81
82
|
score = len(valid_entries)
|
|
82
83
|
ratio = len(valid_entries) / len(sample)
|
|
83
|
-
|
|
84
|
-
|
|
84
|
+
num_invalid_results = len(sample) - len(valid_entries)
|
|
85
|
+
|
|
86
|
+
if (
|
|
87
|
+
score > best_score
|
|
88
|
+
# if the score is the same, prefer the reader with higher ratio
|
|
89
|
+
# of valid entries
|
|
90
|
+
or (score == best_score and ratio > best_ratio)
|
|
91
|
+
# if the ratio is the same, prefer the reader with less invalid
|
|
92
|
+
# results
|
|
93
|
+
or (
|
|
94
|
+
score == best_score
|
|
95
|
+
and ratio == best_ratio
|
|
96
|
+
and num_invalid_results < best_num_invalid_results
|
|
97
|
+
)
|
|
98
|
+
):
|
|
85
99
|
best_reader = reader
|
|
86
100
|
best_mode = mode
|
|
87
101
|
best_score = score
|
|
88
102
|
best_ratio = ratio
|
|
103
|
+
best_num_invalid_results = num_invalid_results
|
|
89
104
|
|
|
90
|
-
if score == self.
|
|
105
|
+
if score == self._num_test_entries:
|
|
91
106
|
break
|
|
92
107
|
except Exception:
|
|
93
108
|
pass
|
|
94
109
|
|
|
95
110
|
# clean up tree
|
|
96
|
-
while len(self.
|
|
97
|
-
self.
|
|
111
|
+
while len(self._state_stack) > depth:
|
|
112
|
+
self._state_stack.pop()
|
|
98
113
|
generator = None
|
|
99
114
|
|
|
100
115
|
if generator is None:
|
|
101
116
|
if best_reader is None:
|
|
102
|
-
generator = InvalidInputReader()
|
|
117
|
+
generator = self._read(InvalidInputReader(), input)
|
|
118
|
+
sample = []
|
|
103
119
|
else:
|
|
104
|
-
generator =
|
|
105
|
-
sample = list(islice(generator, self.
|
|
120
|
+
generator = self._read(best_reader, input)
|
|
121
|
+
sample = list(islice(generator, self._num_test_entries))
|
|
106
122
|
else:
|
|
107
123
|
if best_mode is not None and best_mode != "guess":
|
|
108
124
|
parent["first_guess"].append(best_reader)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Iterator
|
|
3
|
+
|
|
4
|
+
from .reader import MoleculeEntry, Reader
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Explorer(ABC):
|
|
8
|
+
def __init__(self):
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def explore(self, input) -> Iterator[MoleculeEntry]:
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
def _read(self, reader: Reader, input) -> Iterator[MoleculeEntry]:
|
|
16
|
+
return reader.read(input, self.explore)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
from os import PathLike
|
|
1
2
|
from pathlib import Path
|
|
2
|
-
from typing import
|
|
3
|
+
from typing import Iterator, Optional, Tuple, Union
|
|
3
4
|
|
|
4
5
|
from .reader import MoleculeEntry, Reader
|
|
5
6
|
from .reader_registry import register_reader
|
|
@@ -7,15 +8,15 @@ from .reader_registry import register_reader
|
|
|
7
8
|
__all__ = ["FileReader"]
|
|
8
9
|
|
|
9
10
|
|
|
10
|
-
@register_reader
|
|
11
|
+
@register_reader
|
|
11
12
|
class FileReader(Reader):
|
|
12
|
-
def __init__(self, data_dir=None):
|
|
13
|
+
def __init__(self, data_dir: Union[str, PathLike, None] = None):
|
|
13
14
|
super().__init__()
|
|
14
15
|
self.data_dir = data_dir
|
|
15
16
|
if self.data_dir is not None:
|
|
16
17
|
self.data_dir = Path(self.data_dir)
|
|
17
18
|
|
|
18
|
-
def read(self, filename, explore) ->
|
|
19
|
+
def read(self, filename, explore) -> Iterator[MoleculeEntry]:
|
|
19
20
|
assert isinstance(filename, str), "input must be a string"
|
|
20
21
|
|
|
21
22
|
# convert filename to path
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import gzip
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Iterator
|
|
3
3
|
|
|
4
4
|
from .reader import MoleculeEntry, Reader
|
|
5
5
|
from .reader_registry import register_reader
|
|
@@ -12,7 +12,7 @@ class GzipReader(Reader):
|
|
|
12
12
|
def __init__(self):
|
|
13
13
|
super().__init__()
|
|
14
14
|
|
|
15
|
-
def read(self, input_stream, explore) ->
|
|
15
|
+
def read(self, input_stream, explore) -> Iterator[MoleculeEntry]:
|
|
16
16
|
if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
|
|
17
17
|
raise TypeError("input must be a stream-like object")
|
|
18
18
|
|