nerdd-module 0.2.6__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/PKG-INFO +6 -2
  2. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/__init__.py +4 -4
  3. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/cli.py +12 -17
  4. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/config/__init__.py +1 -1
  5. nerdd_module-0.3.3/nerdd_module/config/configuration.py +71 -0
  6. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/config/default_configuration.py +8 -12
  7. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/config/dict_configuration.py +4 -5
  8. nerdd_module-0.3.3/nerdd_module/config/merged_configuration.py +44 -0
  9. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/config/package_configuration.py +5 -0
  10. nerdd_module-0.3.3/nerdd_module/config/search_yaml_configuration.py +40 -0
  11. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/config/yaml_configuration.py +6 -2
  12. nerdd_module-0.3.3/nerdd_module/converters/__init__.py +2 -0
  13. nerdd_module-0.3.3/nerdd_module/converters/converter.py +16 -0
  14. nerdd_module-0.3.3/nerdd_module/converters/converter_registry.py +61 -0
  15. nerdd_module-0.3.3/nerdd_module/converters/identity_converter.py +5 -0
  16. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/__init__.py +1 -0
  17. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/depth_first_explorer.py +43 -27
  18. nerdd_module-0.3.3/nerdd_module/input/explorer.py +16 -0
  19. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/file_reader.py +5 -4
  20. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/gzip_reader.py +2 -2
  21. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/inchi_reader.py +2 -2
  22. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/list_reader.py +2 -2
  23. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/mol_reader.py +2 -2
  24. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/reader.py +2 -2
  25. nerdd_module-0.3.3/nerdd_module/input/reader_registry.py +41 -0
  26. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/sdf_reader.py +2 -2
  27. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/smiles_reader.py +8 -3
  28. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/string_reader.py +2 -2
  29. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/tar_reader.py +2 -2
  30. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/input/zip_reader.py +2 -2
  31. nerdd_module-0.3.3/nerdd_module/model/__init__.py +6 -0
  32. nerdd_module-0.3.3/nerdd_module/model/add_smiles_step.py +25 -0
  33. nerdd_module-0.3.3/nerdd_module/model/assign_mol_id_step.py +15 -0
  34. nerdd_module-0.3.3/nerdd_module/model/assign_name_step.py +19 -0
  35. nerdd_module-0.3.3/nerdd_module/model/enforce_schema_step.py +29 -0
  36. nerdd_module-0.3.3/nerdd_module/model/model.py +277 -0
  37. nerdd_module-0.3.3/nerdd_module/model/read_input_step.py +24 -0
  38. nerdd_module-0.3.3/nerdd_module/model/simple_model.py +158 -0
  39. nerdd_module-0.3.3/nerdd_module/model/write_output_step.py +19 -0
  40. nerdd_module-0.3.3/nerdd_module/output/__init__.py +6 -0
  41. nerdd_module-0.3.3/nerdd_module/output/csv_writer.py +26 -0
  42. nerdd_module-0.3.3/nerdd_module/output/file_writer.py +41 -0
  43. nerdd_module-0.3.3/nerdd_module/output/iterator_writer.py +13 -0
  44. nerdd_module-0.3.3/nerdd_module/output/pandas_writer.py +16 -0
  45. nerdd_module-0.3.3/nerdd_module/output/record_list_writer.py +13 -0
  46. nerdd_module-0.3.3/nerdd_module/output/sdf_writer.py +35 -0
  47. nerdd_module-0.3.3/nerdd_module/output/writer.py +18 -0
  48. nerdd_module-0.3.3/nerdd_module/output/writer_registry.py +50 -0
  49. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/preprocessing/__init__.py +1 -4
  50. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/preprocessing/check_valid_smiles.py +6 -6
  51. nerdd_module-0.3.3/nerdd_module/preprocessing/chembl_structure_pipeline.py +78 -0
  52. nerdd_module-0.3.3/nerdd_module/preprocessing/filter_by_element.py +67 -0
  53. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/preprocessing/filter_by_weight.py +9 -12
  54. nerdd_module-0.3.3/nerdd_module/preprocessing/preprocessing_step.py +61 -0
  55. nerdd_module-0.3.3/nerdd_module/preprocessing/remove_stereochemistry.py +26 -0
  56. nerdd_module-0.3.3/nerdd_module/preprocessing/sanitize.py +20 -0
  57. nerdd_module-0.3.3/nerdd_module/problem.py +17 -0
  58. nerdd_module-0.3.3/nerdd_module/steps/__init__.py +3 -0
  59. nerdd_module-0.3.3/nerdd_module/steps/map_step.py +38 -0
  60. nerdd_module-0.3.3/nerdd_module/steps/output_step.py +27 -0
  61. nerdd_module-0.3.3/nerdd_module/steps/step.py +27 -0
  62. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/tests/__init__.py +1 -0
  63. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/tests/checks.py +64 -37
  64. nerdd_module-0.3.3/nerdd_module/tests/models/AtomicMassModel.py +51 -0
  65. nerdd_module-0.3.3/nerdd_module/tests/models/MolWeightModel.py +43 -0
  66. nerdd_module-0.3.3/nerdd_module/tests/models/__init__.py +2 -0
  67. nerdd_module-0.3.3/nerdd_module/tests/predictions.py +10 -0
  68. nerdd_module-0.3.3/nerdd_module/tests/predictors.py +42 -0
  69. nerdd_module-0.3.3/nerdd_module/tests/preprocessing/DummyPreprocessingStep.py +25 -0
  70. nerdd_module-0.3.3/nerdd_module/tests/preprocessing/__init__.py +1 -0
  71. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/tests/representations.py +9 -3
  72. nerdd_module-0.3.3/nerdd_module/util/__init__.py +3 -0
  73. nerdd_module-0.3.3/nerdd_module/util/call_with_mappings.py +53 -0
  74. nerdd_module-0.3.3/nerdd_module/util/class_decorator.py +29 -0
  75. nerdd_module-0.3.3/nerdd_module/util/package.py +24 -0
  76. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module.egg-info/PKG-INFO +6 -2
  77. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module.egg-info/SOURCES.txt +35 -16
  78. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module.egg-info/requires.txt +5 -1
  79. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/setup.py +11 -6
  80. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/tests/steps/__init__.py +1 -1
  81. nerdd_module-0.3.3/tests/steps/checks.py +77 -0
  82. nerdd_module-0.3.3/tests/steps/input.py +114 -0
  83. nerdd_module-0.3.3/tests/steps/preprocessing.py +111 -0
  84. nerdd_module-0.3.3/tests/test_features.py +3 -0
  85. nerdd_module-0.2.6/nerdd_module/abstract_model.py +0 -271
  86. nerdd_module-0.2.6/nerdd_module/config/auto_configuration.py +0 -62
  87. nerdd_module-0.2.6/nerdd_module/config/configuration.py +0 -52
  88. nerdd_module-0.2.6/nerdd_module/config/merged_configuration.py +0 -18
  89. nerdd_module-0.2.6/nerdd_module/input/explorer.py +0 -13
  90. nerdd_module-0.2.6/nerdd_module/input/reader_registry.py +0 -64
  91. nerdd_module-0.2.6/nerdd_module/output/__init__.py +0 -1
  92. nerdd_module-0.2.6/nerdd_module/output/csv_writer.py +0 -30
  93. nerdd_module-0.2.6/nerdd_module/output/sdf_writer.py +0 -35
  94. nerdd_module-0.2.6/nerdd_module/output/writer.py +0 -45
  95. nerdd_module-0.2.6/nerdd_module/output/writer_registry.py +0 -40
  96. nerdd_module-0.2.6/nerdd_module/preprocessing/chembl_structure_pipeline.py +0 -124
  97. nerdd_module-0.2.6/nerdd_module/preprocessing/empty_pipeline.py +0 -8
  98. nerdd_module-0.2.6/nerdd_module/preprocessing/filter_by_element.py +0 -39
  99. nerdd_module-0.2.6/nerdd_module/preprocessing/pipeline.py +0 -53
  100. nerdd_module-0.2.6/nerdd_module/preprocessing/registry.py +0 -20
  101. nerdd_module-0.2.6/nerdd_module/preprocessing/remove_stereochemistry.py +0 -24
  102. nerdd_module-0.2.6/nerdd_module/preprocessing/sanitize.py +0 -18
  103. nerdd_module-0.2.6/nerdd_module/preprocessing/step.py +0 -26
  104. nerdd_module-0.2.6/nerdd_module/problem.py +0 -13
  105. nerdd_module-0.2.6/nerdd_module/tests/predictions.py +0 -30
  106. nerdd_module-0.2.6/tests/models/AtomicMassModel.py +0 -29
  107. nerdd_module-0.2.6/tests/models/MolWeightModel.py +0 -25
  108. nerdd_module-0.2.6/tests/models/MolWeightModelWithExplicitMolIds.py +0 -30
  109. nerdd_module-0.2.6/tests/models/MolWeightModelWithExplicitMols.py +0 -27
  110. nerdd_module-0.2.6/tests/models/__init__.py +0 -4
  111. nerdd_module-0.2.6/tests/steps/checks.py +0 -45
  112. nerdd_module-0.2.6/tests/steps/predictors.py +0 -52
  113. nerdd_module-0.2.6/tests/steps/preprocessing.py +0 -9
  114. nerdd_module-0.2.6/tests/test_atom_property_prediction.py +0 -66
  115. nerdd_module-0.2.6/tests/test_molecule_property_prediction.py +0 -60
  116. nerdd_module-0.2.6/tests/test_preprocessing.py +0 -12
  117. nerdd_module-0.2.6/tests/test_reading_formats.py +0 -137
  118. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/LICENSE +0 -0
  119. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/README.md +0 -0
  120. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/polyfills/__init__.py +0 -0
  121. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/polyfills/files.py +0 -0
  122. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/polyfills/get_entry_points.py +0 -0
  123. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/polyfills/version.py +0 -0
  124. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module/version.py +0 -0
  125. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module.egg-info/dependency_links.txt +0 -0
  126. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/nerdd_module.egg-info/top_level.txt +0 -0
  127. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/setup.cfg +0 -0
  128. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/tests/__init__.py +0 -0
  129. {nerdd_module-0.2.6 → nerdd_module-0.3.3}/tests/conftest.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nerdd-module
3
- Version: 0.2.6
3
+ Version: 0.3.3
4
4
  Summary: Base package to create NERDD modules
5
5
  Home-page: https://github.com/molinfo-vienna/nerdd-module
6
6
  Maintainer: Steffen Hirte
@@ -35,7 +35,11 @@ Requires-Dist: importlib-resources>=5; python_version < "3.10"
35
35
  Requires-Dist: importlib-metadata>=4.6; python_version < "3.10"
36
36
  Provides-Extra: dev
37
37
  Requires-Dist: black; extra == "dev"
38
- Requires-Dist: isort; extra == "dev"
38
+ Requires-Dist: mypy; extra == "dev"
39
+ Requires-Dist: pandas-stubs; extra == "dev"
40
+ Requires-Dist: types-PyYAML; extra == "dev"
41
+ Requires-Dist: types-decorator; extra == "dev"
42
+ Requires-Dist: types-setuptools; extra == "dev"
39
43
  Provides-Extra: csp
40
44
  Requires-Dist: chembl_structure_pipeline>=1.0.0; extra == "csp"
41
45
  Provides-Extra: test
@@ -1,10 +1,10 @@
1
- from .abstract_model import *
2
1
  from .cli import *
3
- from .config import *
2
+ from .input import ReaderRegistry
3
+ from .model import *
4
+ from .output import WriterRegistry
5
+ from .polyfills import get_entry_points
4
6
  from .problem import *
5
7
  from .version import *
6
- from .polyfills import get_entry_points
7
-
8
8
 
9
9
  for entry_point in get_entry_points("nerdd-module.plugins"):
10
10
  entry_point.load()
@@ -4,8 +4,7 @@ import sys
4
4
 
5
5
  import rich_click as click
6
6
  from decorator import decorator
7
- from nerdd_module.output import WriterRegistry
8
- from stringcase import spinalcase
7
+ from stringcase import spinalcase # type: ignore
9
8
 
10
9
  __all__ = ["auto_cli"]
11
10
 
@@ -14,7 +13,7 @@ input_description = """{description}
14
13
  INPUT molecules are provided as file paths or strings. The following formats are
15
14
  supported:
16
15
 
17
- {format_list}
16
+ {input_format_list}
18
17
 
19
18
  Note that input formats shouldn't be mixed.
20
19
  """
@@ -43,17 +42,19 @@ def auto_cli(f, *args, **kwargs):
43
42
  # get the model
44
43
  model = f()
45
44
 
46
- config = model.get_config().get_dict()
45
+ config = model.get_config()
47
46
 
48
47
  # compose cli description
49
48
  description = config.get("description", "")
50
49
 
51
- format_list = "\n".join([f"* {fmt}" for fmt in ["smiles", "sdf", "inchi"]])
50
+ input_format_list = "\n".join([f"* {fmt}" for fmt in ["smiles", "sdf", "inchi"]])
52
51
 
53
52
  help_text = input_description.format(
54
- description=description, format_list=format_list
53
+ description=description, input_format_list=input_format_list
55
54
  )
56
55
 
56
+ output_format_list = ["sdf", "csv"]
57
+
57
58
  # compose footer with examples
58
59
  examples = []
59
60
  if "example_smiles" in config:
@@ -88,21 +89,15 @@ def auto_cli(f, *args, **kwargs):
88
89
  ):
89
90
  logging.basicConfig(level=log_level.upper())
90
91
 
91
- df_result = model.predict(input, **kwargs)
92
-
93
92
  # write results
94
- assert format in WriterRegistry().supported_formats
95
- writer = WriterRegistry().get_writer(format)
93
+ assert format in output_format_list, f"Unknown output format: {format}"
96
94
 
97
- if output.lower() == "stdout":
98
- assert not writer.writes_bytes, "stdout does not support binary output"
95
+ if str(output).lower() == "stdout":
99
96
  output_handle = sys.stdout
100
97
  else:
101
- mode = "wb" if writer.writes_bytes else "w"
102
- output_handle = click.open_file(output, mode)
98
+ output_handle = click.open_file(str(output), "wb")
103
99
 
104
- entries = (tup._asdict() for tup in df_result.itertuples(index=False))
105
- writer.write(output_handle, entries)
100
+ model.predict(input, output_format=format, output_file=output_handle, **kwargs)
106
101
 
107
102
  #
108
103
  # Add job parameters
@@ -130,7 +125,7 @@ def auto_cli(f, *args, **kwargs):
130
125
  main = click.option(
131
126
  "--format",
132
127
  default="csv",
133
- type=click.Choice(WriterRegistry().supported_formats, case_sensitive=False),
128
+ type=click.Choice(output_format_list, case_sensitive=False),
134
129
  help="The output format.",
135
130
  )(main)
136
131
 
@@ -1,7 +1,7 @@
1
- from .auto_configuration import *
2
1
  from .configuration import *
3
2
  from .default_configuration import *
4
3
  from .dict_configuration import *
5
4
  from .merged_configuration import *
6
5
  from .package_configuration import *
6
+ from .search_yaml_configuration import *
7
7
  from .yaml_configuration import *
@@ -0,0 +1,71 @@
1
+ from abc import ABC, abstractmethod
2
+ from functools import lru_cache
3
+ from typing import List
4
+
5
+ __all__ = ["Configuration"]
6
+
7
+
8
+ def get_property_columns_of_type(config, t) -> List[dict]:
9
+ return [c for c in config["result_properties"] if c.get("level", "molecule") == t]
10
+
11
+
12
+ class Configuration(ABC):
13
+ def __init__(self):
14
+ pass
15
+
16
+ @lru_cache
17
+ def get_dict(self) -> dict:
18
+ config = self._get_dict()
19
+
20
+ if "result_properties" not in config:
21
+ config["result_properties"] = []
22
+
23
+ # check that a module can only predict atom or derivative properties, not both
24
+ num_atom_properties = len(get_property_columns_of_type(config, "atom"))
25
+ num_derivative_properties = len(
26
+ get_property_columns_of_type(config, "derivative")
27
+ )
28
+ assert (
29
+ num_atom_properties == 0 or num_derivative_properties == 0
30
+ ), "A module can only predict atom or derivative properties, not both."
31
+
32
+ return config
33
+
34
+ @abstractmethod
35
+ def _get_dict(self) -> dict:
36
+ pass
37
+
38
+ def is_empty(self) -> bool:
39
+ return self.get_dict() == {}
40
+
41
+ def molecular_property_columns(self) -> List[dict]:
42
+ return get_property_columns_of_type(self, "molecule")
43
+
44
+ def atom_property_columns(self) -> List[dict]:
45
+ return get_property_columns_of_type(self, "atom")
46
+
47
+ def derivative_property_columns(self) -> List[dict]:
48
+ return get_property_columns_of_type(self, "derivative")
49
+
50
+ def get_task(self) -> str:
51
+ # if task is specified in the config, use that
52
+ config = self.get_dict()
53
+ if "task" in config:
54
+ return config["task"]
55
+
56
+ # try to derive the task from the result_properties
57
+ num_atom_properties = len(self.atom_property_columns())
58
+ num_derivative_properties = len(self.derivative_property_columns())
59
+
60
+ if num_atom_properties > 0:
61
+ return "atom_property_prediction"
62
+ elif num_derivative_properties > 0:
63
+ return "derivative_property_prediction"
64
+ else:
65
+ return "molecular_property_prediction"
66
+
67
+ def __getitem__(self, key):
68
+ return self.get_dict()[key]
69
+
70
+ def __repr__(self):
71
+ return f"{self.__class__.__name__}({self._get_dict()})"
@@ -1,15 +1,13 @@
1
- from stringcase import snakecase
1
+ from stringcase import snakecase # type: ignore
2
2
 
3
3
  from ..polyfills import version
4
- from .configuration import Configuration
4
+ from .dict_configuration import DictConfiguration
5
5
 
6
6
  __all__ = ["DefaultConfiguration"]
7
7
 
8
8
 
9
- class DefaultConfiguration(Configuration):
9
+ class DefaultConfiguration(DictConfiguration):
10
10
  def __init__(self, nerdd_module):
11
- super().__init__()
12
-
13
11
  # generate a name from the module name
14
12
  class_name = nerdd_module.__class__.__name__
15
13
  if class_name.endswith("Model"):
@@ -25,17 +23,15 @@ class DefaultConfiguration(Configuration):
25
23
  try:
26
24
  module = nerdd_module.__module__
27
25
  root_module = module.split(".", 1)[0]
28
- version_ = version(root_module)
26
+ package_version = version(root_module)
29
27
  except ModuleNotFoundError:
30
- pass
28
+ package_version = "0.0.1"
31
29
 
32
- self.config = dict(
30
+ config = dict(
33
31
  name=name,
34
- version=version_,
35
- task="molecular_property_prediction",
32
+ version=package_version,
36
33
  job_parameters=[],
37
34
  result_properties=[],
38
35
  )
39
36
 
40
- def _get_dict(self):
41
- return self.config
37
+ super().__init__(config)
@@ -4,10 +4,9 @@ __all__ = ["DictConfiguration"]
4
4
 
5
5
 
6
6
  class DictConfiguration(Configuration):
7
- def __init__(self, config):
7
+ def __init__(self, config: dict) -> None:
8
8
  super().__init__()
9
+ self._config = config
9
10
 
10
- self.config = config
11
-
12
- def _get_dict(self):
13
- return self.config
11
+ def _get_dict(self) -> dict:
12
+ return self._config
@@ -0,0 +1,44 @@
1
+ from collections import Counter
2
+
3
+ from .configuration import Configuration
4
+ from .dict_configuration import DictConfiguration
5
+
6
+ __all__ = ["MergedConfiguration"]
7
+
8
+
9
+ def merge(*args):
10
+ assert len(args) > 0
11
+
12
+ first_entry = args[0]
13
+ assert all(isinstance(d, type(first_entry)) for d in args)
14
+
15
+ if isinstance(first_entry, list):
16
+ return [e for d in args for e in d]
17
+ if isinstance(first_entry, dict):
18
+ count_fields = Counter([k for d in args for k in d.keys()])
19
+
20
+ # merge fields that occur in multiple dicts
21
+ overlapping_fields = [k for k, v in count_fields.items() if v > 1]
22
+ merged_overlapping_fields = {
23
+ k: merge(*[d[k] for d in args if k in d]) for k in overlapping_fields
24
+ }
25
+
26
+ # collect fields that occur in only one dict
27
+ non_overlapping_fields = [k for k, v in count_fields.items() if v == 1]
28
+ merged_non_overlapping_fields = {
29
+ k: v for d in args for k, v in d.items() if k in non_overlapping_fields
30
+ }
31
+
32
+ return {
33
+ **merged_non_overlapping_fields,
34
+ **merged_overlapping_fields,
35
+ }
36
+ else:
37
+ # merge all configurations starting from the first one
38
+ # --> last configuration has the highest priority
39
+ return args[-1]
40
+
41
+
42
+ class MergedConfiguration(DictConfiguration):
43
+ def __init__(self, *configs: Configuration):
44
+ super().__init__(merge(*[c.get_dict() for c in configs]))
@@ -1,3 +1,5 @@
1
+ import logging
2
+
1
3
  from ..polyfills import files
2
4
  from .configuration import Configuration
3
5
  from .dict_configuration import DictConfiguration
@@ -5,6 +7,8 @@ from .yaml_configuration import YamlConfiguration
5
7
 
6
8
  __all__ = ["PackageConfiguration"]
7
9
 
10
+ logger = logging.getLogger(__name__)
11
+
8
12
 
9
13
  class PackageConfiguration(Configuration):
10
14
  def __init__(self, package):
@@ -23,6 +27,7 @@ class PackageConfiguration(Configuration):
23
27
  config_file = root_dir / "nerdd.yml"
24
28
 
25
29
  if config_file is not None and config_file.exists():
30
+ logger.info(f"Found configuration file in package: {config_file}")
26
31
  self.config = YamlConfiguration(config_file, base_path=root_dir)
27
32
  else:
28
33
  self.config = DictConfiguration({})
@@ -0,0 +1,40 @@
1
+ import logging
2
+ import os
3
+ import sys
4
+ from typing import Any, Optional
5
+
6
+ from .configuration import Configuration
7
+ from .dict_configuration import DictConfiguration
8
+ from .yaml_configuration import YamlConfiguration
9
+
10
+ __all__ = ["SearchYamlConfiguration"]
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class SearchYamlConfiguration(DictConfiguration):
16
+ def __init__(self, start: str, base_path: Optional[str] = None) -> None:
17
+ # provide a default configuration if no configuration file is found
18
+ config: Configuration = DictConfiguration({})
19
+
20
+ if start is not None:
21
+ # start at the directory containing the file where nerdd_module_class is
22
+ # defined and go up the directory tree until nerdd.yml is found (or root is
23
+ # reached)
24
+ leaf = start
25
+ while True:
26
+ if os.path.isfile(os.path.join(leaf, "nerdd.yml")):
27
+ default_config_file = os.path.join(leaf, "nerdd.yml")
28
+ break
29
+ elif leaf == os.path.dirname(leaf): # reached root
30
+ default_config_file = None
31
+ break
32
+ leaf = os.path.dirname(leaf)
33
+
34
+ if default_config_file is not None:
35
+ logger.info(
36
+ f"Found configuration file in project directory: {default_config_file}"
37
+ )
38
+ config = YamlConfiguration(default_config_file, base_path)
39
+
40
+ super().__init__(config.get_dict())
@@ -1,8 +1,10 @@
1
1
  import base64
2
2
  import os
3
3
  import pathlib
4
+ from typing import Optional, Union
5
+ from pathlib import Path
4
6
 
5
- import filetype
7
+ import filetype # type: ignore
6
8
  import yaml
7
9
 
8
10
  from .configuration import Configuration
@@ -26,7 +28,9 @@ def image_constructor(loader, node):
26
28
 
27
29
 
28
30
  class YamlConfiguration(Configuration):
29
- def __init__(self, handle, base_path=None):
31
+ def __init__(
32
+ self, handle: Union[str, Path], base_path: Optional[Union[str, Path]] = None
33
+ ) -> None:
30
34
  super().__init__()
31
35
 
32
36
  if base_path is None:
@@ -0,0 +1,2 @@
1
+ from .converter import *
2
+ from .converter_registry import *
@@ -0,0 +1,16 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any
3
+
4
+ __all__ = ["Converter"]
5
+
6
+
7
+ class Converter(ABC):
8
+ def __init__(self):
9
+ super().__init__()
10
+
11
+ @abstractmethod
12
+ def _convert(self, input: Any, context: dict, **kwargs) -> Any:
13
+ pass
14
+
15
+ def convert(self, input: Any, context: dict, **kwargs) -> Any:
16
+ return self._convert(input, context, **kwargs)
@@ -0,0 +1,61 @@
1
+ from functools import lru_cache, partial
2
+ from typing import Callable, Dict, Tuple, Type
3
+
4
+ from ..util import call_with_mappings, class_decorator
5
+ from .converter import Converter
6
+ from .identity_converter import IdentityConverter
7
+
8
+ __all__ = [
9
+ "ConverterRegistry",
10
+ "register_representation",
11
+ ]
12
+
13
+
14
+ ConverterFactory = Callable[[dict], Converter]
15
+
16
+
17
+ # lru_cache makes the registry a singleton
18
+ @lru_cache(maxsize=1)
19
+ class ConverterRegistry:
20
+ def __init__(self) -> None:
21
+ self._factories: Dict[Tuple[str, str], ConverterFactory] = {}
22
+
23
+ def register(
24
+ self,
25
+ data_type: str,
26
+ output_format: str,
27
+ ConverterClass: Type[Converter],
28
+ *args: str,
29
+ **kwargs: str,
30
+ ):
31
+ assert issubclass(ConverterClass, Converter)
32
+ assert all([isinstance(arg, str) for arg in args])
33
+ assert all(
34
+ [isinstance(k, str) and isinstance(v, str) for k, v in kwargs.items()]
35
+ )
36
+
37
+ self._factories[(data_type, output_format)] = partial(
38
+ call_with_mappings, ConverterClass, args_mapping=args, kwargs_mapping=kwargs
39
+ )
40
+
41
+ def get_converter(
42
+ self, data_type: str, output_format: str, return_default=True, **kwargs
43
+ ) -> Converter:
44
+ if (data_type, output_format) not in self._factories:
45
+ if return_default:
46
+ return IdentityConverter()
47
+ else:
48
+ raise ValueError(
49
+ f"Unknown data type '{data_type}' or output format '{output_format}'"
50
+ )
51
+ return self._factories[(data_type, output_format)](kwargs)
52
+
53
+ def get_output_formats(self) -> frozenset:
54
+ return frozenset(self._factories.keys())
55
+
56
+
57
+ @class_decorator
58
+ def register_representation(
59
+ cls: Type[Converter], data_type: str, output_format: str, *args, **kwargs
60
+ ):
61
+ ConverterRegistry().register(data_type, output_format, cls, *args, **kwargs)
@@ -0,0 +1,5 @@
1
+ from .converter import Converter
2
+
3
+
4
+ class IdentityConverter(Converter):
5
+ pass
@@ -1,4 +1,5 @@
1
1
  from .depth_first_explorer import *
2
+ from .explorer import *
2
3
  from .file_reader import *
3
4
  from .gzip_reader import *
4
5
  from .inchi_reader import *
@@ -1,5 +1,5 @@
1
1
  from itertools import chain, islice, repeat
2
- from typing import Generator, Iterable, Optional
2
+ from typing import Iterable, Iterator, Optional
3
3
 
4
4
  from .explorer import Explorer
5
5
  from .reader import MoleculeEntry, Problem, Reader
@@ -12,7 +12,7 @@ class InvalidInputReader(Reader):
12
12
  def __init__(self):
13
13
  super().__init__()
14
14
 
15
- def read(self, input, explore) -> Generator[MoleculeEntry, None, None]:
15
+ def read(self, input, explore) -> Iterator[MoleculeEntry]:
16
16
  yield MoleculeEntry(
17
17
  raw_input=input,
18
18
  input_type="unknown",
@@ -36,31 +36,31 @@ class DepthFirstExplorer(Explorer):
36
36
  super().__init__()
37
37
 
38
38
  if readers is None:
39
- self.reader_registry = ReaderRegistry()
39
+ self._reader_registry = list(ReaderRegistry().get_readers())
40
40
  else:
41
- self.reader_registry = readers
41
+ self._reader_registry = list(readers)
42
42
 
43
- self.num_test_entries = num_test_entries
44
- self.threshold = threshold
45
- self.state_stack = [self.empty_state()]
46
- self.maximum_depth = maximum_depth
43
+ self._num_test_entries = num_test_entries
44
+ self._threshold = threshold
45
+ self._state_stack = [self._empty_state()]
46
+ self._maximum_depth = maximum_depth
47
47
 
48
- def empty_state(self):
48
+ def _empty_state(self):
49
49
  return dict(first_guess=[])
50
50
 
51
- def explore(self, input) -> Generator[MoleculeEntry, None, None]:
51
+ def explore(self, input) -> Iterator[MoleculeEntry]:
52
52
  # create a new child node and set it as the current node
53
- state = self.empty_state()
54
- parent = self.state_stack[-1]
55
- self.state_stack.append(state)
53
+ state = self._empty_state()
54
+ parent = self._state_stack[-1]
55
+ self._state_stack.append(state)
56
56
 
57
- depth = len(self.state_stack)
58
- if depth > self.maximum_depth:
59
- raise ValueError(f"Maximum depth of {self.maximum_depth} reached")
57
+ depth = len(self._state_stack)
58
+ if depth > self._maximum_depth:
59
+ raise ValueError(f"Maximum depth of {self._maximum_depth} reached")
60
60
 
61
61
  readers_iter = chain(
62
62
  zip(parent["first_guess"], repeat("guess")),
63
- zip(self.reader_registry, repeat("builtin")),
63
+ zip(self._reader_registry, repeat("builtin")),
64
64
  )
65
65
 
66
66
  # try all readers and take a sample of the first num_test_entries
@@ -69,40 +69,56 @@ class DepthFirstExplorer(Explorer):
69
69
  best_mode = None
70
70
  best_score = 0
71
71
  best_ratio = 0.0
72
+ best_num_invalid_results = 0
72
73
  generator = None
73
74
  sample = []
74
75
  for reader, mode in readers_iter:
75
76
  try:
76
77
  # read at most num_test_entries entries
77
- generator = reader.read(input, self.explore)
78
- sample = list(islice(generator, self.num_test_entries))
78
+ generator = self._read(reader, input)
79
+ sample = list(islice(generator, self._num_test_entries))
79
80
  valid_entries = [entry for entry in sample if entry.mol is not None]
80
81
 
81
82
  score = len(valid_entries)
82
83
  ratio = len(valid_entries) / len(sample)
83
-
84
- if score > best_score or (score == best_score and ratio > best_ratio):
84
+ num_invalid_results = len(sample) - len(valid_entries)
85
+
86
+ if (
87
+ score > best_score
88
+ # if the score is the same, prefer the reader with higher ratio
89
+ # of valid entries
90
+ or (score == best_score and ratio > best_ratio)
91
+ # if the ratio is the same, prefer the reader with less invalid
92
+ # results
93
+ or (
94
+ score == best_score
95
+ and ratio == best_ratio
96
+ and num_invalid_results < best_num_invalid_results
97
+ )
98
+ ):
85
99
  best_reader = reader
86
100
  best_mode = mode
87
101
  best_score = score
88
102
  best_ratio = ratio
103
+ best_num_invalid_results = num_invalid_results
89
104
 
90
- if score == self.num_test_entries:
105
+ if score == self._num_test_entries:
91
106
  break
92
107
  except Exception:
93
108
  pass
94
109
 
95
110
  # clean up tree
96
- while len(self.state_stack) > depth:
97
- self.state_stack.pop()
111
+ while len(self._state_stack) > depth:
112
+ self._state_stack.pop()
98
113
  generator = None
99
114
 
100
115
  if generator is None:
101
116
  if best_reader is None:
102
- generator = InvalidInputReader().read(input, self.explore)
117
+ generator = self._read(InvalidInputReader(), input)
118
+ sample = []
103
119
  else:
104
- generator = best_reader.read(input, self.explore)
105
- sample = list(islice(generator, self.num_test_entries))
120
+ generator = self._read(best_reader, input)
121
+ sample = list(islice(generator, self._num_test_entries))
106
122
  else:
107
123
  if best_mode is not None and best_mode != "guess":
108
124
  parent["first_guess"].append(best_reader)
@@ -0,0 +1,16 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Iterator
3
+
4
+ from .reader import MoleculeEntry, Reader
5
+
6
+
7
+ class Explorer(ABC):
8
+ def __init__(self):
9
+ pass
10
+
11
+ @abstractmethod
12
+ def explore(self, input) -> Iterator[MoleculeEntry]:
13
+ pass
14
+
15
+ def _read(self, reader: Reader, input) -> Iterator[MoleculeEntry]:
16
+ return reader.read(input, self.explore)
@@ -1,5 +1,6 @@
1
+ from os import PathLike
1
2
  from pathlib import Path
2
- from typing import Generator, Tuple
3
+ from typing import Iterator, Optional, Tuple, Union
3
4
 
4
5
  from .reader import MoleculeEntry, Reader
5
6
  from .reader_registry import register_reader
@@ -7,15 +8,15 @@ from .reader_registry import register_reader
7
8
  __all__ = ["FileReader"]
8
9
 
9
10
 
10
- @register_reader("data_dir")
11
+ @register_reader
11
12
  class FileReader(Reader):
12
- def __init__(self, data_dir=None):
13
+ def __init__(self, data_dir: Union[str, PathLike, None] = None):
13
14
  super().__init__()
14
15
  self.data_dir = data_dir
15
16
  if self.data_dir is not None:
16
17
  self.data_dir = Path(self.data_dir)
17
18
 
18
- def read(self, filename, explore) -> Generator[MoleculeEntry, None, None]:
19
+ def read(self, filename, explore) -> Iterator[MoleculeEntry]:
19
20
  assert isinstance(filename, str), "input must be a string"
20
21
 
21
22
  # convert filename to path
@@ -1,5 +1,5 @@
1
1
  import gzip
2
- from typing import Generator
2
+ from typing import Iterator
3
3
 
4
4
  from .reader import MoleculeEntry, Reader
5
5
  from .reader_registry import register_reader
@@ -12,7 +12,7 @@ class GzipReader(Reader):
12
12
  def __init__(self):
13
13
  super().__init__()
14
14
 
15
- def read(self, input_stream, explore) -> Generator[MoleculeEntry, None, None]:
15
+ def read(self, input_stream, explore) -> Iterator[MoleculeEntry]:
16
16
  if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
17
17
  raise TypeError("input must be a stream-like object")
18
18