nerdd-module 0.3.49__tar.gz → 0.3.51__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/PKG-INFO +1 -1
  2. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/depth_first_explorer.py +6 -3
  3. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/gzip_reader.py +11 -1
  4. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/list_reader.py +14 -1
  5. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/sdf_reader.py +1 -1
  6. nerdd_module-0.3.51/nerdd_module/input/tar_reader.py +50 -0
  7. nerdd_module-0.3.51/nerdd_module/input/zip_reader.py +52 -0
  8. nerdd_module-0.3.51/nerdd_module/preprocessing/__init__.py +81 -0
  9. nerdd_module-0.3.51/nerdd_module/preprocessing/check_valid_smiles.py +74 -0
  10. nerdd_module-0.3.51/nerdd_module/preprocessing/chembl_structure_pipeline.py +183 -0
  11. nerdd_module-0.3.51/nerdd_module/preprocessing/filter_by_element.py +148 -0
  12. nerdd_module-0.3.51/nerdd_module/preprocessing/filter_by_weight.py +95 -0
  13. nerdd_module-0.3.51/nerdd_module/preprocessing/preprocessing_step.py +139 -0
  14. nerdd_module-0.3.51/nerdd_module/preprocessing/remove_hydrogens.py +104 -0
  15. nerdd_module-0.3.51/nerdd_module/preprocessing/remove_small_fragments.py +76 -0
  16. nerdd_module-0.3.51/nerdd_module/preprocessing/remove_stereochemistry.py +81 -0
  17. nerdd_module-0.3.51/nerdd_module/preprocessing/sanitize.py +91 -0
  18. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module.egg-info/PKG-INFO +1 -1
  19. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module.egg-info/SOURCES.txt +1 -0
  20. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/pyproject.toml +1 -1
  21. nerdd_module-0.3.49/nerdd_module/input/tar_reader.py +0 -34
  22. nerdd_module-0.3.49/nerdd_module/input/zip_reader.py +0 -37
  23. nerdd_module-0.3.49/nerdd_module/preprocessing/__init__.py +0 -8
  24. nerdd_module-0.3.49/nerdd_module/preprocessing/check_valid_smiles.py +0 -26
  25. nerdd_module-0.3.49/nerdd_module/preprocessing/chembl_structure_pipeline.py +0 -77
  26. nerdd_module-0.3.49/nerdd_module/preprocessing/filter_by_element.py +0 -57
  27. nerdd_module-0.3.49/nerdd_module/preprocessing/filter_by_weight.py +0 -34
  28. nerdd_module-0.3.49/nerdd_module/preprocessing/preprocessing_step.py +0 -61
  29. nerdd_module-0.3.49/nerdd_module/preprocessing/remove_small_fragments.py +0 -26
  30. nerdd_module-0.3.49/nerdd_module/preprocessing/remove_stereochemistry.py +0 -26
  31. nerdd_module-0.3.49/nerdd_module/preprocessing/sanitize.py +0 -31
  32. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/LICENSE +0 -0
  33. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/README.md +0 -0
  34. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/__init__.py +0 -0
  35. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/cli.py +0 -0
  36. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/__init__.py +0 -0
  37. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/configuration.py +0 -0
  38. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/default_configuration.py +0 -0
  39. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/dict_configuration.py +0 -0
  40. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/merged_configuration.py +0 -0
  41. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/models.py +0 -0
  42. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/package_configuration.py +0 -0
  43. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/search_yaml_configuration.py +0 -0
  44. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/yaml_configuration.py +0 -0
  45. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/__init__.py +0 -0
  46. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/basic_type_converter.py +0 -0
  47. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/converter.py +0 -0
  48. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/converter_config.py +0 -0
  49. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/mol_converter.py +0 -0
  50. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/problem_list_converter.py +0 -0
  51. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/representation_converter.py +0 -0
  52. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/source_list_converter.py +0 -0
  53. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/void_converter.py +0 -0
  54. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/__init__.py +0 -0
  55. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/explorer.py +0 -0
  56. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/file_reader.py +0 -0
  57. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/inchi_reader.py +0 -0
  58. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/mol_reader.py +0 -0
  59. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/reader.py +0 -0
  60. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/reader_config.py +0 -0
  61. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/smiles_reader.py +0 -0
  62. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/stream_reader.py +0 -0
  63. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/string_reader.py +0 -0
  64. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/model/__init__.py +0 -0
  65. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/model/assign_name_step.py +0 -0
  66. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/model/convert_representations_step.py +0 -0
  67. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/model/enforce_schema_step.py +0 -0
  68. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/model/model.py +0 -0
  69. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/model/prediction_step.py +0 -0
  70. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/model/read_input_step.py +0 -0
  71. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/model/write_output_step.py +0 -0
  72. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/__init__.py +0 -0
  73. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/csv_writer.py +0 -0
  74. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/file_writer.py +0 -0
  75. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/iterator_writer.py +0 -0
  76. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/pandas_writer.py +0 -0
  77. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/record_list_writer.py +0 -0
  78. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/sdf_writer.py +0 -0
  79. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/writer.py +0 -0
  80. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/writer_config.py +0 -0
  81. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/polyfills/__init__.py +0 -0
  82. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/polyfills/block_logs.py +0 -0
  83. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/polyfills/files.py +0 -0
  84. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/polyfills/get_entry_points.py +0 -0
  85. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/polyfills/literal.py +0 -0
  86. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/polyfills/typed_dict.py +0 -0
  87. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/polyfills/types.py +0 -0
  88. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/polyfills/version.py +0 -0
  89. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/problem.py +0 -0
  90. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/py.typed +0 -0
  91. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/steps/__init__.py +0 -0
  92. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/steps/map_step.py +0 -0
  93. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/steps/output_step.py +0 -0
  94. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/steps/step.py +0 -0
  95. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/__init__.py +0 -0
  96. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/checks.py +0 -0
  97. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/files.py +0 -0
  98. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/models/AtomicMassModel.py +0 -0
  99. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/models/MolWeightModel.py +0 -0
  100. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/models/__init__.py +0 -0
  101. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/predictions.py +0 -0
  102. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/preprocessing/DummyPreprocessingStep.py +0 -0
  103. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/preprocessing/__init__.py +0 -0
  104. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/representations.py +0 -0
  105. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/util/__init__.py +0 -0
  106. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/util/call_with_mappings.py +0 -0
  107. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/util/package.py +0 -0
  108. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/version.py +0 -0
  109. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module.egg-info/dependency_links.txt +0 -0
  110. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module.egg-info/requires.txt +0 -0
  111. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module.egg-info/top_level.txt +0 -0
  112. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/setup.cfg +0 -0
  113. {nerdd_module-0.3.49 → nerdd_module-0.3.51}/tests/test_features.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nerdd-module
3
- Version: 0.3.49
3
+ Version: 0.3.51
4
4
  Summary: Base package to create NERDD modules
5
5
  Author-email: Steffen Hirte <steffen.hirte@univie.ac.at>
6
6
  Maintainer-email: Steffen Hirte <steffen.hirte@univie.ac.at>
@@ -8,16 +8,17 @@ __all__ = ["DepthFirstExplorer"]
8
8
 
9
9
 
10
10
  class InvalidInputReader(Reader):
11
- def __init__(self) -> None:
11
+ def __init__(self, message: str = "Invalid input") -> None:
12
12
  super().__init__()
13
+ self.message = message
13
14
 
14
15
  def read(self, input: Any, explore: ExploreCallable) -> Iterator[MoleculeEntry]:
15
16
  yield MoleculeEntry(
16
17
  raw_input=input,
17
18
  input_type="unknown",
18
- source=("input",),
19
+ source=("raw_input",),
19
20
  mol=None,
20
- errors=[Problem("invalid_input", "Invalid input")],
21
+ errors=[Problem("invalid_input", self.message)],
21
22
  )
22
23
 
23
24
  def __repr__(self) -> str:
@@ -120,6 +121,8 @@ class DepthFirstExplorer(Explorer):
120
121
  if best_mode == "builtin":
121
122
  parent["first_guess"].append(best_reader)
122
123
 
124
+ # In order to get more fine-grained error messages, we do not handle exceptions here and
125
+ # rely on the readers to do so.
123
126
  yield from sample
124
127
  yield from generator
125
128
 
@@ -1,6 +1,7 @@
1
1
  import gzip
2
2
  from typing import Any, Iterator
3
3
 
4
+ from ..problem import Problem
4
5
  from .reader import ExploreCallable, MoleculeEntry, Reader
5
6
 
6
7
  __all__ = ["GzipReader"]
@@ -22,7 +23,16 @@ class GzipReader(Reader):
22
23
  f.read(1)
23
24
  f.seek(0)
24
25
 
25
- yield from explore(f)
26
+ try:
27
+ yield from explore(f)
28
+ except Exception as e:
29
+ yield MoleculeEntry(
30
+ raw_input="<gzip>",
31
+ input_type="gzip",
32
+ source=("raw_input",),
33
+ mol=None,
34
+ errors=[Problem("invalid_input", f"Invalid gzip file: {e}")],
35
+ )
26
36
 
27
37
  def __repr__(self) -> str:
28
38
  return "GzipReader()"
@@ -1,6 +1,7 @@
1
1
  from io import IOBase
2
2
  from typing import Any, Iterable, Iterator
3
3
 
4
+ from ..problem import Problem
4
5
  from .reader import ExploreCallable, MoleculeEntry, Reader
5
6
 
6
7
  __all__ = ["ListReader"]
@@ -16,7 +17,19 @@ class ListReader(Reader):
16
17
  ), f"input must be an iterable, but is {type(input_iterable)}"
17
18
 
18
19
  for entry in input_iterable:
19
- yield from explore(entry)
20
+ try:
21
+ yield from explore(entry)
22
+ except Exception as e:
23
+ raw_input = str(entry)
24
+ if len(raw_input) > 100:
25
+ raw_input = raw_input[:97] + "..."
26
+ yield MoleculeEntry(
27
+ raw_input=raw_input,
28
+ input_type="unknown",
29
+ source=(),
30
+ mol=None,
31
+ errors=[Problem("invalid_list_entry", f"Could not read list entry: {e}")],
32
+ )
20
33
 
21
34
  def __repr__(self) -> str:
22
35
  return "ListReader()"
@@ -11,7 +11,7 @@ __all__ = ["SdfReader"]
11
11
 
12
12
 
13
13
  class SdfReader(StreamReader):
14
- def __init__(self, max_num_lines_mol_block: int = 10_000) -> None:
14
+ def __init__(self, max_num_lines_mol_block: int = 100_000) -> None:
15
15
  super().__init__()
16
16
  self.max_num_lines_mol_block = max_num_lines_mol_block
17
17
 
@@ -0,0 +1,50 @@
1
+ import tarfile
2
+ from typing import Any, Iterator, Tuple
3
+
4
+ from ..problem import Problem
5
+ from .reader import ExploreCallable, MoleculeEntry, Reader
6
+
7
+ __all__ = ["TarReader"]
8
+
9
+
10
+ class TarReader(Reader):
11
+ def __init__(self) -> None:
12
+ super().__init__()
13
+
14
+ def read(self, input_stream: Any, explore: ExploreCallable) -> Iterator[MoleculeEntry]:
15
+ if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
16
+ raise TypeError("input must be a stream-like object")
17
+
18
+ input_stream.seek(0)
19
+
20
+ with tarfile.open(fileobj=input_stream, mode="r") as tar:
21
+ for member in tar.getmembers():
22
+ if not member.isfile():
23
+ continue
24
+
25
+ try:
26
+ for entry in explore(tar.extractfile(member)):
27
+ # the underlying reader only sees the file content as a stream
28
+ # -> it might believe that the source is "raw_input"
29
+ # -> we need to correct that here
30
+ if len(entry.source) == 1 and entry.source[0] == "raw_input":
31
+ source: Tuple[str, ...] = tuple()
32
+ else:
33
+ source = entry.source
34
+ yield entry._replace(source=(member.name, *source))
35
+ except Exception as e:
36
+ yield MoleculeEntry(
37
+ raw_input="<tar>",
38
+ input_type="unknown",
39
+ source=(member.name,),
40
+ mol=None,
41
+ errors=[
42
+ Problem(
43
+ "invalid_tar_member",
44
+ f"Could not read tar member '{member.name}': {e}",
45
+ )
46
+ ],
47
+ )
48
+
49
+ def __repr__(self) -> str:
50
+ return "TarReader()"
@@ -0,0 +1,52 @@
1
+ import zipfile
2
+ from typing import Any, Iterator, Tuple
3
+
4
+ from ..problem import Problem
5
+ from .reader import ExploreCallable, MoleculeEntry, Reader
6
+
7
+ __all__ = ["ZipReader"]
8
+
9
+
10
+ class ZipReader(Reader):
11
+ def __init__(self) -> None:
12
+ super().__init__()
13
+
14
+ def read(self, input_stream: Any, explore: ExploreCallable) -> Iterator[MoleculeEntry]:
15
+ if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
16
+ raise TypeError("input must be a stream-like object")
17
+
18
+ input_stream.seek(0)
19
+
20
+ with zipfile.ZipFile(input_stream, "r") as zipf:
21
+ for member in zipf.namelist():
22
+ # check if the member is a file
23
+ if member.endswith("/"):
24
+ continue
25
+
26
+ try:
27
+ with zipf.open(member, "r") as f:
28
+ for entry in explore(f):
29
+ # the underlying reader only sees the file content as a stream
30
+ # -> it might believe that the source is "raw_input"
31
+ # -> we need to correct that here
32
+ if len(entry.source) == 1 and entry.source[0] == "raw_input":
33
+ source: Tuple[str, ...] = tuple()
34
+ else:
35
+ source = entry.source
36
+
37
+ yield entry._replace(source=(member, *source))
38
+ except Exception as e:
39
+ yield MoleculeEntry(
40
+ raw_input="<zip>",
41
+ input_type="unknown",
42
+ source=(member,),
43
+ mol=None,
44
+ errors=[
45
+ Problem(
46
+ "invalid_zip_member", f"Could not read zip member '{member}': {e}"
47
+ )
48
+ ],
49
+ )
50
+
51
+ def __repr__(self) -> str:
52
+ return "ZipReader()"
@@ -0,0 +1,81 @@
1
+ """
2
+ Molecular preprocessing pipeline components.
3
+
4
+ This package provides a comprehensive set of preprocessing steps for molecular data processing
5
+ pipelines. These steps can be chained together to clean, standardize, and validate molecular
6
+ datasets commonly used in cheminformatics and drug discovery.
7
+
8
+ The preprocessing steps inherit from the base `PreprocessingStep` class and can be easily combined
9
+ to create custom preprocessing pipelines. Each step operates on molecular records and can transform
10
+ molecules, report problems, or filter out invalid structures.
11
+
12
+ Available Preprocessing Steps
13
+ -----------------------------
14
+
15
+ - `CheckValidSmiles` : Validates molecules through SMILES round-trip conversion
16
+ - `Sanitize` : Validates and corrects molecular structures using RDKit sanitization
17
+ - `FilterByWeight` : Filters molecules based on molecular weight thresholds
18
+ - `FilterByElement` : Filters molecules based on allowed elemental composition
19
+ - `StandardizeWithCsp` : Standardizes molecules using ChEMBL Structure Pipeline
20
+ - `GetParentMolWithCsp` : Extracts parent molecules using ChEMBL Structure Pipeline
21
+ - `RemoveHydrogens` : Removes hydrogen atoms from molecular representations
22
+ - `RemoveSmallFragments` : Removes small fragments, keeping only the largest component
23
+ - `RemoveStereochemistry` : Removes stereochemical information from molecules
24
+
25
+ Base Classes
26
+ ------------
27
+
28
+ - `PreprocessingStep` : Abstract base class for all preprocessing steps
29
+
30
+ Examples
31
+ --------
32
+
33
+ Basic usage of individual preprocessing steps:
34
+
35
+ >>> from nerdd_module.preprocessing import FilterByWeight, RemoveHydrogens, Sanitize
36
+ >>>
37
+ >>> # Create preprocessing steps
38
+ >>> weight_filter = FilterByWeight(min_weight=150, max_weight=500)
39
+ >>> hydrogen_remover = RemoveHydrogens()
40
+ >>> sanitizer = Sanitize()
41
+
42
+ Creating a complete preprocessing pipeline:
43
+
44
+ >>> from nerdd_module.preprocessing import (
45
+ ... CheckValidSmiles, FilterByElement, RemoveSmallFragments,
46
+ ... Sanitize, StandardizeWithCsp, ORGANIC_SUBSET
47
+ ... )
48
+ >>>
49
+ >>> # Define a comprehensive preprocessing pipeline
50
+ >>> pipeline_steps = [
51
+ ... Sanitize(), # Sanitize molecules
52
+ ... CheckValidSmiles(), # Validate SMILES representation
53
+ ... RemoveSmallFragments(), # Remove salts and solvents
54
+ ... FilterByElement(ORGANIC_SUBSET), # Keep only organic molecules
55
+ ... StandardizeWithCsp(), # Standardize using chembl_structure_pipeline
56
+ ... ]
57
+
58
+
59
+ Notes
60
+ -----
61
+ * All preprocessing steps follow the same interface defined by `PreprocessingStep`
62
+ * Steps can be chained together to create comprehensive preprocessing pipelines
63
+ * Problems encountered during preprocessing are accumulated in the record's "problems" list
64
+ * Some steps require optional dependencies (e.g., `chembl_structure_pipeline`)
65
+ * The order of preprocessing steps can significantly impact the final results
66
+
67
+ See Also
68
+ --------
69
+ nerdd_module.steps : Base classes for pipeline steps nerdd_module.problem : Problem reporting
70
+ classes used by preprocessing steps
71
+ """
72
+
73
+ from .check_valid_smiles import *
74
+ from .chembl_structure_pipeline import *
75
+ from .filter_by_element import *
76
+ from .filter_by_weight import *
77
+ from .preprocessing_step import *
78
+ from .remove_hydrogens import *
79
+ from .remove_small_fragments import *
80
+ from .remove_stereochemistry import *
81
+ from .sanitize import *
@@ -0,0 +1,74 @@
1
+ """
2
+ SMILES validation preprocessing step for molecular data.
3
+
4
+ This module provides functionality to validate molecular representations by converting them to
5
+ SMILES format and attempting to parse them back. This round-trip validation ensures that molecules
6
+ can be properly serialized and deserialized as SMILES strings.
7
+ """
8
+
9
+ from typing import List, Optional, Tuple
10
+
11
+ from rdkit.Chem import Mol, MolFromSmiles, MolToSmiles
12
+
13
+ from ..problem import InvalidSmiles, Problem
14
+ from .preprocessing_step import PreprocessingStep
15
+
16
+ __all__ = ["CheckValidSmiles"]
17
+
18
+
19
+ class CheckValidSmiles(PreprocessingStep):
20
+ """
21
+ Preprocessing step that validates molecules through SMILES round-trip conversion.
22
+
23
+ This class validates molecular representations by converting them to SMILES format and then
24
+ attempting to parse the SMILES back to a molecule object. This round-trip validation ensures
25
+ that molecules can be properly represented as SMILES strings, which is an indicator for a valid
26
+ molecular structure. Molecules that fail the round-trip test are considered invalid and removed.
27
+
28
+ Parameters
29
+ ----------
30
+ None
31
+
32
+ Examples
33
+ --------
34
+ >>> # Create a SMILES validation step
35
+ >>> smiles_check = CheckValidSmiles()
36
+ """
37
+
38
+ def __init__(self) -> None:
39
+ super().__init__()
40
+
41
+ def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
42
+ """
43
+ Validate a molecule through SMILES round-trip conversion.
44
+
45
+ Converts the input molecule to a canonical SMILES string and then attempts to parse it back
46
+ to a molecule object. If the round-trip conversion fails, the molecule is considered
47
+ invalid.
48
+
49
+ Parameters
50
+ ----------
51
+ mol : Mol
52
+ RDKit Mol object representing the molecule to be validated.
53
+
54
+ Returns
55
+ -------
56
+ Tuple[Optional[Mol], List[Problem]]
57
+ A tuple containing:
58
+ * The original molecule if SMILES validation succeeded, or None if validation failed
59
+ * An empty list if validation succeeded, or a list containing an InvalidSmiles problem
60
+ if validation failed
61
+
62
+ Notes
63
+ -----
64
+ The validation process converts the molecule to canonical SMILES.
65
+ """
66
+ problems = []
67
+
68
+ smi = MolToSmiles(mol, True)
69
+ check_mol = MolFromSmiles(smi)
70
+ if check_mol is None:
71
+ problems.append(InvalidSmiles())
72
+ mol = None
73
+
74
+ return mol, problems
@@ -0,0 +1,183 @@
1
+ """
2
+ ChEMBL Structure Pipeline preprocessing steps for molecular data.
3
+
4
+ This module provides preprocessing steps that utilize the ChEMBL Structure Pipeline library for
5
+ molecule standardization and parent molecule extraction.
6
+ """
7
+
8
+ import warnings
9
+ from typing import List, Optional, Tuple
10
+
11
+ from rdkit.Chem import Mol
12
+
13
+ from ..polyfills import BlockLogs
14
+ from ..problem import Problem
15
+ from .preprocessing_step import PreprocessingStep
16
+
17
+ # before importing chembl_structure_pipeline, we need to suppress RDKit warnings
18
+ warnings.filterwarnings(
19
+ "ignore",
20
+ category=DeprecationWarning,
21
+ module="rdkit.Chem.MolStandardize",
22
+ )
23
+
24
+ # We check if chembl_structure_pipeline is installed. Since importing this library already logs
25
+ # messages, we suppress them using RDKit's BlockLogs.
26
+ with BlockLogs():
27
+ try:
28
+ from chembl_structure_pipeline import get_parent_mol, standardize_mol
29
+
30
+ import_error = None
31
+ except ImportError as e:
32
+ # raise ImportError later when using this class
33
+ # --> this allows to use the rest of the package without chembl_structure_pipeline
34
+ import_error = e
35
+
36
+ __all__ = ["GetParentMolWithCsp", "StandardizeWithCsp"]
37
+
38
+
39
+ class StandardizeWithCsp(PreprocessingStep):
40
+ """
41
+ Preprocessing step that standardizes molecules using ChEMBL Structure Pipeline.
42
+
43
+ This class applies the ChEMBL Structure Pipeline standardization procedures to normalize
44
+ molecular representations. The standardization includes tautomer normalization, charge
45
+ neutralization, and other structural standardizations commonly used in pharmaceutical databases.
46
+
47
+ Parameters
48
+ ----------
49
+ None
50
+
51
+ Raises
52
+ ------
53
+ ImportError
54
+ If the chembl_structure_pipeline library is not installed.
55
+
56
+ Examples
57
+ --------
58
+ >>> # Create a standardization step (requires chembl_structure_pipeline)
59
+ >>> standardize_step = StandardizeWithCsp()
60
+
61
+ Notes
62
+ -----
63
+ * Requires the chembl_structure_pipeline library to be installed
64
+ * Automatically removes 3D conformers as the pipeline cannot handle them
65
+ * Uses ChEMBL's standardize_mol function which applies comprehensive molecular standardization
66
+ procedures
67
+ * If standardization fails, the original molecule is returned with a problem
68
+ """
69
+
70
+ def __init__(self) -> None:
71
+ super().__init__()
72
+
73
+ if import_error is not None:
74
+ raise import_error
75
+
76
+ def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
77
+ """
78
+ Standardize a molecule using ChEMBL Structure Pipeline.
79
+
80
+ Applies ChEMBL's standardization procedures to normalize the molecular representation. The
81
+ process removes 3D conformers before applying the standardize_mol function.
82
+
83
+ Parameters
84
+ ----------
85
+ mol : Mol
86
+ RDKit Mol object representing the molecule to be standardized.
87
+
88
+ Returns
89
+ -------
90
+ Tuple[Optional[Mol], List[Problem]]
91
+ A tuple containing:
92
+ * The standardized molecule if successful, or the original molecule if standardization
93
+ failed
94
+ * An empty list if standardization succeeded, or a list containing a Problem instance
95
+ with code "csp_error" if standardization failed
96
+ """
97
+ problems: List[Problem] = []
98
+
99
+ # chembl structure pipeline cannot handle molecules with 3D coordinates
100
+ # --> delete conformers
101
+ mol.RemoveAllConformers()
102
+
103
+ # standardization via chembl structure pipeline
104
+ preprocessed_mol = standardize_mol(mol)
105
+
106
+ if preprocessed_mol is None:
107
+ problems.append(Problem("csp_error", "Could not standardize the molecule."))
108
+ preprocessed_mol = mol
109
+
110
+ return preprocessed_mol, problems
111
+
112
+
113
+ class GetParentMolWithCsp(PreprocessingStep):
114
+ """
115
+ Preprocessing step that extracts parent molecules using ChEMBL Structure Pipeline.
116
+
117
+ This class uses the ChEMBL Structure Pipeline to identify and extract the parent molecule from
118
+ complex molecular structures. This process removes salts, solvents, and other fragments while
119
+ applying ChEMBL's standardization rules.
120
+
121
+ Parameters
122
+ ----------
123
+ None
124
+
125
+ Raises
126
+ ------
127
+ ImportError
128
+ If the chembl_structure_pipeline library is not installed.
129
+
130
+ Examples
131
+ --------
132
+ >>> # Create a parent molecule extraction step
133
+ >>> get_parent_step = GetParentMolWithCsp()
134
+
135
+ Notes
136
+ -----
137
+ * Requires the chembl_structure_pipeline library to be installed
138
+ * Automatically removes 3D conformers as the pipeline cannot handle them
139
+ * Applies the get_parent_mol function from the chembl_structure_pipeline library
140
+ * If parent extraction fails or is flagged for exclusion, the original molecule is returned with
141
+ a Problem instance
142
+ """
143
+
144
+ def __init__(self) -> None:
145
+ super().__init__()
146
+
147
+ if import_error is not None:
148
+ raise import_error
149
+
150
+ def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
151
+ """
152
+ Extract the parent molecule using ChEMBL Structure Pipeline.
153
+
154
+ Identifies and returns the main molecular component. The process removes 3D conformers,
155
+ because chembl_structure_pipeline cannot handle them.
156
+
157
+ Parameters
158
+ ----------
159
+ mol : Mol
160
+ RDKit Mol object representing the molecule from which to extract the parent structure.
161
+
162
+ Returns
163
+ -------
164
+ Tuple[Optional[Mol], List[Problem]]
165
+ A tuple containing:
166
+ * The parent molecule if successful, or the original molecule if extraction failed
167
+ * An empty list if extraction succeeded, or a list containing a Problem instance with
168
+ code "csp_error" if extraction failed or was flagged for exclusion
169
+ """
170
+ problems = []
171
+
172
+ # chembl structure pipeline cannot handle molecules with 3D coordinates
173
+ # --> delete conformers
174
+ mol.RemoveAllConformers()
175
+
176
+ # get parent molecule via chembl structure pipeline
177
+ preprocessed_mol, exclude_flag = get_parent_mol(mol)
178
+ if exclude_flag or preprocessed_mol is None:
179
+ problems.append(Problem("csp_error", "Could not remove small fragments."))
180
+ if preprocessed_mol is None:
181
+ preprocessed_mol = mol
182
+
183
+ return preprocessed_mol, problems