nerdd-module 0.3.50__tar.gz → 0.3.51__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/PKG-INFO +1 -1
  2. nerdd_module-0.3.51/nerdd_module/preprocessing/__init__.py +81 -0
  3. nerdd_module-0.3.51/nerdd_module/preprocessing/check_valid_smiles.py +74 -0
  4. nerdd_module-0.3.51/nerdd_module/preprocessing/chembl_structure_pipeline.py +183 -0
  5. nerdd_module-0.3.51/nerdd_module/preprocessing/filter_by_element.py +148 -0
  6. nerdd_module-0.3.51/nerdd_module/preprocessing/filter_by_weight.py +95 -0
  7. nerdd_module-0.3.51/nerdd_module/preprocessing/preprocessing_step.py +139 -0
  8. nerdd_module-0.3.51/nerdd_module/preprocessing/remove_hydrogens.py +104 -0
  9. nerdd_module-0.3.51/nerdd_module/preprocessing/remove_small_fragments.py +76 -0
  10. nerdd_module-0.3.51/nerdd_module/preprocessing/remove_stereochemistry.py +81 -0
  11. nerdd_module-0.3.51/nerdd_module/preprocessing/sanitize.py +91 -0
  12. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module.egg-info/PKG-INFO +1 -1
  13. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module.egg-info/SOURCES.txt +1 -0
  14. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/pyproject.toml +1 -1
  15. nerdd_module-0.3.50/nerdd_module/preprocessing/__init__.py +0 -8
  16. nerdd_module-0.3.50/nerdd_module/preprocessing/check_valid_smiles.py +0 -26
  17. nerdd_module-0.3.50/nerdd_module/preprocessing/chembl_structure_pipeline.py +0 -77
  18. nerdd_module-0.3.50/nerdd_module/preprocessing/filter_by_element.py +0 -57
  19. nerdd_module-0.3.50/nerdd_module/preprocessing/filter_by_weight.py +0 -34
  20. nerdd_module-0.3.50/nerdd_module/preprocessing/preprocessing_step.py +0 -61
  21. nerdd_module-0.3.50/nerdd_module/preprocessing/remove_small_fragments.py +0 -26
  22. nerdd_module-0.3.50/nerdd_module/preprocessing/remove_stereochemistry.py +0 -26
  23. nerdd_module-0.3.50/nerdd_module/preprocessing/sanitize.py +0 -39
  24. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/LICENSE +0 -0
  25. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/README.md +0 -0
  26. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/__init__.py +0 -0
  27. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/cli.py +0 -0
  28. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/__init__.py +0 -0
  29. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/configuration.py +0 -0
  30. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/default_configuration.py +0 -0
  31. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/dict_configuration.py +0 -0
  32. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/merged_configuration.py +0 -0
  33. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/models.py +0 -0
  34. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/package_configuration.py +0 -0
  35. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/search_yaml_configuration.py +0 -0
  36. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/yaml_configuration.py +0 -0
  37. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/__init__.py +0 -0
  38. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/basic_type_converter.py +0 -0
  39. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/converter.py +0 -0
  40. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/converter_config.py +0 -0
  41. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/mol_converter.py +0 -0
  42. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/problem_list_converter.py +0 -0
  43. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/representation_converter.py +0 -0
  44. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/source_list_converter.py +0 -0
  45. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/void_converter.py +0 -0
  46. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/__init__.py +0 -0
  47. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/depth_first_explorer.py +0 -0
  48. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/explorer.py +0 -0
  49. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/file_reader.py +0 -0
  50. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/gzip_reader.py +0 -0
  51. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/inchi_reader.py +0 -0
  52. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/list_reader.py +0 -0
  53. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/mol_reader.py +0 -0
  54. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/reader.py +0 -0
  55. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/reader_config.py +0 -0
  56. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/sdf_reader.py +0 -0
  57. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/smiles_reader.py +0 -0
  58. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/stream_reader.py +0 -0
  59. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/string_reader.py +0 -0
  60. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/tar_reader.py +0 -0
  61. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/zip_reader.py +0 -0
  62. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/model/__init__.py +0 -0
  63. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/model/assign_name_step.py +0 -0
  64. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/model/convert_representations_step.py +0 -0
  65. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/model/enforce_schema_step.py +0 -0
  66. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/model/model.py +0 -0
  67. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/model/prediction_step.py +0 -0
  68. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/model/read_input_step.py +0 -0
  69. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/model/write_output_step.py +0 -0
  70. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/__init__.py +0 -0
  71. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/csv_writer.py +0 -0
  72. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/file_writer.py +0 -0
  73. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/iterator_writer.py +0 -0
  74. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/pandas_writer.py +0 -0
  75. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/record_list_writer.py +0 -0
  76. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/sdf_writer.py +0 -0
  77. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/writer.py +0 -0
  78. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/writer_config.py +0 -0
  79. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/polyfills/__init__.py +0 -0
  80. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/polyfills/block_logs.py +0 -0
  81. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/polyfills/files.py +0 -0
  82. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/polyfills/get_entry_points.py +0 -0
  83. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/polyfills/literal.py +0 -0
  84. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/polyfills/typed_dict.py +0 -0
  85. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/polyfills/types.py +0 -0
  86. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/polyfills/version.py +0 -0
  87. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/problem.py +0 -0
  88. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/py.typed +0 -0
  89. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/steps/__init__.py +0 -0
  90. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/steps/map_step.py +0 -0
  91. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/steps/output_step.py +0 -0
  92. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/steps/step.py +0 -0
  93. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/__init__.py +0 -0
  94. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/checks.py +0 -0
  95. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/files.py +0 -0
  96. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/models/AtomicMassModel.py +0 -0
  97. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/models/MolWeightModel.py +0 -0
  98. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/models/__init__.py +0 -0
  99. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/predictions.py +0 -0
  100. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/preprocessing/DummyPreprocessingStep.py +0 -0
  101. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/preprocessing/__init__.py +0 -0
  102. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/representations.py +0 -0
  103. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/util/__init__.py +0 -0
  104. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/util/call_with_mappings.py +0 -0
  105. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/util/package.py +0 -0
  106. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/version.py +0 -0
  107. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module.egg-info/dependency_links.txt +0 -0
  108. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module.egg-info/requires.txt +0 -0
  109. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module.egg-info/top_level.txt +0 -0
  110. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/setup.cfg +0 -0
  111. {nerdd_module-0.3.50 → nerdd_module-0.3.51}/tests/test_features.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nerdd-module
3
- Version: 0.3.50
3
+ Version: 0.3.51
4
4
  Summary: Base package to create NERDD modules
5
5
  Author-email: Steffen Hirte <steffen.hirte@univie.ac.at>
6
6
  Maintainer-email: Steffen Hirte <steffen.hirte@univie.ac.at>
@@ -0,0 +1,81 @@
1
+ """
2
+ Molecular preprocessing pipeline components.
3
+
4
+ This package provides a comprehensive set of preprocessing steps for molecular data processing
5
+ pipelines. These steps can be chained together to clean, standardize, and validate molecular
6
+ datasets commonly used in cheminformatics and drug discovery.
7
+
8
+ The preprocessing steps inherit from the base `PreprocessingStep` class and can be easily combined
9
+ to create custom preprocessing pipelines. Each step operates on molecular records and can transform
10
+ molecules, report problems, or filter out invalid structures.
11
+
12
+ Available Preprocessing Steps
13
+ -----------------------------
14
+
15
+ - `CheckValidSmiles` : Validates molecules through SMILES round-trip conversion
16
+ - `Sanitize` : Validates and corrects molecular structures using RDKit sanitization
17
+ - `FilterByWeight` : Filters molecules based on molecular weight thresholds
18
+ - `FilterByElement` : Filters molecules based on allowed elemental composition
19
+ - `StandardizeWithCsp` : Standardizes molecules using ChEMBL Structure Pipeline
20
+ - `GetParentMolWithCsp` : Extracts parent molecules using ChEMBL Structure Pipeline
21
+ - `RemoveHydrogens` : Removes hydrogen atoms from molecular representations
22
+ - `RemoveSmallFragments` : Removes small fragments, keeping only the largest component
23
+ - `RemoveStereochemistry` : Removes stereochemical information from molecules
24
+
25
+ Base Classes
26
+ ------------
27
+
28
+ - `PreprocessingStep` : Abstract base class for all preprocessing steps
29
+
30
+ Examples
31
+ --------
32
+
33
+ Basic usage of individual preprocessing steps:
34
+
35
+ >>> from nerdd_module.preprocessing import FilterByWeight, RemoveHydrogens, Sanitize
36
+ >>>
37
+ >>> # Create preprocessing steps
38
+ >>> weight_filter = FilterByWeight(min_weight=150, max_weight=500)
39
+ >>> hydrogen_remover = RemoveHydrogens()
40
+ >>> sanitizer = Sanitize()
41
+
42
+ Creating a complete preprocessing pipeline:
43
+
44
+ >>> from nerdd_module.preprocessing import (
45
+ ... CheckValidSmiles, FilterByElement, RemoveSmallFragments,
46
+ ... Sanitize, StandardizeWithCsp, ORGANIC_SUBSET
47
+ ... )
48
+ >>>
49
+ >>> # Define a comprehensive preprocessing pipeline
50
+ >>> pipeline_steps = [
51
+ ... Sanitize(), # Sanitize molecules
52
+ ... CheckValidSmiles(), # Validate SMILES representation
53
+ ... RemoveSmallFragments(), # Remove salts and solvents
54
+ ... FilterByElement(ORGANIC_SUBSET), # Keep only organic molecules
55
+ ... StandardizeWithCsp(), # Standardize using chembl_structure_pipeline
56
+ ... ]
57
+
58
+
59
+ Notes
60
+ -----
61
+ * All preprocessing steps follow the same interface defined by `PreprocessingStep`
62
+ * Steps can be chained together to create comprehensive preprocessing pipelines
63
+ * Problems encountered during preprocessing are accumulated in the record's "problems" list
64
+ * Some steps require optional dependencies (e.g., `chembl_structure_pipeline`)
65
+ * The order of preprocessing steps can significantly impact the final results
66
+
67
+ See Also
68
+ --------
69
+ nerdd_module.steps : Base classes for pipeline steps nerdd_module.problem : Problem reporting
70
+ classes used by preprocessing steps
71
+ """
72
+
73
+ from .check_valid_smiles import *
74
+ from .chembl_structure_pipeline import *
75
+ from .filter_by_element import *
76
+ from .filter_by_weight import *
77
+ from .preprocessing_step import *
78
+ from .remove_hydrogens import *
79
+ from .remove_small_fragments import *
80
+ from .remove_stereochemistry import *
81
+ from .sanitize import *
@@ -0,0 +1,74 @@
1
+ """
2
+ SMILES validation preprocessing step for molecular data.
3
+
4
+ This module provides functionality to validate molecular representations by converting them to
5
+ SMILES format and attempting to parse them back. This round-trip validation ensures that molecules
6
+ can be properly serialized and deserialized as SMILES strings.
7
+ """
8
+
9
+ from typing import List, Optional, Tuple
10
+
11
+ from rdkit.Chem import Mol, MolFromSmiles, MolToSmiles
12
+
13
+ from ..problem import InvalidSmiles, Problem
14
+ from .preprocessing_step import PreprocessingStep
15
+
16
+ __all__ = ["CheckValidSmiles"]
17
+
18
+
19
+ class CheckValidSmiles(PreprocessingStep):
20
+ """
21
+ Preprocessing step that validates molecules through SMILES round-trip conversion.
22
+
23
+ This class validates molecular representations by converting them to SMILES format and then
24
+ attempting to parse the SMILES back to a molecule object. This round-trip validation ensures
25
+ that molecules can be properly represented as SMILES strings, which is an indicator for a valid
26
+ molecular structure. Molecules that fail the round-trip test are considered invalid and removed.
27
+
28
+ Parameters
29
+ ----------
30
+ None
31
+
32
+ Examples
33
+ --------
34
+ >>> # Create a SMILES validation step
35
+ >>> smiles_check = CheckValidSmiles()
36
+ """
37
+
38
+ def __init__(self) -> None:
39
+ super().__init__()
40
+
41
+ def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
42
+ """
43
+ Validate a molecule through SMILES round-trip conversion.
44
+
45
+ Converts the input molecule to a canonical SMILES string and then attempts to parse it back
46
+ to a molecule object. If the round-trip conversion fails, the molecule is considered
47
+ invalid.
48
+
49
+ Parameters
50
+ ----------
51
+ mol : Mol
52
+ RDKit Mol object representing the molecule to be validated.
53
+
54
+ Returns
55
+ -------
56
+ Tuple[Optional[Mol], List[Problem]]
57
+ A tuple containing:
58
+ * The original molecule if SMILES validation succeeded, or None if validation failed
59
+ * An empty list if validation succeeded, or a list containing an InvalidSmiles problem
60
+ if validation failed
61
+
62
+ Notes
63
+ -----
64
+ The validation process converts the molecule to canonical SMILES.
65
+ """
66
+ problems = []
67
+
68
+ smi = MolToSmiles(mol, True)
69
+ check_mol = MolFromSmiles(smi)
70
+ if check_mol is None:
71
+ problems.append(InvalidSmiles())
72
+ mol = None
73
+
74
+ return mol, problems
@@ -0,0 +1,183 @@
1
+ """
2
+ ChEMBL Structure Pipeline preprocessing steps for molecular data.
3
+
4
+ This module provides preprocessing steps that utilize the ChEMBL Structure Pipeline library for
5
+ molecule standardization and parent molecule extraction.
6
+ """
7
+
8
+ import warnings
9
+ from typing import List, Optional, Tuple
10
+
11
+ from rdkit.Chem import Mol
12
+
13
+ from ..polyfills import BlockLogs
14
+ from ..problem import Problem
15
+ from .preprocessing_step import PreprocessingStep
16
+
17
+ # before importing chembl_structure_pipeline, we need to suppress RDKit warnings
18
+ warnings.filterwarnings(
19
+ "ignore",
20
+ category=DeprecationWarning,
21
+ module="rdkit.Chem.MolStandardize",
22
+ )
23
+
24
+ # We check if chembl_structure_pipeline is installed. Since importing this library already logs
25
+ # messages, we suppress them using RDKit's BlockLogs.
26
+ with BlockLogs():
27
+ try:
28
+ from chembl_structure_pipeline import get_parent_mol, standardize_mol
29
+
30
+ import_error = None
31
+ except ImportError as e:
32
+ # raise ImportError later when using this class
33
+ # --> this allows to use the rest of the package without chembl_structure_pipeline
34
+ import_error = e
35
+
36
+ __all__ = ["GetParentMolWithCsp", "StandardizeWithCsp"]
37
+
38
+
39
+ class StandardizeWithCsp(PreprocessingStep):
40
+ """
41
+ Preprocessing step that standardizes molecules using ChEMBL Structure Pipeline.
42
+
43
+ This class applies the ChEMBL Structure Pipeline standardization procedures to normalize
44
+ molecular representations. The standardization includes tautomer normalization, charge
45
+ neutralization, and other structural standardizations commonly used in pharmaceutical databases.
46
+
47
+ Parameters
48
+ ----------
49
+ None
50
+
51
+ Raises
52
+ ------
53
+ ImportError
54
+ If the chembl_structure_pipeline library is not installed.
55
+
56
+ Examples
57
+ --------
58
+ >>> # Create a standardization step (requires chembl_structure_pipeline)
59
+ >>> standardize_step = StandardizeWithCsp()
60
+
61
+ Notes
62
+ -----
63
+ * Requires the chembl_structure_pipeline library to be installed
64
+ * Automatically removes 3D conformers as the pipeline cannot handle them
65
+ * Uses ChEMBL's standardize_mol function which applies comprehensive molecular standardization
66
+ procedures
67
+ * If standardization fails, the original molecule is returned with a problem
68
+ """
69
+
70
+ def __init__(self) -> None:
71
+ super().__init__()
72
+
73
+ if import_error is not None:
74
+ raise import_error
75
+
76
+ def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
77
+ """
78
+ Standardize a molecule using ChEMBL Structure Pipeline.
79
+
80
+ Applies ChEMBL's standardization procedures to normalize the molecular representation. The
81
+ process removes 3D conformers before applying the standardize_mol function.
82
+
83
+ Parameters
84
+ ----------
85
+ mol : Mol
86
+ RDKit Mol object representing the molecule to be standardized.
87
+
88
+ Returns
89
+ -------
90
+ Tuple[Optional[Mol], List[Problem]]
91
+ A tuple containing:
92
+ * The standardized molecule if successful, or the original molecule if standardization
93
+ failed
94
+ * An empty list if standardization succeeded, or a list containing a Problem instance
95
+ with code "csp_error" if standardization failed
96
+ """
97
+ problems: List[Problem] = []
98
+
99
+ # chembl structure pipeline cannot handle molecules with 3D coordinates
100
+ # --> delete conformers
101
+ mol.RemoveAllConformers()
102
+
103
+ # standardization via chembl structure pipeline
104
+ preprocessed_mol = standardize_mol(mol)
105
+
106
+ if preprocessed_mol is None:
107
+ problems.append(Problem("csp_error", "Could not standardize the molecule."))
108
+ preprocessed_mol = mol
109
+
110
+ return preprocessed_mol, problems
111
+
112
+
113
+ class GetParentMolWithCsp(PreprocessingStep):
114
+ """
115
+ Preprocessing step that extracts parent molecules using ChEMBL Structure Pipeline.
116
+
117
+ This class uses the ChEMBL Structure Pipeline to identify and extract the parent molecule from
118
+ complex molecular structures. This process removes salts, solvents, and other fragments while
119
+ applying ChEMBL's standardization rules.
120
+
121
+ Parameters
122
+ ----------
123
+ None
124
+
125
+ Raises
126
+ ------
127
+ ImportError
128
+ If the chembl_structure_pipeline library is not installed.
129
+
130
+ Examples
131
+ --------
132
+ >>> # Create a parent molecule extraction step
133
+ >>> get_parent_step = GetParentMolWithCsp()
134
+
135
+ Notes
136
+ -----
137
+ * Requires the chembl_structure_pipeline library to be installed
138
+ * Automatically removes 3D conformers as the pipeline cannot handle them
139
+ * Applies the get_parent_mol function from the chembl_structure_pipeline library
140
+ * If parent extraction fails or is flagged for exclusion, the original molecule is returned with
141
+ a Problem instance
142
+ """
143
+
144
+ def __init__(self) -> None:
145
+ super().__init__()
146
+
147
+ if import_error is not None:
148
+ raise import_error
149
+
150
+ def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
151
+ """
152
+ Extract the parent molecule using ChEMBL Structure Pipeline.
153
+
154
+ Identifies and returns the main molecular component. The process removes 3D conformers,
155
+ because chembl_structure_pipeline cannot handle them.
156
+
157
+ Parameters
158
+ ----------
159
+ mol : Mol
160
+ RDKit Mol object representing the molecule from which to extract the parent structure.
161
+
162
+ Returns
163
+ -------
164
+ Tuple[Optional[Mol], List[Problem]]
165
+ A tuple containing:
166
+ * The parent molecule if successful, or the original molecule if extraction failed
167
+ * An empty list if extraction succeeded, or a list containing a Problem instance with
168
+ code "csp_error" if extraction failed or was flagged for exclusion
169
+ """
170
+ problems = []
171
+
172
+ # chembl structure pipeline cannot handle molecules with 3D coordinates
173
+ # --> delete conformers
174
+ mol.RemoveAllConformers()
175
+
176
+ # get parent molecule via chembl structure pipeline
177
+ preprocessed_mol, exclude_flag = get_parent_mol(mol)
178
+ if exclude_flag or preprocessed_mol is None:
179
+ problems.append(Problem("csp_error", "Could not remove small fragments."))
180
+ if preprocessed_mol is None:
181
+ preprocessed_mol = mol
182
+
183
+ return preprocessed_mol, problems
@@ -0,0 +1,148 @@
1
+ """
2
+ Element filtering preprocessing step for molecular data.
3
+
4
+ This module provides functionality to filter molecules based on their elemental composition,
5
+ allowing only molecules containing specified allowed elements to pass through the processing
6
+ pipeline.
7
+ """
8
+
9
+ from typing import Iterable, List, Optional, Set, Tuple
10
+
11
+ from rdkit.Chem import Mol
12
+
13
+ from ..problem import InvalidElementsProblem, Problem
14
+ from .preprocessing_step import PreprocessingStep
15
+
16
+ __all__ = ["FilterByElement", "ORGANIC_SUBSET"]
17
+
18
+ ORGANIC_SUBSET = [
19
+ "H",
20
+ "B",
21
+ "C",
22
+ "N",
23
+ "O",
24
+ "F",
25
+ "Si",
26
+ "P",
27
+ "S",
28
+ "Cl",
29
+ "Se",
30
+ "Br",
31
+ "I",
32
+ ]
33
+ """
34
+ List[str] : Predefined set of elements commonly found in organic molecules.
35
+
36
+ This list contains the atomic symbols of elements that are typically present in organic and
37
+ drug-like molecules. It can be used as a convenient preset for the FilterByElement class to restrict
38
+ molecules to organic chemistry space.
39
+
40
+ The elements included are:
41
+ * H (Hydrogen)
42
+ * B (Boron)
43
+ * C (Carbon)
44
+ * N (Nitrogen)
45
+ * O (Oxygen)
46
+ * F (Fluorine)
47
+ * Si (Silicon)
48
+ * P (Phosphorus)
49
+ * S (Sulfur)
50
+ * Cl (Chlorine)
51
+ * Se (Selenium)
52
+ * Br (Bromine)
53
+ * I (Iodine)
54
+
55
+ Examples
56
+ --------
57
+ >>> filter_step = FilterByElement(ORGANIC_SUBSET)
58
+ >>> # This will only allow molecules containing organic elements
59
+ """
60
+
61
+
62
+ class FilterByElement(PreprocessingStep):
63
+ """
64
+ Preprocessing step that filters molecules based on elemental composition.
65
+
66
+ This class validates molecules against a specified set of allowed elements. Molecules containing
67
+ elements not in the allowed set are flagged with a problem instance "invalid_element" and
68
+ optionally removed from the pipeline.
69
+
70
+ Parameters
71
+ ----------
72
+ allowed_elements : Iterable[str]
73
+ An iterable of atomic symbols (element names) that are allowed in molecules. Element symbols
74
+ are case-insensitive but will be normalized to proper case (first letter uppercase, rest
75
+ lowercase).
76
+ remove_invalid_molecules : bool, optional
77
+ If True, molecules containing disallowed elements are set to None (removed). If False,
78
+ invalid molecules are kept. Default is False.
79
+
80
+ Examples
81
+ --------
82
+ >>> # Allow only carbon, nitrogen, oxygen, and hydrogen
83
+ >>> filter_step = FilterByElement(['C', 'N', 'O', 'H'])
84
+
85
+ >>> # Use predefined organic subset, removing invalid molecules
86
+ >>> filter_step = FilterByElement(ORGANIC_SUBSET, remove_invalid_molecules=True)
87
+
88
+ Notes
89
+ -----
90
+ * Element symbols are normalized to proper case (e.g., 'cl' becomes 'Cl')
91
+ * Even if remove_invalid_molecules is set to False, molecules with invalid elements are still
92
+ marked with a problem instance
93
+ * Hydrogen atoms are handled specially since they may not be explicit in the molecular
94
+ representation and are detected via GetTotalNumHs()
95
+ """
96
+
97
+ def __init__(
98
+ self, allowed_elements: Iterable[str], remove_invalid_molecules: bool = False
99
+ ) -> None:
100
+ super().__init__()
101
+ self._allowed_elements = {a[0].upper() + a[1:] for a in allowed_elements}
102
+ self._hydrogen_in_allowed_elements = "H" in self._allowed_elements
103
+ self._remove_invalid_molecules = remove_invalid_molecules
104
+
105
+ def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
106
+ """
107
+ Filter a molecule by comparing its elemental composition against allowed elements.
108
+
109
+ Parameters
110
+ ----------
111
+ mol : Mol
112
+ RDKit Mol object representing the molecule to be validated.
113
+
114
+ Returns
115
+ -------
116
+ Tuple[Optional[Mol], List[Problem]]
117
+ A tuple containing:
118
+ * The original molecule if all elements are allowed, or None if disallowed elements are
119
+ found and remove_invalid_molecules is True
120
+ * A list containing an InvalidElementsProblem if disallowed elements are found,
121
+ otherwise an empty list
122
+
123
+ Notes
124
+ -----
125
+ Hydrogen detection is special-cased because hydrogen atoms are often implicit in molecular
126
+ representations and detected via atom.GetTotalNumHs().
127
+ """
128
+ problems = []
129
+ result_mol = mol
130
+
131
+ elements: Set[str] = {atom.GetSymbol() for atom in mol.GetAtoms()}
132
+ invalid_elements = elements - self._allowed_elements
133
+
134
+ # special case: hydrogens are not recognized by mol.GetAtoms()
135
+ if not self._hydrogen_in_allowed_elements:
136
+ # get the number of hydrogens in mol
137
+ for a in mol.GetAtoms():
138
+ if a.GetTotalNumHs() > 0:
139
+ invalid_elements.add("H")
140
+ break
141
+
142
+ if len(invalid_elements) > 0:
143
+ if self._remove_invalid_molecules:
144
+ result_mol = None
145
+
146
+ problems.append(InvalidElementsProblem(invalid_elements))
147
+
148
+ return result_mol, problems
@@ -0,0 +1,95 @@
1
+ """
2
+ Molecular weight filtering preprocessing step.
3
+
4
+ This module provides a preprocessing step that filters molecules based on their molecular weight.
5
+ This is desirable for models having a runtime scaling with molecule size.
6
+ """
7
+
8
+ from typing import List, Optional, Tuple
9
+
10
+ from rdkit.Chem import Mol
11
+ from rdkit.Chem.rdMolDescriptors import CalcExactMolWt
12
+
13
+ from ..problem import InvalidWeightProblem, Problem
14
+ from .preprocessing_step import PreprocessingStep
15
+
16
+ __all__ = ["FilterByWeight"]
17
+
18
+
19
+ class FilterByWeight(PreprocessingStep):
20
+ """
21
+ Preprocessing step that filters molecules based on molecular weight.
22
+
23
+ This class validates molecules against specified minimum and maximum molecular weight
24
+ thresholds. Molecules outside these bounds are flagged and optionally removed from the pipeline.
25
+
26
+ Parameters
27
+ ----------
28
+ min_weight : float, optional
29
+ Minimum allowed molecular weight in Daltons (Da). Default is 0.
30
+ max_weight : float, optional
31
+ Maximum allowed molecular weight in Daltons (Da). Default is infinity.
32
+ remove_invalid_molecules : bool, optional
33
+ If True, molecules outside the weight range are set to None (removed). If False, invalid
34
+ molecules are kept. Default is False.
35
+
36
+ Examples
37
+ --------
38
+ >>> # Filter molecules between 150 and 500 Da, keeping invalid ones
39
+ >>> filter_step = FilterByWeight(min_weight=150, max_weight=500)
40
+
41
+ >>> # Filter molecules below 1000 Da, removing invalid ones
42
+ >>> filter_step = FilterByWeight(max_weight=1000, remove_invalid_molecules=True)
43
+
44
+ >>> # Only set minimum weight threshold
45
+ >>> filter_step = FilterByWeight(min_weight=100)
46
+
47
+ Notes
48
+ -----
49
+ * Even if remove_invalid_molecules is set to False, molecules with invalid weight are still
50
+ marked with a problem instance
51
+ * The molecular weight is calculated using RDKit's CalcExactMolWt function
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ min_weight: float = 0,
57
+ max_weight: float = float("inf"),
58
+ remove_invalid_molecules: bool = False,
59
+ ) -> None:
60
+ super().__init__()
61
+ self._min_weight = min_weight
62
+ self._max_weight = max_weight
63
+ self._remove_invalid_molecules = remove_invalid_molecules
64
+
65
+ def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
66
+ """
67
+ Filter a molecule based on its molecular weight.
68
+
69
+ Calculates the exact molecular weight of the input molecule and validates it against the
70
+ configured minimum and maximum weight thresholds.
71
+
72
+ Parameters
73
+ ----------
74
+ mol : Mol
75
+ RDKit Mol object representing the molecule to be validated.
76
+
77
+ Returns
78
+ -------
79
+ Tuple[Optional[Mol], List[Problem]]
80
+ A tuple containing:
81
+ * The original molecule if within weight bounds, or None if outside bounds and
82
+ remove_invalid_molecules is True
83
+ * A list containing an InvalidWeightProblem if the molecule is outside the weight
84
+ bounds, otherwise an empty list
85
+ """
86
+ problems = []
87
+ result_mol = mol
88
+
89
+ weight = CalcExactMolWt(mol)
90
+ if weight < self._min_weight or weight > self._max_weight:
91
+ if self._remove_invalid_molecules:
92
+ result_mol = None
93
+ problems.append(InvalidWeightProblem(weight, self._min_weight, self._max_weight))
94
+
95
+ return result_mol, problems