nerdd-module 0.3.50__tar.gz → 0.3.52__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/PKG-INFO +1 -1
  2. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/config/models.py +26 -8
  3. nerdd_module-0.3.52/nerdd_module/preprocessing/__init__.py +81 -0
  4. nerdd_module-0.3.52/nerdd_module/preprocessing/check_valid_smiles.py +74 -0
  5. nerdd_module-0.3.52/nerdd_module/preprocessing/chembl_structure_pipeline.py +183 -0
  6. nerdd_module-0.3.52/nerdd_module/preprocessing/filter_by_element.py +148 -0
  7. nerdd_module-0.3.52/nerdd_module/preprocessing/filter_by_weight.py +95 -0
  8. nerdd_module-0.3.52/nerdd_module/preprocessing/preprocessing_step.py +139 -0
  9. nerdd_module-0.3.52/nerdd_module/preprocessing/remove_hydrogens.py +104 -0
  10. nerdd_module-0.3.52/nerdd_module/preprocessing/remove_small_fragments.py +76 -0
  11. nerdd_module-0.3.52/nerdd_module/preprocessing/remove_stereochemistry.py +81 -0
  12. nerdd_module-0.3.52/nerdd_module/preprocessing/sanitize.py +91 -0
  13. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module.egg-info/PKG-INFO +1 -1
  14. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module.egg-info/SOURCES.txt +1 -0
  15. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/pyproject.toml +1 -1
  16. nerdd_module-0.3.50/nerdd_module/preprocessing/__init__.py +0 -8
  17. nerdd_module-0.3.50/nerdd_module/preprocessing/check_valid_smiles.py +0 -26
  18. nerdd_module-0.3.50/nerdd_module/preprocessing/chembl_structure_pipeline.py +0 -77
  19. nerdd_module-0.3.50/nerdd_module/preprocessing/filter_by_element.py +0 -57
  20. nerdd_module-0.3.50/nerdd_module/preprocessing/filter_by_weight.py +0 -34
  21. nerdd_module-0.3.50/nerdd_module/preprocessing/preprocessing_step.py +0 -61
  22. nerdd_module-0.3.50/nerdd_module/preprocessing/remove_small_fragments.py +0 -26
  23. nerdd_module-0.3.50/nerdd_module/preprocessing/remove_stereochemistry.py +0 -26
  24. nerdd_module-0.3.50/nerdd_module/preprocessing/sanitize.py +0 -39
  25. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/LICENSE +0 -0
  26. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/README.md +0 -0
  27. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/__init__.py +0 -0
  28. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/cli.py +0 -0
  29. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/config/__init__.py +0 -0
  30. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/config/configuration.py +0 -0
  31. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/config/default_configuration.py +0 -0
  32. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/config/dict_configuration.py +0 -0
  33. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/config/merged_configuration.py +0 -0
  34. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/config/package_configuration.py +0 -0
  35. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/config/search_yaml_configuration.py +0 -0
  36. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/config/yaml_configuration.py +0 -0
  37. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/converters/__init__.py +0 -0
  38. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/converters/basic_type_converter.py +0 -0
  39. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/converters/converter.py +0 -0
  40. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/converters/converter_config.py +0 -0
  41. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/converters/mol_converter.py +0 -0
  42. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/converters/problem_list_converter.py +0 -0
  43. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/converters/representation_converter.py +0 -0
  44. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/converters/source_list_converter.py +0 -0
  45. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/converters/void_converter.py +0 -0
  46. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/input/__init__.py +0 -0
  47. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/input/depth_first_explorer.py +0 -0
  48. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/input/explorer.py +0 -0
  49. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/input/file_reader.py +0 -0
  50. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/input/gzip_reader.py +0 -0
  51. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/input/inchi_reader.py +0 -0
  52. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/input/list_reader.py +0 -0
  53. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/input/mol_reader.py +0 -0
  54. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/input/reader.py +0 -0
  55. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/input/reader_config.py +0 -0
  56. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/input/sdf_reader.py +0 -0
  57. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/input/smiles_reader.py +0 -0
  58. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/input/stream_reader.py +0 -0
  59. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/input/string_reader.py +0 -0
  60. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/input/tar_reader.py +0 -0
  61. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/input/zip_reader.py +0 -0
  62. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/model/__init__.py +0 -0
  63. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/model/assign_name_step.py +0 -0
  64. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/model/convert_representations_step.py +0 -0
  65. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/model/enforce_schema_step.py +0 -0
  66. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/model/model.py +0 -0
  67. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/model/prediction_step.py +0 -0
  68. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/model/read_input_step.py +0 -0
  69. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/model/write_output_step.py +0 -0
  70. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/output/__init__.py +0 -0
  71. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/output/csv_writer.py +0 -0
  72. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/output/file_writer.py +0 -0
  73. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/output/iterator_writer.py +0 -0
  74. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/output/pandas_writer.py +0 -0
  75. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/output/record_list_writer.py +0 -0
  76. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/output/sdf_writer.py +0 -0
  77. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/output/writer.py +0 -0
  78. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/output/writer_config.py +0 -0
  79. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/polyfills/__init__.py +0 -0
  80. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/polyfills/block_logs.py +0 -0
  81. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/polyfills/files.py +0 -0
  82. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/polyfills/get_entry_points.py +0 -0
  83. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/polyfills/literal.py +0 -0
  84. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/polyfills/typed_dict.py +0 -0
  85. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/polyfills/types.py +0 -0
  86. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/polyfills/version.py +0 -0
  87. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/problem.py +0 -0
  88. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/py.typed +0 -0
  89. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/steps/__init__.py +0 -0
  90. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/steps/map_step.py +0 -0
  91. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/steps/output_step.py +0 -0
  92. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/steps/step.py +0 -0
  93. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/tests/__init__.py +0 -0
  94. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/tests/checks.py +0 -0
  95. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/tests/files.py +0 -0
  96. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/tests/models/AtomicMassModel.py +0 -0
  97. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/tests/models/MolWeightModel.py +0 -0
  98. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/tests/models/__init__.py +0 -0
  99. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/tests/predictions.py +0 -0
  100. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/tests/preprocessing/DummyPreprocessingStep.py +0 -0
  101. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/tests/preprocessing/__init__.py +0 -0
  102. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/tests/representations.py +0 -0
  103. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/util/__init__.py +0 -0
  104. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/util/call_with_mappings.py +0 -0
  105. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/util/package.py +0 -0
  106. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module/version.py +0 -0
  107. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module.egg-info/dependency_links.txt +0 -0
  108. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module.egg-info/requires.txt +0 -0
  109. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/nerdd_module.egg-info/top_level.txt +0 -0
  110. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/setup.cfg +0 -0
  111. {nerdd_module-0.3.50 → nerdd_module-0.3.52}/tests/test_features.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nerdd-module
3
- Version: 0.3.50
3
+ Version: 0.3.52
4
4
  Summary: Base package to create NERDD modules
5
5
  Author-email: Steffen Hirte <steffen.hirte@univie.ac.at>
6
6
  Maintainer-email: Steffen Hirte <steffen.hirte@univie.ac.at>
@@ -1,3 +1,5 @@
1
+ """Pydantic models describing module metadata, schema, and parameters."""
2
+
1
3
  from typing import Any, List, Optional, Union
2
4
 
3
5
  from pydantic import BaseModel, computed_field, model_validator
@@ -7,6 +9,8 @@ from ..polyfills import Literal
7
9
 
8
10
 
9
11
  class Partner(BaseModel):
12
+ """Partner organization metadata for module marketing/attribution."""
13
+
10
14
  name: str
11
15
  logo: str
12
16
  url: Optional[str] = None
@@ -31,14 +35,18 @@ class Author(BaseModel):
31
35
 
32
36
 
33
37
  class Publication(BaseModel):
34
- title: str
35
- authors: List[Author] = []
36
- journal: str
37
- year: int
38
+ """Reference to a publication related to the module or model."""
39
+
40
+ title: Optional[str] = None
41
+ authors: Optional[List[Author]] = None
42
+ journal: Optional[str] = None
43
+ year: Optional[int] = None
38
44
  doi: Optional[str]
39
45
 
40
46
 
41
47
  class ColorPalette(BaseModel):
48
+ """Optional color mapping for visualizing categorical or numeric outputs."""
49
+
42
50
  type: Optional[str] = None
43
51
  name: Optional[str] = None
44
52
  domain: Optional[Union[List[str], List[float], List[int], List[bool]]] = None
@@ -47,6 +55,8 @@ class ColorPalette(BaseModel):
47
55
 
48
56
 
49
57
  class Choice(BaseModel):
58
+ """Select options for job parameters or categorical result properties."""
59
+
50
60
  value: Union[str, int, float, bool]
51
61
  label: Optional[str] = None
52
62
 
@@ -55,6 +65,8 @@ JobType = Literal["int", "integer", "float", "bool", "boolean", "str", "string"]
55
65
 
56
66
 
57
67
  class JobParameter(BaseModel):
68
+ """Definition of a user-configurable parameter for a job run."""
69
+
58
70
  name: str
59
71
  type: JobType
60
72
  visible_name: Optional[str] = None
@@ -64,6 +76,7 @@ class JobParameter(BaseModel):
64
76
  choices: Optional[List[Choice]] = None
65
77
 
66
78
  def validate_value(self, value: Any) -> None:
79
+ """Validate a provided value against type and choices."""
67
80
  if self.type == ["int", "integer"]:
68
81
  if not isinstance(value, int):
69
82
  raise ValueError(
@@ -107,11 +120,15 @@ FormatSpec = Union[List[str], str]
107
120
 
108
121
 
109
122
  class IncludeExcludeFormatSpec(BaseModel):
123
+ """Visibility filter for result properties per output format."""
124
+
110
125
  include: Optional[FormatSpec]
111
126
  exclude: Optional[FormatSpec]
112
127
 
113
128
 
114
129
  class ResultProperty(BaseModel):
130
+ """Schema entry for a model output property."""
131
+
115
132
  name: str
116
133
  type: str
117
134
  visible_name: Optional[str] = None
@@ -129,6 +146,7 @@ class ResultProperty(BaseModel):
129
146
  color_palette: Optional[ColorPalette] = None
130
147
 
131
148
  def is_visible(self, output_format: str) -> bool:
149
+ """Return True if property should be shown for the given format."""
132
150
  formats = self.formats
133
151
 
134
152
  if formats is None:
@@ -144,6 +162,8 @@ class ResultProperty(BaseModel):
144
162
 
145
163
 
146
164
  class Module(BaseModel):
165
+ """Full module configuration: metadata, schema, parameters, and validation."""
166
+
147
167
  @computed_field # type: ignore[prop-decorator]
148
168
  @property
149
169
  def id(self) -> str:
@@ -194,6 +214,7 @@ class Module(BaseModel):
194
214
  @model_validator(mode="after")
195
215
  @classmethod
196
216
  def validate_model(cls, values: Any) -> Any:
217
+ """Enforce consistency between task type and declared result properties."""
197
218
  assert isinstance(values, Module)
198
219
 
199
220
  num_atom_properties = len(values.get_property_columns_of_type("atom"))
@@ -244,10 +265,7 @@ class Module(BaseModel):
244
265
  return values
245
266
 
246
267
  def validate_job_parameters(self, params: dict) -> None:
247
- """
248
- Validate the job parameters against the module's job parameters.
249
- Raises an error if a parameter is missing or has an invalid type.
250
- """
268
+ """Validate provided job parameters against the module declaration."""
251
269
  # make sure that all job parameters are present
252
270
  for param in self.job_parameters:
253
271
  if param.name not in params and param.required:
@@ -0,0 +1,81 @@
1
+ """
2
+ Molecular preprocessing pipeline components.
3
+
4
+ This package provides a comprehensive set of preprocessing steps for molecular data processing
5
+ pipelines. These steps can be chained together to clean, standardize, and validate molecular
6
+ datasets commonly used in cheminformatics and drug discovery.
7
+
8
+ The preprocessing steps inherit from the base `PreprocessingStep` class and can be easily combined
9
+ to create custom preprocessing pipelines. Each step operates on molecular records and can transform
10
+ molecules, report problems, or filter out invalid structures.
11
+
12
+ Available Preprocessing Steps
13
+ -----------------------------
14
+
15
+ - `CheckValidSmiles` : Validates molecules through SMILES round-trip conversion
16
+ - `Sanitize` : Validates and corrects molecular structures using RDKit sanitization
17
+ - `FilterByWeight` : Filters molecules based on molecular weight thresholds
18
+ - `FilterByElement` : Filters molecules based on allowed elemental composition
19
+ - `StandardizeWithCsp` : Standardizes molecules using ChEMBL Structure Pipeline
20
+ - `GetParentMolWithCsp` : Extracts parent molecules using ChEMBL Structure Pipeline
21
+ - `RemoveHydrogens` : Removes hydrogen atoms from molecular representations
22
+ - `RemoveSmallFragments` : Removes small fragments, keeping only the largest component
23
+ - `RemoveStereochemistry` : Removes stereochemical information from molecules
24
+
25
+ Base Classes
26
+ ------------
27
+
28
+ - `PreprocessingStep` : Abstract base class for all preprocessing steps
29
+
30
+ Examples
31
+ --------
32
+
33
+ Basic usage of individual preprocessing steps:
34
+
35
+ >>> from nerdd_module.preprocessing import FilterByWeight, RemoveHydrogens, Sanitize
36
+ >>>
37
+ >>> # Create preprocessing steps
38
+ >>> weight_filter = FilterByWeight(min_weight=150, max_weight=500)
39
+ >>> hydrogen_remover = RemoveHydrogens()
40
+ >>> sanitizer = Sanitize()
41
+
42
+ Creating a complete preprocessing pipeline:
43
+
44
+ >>> from nerdd_module.preprocessing import (
45
+ ... CheckValidSmiles, FilterByElement, RemoveSmallFragments,
46
+ ... Sanitize, StandardizeWithCsp, ORGANIC_SUBSET
47
+ ... )
48
+ >>>
49
+ >>> # Define a comprehensive preprocessing pipeline
50
+ >>> pipeline_steps = [
51
+ ... Sanitize(), # Sanitize molecules
52
+ ... CheckValidSmiles(), # Validate SMILES representation
53
+ ... RemoveSmallFragments(), # Remove salts and solvents
54
+ ... FilterByElement(ORGANIC_SUBSET), # Keep only organic molecules
55
+ ... StandardizeWithCsp(), # Standardize using chembl_structure_pipeline
56
+ ... ]
57
+
58
+
59
+ Notes
60
+ -----
61
+ * All preprocessing steps follow the same interface defined by `PreprocessingStep`
62
+ * Steps can be chained together to create comprehensive preprocessing pipelines
63
+ * Problems encountered during preprocessing are accumulated in the record's "problems" list
64
+ * Some steps require optional dependencies (e.g., `chembl_structure_pipeline`)
65
+ * The order of preprocessing steps can significantly impact the final results
66
+
67
+ See Also
68
+ --------
69
+ nerdd_module.steps : Base classes for pipeline steps nerdd_module.problem : Problem reporting
70
+ classes used by preprocessing steps
71
+ """
72
+
73
+ from .check_valid_smiles import *
74
+ from .chembl_structure_pipeline import *
75
+ from .filter_by_element import *
76
+ from .filter_by_weight import *
77
+ from .preprocessing_step import *
78
+ from .remove_hydrogens import *
79
+ from .remove_small_fragments import *
80
+ from .remove_stereochemistry import *
81
+ from .sanitize import *
@@ -0,0 +1,74 @@
1
+ """
2
+ SMILES validation preprocessing step for molecular data.
3
+
4
+ This module provides functionality to validate molecular representations by converting them to
5
+ SMILES format and attempting to parse them back. This round-trip validation ensures that molecules
6
+ can be properly serialized and deserialized as SMILES strings.
7
+ """
8
+
9
+ from typing import List, Optional, Tuple
10
+
11
+ from rdkit.Chem import Mol, MolFromSmiles, MolToSmiles
12
+
13
+ from ..problem import InvalidSmiles, Problem
14
+ from .preprocessing_step import PreprocessingStep
15
+
16
+ __all__ = ["CheckValidSmiles"]
17
+
18
+
19
+ class CheckValidSmiles(PreprocessingStep):
20
+ """
21
+ Preprocessing step that validates molecules through SMILES round-trip conversion.
22
+
23
+ This class validates molecular representations by converting them to SMILES format and then
24
+ attempting to parse the SMILES back to a molecule object. This round-trip validation ensures
25
+ that molecules can be properly represented as SMILES strings, which is an indicator for a valid
26
+ molecular structure. Molecules that fail the round-trip test are considered invalid and removed.
27
+
28
+ Parameters
29
+ ----------
30
+ None
31
+
32
+ Examples
33
+ --------
34
+ >>> # Create a SMILES validation step
35
+ >>> smiles_check = CheckValidSmiles()
36
+ """
37
+
38
+ def __init__(self) -> None:
39
+ super().__init__()
40
+
41
+ def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
42
+ """
43
+ Validate a molecule through SMILES round-trip conversion.
44
+
45
+ Converts the input molecule to a canonical SMILES string and then attempts to parse it back
46
+ to a molecule object. If the round-trip conversion fails, the molecule is considered
47
+ invalid.
48
+
49
+ Parameters
50
+ ----------
51
+ mol : Mol
52
+ RDKit Mol object representing the molecule to be validated.
53
+
54
+ Returns
55
+ -------
56
+ Tuple[Optional[Mol], List[Problem]]
57
+ A tuple containing:
58
+ * The original molecule if SMILES validation succeeded, or None if validation failed
59
+ * An empty list if validation succeeded, or a list containing an InvalidSmiles problem
60
+ if validation failed
61
+
62
+ Notes
63
+ -----
64
+ The validation process converts the molecule to canonical SMILES.
65
+ """
66
+ problems = []
67
+
68
+ smi = MolToSmiles(mol, True)
69
+ check_mol = MolFromSmiles(smi)
70
+ if check_mol is None:
71
+ problems.append(InvalidSmiles())
72
+ mol = None
73
+
74
+ return mol, problems
@@ -0,0 +1,183 @@
1
+ """
2
+ ChEMBL Structure Pipeline preprocessing steps for molecular data.
3
+
4
+ This module provides preprocessing steps that utilize the ChEMBL Structure Pipeline library for
5
+ molecule standardization and parent molecule extraction.
6
+ """
7
+
8
+ import warnings
9
+ from typing import List, Optional, Tuple
10
+
11
+ from rdkit.Chem import Mol
12
+
13
+ from ..polyfills import BlockLogs
14
+ from ..problem import Problem
15
+ from .preprocessing_step import PreprocessingStep
16
+
17
+ # before importing chembl_structure_pipeline, we need to suppress RDKit warnings
18
+ warnings.filterwarnings(
19
+ "ignore",
20
+ category=DeprecationWarning,
21
+ module="rdkit.Chem.MolStandardize",
22
+ )
23
+
24
+ # We check if chembl_structure_pipeline is installed. Since importing this library already logs
25
+ # messages, we suppress them using RDKit's BlockLogs.
26
+ with BlockLogs():
27
+ try:
28
+ from chembl_structure_pipeline import get_parent_mol, standardize_mol
29
+
30
+ import_error = None
31
+ except ImportError as e:
32
+ # raise ImportError later when using this class
33
+ # --> this allows to use the rest of the package without chembl_structure_pipeline
34
+ import_error = e
35
+
36
+ __all__ = ["GetParentMolWithCsp", "StandardizeWithCsp"]
37
+
38
+
39
+ class StandardizeWithCsp(PreprocessingStep):
40
+ """
41
+ Preprocessing step that standardizes molecules using ChEMBL Structure Pipeline.
42
+
43
+ This class applies the ChEMBL Structure Pipeline standardization procedures to normalize
44
+ molecular representations. The standardization includes tautomer normalization, charge
45
+ neutralization, and other structural standardizations commonly used in pharmaceutical databases.
46
+
47
+ Parameters
48
+ ----------
49
+ None
50
+
51
+ Raises
52
+ ------
53
+ ImportError
54
+ If the chembl_structure_pipeline library is not installed.
55
+
56
+ Examples
57
+ --------
58
+ >>> # Create a standardization step (requires chembl_structure_pipeline)
59
+ >>> standardize_step = StandardizeWithCsp()
60
+
61
+ Notes
62
+ -----
63
+ * Requires the chembl_structure_pipeline library to be installed
64
+ * Automatically removes 3D conformers as the pipeline cannot handle them
65
+ * Uses ChEMBL's standardize_mol function which applies comprehensive molecular standardization
66
+ procedures
67
+ * If standardization fails, the original molecule is returned with a problem
68
+ """
69
+
70
+ def __init__(self) -> None:
71
+ super().__init__()
72
+
73
+ if import_error is not None:
74
+ raise import_error
75
+
76
+ def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
77
+ """
78
+ Standardize a molecule using ChEMBL Structure Pipeline.
79
+
80
+ Applies ChEMBL's standardization procedures to normalize the molecular representation. The
81
+ process removes 3D conformers before applying the standardize_mol function.
82
+
83
+ Parameters
84
+ ----------
85
+ mol : Mol
86
+ RDKit Mol object representing the molecule to be standardized.
87
+
88
+ Returns
89
+ -------
90
+ Tuple[Optional[Mol], List[Problem]]
91
+ A tuple containing:
92
+ * The standardized molecule if successful, or the original molecule if standardization
93
+ failed
94
+ * An empty list if standardization succeeded, or a list containing a Problem instance
95
+ with code "csp_error" if standardization failed
96
+ """
97
+ problems: List[Problem] = []
98
+
99
+ # chembl structure pipeline cannot handle molecules with 3D coordinates
100
+ # --> delete conformers
101
+ mol.RemoveAllConformers()
102
+
103
+ # standardization via chembl structure pipeline
104
+ preprocessed_mol = standardize_mol(mol)
105
+
106
+ if preprocessed_mol is None:
107
+ problems.append(Problem("csp_error", "Could not standardize the molecule."))
108
+ preprocessed_mol = mol
109
+
110
+ return preprocessed_mol, problems
111
+
112
+
113
+ class GetParentMolWithCsp(PreprocessingStep):
114
+ """
115
+ Preprocessing step that extracts parent molecules using ChEMBL Structure Pipeline.
116
+
117
+ This class uses the ChEMBL Structure Pipeline to identify and extract the parent molecule from
118
+ complex molecular structures. This process removes salts, solvents, and other fragments while
119
+ applying ChEMBL's standardization rules.
120
+
121
+ Parameters
122
+ ----------
123
+ None
124
+
125
+ Raises
126
+ ------
127
+ ImportError
128
+ If the chembl_structure_pipeline library is not installed.
129
+
130
+ Examples
131
+ --------
132
+ >>> # Create a parent molecule extraction step
133
+ >>> get_parent_step = GetParentMolWithCsp()
134
+
135
+ Notes
136
+ -----
137
+ * Requires the chembl_structure_pipeline library to be installed
138
+ * Automatically removes 3D conformers as the pipeline cannot handle them
139
+ * Applies the get_parent_mol function from the chembl_structure_pipeline library
140
+ * If parent extraction fails or is flagged for exclusion, the original molecule is returned with
141
+ a Problem instance
142
+ """
143
+
144
+ def __init__(self) -> None:
145
+ super().__init__()
146
+
147
+ if import_error is not None:
148
+ raise import_error
149
+
150
+ def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
151
+ """
152
+ Extract the parent molecule using ChEMBL Structure Pipeline.
153
+
154
+ Identifies and returns the main molecular component. The process removes 3D conformers,
155
+ because chembl_structure_pipeline cannot handle them.
156
+
157
+ Parameters
158
+ ----------
159
+ mol : Mol
160
+ RDKit Mol object representing the molecule from which to extract the parent structure.
161
+
162
+ Returns
163
+ -------
164
+ Tuple[Optional[Mol], List[Problem]]
165
+ A tuple containing:
166
+ * The parent molecule if successful, or the original molecule if extraction failed
167
+ * An empty list if extraction succeeded, or a list containing a Problem instance with
168
+ code "csp_error" if extraction failed or was flagged for exclusion
169
+ """
170
+ problems = []
171
+
172
+ # chembl structure pipeline cannot handle molecules with 3D coordinates
173
+ # --> delete conformers
174
+ mol.RemoveAllConformers()
175
+
176
+ # get parent molecule via chembl structure pipeline
177
+ preprocessed_mol, exclude_flag = get_parent_mol(mol)
178
+ if exclude_flag or preprocessed_mol is None:
179
+ problems.append(Problem("csp_error", "Could not remove small fragments."))
180
+ if preprocessed_mol is None:
181
+ preprocessed_mol = mol
182
+
183
+ return preprocessed_mol, problems
@@ -0,0 +1,148 @@
1
+ """
2
+ Element filtering preprocessing step for molecular data.
3
+
4
+ This module provides functionality to filter molecules based on their elemental composition,
5
+ allowing only molecules containing specified allowed elements to pass through the processing
6
+ pipeline.
7
+ """
8
+
9
+ from typing import Iterable, List, Optional, Set, Tuple
10
+
11
+ from rdkit.Chem import Mol
12
+
13
+ from ..problem import InvalidElementsProblem, Problem
14
+ from .preprocessing_step import PreprocessingStep
15
+
16
+ __all__ = ["FilterByElement", "ORGANIC_SUBSET"]
17
+
18
+ ORGANIC_SUBSET = [
19
+ "H",
20
+ "B",
21
+ "C",
22
+ "N",
23
+ "O",
24
+ "F",
25
+ "Si",
26
+ "P",
27
+ "S",
28
+ "Cl",
29
+ "Se",
30
+ "Br",
31
+ "I",
32
+ ]
33
+ """
34
+ List[str] : Predefined set of elements commonly found in organic molecules.
35
+
36
+ This list contains the atomic symbols of elements that are typically present in organic and
37
+ drug-like molecules. It can be used as a convenient preset for the FilterByElement class to restrict
38
+ molecules to organic chemistry space.
39
+
40
+ The elements included are:
41
+ * H (Hydrogen)
42
+ * B (Boron)
43
+ * C (Carbon)
44
+ * N (Nitrogen)
45
+ * O (Oxygen)
46
+ * F (Fluorine)
47
+ * Si (Silicon)
48
+ * P (Phosphorus)
49
+ * S (Sulfur)
50
+ * Cl (Chlorine)
51
+ * Se (Selenium)
52
+ * Br (Bromine)
53
+ * I (Iodine)
54
+
55
+ Examples
56
+ --------
57
+ >>> filter_step = FilterByElement(ORGANIC_SUBSET)
58
+ >>> # This will only allow molecules containing organic elements
59
+ """
60
+
61
+
62
+ class FilterByElement(PreprocessingStep):
63
+ """
64
+ Preprocessing step that filters molecules based on elemental composition.
65
+
66
+ This class validates molecules against a specified set of allowed elements. Molecules containing
67
+ elements not in the allowed set are flagged with a problem instance "invalid_element" and
68
+ optionally removed from the pipeline.
69
+
70
+ Parameters
71
+ ----------
72
+ allowed_elements : Iterable[str]
73
+ An iterable of atomic symbols (element names) that are allowed in molecules. Element symbols
74
+ are case-insensitive but will be normalized to proper case (first letter uppercase, rest
75
+ lowercase).
76
+ remove_invalid_molecules : bool, optional
77
+ If True, molecules containing disallowed elements are set to None (removed). If False,
78
+ invalid molecules are kept. Default is False.
79
+
80
+ Examples
81
+ --------
82
+ >>> # Allow only carbon, nitrogen, oxygen, and hydrogen
83
+ >>> filter_step = FilterByElement(['C', 'N', 'O', 'H'])
84
+
85
+ >>> # Use predefined organic subset, removing invalid molecules
86
+ >>> filter_step = FilterByElement(ORGANIC_SUBSET, remove_invalid_molecules=True)
87
+
88
+ Notes
89
+ -----
90
+ * Element symbols are normalized to proper case (e.g., 'cl' becomes 'Cl')
91
+ * Even if remove_invalid_molecules is set to False, molecules with invalid elements are still
92
+ marked with a problem instance
93
+ * Hydrogen atoms are handled specially since they may not be explicit in the molecular
94
+ representation and are detected via GetTotalNumHs()
95
+ """
96
+
97
+ def __init__(
98
+ self, allowed_elements: Iterable[str], remove_invalid_molecules: bool = False
99
+ ) -> None:
100
+ super().__init__()
101
+ self._allowed_elements = {a[0].upper() + a[1:] for a in allowed_elements}
102
+ self._hydrogen_in_allowed_elements = "H" in self._allowed_elements
103
+ self._remove_invalid_molecules = remove_invalid_molecules
104
+
105
+ def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
106
+ """
107
+ Filter a molecule by comparing its elemental composition against allowed elements.
108
+
109
+ Parameters
110
+ ----------
111
+ mol : Mol
112
+ RDKit Mol object representing the molecule to be validated.
113
+
114
+ Returns
115
+ -------
116
+ Tuple[Optional[Mol], List[Problem]]
117
+ A tuple containing:
118
+ * The original molecule if all elements are allowed, or None if disallowed elements are
119
+ found and remove_invalid_molecules is True
120
+ * A list containing an InvalidElementsProblem if disallowed elements are found,
121
+ otherwise an empty list
122
+
123
+ Notes
124
+ -----
125
+ Hydrogen detection is special-cased because hydrogen atoms are often implicit in molecular
126
+ representations and detected via atom.GetTotalNumHs().
127
+ """
128
+ problems = []
129
+ result_mol = mol
130
+
131
+ elements: Set[str] = {atom.GetSymbol() for atom in mol.GetAtoms()}
132
+ invalid_elements = elements - self._allowed_elements
133
+
134
+ # special case: hydrogens are not recognized by mol.GetAtoms()
135
+ if not self._hydrogen_in_allowed_elements:
136
+ # get the number of hydrogens in mol
137
+ for a in mol.GetAtoms():
138
+ if a.GetTotalNumHs() > 0:
139
+ invalid_elements.add("H")
140
+ break
141
+
142
+ if len(invalid_elements) > 0:
143
+ if self._remove_invalid_molecules:
144
+ result_mol = None
145
+
146
+ problems.append(InvalidElementsProblem(invalid_elements))
147
+
148
+ return result_mol, problems