nerdd-module 0.3.50__tar.gz → 0.3.51__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/PKG-INFO +1 -1
- nerdd_module-0.3.51/nerdd_module/preprocessing/__init__.py +81 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/check_valid_smiles.py +74 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/chembl_structure_pipeline.py +183 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/filter_by_element.py +148 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/filter_by_weight.py +95 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/preprocessing_step.py +139 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/remove_hydrogens.py +104 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/remove_small_fragments.py +76 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/remove_stereochemistry.py +81 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/sanitize.py +91 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module.egg-info/PKG-INFO +1 -1
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module.egg-info/SOURCES.txt +1 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/pyproject.toml +1 -1
- nerdd_module-0.3.50/nerdd_module/preprocessing/__init__.py +0 -8
- nerdd_module-0.3.50/nerdd_module/preprocessing/check_valid_smiles.py +0 -26
- nerdd_module-0.3.50/nerdd_module/preprocessing/chembl_structure_pipeline.py +0 -77
- nerdd_module-0.3.50/nerdd_module/preprocessing/filter_by_element.py +0 -57
- nerdd_module-0.3.50/nerdd_module/preprocessing/filter_by_weight.py +0 -34
- nerdd_module-0.3.50/nerdd_module/preprocessing/preprocessing_step.py +0 -61
- nerdd_module-0.3.50/nerdd_module/preprocessing/remove_small_fragments.py +0 -26
- nerdd_module-0.3.50/nerdd_module/preprocessing/remove_stereochemistry.py +0 -26
- nerdd_module-0.3.50/nerdd_module/preprocessing/sanitize.py +0 -39
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/LICENSE +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/README.md +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/__init__.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/cli.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/__init__.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/configuration.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/default_configuration.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/dict_configuration.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/merged_configuration.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/models.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/package_configuration.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/search_yaml_configuration.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/config/yaml_configuration.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/__init__.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/basic_type_converter.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/converter.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/converter_config.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/mol_converter.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/problem_list_converter.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/representation_converter.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/source_list_converter.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/converters/void_converter.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/__init__.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/depth_first_explorer.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/explorer.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/file_reader.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/gzip_reader.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/inchi_reader.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/list_reader.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/mol_reader.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/reader.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/reader_config.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/sdf_reader.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/smiles_reader.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/stream_reader.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/string_reader.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/tar_reader.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/input/zip_reader.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/model/__init__.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/model/assign_name_step.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/model/convert_representations_step.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/model/enforce_schema_step.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/model/model.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/model/prediction_step.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/model/read_input_step.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/model/write_output_step.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/__init__.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/csv_writer.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/file_writer.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/iterator_writer.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/pandas_writer.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/record_list_writer.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/sdf_writer.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/writer.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/output/writer_config.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/polyfills/__init__.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/polyfills/block_logs.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/polyfills/files.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/polyfills/get_entry_points.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/polyfills/literal.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/polyfills/typed_dict.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/polyfills/types.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/polyfills/version.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/problem.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/py.typed +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/steps/__init__.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/steps/map_step.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/steps/output_step.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/steps/step.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/__init__.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/checks.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/files.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/models/AtomicMassModel.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/models/MolWeightModel.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/models/__init__.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/predictions.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/preprocessing/DummyPreprocessingStep.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/preprocessing/__init__.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/tests/representations.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/util/__init__.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/util/call_with_mappings.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/util/package.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module/version.py +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module.egg-info/dependency_links.txt +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module.egg-info/requires.txt +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/nerdd_module.egg-info/top_level.txt +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/setup.cfg +0 -0
- {nerdd_module-0.3.50 → nerdd_module-0.3.51}/tests/test_features.py +0 -0
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Molecular preprocessing pipeline components.
|
|
3
|
+
|
|
4
|
+
This package provides a comprehensive set of preprocessing steps for molecular data processing
|
|
5
|
+
pipelines. These steps can be chained together to clean, standardize, and validate molecular
|
|
6
|
+
datasets commonly used in cheminformatics and drug discovery.
|
|
7
|
+
|
|
8
|
+
The preprocessing steps inherit from the base `PreprocessingStep` class and can be easily combined
|
|
9
|
+
to create custom preprocessing pipelines. Each step operates on molecular records and can transform
|
|
10
|
+
molecules, report problems, or filter out invalid structures.
|
|
11
|
+
|
|
12
|
+
Available Preprocessing Steps
|
|
13
|
+
-----------------------------
|
|
14
|
+
|
|
15
|
+
- `CheckValidSmiles` : Validates molecules through SMILES round-trip conversion
|
|
16
|
+
- `Sanitize` : Validates and corrects molecular structures using RDKit sanitization
|
|
17
|
+
- `FilterByWeight` : Filters molecules based on molecular weight thresholds
|
|
18
|
+
- `FilterByElement` : Filters molecules based on allowed elemental composition
|
|
19
|
+
- `StandardizeWithCsp` : Standardizes molecules using ChEMBL Structure Pipeline
|
|
20
|
+
- `GetParentMolWithCsp` : Extracts parent molecules using ChEMBL Structure Pipeline
|
|
21
|
+
- `RemoveHydrogens` : Removes hydrogen atoms from molecular representations
|
|
22
|
+
- `RemoveSmallFragments` : Removes small fragments, keeping only the largest component
|
|
23
|
+
- `RemoveStereochemistry` : Removes stereochemical information from molecules
|
|
24
|
+
|
|
25
|
+
Base Classes
|
|
26
|
+
------------
|
|
27
|
+
|
|
28
|
+
- `PreprocessingStep` : Abstract base class for all preprocessing steps
|
|
29
|
+
|
|
30
|
+
Examples
|
|
31
|
+
--------
|
|
32
|
+
|
|
33
|
+
Basic usage of individual preprocessing steps:
|
|
34
|
+
|
|
35
|
+
>>> from nerdd_module.preprocessing import FilterByWeight, RemoveHydrogens, Sanitize
|
|
36
|
+
>>>
|
|
37
|
+
>>> # Create preprocessing steps
|
|
38
|
+
>>> weight_filter = FilterByWeight(min_weight=150, max_weight=500)
|
|
39
|
+
>>> hydrogen_remover = RemoveHydrogens()
|
|
40
|
+
>>> sanitizer = Sanitize()
|
|
41
|
+
|
|
42
|
+
Creating a complete preprocessing pipeline:
|
|
43
|
+
|
|
44
|
+
>>> from nerdd_module.preprocessing import (
|
|
45
|
+
... CheckValidSmiles, FilterByElement, RemoveSmallFragments,
|
|
46
|
+
... Sanitize, StandardizeWithCsp, ORGANIC_SUBSET
|
|
47
|
+
... )
|
|
48
|
+
>>>
|
|
49
|
+
>>> # Define a comprehensive preprocessing pipeline
|
|
50
|
+
>>> pipeline_steps = [
|
|
51
|
+
... Sanitize(), # Sanitize molecules
|
|
52
|
+
... CheckValidSmiles(), # Validate SMILES representation
|
|
53
|
+
... RemoveSmallFragments(), # Remove salts and solvents
|
|
54
|
+
... FilterByElement(ORGANIC_SUBSET), # Keep only organic molecules
|
|
55
|
+
... StandardizeWithCsp(), # Standardize using chembl_structure_pipeline
|
|
56
|
+
... ]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
Notes
|
|
60
|
+
-----
|
|
61
|
+
* All preprocessing steps follow the same interface defined by `PreprocessingStep`
|
|
62
|
+
* Steps can be chained together to create comprehensive preprocessing pipelines
|
|
63
|
+
* Problems encountered during preprocessing are accumulated in the record's "problems" list
|
|
64
|
+
* Some steps require optional dependencies (e.g., `chembl_structure_pipeline`)
|
|
65
|
+
* The order of preprocessing steps can significantly impact the final results
|
|
66
|
+
|
|
67
|
+
See Also
|
|
68
|
+
--------
|
|
69
|
+
nerdd_module.steps : Base classes for pipeline steps nerdd_module.problem : Problem reporting
|
|
70
|
+
classes used by preprocessing steps
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
from .check_valid_smiles import *
|
|
74
|
+
from .chembl_structure_pipeline import *
|
|
75
|
+
from .filter_by_element import *
|
|
76
|
+
from .filter_by_weight import *
|
|
77
|
+
from .preprocessing_step import *
|
|
78
|
+
from .remove_hydrogens import *
|
|
79
|
+
from .remove_small_fragments import *
|
|
80
|
+
from .remove_stereochemistry import *
|
|
81
|
+
from .sanitize import *
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SMILES validation preprocessing step for molecular data.
|
|
3
|
+
|
|
4
|
+
This module provides functionality to validate molecular representations by converting them to
|
|
5
|
+
SMILES format and attempting to parse them back. This round-trip validation ensures that molecules
|
|
6
|
+
can be properly serialized and deserialized as SMILES strings.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
from rdkit.Chem import Mol, MolFromSmiles, MolToSmiles
|
|
12
|
+
|
|
13
|
+
from ..problem import InvalidSmiles, Problem
|
|
14
|
+
from .preprocessing_step import PreprocessingStep
|
|
15
|
+
|
|
16
|
+
__all__ = ["CheckValidSmiles"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CheckValidSmiles(PreprocessingStep):
|
|
20
|
+
"""
|
|
21
|
+
Preprocessing step that validates molecules through SMILES round-trip conversion.
|
|
22
|
+
|
|
23
|
+
This class validates molecular representations by converting them to SMILES format and then
|
|
24
|
+
attempting to parse the SMILES back to a molecule object. This round-trip validation ensures
|
|
25
|
+
that molecules can be properly represented as SMILES strings, which is an indicator for a valid
|
|
26
|
+
molecular structure. Molecules that fail the round-trip test are considered invalid and removed.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
None
|
|
31
|
+
|
|
32
|
+
Examples
|
|
33
|
+
--------
|
|
34
|
+
>>> # Create a SMILES validation step
|
|
35
|
+
>>> smiles_check = CheckValidSmiles()
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self) -> None:
|
|
39
|
+
super().__init__()
|
|
40
|
+
|
|
41
|
+
def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
|
|
42
|
+
"""
|
|
43
|
+
Validate a molecule through SMILES round-trip conversion.
|
|
44
|
+
|
|
45
|
+
Converts the input molecule to a canonical SMILES string and then attempts to parse it back
|
|
46
|
+
to a molecule object. If the round-trip conversion fails, the molecule is considered
|
|
47
|
+
invalid.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
mol : Mol
|
|
52
|
+
RDKit Mol object representing the molecule to be validated.
|
|
53
|
+
|
|
54
|
+
Returns
|
|
55
|
+
-------
|
|
56
|
+
Tuple[Optional[Mol], List[Problem]]
|
|
57
|
+
A tuple containing:
|
|
58
|
+
* The original molecule if SMILES validation succeeded, or None if validation failed
|
|
59
|
+
* An empty list if validation succeeded, or a list containing an InvalidSmiles problem
|
|
60
|
+
if validation failed
|
|
61
|
+
|
|
62
|
+
Notes
|
|
63
|
+
-----
|
|
64
|
+
The validation process converts the molecule to canonical SMILES.
|
|
65
|
+
"""
|
|
66
|
+
problems = []
|
|
67
|
+
|
|
68
|
+
smi = MolToSmiles(mol, True)
|
|
69
|
+
check_mol = MolFromSmiles(smi)
|
|
70
|
+
if check_mol is None:
|
|
71
|
+
problems.append(InvalidSmiles())
|
|
72
|
+
mol = None
|
|
73
|
+
|
|
74
|
+
return mol, problems
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ChEMBL Structure Pipeline preprocessing steps for molecular data.
|
|
3
|
+
|
|
4
|
+
This module provides preprocessing steps that utilize the ChEMBL Structure Pipeline library for
|
|
5
|
+
molecule standardization and parent molecule extraction.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import warnings
|
|
9
|
+
from typing import List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
from rdkit.Chem import Mol
|
|
12
|
+
|
|
13
|
+
from ..polyfills import BlockLogs
|
|
14
|
+
from ..problem import Problem
|
|
15
|
+
from .preprocessing_step import PreprocessingStep
|
|
16
|
+
|
|
17
|
+
# before importing chembl_structure_pipeline, we need to suppress RDKit warnings
|
|
18
|
+
warnings.filterwarnings(
|
|
19
|
+
"ignore",
|
|
20
|
+
category=DeprecationWarning,
|
|
21
|
+
module="rdkit.Chem.MolStandardize",
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# We check if chembl_structure_pipeline is installed. Since importing this library already logs
|
|
25
|
+
# messages, we suppress them using RDKit's BlockLogs.
|
|
26
|
+
with BlockLogs():
|
|
27
|
+
try:
|
|
28
|
+
from chembl_structure_pipeline import get_parent_mol, standardize_mol
|
|
29
|
+
|
|
30
|
+
import_error = None
|
|
31
|
+
except ImportError as e:
|
|
32
|
+
# raise ImportError later when using this class
|
|
33
|
+
# --> this allows to use the rest of the package without chembl_structure_pipeline
|
|
34
|
+
import_error = e
|
|
35
|
+
|
|
36
|
+
__all__ = ["GetParentMolWithCsp", "StandardizeWithCsp"]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class StandardizeWithCsp(PreprocessingStep):
|
|
40
|
+
"""
|
|
41
|
+
Preprocessing step that standardizes molecules using ChEMBL Structure Pipeline.
|
|
42
|
+
|
|
43
|
+
This class applies the ChEMBL Structure Pipeline standardization procedures to normalize
|
|
44
|
+
molecular representations. The standardization includes tautomer normalization, charge
|
|
45
|
+
neutralization, and other structural standardizations commonly used in pharmaceutical databases.
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
None
|
|
50
|
+
|
|
51
|
+
Raises
|
|
52
|
+
------
|
|
53
|
+
ImportError
|
|
54
|
+
If the chembl_structure_pipeline library is not installed.
|
|
55
|
+
|
|
56
|
+
Examples
|
|
57
|
+
--------
|
|
58
|
+
>>> # Create a standardization step (requires chembl_structure_pipeline)
|
|
59
|
+
>>> standardize_step = StandardizeWithCsp()
|
|
60
|
+
|
|
61
|
+
Notes
|
|
62
|
+
-----
|
|
63
|
+
* Requires the chembl_structure_pipeline library to be installed
|
|
64
|
+
* Automatically removes 3D conformers as the pipeline cannot handle them
|
|
65
|
+
* Uses ChEMBL's standardize_mol function which applies comprehensive molecular standardization
|
|
66
|
+
procedures
|
|
67
|
+
* If standardization fails, the original molecule is returned with a problem
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(self) -> None:
|
|
71
|
+
super().__init__()
|
|
72
|
+
|
|
73
|
+
if import_error is not None:
|
|
74
|
+
raise import_error
|
|
75
|
+
|
|
76
|
+
def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
|
|
77
|
+
"""
|
|
78
|
+
Standardize a molecule using ChEMBL Structure Pipeline.
|
|
79
|
+
|
|
80
|
+
Applies ChEMBL's standardization procedures to normalize the molecular representation. The
|
|
81
|
+
process removes 3D conformers before applying the standardize_mol function.
|
|
82
|
+
|
|
83
|
+
Parameters
|
|
84
|
+
----------
|
|
85
|
+
mol : Mol
|
|
86
|
+
RDKit Mol object representing the molecule to be standardized.
|
|
87
|
+
|
|
88
|
+
Returns
|
|
89
|
+
-------
|
|
90
|
+
Tuple[Optional[Mol], List[Problem]]
|
|
91
|
+
A tuple containing:
|
|
92
|
+
* The standardized molecule if successful, or the original molecule if standardization
|
|
93
|
+
failed
|
|
94
|
+
* An empty list if standardization succeeded, or a list containing a Problem instance
|
|
95
|
+
with code "csp_error" if standardization failed
|
|
96
|
+
"""
|
|
97
|
+
problems: List[Problem] = []
|
|
98
|
+
|
|
99
|
+
# chembl structure pipeline cannot handle molecules with 3D coordinates
|
|
100
|
+
# --> delete conformers
|
|
101
|
+
mol.RemoveAllConformers()
|
|
102
|
+
|
|
103
|
+
# standardization via chembl structure pipeline
|
|
104
|
+
preprocessed_mol = standardize_mol(mol)
|
|
105
|
+
|
|
106
|
+
if preprocessed_mol is None:
|
|
107
|
+
problems.append(Problem("csp_error", "Could not standardize the molecule."))
|
|
108
|
+
preprocessed_mol = mol
|
|
109
|
+
|
|
110
|
+
return preprocessed_mol, problems
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class GetParentMolWithCsp(PreprocessingStep):
|
|
114
|
+
"""
|
|
115
|
+
Preprocessing step that extracts parent molecules using ChEMBL Structure Pipeline.
|
|
116
|
+
|
|
117
|
+
This class uses the ChEMBL Structure Pipeline to identify and extract the parent molecule from
|
|
118
|
+
complex molecular structures. This process removes salts, solvents, and other fragments while
|
|
119
|
+
applying ChEMBL's standardization rules.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
None
|
|
124
|
+
|
|
125
|
+
Raises
|
|
126
|
+
------
|
|
127
|
+
ImportError
|
|
128
|
+
If the chembl_structure_pipeline library is not installed.
|
|
129
|
+
|
|
130
|
+
Examples
|
|
131
|
+
--------
|
|
132
|
+
>>> # Create a parent molecule extraction step
|
|
133
|
+
>>> get_parent_step = GetParentMolWithCsp()
|
|
134
|
+
|
|
135
|
+
Notes
|
|
136
|
+
-----
|
|
137
|
+
* Requires the chembl_structure_pipeline library to be installed
|
|
138
|
+
* Automatically removes 3D conformers as the pipeline cannot handle them
|
|
139
|
+
* Applies the get_parent_mol function from the chembl_structure_pipeline library
|
|
140
|
+
* If parent extraction fails or is flagged for exclusion, the original molecule is returned with
|
|
141
|
+
a Problem instance
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
def __init__(self) -> None:
|
|
145
|
+
super().__init__()
|
|
146
|
+
|
|
147
|
+
if import_error is not None:
|
|
148
|
+
raise import_error
|
|
149
|
+
|
|
150
|
+
def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
|
|
151
|
+
"""
|
|
152
|
+
Extract the parent molecule using ChEMBL Structure Pipeline.
|
|
153
|
+
|
|
154
|
+
Identifies and returns the main molecular component. The process removes 3D conformers,
|
|
155
|
+
because chembl_structure_pipeline cannot handle them.
|
|
156
|
+
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
mol : Mol
|
|
160
|
+
RDKit Mol object representing the molecule from which to extract the parent structure.
|
|
161
|
+
|
|
162
|
+
Returns
|
|
163
|
+
-------
|
|
164
|
+
Tuple[Optional[Mol], List[Problem]]
|
|
165
|
+
A tuple containing:
|
|
166
|
+
* The parent molecule if successful, or the original molecule if extraction failed
|
|
167
|
+
* An empty list if extraction succeeded, or a list containing a Problem instance with
|
|
168
|
+
code "csp_error" if extraction failed or was flagged for exclusion
|
|
169
|
+
"""
|
|
170
|
+
problems = []
|
|
171
|
+
|
|
172
|
+
# chembl structure pipeline cannot handle molecules with 3D coordinates
|
|
173
|
+
# --> delete conformers
|
|
174
|
+
mol.RemoveAllConformers()
|
|
175
|
+
|
|
176
|
+
# get parent molecule via chembl structure pipeline
|
|
177
|
+
preprocessed_mol, exclude_flag = get_parent_mol(mol)
|
|
178
|
+
if exclude_flag or preprocessed_mol is None:
|
|
179
|
+
problems.append(Problem("csp_error", "Could not remove small fragments."))
|
|
180
|
+
if preprocessed_mol is None:
|
|
181
|
+
preprocessed_mol = mol
|
|
182
|
+
|
|
183
|
+
return preprocessed_mol, problems
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Element filtering preprocessing step for molecular data.
|
|
3
|
+
|
|
4
|
+
This module provides functionality to filter molecules based on their elemental composition,
|
|
5
|
+
allowing only molecules containing specified allowed elements to pass through the processing
|
|
6
|
+
pipeline.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Iterable, List, Optional, Set, Tuple
|
|
10
|
+
|
|
11
|
+
from rdkit.Chem import Mol
|
|
12
|
+
|
|
13
|
+
from ..problem import InvalidElementsProblem, Problem
|
|
14
|
+
from .preprocessing_step import PreprocessingStep
|
|
15
|
+
|
|
16
|
+
__all__ = ["FilterByElement", "ORGANIC_SUBSET"]
|
|
17
|
+
|
|
18
|
+
ORGANIC_SUBSET = [
|
|
19
|
+
"H",
|
|
20
|
+
"B",
|
|
21
|
+
"C",
|
|
22
|
+
"N",
|
|
23
|
+
"O",
|
|
24
|
+
"F",
|
|
25
|
+
"Si",
|
|
26
|
+
"P",
|
|
27
|
+
"S",
|
|
28
|
+
"Cl",
|
|
29
|
+
"Se",
|
|
30
|
+
"Br",
|
|
31
|
+
"I",
|
|
32
|
+
]
|
|
33
|
+
"""
|
|
34
|
+
List[str] : Predefined set of elements commonly found in organic molecules.
|
|
35
|
+
|
|
36
|
+
This list contains the atomic symbols of elements that are typically present in organic and
|
|
37
|
+
drug-like molecules. It can be used as a convenient preset for the FilterByElement class to restrict
|
|
38
|
+
molecules to organic chemistry space.
|
|
39
|
+
|
|
40
|
+
The elements included are:
|
|
41
|
+
* H (Hydrogen)
|
|
42
|
+
* B (Boron)
|
|
43
|
+
* C (Carbon)
|
|
44
|
+
* N (Nitrogen)
|
|
45
|
+
* O (Oxygen)
|
|
46
|
+
* F (Fluorine)
|
|
47
|
+
* Si (Silicon)
|
|
48
|
+
* P (Phosphorus)
|
|
49
|
+
* S (Sulfur)
|
|
50
|
+
* Cl (Chlorine)
|
|
51
|
+
* Se (Selenium)
|
|
52
|
+
* Br (Bromine)
|
|
53
|
+
* I (Iodine)
|
|
54
|
+
|
|
55
|
+
Examples
|
|
56
|
+
--------
|
|
57
|
+
>>> filter_step = FilterByElement(ORGANIC_SUBSET)
|
|
58
|
+
>>> # This will only allow molecules containing organic elements
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class FilterByElement(PreprocessingStep):
|
|
63
|
+
"""
|
|
64
|
+
Preprocessing step that filters molecules based on elemental composition.
|
|
65
|
+
|
|
66
|
+
This class validates molecules against a specified set of allowed elements. Molecules containing
|
|
67
|
+
elements not in the allowed set are flagged with a problem instance "invalid_element" and
|
|
68
|
+
optionally removed from the pipeline.
|
|
69
|
+
|
|
70
|
+
Parameters
|
|
71
|
+
----------
|
|
72
|
+
allowed_elements : Iterable[str]
|
|
73
|
+
An iterable of atomic symbols (element names) that are allowed in molecules. Element symbols
|
|
74
|
+
are case-insensitive but will be normalized to proper case (first letter uppercase, rest
|
|
75
|
+
lowercase).
|
|
76
|
+
remove_invalid_molecules : bool, optional
|
|
77
|
+
If True, molecules containing disallowed elements are set to None (removed). If False,
|
|
78
|
+
invalid molecules are kept. Default is False.
|
|
79
|
+
|
|
80
|
+
Examples
|
|
81
|
+
--------
|
|
82
|
+
>>> # Allow only carbon, nitrogen, oxygen, and hydrogen
|
|
83
|
+
>>> filter_step = FilterByElement(['C', 'N', 'O', 'H'])
|
|
84
|
+
|
|
85
|
+
>>> # Use predefined organic subset, removing invalid molecules
|
|
86
|
+
>>> filter_step = FilterByElement(ORGANIC_SUBSET, remove_invalid_molecules=True)
|
|
87
|
+
|
|
88
|
+
Notes
|
|
89
|
+
-----
|
|
90
|
+
* Element symbols are normalized to proper case (e.g., 'cl' becomes 'Cl')
|
|
91
|
+
* Even if remove_invalid_molecules is set to False, molecules with invalid elements are still
|
|
92
|
+
marked with a problem instance
|
|
93
|
+
* Hydrogen atoms are handled specially since they may not be explicit in the molecular
|
|
94
|
+
representation and are detected via GetTotalNumHs()
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
def __init__(
|
|
98
|
+
self, allowed_elements: Iterable[str], remove_invalid_molecules: bool = False
|
|
99
|
+
) -> None:
|
|
100
|
+
super().__init__()
|
|
101
|
+
self._allowed_elements = {a[0].upper() + a[1:] for a in allowed_elements}
|
|
102
|
+
self._hydrogen_in_allowed_elements = "H" in self._allowed_elements
|
|
103
|
+
self._remove_invalid_molecules = remove_invalid_molecules
|
|
104
|
+
|
|
105
|
+
def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
|
|
106
|
+
"""
|
|
107
|
+
Filter a molecule by comparing its elemental composition against allowed elements.
|
|
108
|
+
|
|
109
|
+
Parameters
|
|
110
|
+
----------
|
|
111
|
+
mol : Mol
|
|
112
|
+
RDKit Mol object representing the molecule to be validated.
|
|
113
|
+
|
|
114
|
+
Returns
|
|
115
|
+
-------
|
|
116
|
+
Tuple[Optional[Mol], List[Problem]]
|
|
117
|
+
A tuple containing:
|
|
118
|
+
* The original molecule if all elements are allowed, or None if disallowed elements are
|
|
119
|
+
found and remove_invalid_molecules is True
|
|
120
|
+
* A list containing an InvalidElementsProblem if disallowed elements are found,
|
|
121
|
+
otherwise an empty list
|
|
122
|
+
|
|
123
|
+
Notes
|
|
124
|
+
-----
|
|
125
|
+
Hydrogen detection is special-cased because hydrogen atoms are often implicit in molecular
|
|
126
|
+
representations and detected via atom.GetTotalNumHs().
|
|
127
|
+
"""
|
|
128
|
+
problems = []
|
|
129
|
+
result_mol = mol
|
|
130
|
+
|
|
131
|
+
elements: Set[str] = {atom.GetSymbol() for atom in mol.GetAtoms()}
|
|
132
|
+
invalid_elements = elements - self._allowed_elements
|
|
133
|
+
|
|
134
|
+
# special case: hydrogens are not recognized by mol.GetAtoms()
|
|
135
|
+
if not self._hydrogen_in_allowed_elements:
|
|
136
|
+
# get the number of hydrogens in mol
|
|
137
|
+
for a in mol.GetAtoms():
|
|
138
|
+
if a.GetTotalNumHs() > 0:
|
|
139
|
+
invalid_elements.add("H")
|
|
140
|
+
break
|
|
141
|
+
|
|
142
|
+
if len(invalid_elements) > 0:
|
|
143
|
+
if self._remove_invalid_molecules:
|
|
144
|
+
result_mol = None
|
|
145
|
+
|
|
146
|
+
problems.append(InvalidElementsProblem(invalid_elements))
|
|
147
|
+
|
|
148
|
+
return result_mol, problems
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Molecular weight filtering preprocessing step.
|
|
3
|
+
|
|
4
|
+
This module provides a preprocessing step that filters molecules based on their molecular weight.
|
|
5
|
+
This is desirable for models having a runtime scaling with molecule size.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import List, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
from rdkit.Chem import Mol
|
|
11
|
+
from rdkit.Chem.rdMolDescriptors import CalcExactMolWt
|
|
12
|
+
|
|
13
|
+
from ..problem import InvalidWeightProblem, Problem
|
|
14
|
+
from .preprocessing_step import PreprocessingStep
|
|
15
|
+
|
|
16
|
+
__all__ = ["FilterByWeight"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class FilterByWeight(PreprocessingStep):
|
|
20
|
+
"""
|
|
21
|
+
Preprocessing step that filters molecules based on molecular weight.
|
|
22
|
+
|
|
23
|
+
This class validates molecules against specified minimum and maximum molecular weight
|
|
24
|
+
thresholds. Molecules outside these bounds are flagged and optionally removed from the pipeline.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
min_weight : float, optional
|
|
29
|
+
Minimum allowed molecular weight in Daltons (Da). Default is 0.
|
|
30
|
+
max_weight : float, optional
|
|
31
|
+
Maximum allowed molecular weight in Daltons (Da). Default is infinity.
|
|
32
|
+
remove_invalid_molecules : bool, optional
|
|
33
|
+
If True, molecules outside the weight range are set to None (removed). If False, invalid
|
|
34
|
+
molecules are kept. Default is False.
|
|
35
|
+
|
|
36
|
+
Examples
|
|
37
|
+
--------
|
|
38
|
+
>>> # Filter molecules between 150 and 500 Da, keeping invalid ones
|
|
39
|
+
>>> filter_step = FilterByWeight(min_weight=150, max_weight=500)
|
|
40
|
+
|
|
41
|
+
>>> # Filter molecules below 1000 Da, removing invalid ones
|
|
42
|
+
>>> filter_step = FilterByWeight(max_weight=1000, remove_invalid_molecules=True)
|
|
43
|
+
|
|
44
|
+
>>> # Only set minimum weight threshold
|
|
45
|
+
>>> filter_step = FilterByWeight(min_weight=100)
|
|
46
|
+
|
|
47
|
+
Notes
|
|
48
|
+
-----
|
|
49
|
+
* Even if remove_invalid_molecules is set to False, molecules with invalid weight are still
|
|
50
|
+
marked with a problem instance
|
|
51
|
+
* The molecular weight is calculated using RDKit's CalcExactMolWt function
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
min_weight: float = 0,
|
|
57
|
+
max_weight: float = float("inf"),
|
|
58
|
+
remove_invalid_molecules: bool = False,
|
|
59
|
+
) -> None:
|
|
60
|
+
super().__init__()
|
|
61
|
+
self._min_weight = min_weight
|
|
62
|
+
self._max_weight = max_weight
|
|
63
|
+
self._remove_invalid_molecules = remove_invalid_molecules
|
|
64
|
+
|
|
65
|
+
def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
|
|
66
|
+
"""
|
|
67
|
+
Filter a molecule based on its molecular weight.
|
|
68
|
+
|
|
69
|
+
Calculates the exact molecular weight of the input molecule and validates it against the
|
|
70
|
+
configured minimum and maximum weight thresholds.
|
|
71
|
+
|
|
72
|
+
Parameters
|
|
73
|
+
----------
|
|
74
|
+
mol : Mol
|
|
75
|
+
RDKit Mol object representing the molecule to be validated.
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
Tuple[Optional[Mol], List[Problem]]
|
|
80
|
+
A tuple containing:
|
|
81
|
+
* The original molecule if within weight bounds, or None if outside bounds and
|
|
82
|
+
remove_invalid_molecules is True
|
|
83
|
+
* A list containing an InvalidWeightProblem if the molecule is outside the weight
|
|
84
|
+
bounds, otherwise an empty list
|
|
85
|
+
"""
|
|
86
|
+
problems = []
|
|
87
|
+
result_mol = mol
|
|
88
|
+
|
|
89
|
+
weight = CalcExactMolWt(mol)
|
|
90
|
+
if weight < self._min_weight or weight > self._max_weight:
|
|
91
|
+
if self._remove_invalid_molecules:
|
|
92
|
+
result_mol = None
|
|
93
|
+
problems.append(InvalidWeightProblem(weight, self._min_weight, self._max_weight))
|
|
94
|
+
|
|
95
|
+
return result_mol, problems
|