nerdd-module 0.3.49__tar.gz → 0.3.51__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/PKG-INFO +1 -1
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/depth_first_explorer.py +6 -3
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/gzip_reader.py +11 -1
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/list_reader.py +14 -1
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/sdf_reader.py +1 -1
- nerdd_module-0.3.51/nerdd_module/input/tar_reader.py +50 -0
- nerdd_module-0.3.51/nerdd_module/input/zip_reader.py +52 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/__init__.py +81 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/check_valid_smiles.py +74 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/chembl_structure_pipeline.py +183 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/filter_by_element.py +148 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/filter_by_weight.py +95 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/preprocessing_step.py +139 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/remove_hydrogens.py +104 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/remove_small_fragments.py +76 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/remove_stereochemistry.py +81 -0
- nerdd_module-0.3.51/nerdd_module/preprocessing/sanitize.py +91 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module.egg-info/PKG-INFO +1 -1
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module.egg-info/SOURCES.txt +1 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/pyproject.toml +1 -1
- nerdd_module-0.3.49/nerdd_module/input/tar_reader.py +0 -34
- nerdd_module-0.3.49/nerdd_module/input/zip_reader.py +0 -37
- nerdd_module-0.3.49/nerdd_module/preprocessing/__init__.py +0 -8
- nerdd_module-0.3.49/nerdd_module/preprocessing/check_valid_smiles.py +0 -26
- nerdd_module-0.3.49/nerdd_module/preprocessing/chembl_structure_pipeline.py +0 -77
- nerdd_module-0.3.49/nerdd_module/preprocessing/filter_by_element.py +0 -57
- nerdd_module-0.3.49/nerdd_module/preprocessing/filter_by_weight.py +0 -34
- nerdd_module-0.3.49/nerdd_module/preprocessing/preprocessing_step.py +0 -61
- nerdd_module-0.3.49/nerdd_module/preprocessing/remove_small_fragments.py +0 -26
- nerdd_module-0.3.49/nerdd_module/preprocessing/remove_stereochemistry.py +0 -26
- nerdd_module-0.3.49/nerdd_module/preprocessing/sanitize.py +0 -31
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/LICENSE +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/README.md +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/__init__.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/cli.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/__init__.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/configuration.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/default_configuration.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/dict_configuration.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/merged_configuration.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/models.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/package_configuration.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/search_yaml_configuration.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/config/yaml_configuration.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/__init__.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/basic_type_converter.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/converter.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/converter_config.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/mol_converter.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/problem_list_converter.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/representation_converter.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/source_list_converter.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/converters/void_converter.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/__init__.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/explorer.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/file_reader.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/inchi_reader.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/mol_reader.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/reader.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/reader_config.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/smiles_reader.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/stream_reader.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/input/string_reader.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/model/__init__.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/model/assign_name_step.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/model/convert_representations_step.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/model/enforce_schema_step.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/model/model.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/model/prediction_step.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/model/read_input_step.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/model/write_output_step.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/__init__.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/csv_writer.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/file_writer.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/iterator_writer.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/pandas_writer.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/record_list_writer.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/sdf_writer.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/writer.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/output/writer_config.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/polyfills/__init__.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/polyfills/block_logs.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/polyfills/files.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/polyfills/get_entry_points.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/polyfills/literal.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/polyfills/typed_dict.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/polyfills/types.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/polyfills/version.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/problem.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/py.typed +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/steps/__init__.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/steps/map_step.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/steps/output_step.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/steps/step.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/__init__.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/checks.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/files.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/models/AtomicMassModel.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/models/MolWeightModel.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/models/__init__.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/predictions.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/preprocessing/DummyPreprocessingStep.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/preprocessing/__init__.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/tests/representations.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/util/__init__.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/util/call_with_mappings.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/util/package.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module/version.py +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module.egg-info/dependency_links.txt +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module.egg-info/requires.txt +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/nerdd_module.egg-info/top_level.txt +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/setup.cfg +0 -0
- {nerdd_module-0.3.49 → nerdd_module-0.3.51}/tests/test_features.py +0 -0
|
@@ -8,16 +8,17 @@ __all__ = ["DepthFirstExplorer"]
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class InvalidInputReader(Reader):
|
|
11
|
-
def __init__(self) -> None:
|
|
11
|
+
def __init__(self, message: str = "Invalid input") -> None:
|
|
12
12
|
super().__init__()
|
|
13
|
+
self.message = message
|
|
13
14
|
|
|
14
15
|
def read(self, input: Any, explore: ExploreCallable) -> Iterator[MoleculeEntry]:
|
|
15
16
|
yield MoleculeEntry(
|
|
16
17
|
raw_input=input,
|
|
17
18
|
input_type="unknown",
|
|
18
|
-
source=("
|
|
19
|
+
source=("raw_input",),
|
|
19
20
|
mol=None,
|
|
20
|
-
errors=[Problem("invalid_input",
|
|
21
|
+
errors=[Problem("invalid_input", self.message)],
|
|
21
22
|
)
|
|
22
23
|
|
|
23
24
|
def __repr__(self) -> str:
|
|
@@ -120,6 +121,8 @@ class DepthFirstExplorer(Explorer):
|
|
|
120
121
|
if best_mode == "builtin":
|
|
121
122
|
parent["first_guess"].append(best_reader)
|
|
122
123
|
|
|
124
|
+
# In order to get more fine-grained error messages, we do not handle exceptions here and
|
|
125
|
+
# rely on the readers to do so.
|
|
123
126
|
yield from sample
|
|
124
127
|
yield from generator
|
|
125
128
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import gzip
|
|
2
2
|
from typing import Any, Iterator
|
|
3
3
|
|
|
4
|
+
from ..problem import Problem
|
|
4
5
|
from .reader import ExploreCallable, MoleculeEntry, Reader
|
|
5
6
|
|
|
6
7
|
__all__ = ["GzipReader"]
|
|
@@ -22,7 +23,16 @@ class GzipReader(Reader):
|
|
|
22
23
|
f.read(1)
|
|
23
24
|
f.seek(0)
|
|
24
25
|
|
|
25
|
-
|
|
26
|
+
try:
|
|
27
|
+
yield from explore(f)
|
|
28
|
+
except Exception as e:
|
|
29
|
+
yield MoleculeEntry(
|
|
30
|
+
raw_input="<gzip>",
|
|
31
|
+
input_type="gzip",
|
|
32
|
+
source=("raw_input",),
|
|
33
|
+
mol=None,
|
|
34
|
+
errors=[Problem("invalid_input", f"Invalid gzip file: {e}")],
|
|
35
|
+
)
|
|
26
36
|
|
|
27
37
|
def __repr__(self) -> str:
|
|
28
38
|
return "GzipReader()"
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from io import IOBase
|
|
2
2
|
from typing import Any, Iterable, Iterator
|
|
3
3
|
|
|
4
|
+
from ..problem import Problem
|
|
4
5
|
from .reader import ExploreCallable, MoleculeEntry, Reader
|
|
5
6
|
|
|
6
7
|
__all__ = ["ListReader"]
|
|
@@ -16,7 +17,19 @@ class ListReader(Reader):
|
|
|
16
17
|
), f"input must be an iterable, but is {type(input_iterable)}"
|
|
17
18
|
|
|
18
19
|
for entry in input_iterable:
|
|
19
|
-
|
|
20
|
+
try:
|
|
21
|
+
yield from explore(entry)
|
|
22
|
+
except Exception as e:
|
|
23
|
+
raw_input = str(entry)
|
|
24
|
+
if len(raw_input) > 100:
|
|
25
|
+
raw_input = raw_input[:97] + "..."
|
|
26
|
+
yield MoleculeEntry(
|
|
27
|
+
raw_input=raw_input,
|
|
28
|
+
input_type="unknown",
|
|
29
|
+
source=(),
|
|
30
|
+
mol=None,
|
|
31
|
+
errors=[Problem("invalid_list_entry", f"Could not read list entry: {e}")],
|
|
32
|
+
)
|
|
20
33
|
|
|
21
34
|
def __repr__(self) -> str:
|
|
22
35
|
return "ListReader()"
|
|
@@ -11,7 +11,7 @@ __all__ = ["SdfReader"]
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class SdfReader(StreamReader):
|
|
14
|
-
def __init__(self, max_num_lines_mol_block: int =
|
|
14
|
+
def __init__(self, max_num_lines_mol_block: int = 100_000) -> None:
|
|
15
15
|
super().__init__()
|
|
16
16
|
self.max_num_lines_mol_block = max_num_lines_mol_block
|
|
17
17
|
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import tarfile
|
|
2
|
+
from typing import Any, Iterator, Tuple
|
|
3
|
+
|
|
4
|
+
from ..problem import Problem
|
|
5
|
+
from .reader import ExploreCallable, MoleculeEntry, Reader
|
|
6
|
+
|
|
7
|
+
__all__ = ["TarReader"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TarReader(Reader):
|
|
11
|
+
def __init__(self) -> None:
|
|
12
|
+
super().__init__()
|
|
13
|
+
|
|
14
|
+
def read(self, input_stream: Any, explore: ExploreCallable) -> Iterator[MoleculeEntry]:
|
|
15
|
+
if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
|
|
16
|
+
raise TypeError("input must be a stream-like object")
|
|
17
|
+
|
|
18
|
+
input_stream.seek(0)
|
|
19
|
+
|
|
20
|
+
with tarfile.open(fileobj=input_stream, mode="r") as tar:
|
|
21
|
+
for member in tar.getmembers():
|
|
22
|
+
if not member.isfile():
|
|
23
|
+
continue
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
for entry in explore(tar.extractfile(member)):
|
|
27
|
+
# the underlying reader only sees the file content as a stream
|
|
28
|
+
# -> it might believe that the source is "raw_input"
|
|
29
|
+
# -> we need to correct that here
|
|
30
|
+
if len(entry.source) == 1 and entry.source[0] == "raw_input":
|
|
31
|
+
source: Tuple[str, ...] = tuple()
|
|
32
|
+
else:
|
|
33
|
+
source = entry.source
|
|
34
|
+
yield entry._replace(source=(member.name, *source))
|
|
35
|
+
except Exception as e:
|
|
36
|
+
yield MoleculeEntry(
|
|
37
|
+
raw_input="<tar>",
|
|
38
|
+
input_type="unknown",
|
|
39
|
+
source=(member.name,),
|
|
40
|
+
mol=None,
|
|
41
|
+
errors=[
|
|
42
|
+
Problem(
|
|
43
|
+
"invalid_tar_member",
|
|
44
|
+
f"Could not read tar member '{member.name}': {e}",
|
|
45
|
+
)
|
|
46
|
+
],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
def __repr__(self) -> str:
|
|
50
|
+
return "TarReader()"
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import zipfile
|
|
2
|
+
from typing import Any, Iterator, Tuple
|
|
3
|
+
|
|
4
|
+
from ..problem import Problem
|
|
5
|
+
from .reader import ExploreCallable, MoleculeEntry, Reader
|
|
6
|
+
|
|
7
|
+
__all__ = ["ZipReader"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ZipReader(Reader):
|
|
11
|
+
def __init__(self) -> None:
|
|
12
|
+
super().__init__()
|
|
13
|
+
|
|
14
|
+
def read(self, input_stream: Any, explore: ExploreCallable) -> Iterator[MoleculeEntry]:
|
|
15
|
+
if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
|
|
16
|
+
raise TypeError("input must be a stream-like object")
|
|
17
|
+
|
|
18
|
+
input_stream.seek(0)
|
|
19
|
+
|
|
20
|
+
with zipfile.ZipFile(input_stream, "r") as zipf:
|
|
21
|
+
for member in zipf.namelist():
|
|
22
|
+
# check if the member is a file
|
|
23
|
+
if member.endswith("/"):
|
|
24
|
+
continue
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
with zipf.open(member, "r") as f:
|
|
28
|
+
for entry in explore(f):
|
|
29
|
+
# the underlying reader only sees the file content as a stream
|
|
30
|
+
# -> it might believe that the source is "raw_input"
|
|
31
|
+
# -> we need to correct that here
|
|
32
|
+
if len(entry.source) == 1 and entry.source[0] == "raw_input":
|
|
33
|
+
source: Tuple[str, ...] = tuple()
|
|
34
|
+
else:
|
|
35
|
+
source = entry.source
|
|
36
|
+
|
|
37
|
+
yield entry._replace(source=(member, *source))
|
|
38
|
+
except Exception as e:
|
|
39
|
+
yield MoleculeEntry(
|
|
40
|
+
raw_input="<zip>",
|
|
41
|
+
input_type="unknown",
|
|
42
|
+
source=(member,),
|
|
43
|
+
mol=None,
|
|
44
|
+
errors=[
|
|
45
|
+
Problem(
|
|
46
|
+
"invalid_zip_member", f"Could not read zip member '{member}': {e}"
|
|
47
|
+
)
|
|
48
|
+
],
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def __repr__(self) -> str:
|
|
52
|
+
return "ZipReader()"
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Molecular preprocessing pipeline components.
|
|
3
|
+
|
|
4
|
+
This package provides a comprehensive set of preprocessing steps for molecular data processing
|
|
5
|
+
pipelines. These steps can be chained together to clean, standardize, and validate molecular
|
|
6
|
+
datasets commonly used in cheminformatics and drug discovery.
|
|
7
|
+
|
|
8
|
+
The preprocessing steps inherit from the base `PreprocessingStep` class and can be easily combined
|
|
9
|
+
to create custom preprocessing pipelines. Each step operates on molecular records and can transform
|
|
10
|
+
molecules, report problems, or filter out invalid structures.
|
|
11
|
+
|
|
12
|
+
Available Preprocessing Steps
|
|
13
|
+
-----------------------------
|
|
14
|
+
|
|
15
|
+
- `CheckValidSmiles` : Validates molecules through SMILES round-trip conversion
|
|
16
|
+
- `Sanitize` : Validates and corrects molecular structures using RDKit sanitization
|
|
17
|
+
- `FilterByWeight` : Filters molecules based on molecular weight thresholds
|
|
18
|
+
- `FilterByElement` : Filters molecules based on allowed elemental composition
|
|
19
|
+
- `StandardizeWithCsp` : Standardizes molecules using ChEMBL Structure Pipeline
|
|
20
|
+
- `GetParentMolWithCsp` : Extracts parent molecules using ChEMBL Structure Pipeline
|
|
21
|
+
- `RemoveHydrogens` : Removes hydrogen atoms from molecular representations
|
|
22
|
+
- `RemoveSmallFragments` : Removes small fragments, keeping only the largest component
|
|
23
|
+
- `RemoveStereochemistry` : Removes stereochemical information from molecules
|
|
24
|
+
|
|
25
|
+
Base Classes
|
|
26
|
+
------------
|
|
27
|
+
|
|
28
|
+
- `PreprocessingStep` : Abstract base class for all preprocessing steps
|
|
29
|
+
|
|
30
|
+
Examples
|
|
31
|
+
--------
|
|
32
|
+
|
|
33
|
+
Basic usage of individual preprocessing steps:
|
|
34
|
+
|
|
35
|
+
>>> from nerdd_module.preprocessing import FilterByWeight, RemoveHydrogens, Sanitize
|
|
36
|
+
>>>
|
|
37
|
+
>>> # Create preprocessing steps
|
|
38
|
+
>>> weight_filter = FilterByWeight(min_weight=150, max_weight=500)
|
|
39
|
+
>>> hydrogen_remover = RemoveHydrogens()
|
|
40
|
+
>>> sanitizer = Sanitize()
|
|
41
|
+
|
|
42
|
+
Creating a complete preprocessing pipeline:
|
|
43
|
+
|
|
44
|
+
>>> from nerdd_module.preprocessing import (
|
|
45
|
+
... CheckValidSmiles, FilterByElement, RemoveSmallFragments,
|
|
46
|
+
... Sanitize, StandardizeWithCsp, ORGANIC_SUBSET
|
|
47
|
+
... )
|
|
48
|
+
>>>
|
|
49
|
+
>>> # Define a comprehensive preprocessing pipeline
|
|
50
|
+
>>> pipeline_steps = [
|
|
51
|
+
... Sanitize(), # Sanitize molecules
|
|
52
|
+
... CheckValidSmiles(), # Validate SMILES representation
|
|
53
|
+
... RemoveSmallFragments(), # Remove salts and solvents
|
|
54
|
+
... FilterByElement(ORGANIC_SUBSET), # Keep only organic molecules
|
|
55
|
+
... StandardizeWithCsp(), # Standardize using chembl_structure_pipeline
|
|
56
|
+
... ]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
Notes
|
|
60
|
+
-----
|
|
61
|
+
* All preprocessing steps follow the same interface defined by `PreprocessingStep`
|
|
62
|
+
* Steps can be chained together to create comprehensive preprocessing pipelines
|
|
63
|
+
* Problems encountered during preprocessing are accumulated in the record's "problems" list
|
|
64
|
+
* Some steps require optional dependencies (e.g., `chembl_structure_pipeline`)
|
|
65
|
+
* The order of preprocessing steps can significantly impact the final results
|
|
66
|
+
|
|
67
|
+
See Also
|
|
68
|
+
--------
|
|
69
|
+
nerdd_module.steps : Base classes for pipeline steps nerdd_module.problem : Problem reporting
|
|
70
|
+
classes used by preprocessing steps
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
from .check_valid_smiles import *
|
|
74
|
+
from .chembl_structure_pipeline import *
|
|
75
|
+
from .filter_by_element import *
|
|
76
|
+
from .filter_by_weight import *
|
|
77
|
+
from .preprocessing_step import *
|
|
78
|
+
from .remove_hydrogens import *
|
|
79
|
+
from .remove_small_fragments import *
|
|
80
|
+
from .remove_stereochemistry import *
|
|
81
|
+
from .sanitize import *
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SMILES validation preprocessing step for molecular data.
|
|
3
|
+
|
|
4
|
+
This module provides functionality to validate molecular representations by converting them to
|
|
5
|
+
SMILES format and attempting to parse them back. This round-trip validation ensures that molecules
|
|
6
|
+
can be properly serialized and deserialized as SMILES strings.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
from rdkit.Chem import Mol, MolFromSmiles, MolToSmiles
|
|
12
|
+
|
|
13
|
+
from ..problem import InvalidSmiles, Problem
|
|
14
|
+
from .preprocessing_step import PreprocessingStep
|
|
15
|
+
|
|
16
|
+
__all__ = ["CheckValidSmiles"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CheckValidSmiles(PreprocessingStep):
|
|
20
|
+
"""
|
|
21
|
+
Preprocessing step that validates molecules through SMILES round-trip conversion.
|
|
22
|
+
|
|
23
|
+
This class validates molecular representations by converting them to SMILES format and then
|
|
24
|
+
attempting to parse the SMILES back to a molecule object. This round-trip validation ensures
|
|
25
|
+
that molecules can be properly represented as SMILES strings, which is an indicator for a valid
|
|
26
|
+
molecular structure. Molecules that fail the round-trip test are considered invalid and removed.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
None
|
|
31
|
+
|
|
32
|
+
Examples
|
|
33
|
+
--------
|
|
34
|
+
>>> # Create a SMILES validation step
|
|
35
|
+
>>> smiles_check = CheckValidSmiles()
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self) -> None:
|
|
39
|
+
super().__init__()
|
|
40
|
+
|
|
41
|
+
def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
|
|
42
|
+
"""
|
|
43
|
+
Validate a molecule through SMILES round-trip conversion.
|
|
44
|
+
|
|
45
|
+
Converts the input molecule to a canonical SMILES string and then attempts to parse it back
|
|
46
|
+
to a molecule object. If the round-trip conversion fails, the molecule is considered
|
|
47
|
+
invalid.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
mol : Mol
|
|
52
|
+
RDKit Mol object representing the molecule to be validated.
|
|
53
|
+
|
|
54
|
+
Returns
|
|
55
|
+
-------
|
|
56
|
+
Tuple[Optional[Mol], List[Problem]]
|
|
57
|
+
A tuple containing:
|
|
58
|
+
* The original molecule if SMILES validation succeeded, or None if validation failed
|
|
59
|
+
* An empty list if validation succeeded, or a list containing an InvalidSmiles problem
|
|
60
|
+
if validation failed
|
|
61
|
+
|
|
62
|
+
Notes
|
|
63
|
+
-----
|
|
64
|
+
The validation process converts the molecule to canonical SMILES.
|
|
65
|
+
"""
|
|
66
|
+
problems = []
|
|
67
|
+
|
|
68
|
+
smi = MolToSmiles(mol, True)
|
|
69
|
+
check_mol = MolFromSmiles(smi)
|
|
70
|
+
if check_mol is None:
|
|
71
|
+
problems.append(InvalidSmiles())
|
|
72
|
+
mol = None
|
|
73
|
+
|
|
74
|
+
return mol, problems
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ChEMBL Structure Pipeline preprocessing steps for molecular data.
|
|
3
|
+
|
|
4
|
+
This module provides preprocessing steps that utilize the ChEMBL Structure Pipeline library for
|
|
5
|
+
molecule standardization and parent molecule extraction.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import warnings
|
|
9
|
+
from typing import List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
from rdkit.Chem import Mol
|
|
12
|
+
|
|
13
|
+
from ..polyfills import BlockLogs
|
|
14
|
+
from ..problem import Problem
|
|
15
|
+
from .preprocessing_step import PreprocessingStep
|
|
16
|
+
|
|
17
|
+
# before importing chembl_structure_pipeline, we need to suppress RDKit warnings
|
|
18
|
+
warnings.filterwarnings(
|
|
19
|
+
"ignore",
|
|
20
|
+
category=DeprecationWarning,
|
|
21
|
+
module="rdkit.Chem.MolStandardize",
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# We check if chembl_structure_pipeline is installed. Since importing this library already logs
|
|
25
|
+
# messages, we suppress them using RDKit's BlockLogs.
|
|
26
|
+
with BlockLogs():
|
|
27
|
+
try:
|
|
28
|
+
from chembl_structure_pipeline import get_parent_mol, standardize_mol
|
|
29
|
+
|
|
30
|
+
import_error = None
|
|
31
|
+
except ImportError as e:
|
|
32
|
+
# raise ImportError later when using this class
|
|
33
|
+
# --> this allows to use the rest of the package without chembl_structure_pipeline
|
|
34
|
+
import_error = e
|
|
35
|
+
|
|
36
|
+
__all__ = ["GetParentMolWithCsp", "StandardizeWithCsp"]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class StandardizeWithCsp(PreprocessingStep):
|
|
40
|
+
"""
|
|
41
|
+
Preprocessing step that standardizes molecules using ChEMBL Structure Pipeline.
|
|
42
|
+
|
|
43
|
+
This class applies the ChEMBL Structure Pipeline standardization procedures to normalize
|
|
44
|
+
molecular representations. The standardization includes tautomer normalization, charge
|
|
45
|
+
neutralization, and other structural standardizations commonly used in pharmaceutical databases.
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
None
|
|
50
|
+
|
|
51
|
+
Raises
|
|
52
|
+
------
|
|
53
|
+
ImportError
|
|
54
|
+
If the chembl_structure_pipeline library is not installed.
|
|
55
|
+
|
|
56
|
+
Examples
|
|
57
|
+
--------
|
|
58
|
+
>>> # Create a standardization step (requires chembl_structure_pipeline)
|
|
59
|
+
>>> standardize_step = StandardizeWithCsp()
|
|
60
|
+
|
|
61
|
+
Notes
|
|
62
|
+
-----
|
|
63
|
+
* Requires the chembl_structure_pipeline library to be installed
|
|
64
|
+
* Automatically removes 3D conformers as the pipeline cannot handle them
|
|
65
|
+
* Uses ChEMBL's standardize_mol function which applies comprehensive molecular standardization
|
|
66
|
+
procedures
|
|
67
|
+
* If standardization fails, the original molecule is returned with a problem
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(self) -> None:
|
|
71
|
+
super().__init__()
|
|
72
|
+
|
|
73
|
+
if import_error is not None:
|
|
74
|
+
raise import_error
|
|
75
|
+
|
|
76
|
+
def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
|
|
77
|
+
"""
|
|
78
|
+
Standardize a molecule using ChEMBL Structure Pipeline.
|
|
79
|
+
|
|
80
|
+
Applies ChEMBL's standardization procedures to normalize the molecular representation. The
|
|
81
|
+
process removes 3D conformers before applying the standardize_mol function.
|
|
82
|
+
|
|
83
|
+
Parameters
|
|
84
|
+
----------
|
|
85
|
+
mol : Mol
|
|
86
|
+
RDKit Mol object representing the molecule to be standardized.
|
|
87
|
+
|
|
88
|
+
Returns
|
|
89
|
+
-------
|
|
90
|
+
Tuple[Optional[Mol], List[Problem]]
|
|
91
|
+
A tuple containing:
|
|
92
|
+
* The standardized molecule if successful, or the original molecule if standardization
|
|
93
|
+
failed
|
|
94
|
+
* An empty list if standardization succeeded, or a list containing a Problem instance
|
|
95
|
+
with code "csp_error" if standardization failed
|
|
96
|
+
"""
|
|
97
|
+
problems: List[Problem] = []
|
|
98
|
+
|
|
99
|
+
# chembl structure pipeline cannot handle molecules with 3D coordinates
|
|
100
|
+
# --> delete conformers
|
|
101
|
+
mol.RemoveAllConformers()
|
|
102
|
+
|
|
103
|
+
# standardization via chembl structure pipeline
|
|
104
|
+
preprocessed_mol = standardize_mol(mol)
|
|
105
|
+
|
|
106
|
+
if preprocessed_mol is None:
|
|
107
|
+
problems.append(Problem("csp_error", "Could not standardize the molecule."))
|
|
108
|
+
preprocessed_mol = mol
|
|
109
|
+
|
|
110
|
+
return preprocessed_mol, problems
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class GetParentMolWithCsp(PreprocessingStep):
|
|
114
|
+
"""
|
|
115
|
+
Preprocessing step that extracts parent molecules using ChEMBL Structure Pipeline.
|
|
116
|
+
|
|
117
|
+
This class uses the ChEMBL Structure Pipeline to identify and extract the parent molecule from
|
|
118
|
+
complex molecular structures. This process removes salts, solvents, and other fragments while
|
|
119
|
+
applying ChEMBL's standardization rules.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
None
|
|
124
|
+
|
|
125
|
+
Raises
|
|
126
|
+
------
|
|
127
|
+
ImportError
|
|
128
|
+
If the chembl_structure_pipeline library is not installed.
|
|
129
|
+
|
|
130
|
+
Examples
|
|
131
|
+
--------
|
|
132
|
+
>>> # Create a parent molecule extraction step
|
|
133
|
+
>>> get_parent_step = GetParentMolWithCsp()
|
|
134
|
+
|
|
135
|
+
Notes
|
|
136
|
+
-----
|
|
137
|
+
* Requires the chembl_structure_pipeline library to be installed
|
|
138
|
+
* Automatically removes 3D conformers as the pipeline cannot handle them
|
|
139
|
+
* Applies the get_parent_mol function from the chembl_structure_pipeline library
|
|
140
|
+
* If parent extraction fails or is flagged for exclusion, the original molecule is returned with
|
|
141
|
+
a Problem instance
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
def __init__(self) -> None:
|
|
145
|
+
super().__init__()
|
|
146
|
+
|
|
147
|
+
if import_error is not None:
|
|
148
|
+
raise import_error
|
|
149
|
+
|
|
150
|
+
def _preprocess(self, mol: Mol) -> Tuple[Optional[Mol], List[Problem]]:
|
|
151
|
+
"""
|
|
152
|
+
Extract the parent molecule using ChEMBL Structure Pipeline.
|
|
153
|
+
|
|
154
|
+
Identifies and returns the main molecular component. The process removes 3D conformers,
|
|
155
|
+
because chembl_structure_pipeline cannot handle them.
|
|
156
|
+
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
mol : Mol
|
|
160
|
+
RDKit Mol object representing the molecule from which to extract the parent structure.
|
|
161
|
+
|
|
162
|
+
Returns
|
|
163
|
+
-------
|
|
164
|
+
Tuple[Optional[Mol], List[Problem]]
|
|
165
|
+
A tuple containing:
|
|
166
|
+
* The parent molecule if successful, or the original molecule if extraction failed
|
|
167
|
+
* An empty list if extraction succeeded, or a list containing a Problem instance with
|
|
168
|
+
code "csp_error" if extraction failed or was flagged for exclusion
|
|
169
|
+
"""
|
|
170
|
+
problems = []
|
|
171
|
+
|
|
172
|
+
# chembl structure pipeline cannot handle molecules with 3D coordinates
|
|
173
|
+
# --> delete conformers
|
|
174
|
+
mol.RemoveAllConformers()
|
|
175
|
+
|
|
176
|
+
# get parent molecule via chembl structure pipeline
|
|
177
|
+
preprocessed_mol, exclude_flag = get_parent_mol(mol)
|
|
178
|
+
if exclude_flag or preprocessed_mol is None:
|
|
179
|
+
problems.append(Problem("csp_error", "Could not remove small fragments."))
|
|
180
|
+
if preprocessed_mol is None:
|
|
181
|
+
preprocessed_mol = mol
|
|
182
|
+
|
|
183
|
+
return preprocessed_mol, problems
|