mutcleaner 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mutcleaner/__init__.py +58 -0
- mutcleaner/cleaners/__init__.py +108 -0
- mutcleaner/cleaners/antitoxin_pard3_cleaner.py +341 -0
- mutcleaner/cleaners/antitoxin_pard3_custom_cleaners.py +137 -0
- mutcleaner/cleaners/archstabms_1e10_cleaner.py +286 -0
- mutcleaner/cleaners/archstabms_1e10_custom_cleaners.py +69 -0
- mutcleaner/cleaners/base_config.py +207 -0
- mutcleaner/cleaners/basic_cleaners.py +2329 -0
- mutcleaner/cleaners/cdna_proteolysis_cleaner.py +332 -0
- mutcleaner/cleaners/cdna_proteolysis_custom_cleaners.py +283 -0
- mutcleaner/cleaners/ctxm_cleaner.py +304 -0
- mutcleaner/cleaners/ddg_dtm_cleaners.py +347 -0
- mutcleaner/cleaners/human_domainome_custom_cleaners.py +481 -0
- mutcleaner/cleaners/human_domainome_sup2_cleaner.py +356 -0
- mutcleaner/cleaners/human_myoglobin_cleaner.py +322 -0
- mutcleaner/cleaners/human_myoglobin_custom_cleaners.py +79 -0
- mutcleaner/cleaners/proteingym_dms_substitutions_cleaner.py +329 -0
- mutcleaner/cleaners/proteingym_dms_substitutions_custom_cleaners.py +203 -0
- mutcleaner/cleaners/rbd_ace2_cleaner.py +364 -0
- mutcleaner/cleaners/rbd_antibody_cleaner.py +367 -0
- mutcleaner/cleaners/rbd_custom_cleaners.py +190 -0
- mutcleaner/cleaners/trpb_cleaner.py +298 -0
- mutcleaner/core/__init__.py +43 -0
- mutcleaner/core/alphabet.py +124 -0
- mutcleaner/core/codon.py +83 -0
- mutcleaner/core/constants.py +134 -0
- mutcleaner/core/dataset.py +1546 -0
- mutcleaner/core/mutation.py +739 -0
- mutcleaner/core/pipeline.py +1031 -0
- mutcleaner/core/sequence.py +774 -0
- mutcleaner/core/types.py +27 -0
- mutcleaner/utils/__init__.py +39 -0
- mutcleaner/utils/cleaner_workers.py +391 -0
- mutcleaner/utils/data_source.py +381 -0
- mutcleaner/utils/dataset_builders.py +296 -0
- mutcleaner/utils/label_resolvers.py +262 -0
- mutcleaner/utils/mutation_converter.py +51 -0
- mutcleaner/utils/raw_data_downloader.py +743 -0
- mutcleaner/utils/sequence_io.py +517 -0
- mutcleaner/utils/type_converter.py +313 -0
- mutcleaner-0.1.0.dist-info/METADATA +296 -0
- mutcleaner-0.1.0.dist-info/RECORD +45 -0
- mutcleaner-0.1.0.dist-info/WHEEL +5 -0
- mutcleaner-0.1.0.dist-info/licenses/LICENSE +28 -0
- mutcleaner-0.1.0.dist-info/top_level.txt +1 -0
mutcleaner/__init__.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# mutcleaner/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
MutCleaner: An efficient framework for cleaning, standardizing, and processing biological mutation data.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
__author__ = "Yuxiang Tang and Ziyu Shi"
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
|
9
|
+
|
|
10
|
+
from .core import (
|
|
11
|
+
# Alphabet
|
|
12
|
+
alphabet,
|
|
13
|
+
# Codon
|
|
14
|
+
codon,
|
|
15
|
+
# Mutation
|
|
16
|
+
mutation,
|
|
17
|
+
# Sequence
|
|
18
|
+
sequence,
|
|
19
|
+
# Dataset
|
|
20
|
+
MutationDataset,
|
|
21
|
+
# Pipeline
|
|
22
|
+
Pipeline,
|
|
23
|
+
pipeline_step,
|
|
24
|
+
multiout_step,
|
|
25
|
+
create_pipeline,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
from .cleaners import (
|
|
29
|
+
basic_cleaners,
|
|
30
|
+
cdna_proteolysis_cleaner,
|
|
31
|
+
human_domainome_sup2_cleaner,
|
|
32
|
+
proteingym_dms_substitutions_cleaner,
|
|
33
|
+
ddg_dtm_cleaners,
|
|
34
|
+
archstabms_1e10_cleaner,
|
|
35
|
+
human_myoglobin_cleaner,
|
|
36
|
+
ctxm_cleaner,
|
|
37
|
+
trpb_cleaner,
|
|
38
|
+
antitoxin_pard3_cleaner,
|
|
39
|
+
rbd_antibody_cleaner,
|
|
40
|
+
rbd_ace2_cleaner,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
from .utils import (
|
|
44
|
+
download,
|
|
45
|
+
download_cdna_proteolysis_source_file,
|
|
46
|
+
download_proteingym_source_file,
|
|
47
|
+
download_human_domainome_source_file,
|
|
48
|
+
download_ddg_dtm_source_file,
|
|
49
|
+
list_datasets_with_built_in_cleaners,
|
|
50
|
+
show_download_instructions,
|
|
51
|
+
download_archstabms1e10_source_file,
|
|
52
|
+
download_human_myoglobin_source_file,
|
|
53
|
+
download_ctxm_source_file,
|
|
54
|
+
download_trpb_source_file,
|
|
55
|
+
download_antitoxin_pard3_source_file,
|
|
56
|
+
download_rbd_antibody_source_file,
|
|
57
|
+
download_rbd_ace2_source_file,
|
|
58
|
+
)
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Dataset-specific cleaning pipelines for MutCleaner."""
|
|
2
|
+
|
|
3
|
+
from .cdna_proteolysis_cleaner import (
|
|
4
|
+
CDNAProteolysisCleanerConfig,
|
|
5
|
+
create_cdna_proteolysis_cleaner,
|
|
6
|
+
clean_cdna_proteolysis_dataset,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
from .proteingym_dms_substitutions_cleaner import (
|
|
10
|
+
ProteinGymCleanerConfig,
|
|
11
|
+
create_proteingym_dms_substitutions_cleaner,
|
|
12
|
+
clean_proteingym_dms_substitutions_dataset,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
from .human_domainome_sup2_cleaner import (
|
|
16
|
+
HumanDomainomeSup2CleanerConfig,
|
|
17
|
+
create_human_domainome_sup2_cleaner,
|
|
18
|
+
clean_human_domainome_sup2_dataset,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
from .ddg_dtm_cleaners import (
|
|
22
|
+
DdgDtmCleanerConfig,
|
|
23
|
+
create_ddg_dtm_cleaner,
|
|
24
|
+
clean_ddg_dtm_dataset,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
from .archstabms_1e10_cleaner import (
|
|
28
|
+
ArchStabMS1E10CleanerConfig,
|
|
29
|
+
create_archstabms_1e10_cleaner,
|
|
30
|
+
clean_archstabms_1e10_dataset,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
from .antitoxin_pard3_cleaner import (
|
|
34
|
+
AntitoxinParD3CleanerConfig,
|
|
35
|
+
create_antitoxin_pard3_cleaner,
|
|
36
|
+
clean_antitoxin_pard3_dataset,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
from .trpb_cleaner import (
|
|
40
|
+
TrpBCleanerConfig,
|
|
41
|
+
create_trpb_cleaner,
|
|
42
|
+
clean_trpb_dataset,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
from .ctxm_cleaner import (
|
|
46
|
+
CTXMCleanerConfig,
|
|
47
|
+
create_ctxm_cleaner,
|
|
48
|
+
clean_ctxm_dataset,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
from .human_myoglobin_cleaner import (
|
|
52
|
+
HumanMyoglobinCleanerConfig,
|
|
53
|
+
create_human_myoglobin_cleaner,
|
|
54
|
+
clean_human_myoglobin_dataset,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
from .rbd_antibody_cleaner import (
|
|
58
|
+
RBDAntibodyCleanerConfig,
|
|
59
|
+
create_rbd_antibody_cleaner,
|
|
60
|
+
clean_rbd_antibody_dataset,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
from .rbd_ace2_cleaner import (
|
|
64
|
+
RBDACE2CleanerConfig,
|
|
65
|
+
create_rbd_ace2_cleaner,
|
|
66
|
+
clean_rbd_ace2_dataset,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
__all__ = [
|
|
72
|
+
"create_cdna_proteolysis_cleaner",
|
|
73
|
+
"clean_cdna_proteolysis_dataset",
|
|
74
|
+
"CDNAProteolysisCleanerConfig",
|
|
75
|
+
"create_proteingym_dms_substitutions_cleaner",
|
|
76
|
+
"clean_proteingym_dms_substitutions_dataset",
|
|
77
|
+
"ProteinGymCleanerConfig",
|
|
78
|
+
"create_human_domainome_sup2_cleaner",
|
|
79
|
+
"clean_human_domainome_sup2_dataset",
|
|
80
|
+
"HumanDomainomeSup2CleanerConfig",
|
|
81
|
+
"create_ddg_dtm_cleaner",
|
|
82
|
+
"clean_ddg_dtm_dataset",
|
|
83
|
+
"DdgDtmCleanerConfig",
|
|
84
|
+
"ArchStabMS1E10CleanerConfig",
|
|
85
|
+
"create_archstabms_1e10_cleaner",
|
|
86
|
+
"clean_archstabms_1e10_dataset",
|
|
87
|
+
"ArchStabMS1E10CleanerConfig",
|
|
88
|
+
"create_archstabms_1e10_cleaner",
|
|
89
|
+
"clean_archstabms_1e10_dataset",
|
|
90
|
+
"AntitoxinParD3CleanerConfig",
|
|
91
|
+
"create_antitoxin_pard3_cleaner",
|
|
92
|
+
"clean_antitoxin_pard3_dataset",
|
|
93
|
+
"TrpBCleanerConfig",
|
|
94
|
+
"create_trpb_cleaner",
|
|
95
|
+
"clean_trpb_dataset",
|
|
96
|
+
"CTXMCleanerConfig",
|
|
97
|
+
"create_ctxm_cleaner",
|
|
98
|
+
"clean_ctxm_dataset",
|
|
99
|
+
"HumanMyoglobinCleanerConfig",
|
|
100
|
+
"create_human_myoglobin_cleaner",
|
|
101
|
+
"clean_human_myoglobin_dataset",
|
|
102
|
+
"RBDAntibodyCleanerConfig",
|
|
103
|
+
"create_rbd_antibody_cleaner",
|
|
104
|
+
"clean_rbd_antibody_dataset",
|
|
105
|
+
"RBDACE2CleanerConfig",
|
|
106
|
+
"create_rbd_ace2_cleaner",
|
|
107
|
+
"clean_rbd_ace2_dataset",
|
|
108
|
+
]
|
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
# mutcleaner/cleaners/antitoxin_pard3_cleaner.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
from .base_config import BaseCleanerConfig
|
|
11
|
+
from .basic_cleaners import (
|
|
12
|
+
read_dataset,
|
|
13
|
+
extract_and_rename_columns,
|
|
14
|
+
filter_and_clean_data,
|
|
15
|
+
convert_data_types,
|
|
16
|
+
convert_to_mutation_dataset_format,
|
|
17
|
+
validate_mutations,
|
|
18
|
+
add_columns,
|
|
19
|
+
apply_mutations_to_sequences,
|
|
20
|
+
average_labels_by_name,
|
|
21
|
+
subtract_labels_by_wt,
|
|
22
|
+
)
|
|
23
|
+
from .antitoxin_pard3_custom_cleaners import (
|
|
24
|
+
simplify_mutations,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
from ..core.dataset import MutationDataset
|
|
28
|
+
from ..core.pipeline import Pipeline, create_pipeline
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
"AntitoxinParD3CleanerConfig",
|
|
35
|
+
"create_antitoxin_pard3_cleaner",
|
|
36
|
+
"clean_antitoxin_pard3_dataset",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def __dir__() -> List[str]:
|
|
41
|
+
return __all__
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# Create module logger
|
|
45
|
+
logger = logging.getLogger(__name__)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class AntitoxinParD3CleanerConfig(BaseCleanerConfig):
|
|
50
|
+
"""
|
|
51
|
+
Configuration class for Antitoxin dataset cleaner.
|
|
52
|
+
Inherits from BaseCleanerConfig and adds Antitoxin-specific configuration options.
|
|
53
|
+
|
|
54
|
+
Simply run `mutcleaner.download_antitoxin_source_file()` to download the dataset.
|
|
55
|
+
|
|
56
|
+
Alternatively, the raw Antitoxin file can be obtained from:
|
|
57
|
+
|
|
58
|
+
- Hugging Face: https://huggingface.co/datasets/xulab-research/TidyMut/blob/main/antitoxin/antitoxin.csv
|
|
59
|
+
|
|
60
|
+
Attributes
|
|
61
|
+
----------
|
|
62
|
+
column_mapping : Dict[str, str]
|
|
63
|
+
Mapping from source to target column names
|
|
64
|
+
filters : Dict[str, Callable]
|
|
65
|
+
Filter conditions for data cleaning
|
|
66
|
+
wt_sequence : str
|
|
67
|
+
Wildtype sequence for the dataset, used for mutation validation
|
|
68
|
+
type_conversions : Dict[str, str]
|
|
69
|
+
Data type conversion specifications
|
|
70
|
+
validate_mut_workers : int
|
|
71
|
+
Number of workers for mutation validation, set to -1 to use all available CPUs
|
|
72
|
+
process_workers : int
|
|
73
|
+
Number of workers for applying mutations to sequences, set to -1 to use all available CPUs
|
|
74
|
+
label_columns : List[str]
|
|
75
|
+
List of score columns to process
|
|
76
|
+
primary_label_column : str
|
|
77
|
+
Primary score column for the dataset
|
|
78
|
+
pipeline_name : str
|
|
79
|
+
Name of the cleaning pipeline
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
# Column mapping configuration
|
|
83
|
+
column_mapping: Dict[str, str] = field(
|
|
84
|
+
default_factory=lambda: {
|
|
85
|
+
"mutation": "mut_info",
|
|
86
|
+
"label": "label",
|
|
87
|
+
}
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Data filtering configuration
|
|
91
|
+
filters: Dict[str, Callable] = field(
|
|
92
|
+
default_factory=lambda: {
|
|
93
|
+
"label": lambda s: pd.to_numeric(s, errors="coerce").notna()
|
|
94
|
+
}
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# obtained from the article
|
|
98
|
+
wt_sequence = "MANVEKMSVAVTPQQAAVMREAVEAGEYATASEIVREAVRDWLAKRELRHDDIRRLRQLWDEGKASGRPEPVDFDALRKEARQKLTEVPPNGR"
|
|
99
|
+
|
|
100
|
+
# Type conversion configuration
|
|
101
|
+
type_conversions: Dict[str, str] = field(default_factory=lambda: {"label": "float"})
|
|
102
|
+
|
|
103
|
+
# Mutation validation parameters
|
|
104
|
+
validate_mut_workers: int = 16
|
|
105
|
+
|
|
106
|
+
process_workers: int = 16
|
|
107
|
+
|
|
108
|
+
# Score columns configuration
|
|
109
|
+
label_columns: List[str] = field(default_factory=lambda: ["label"])
|
|
110
|
+
|
|
111
|
+
primary_label_column: str = "label"
|
|
112
|
+
|
|
113
|
+
# Override default pipeline name
|
|
114
|
+
pipeline_name: str = "Antitoxin Pipeline"
|
|
115
|
+
|
|
116
|
+
def validate(self) -> None:
|
|
117
|
+
"""Validate Antitoxin-specific configuration parameters
|
|
118
|
+
|
|
119
|
+
Raises
|
|
120
|
+
------
|
|
121
|
+
ValueError
|
|
122
|
+
If configuration is invalid
|
|
123
|
+
"""
|
|
124
|
+
# Call parent validation
|
|
125
|
+
super().validate()
|
|
126
|
+
|
|
127
|
+
# Validate score columns
|
|
128
|
+
if not self.label_columns:
|
|
129
|
+
raise ValueError("label_columns cannot be empty")
|
|
130
|
+
|
|
131
|
+
if self.primary_label_column not in self.label_columns:
|
|
132
|
+
raise ValueError(
|
|
133
|
+
f"primary_label_column '{self.primary_label_column}' "
|
|
134
|
+
f"must be in label_columns {self.label_columns}"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Validate column mapping
|
|
138
|
+
required_mappings = {"mutation"}
|
|
139
|
+
missing = required_mappings - set(self.column_mapping.keys())
|
|
140
|
+
if missing:
|
|
141
|
+
raise ValueError(f"Missing required column mappings: {missing}")
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def create_antitoxin_pard3_cleaner(
|
|
145
|
+
dataset_or_path: Optional[Union[pd.DataFrame, str, Path]] = None,
|
|
146
|
+
config: Optional[
|
|
147
|
+
Union[AntitoxinParD3CleanerConfig, Dict[str, Any], str, Path]
|
|
148
|
+
] = None,
|
|
149
|
+
) -> Pipeline:
|
|
150
|
+
"""Create Antitoxin dataset cleaning pipeline
|
|
151
|
+
|
|
152
|
+
Parameters
|
|
153
|
+
----------
|
|
154
|
+
dataset_or_path : Optional[Union[pd.DataFrame, str, Path]], default=None
|
|
155
|
+
Raw dataset DataFrame or file path to Antitoxin dataset.
|
|
156
|
+
config : Optional[Union[AntitoxinCleanerConfig, Dict[str, Any], str, Path]]
|
|
157
|
+
Configuration for the cleaning pipeline. Can be:
|
|
158
|
+
- AntitoxinCleanerConfig object
|
|
159
|
+
- Dictionary with configuration parameters (merged with defaults)
|
|
160
|
+
- Path to JSON configuration file (str or Path)
|
|
161
|
+
- None (uses default configuration)
|
|
162
|
+
|
|
163
|
+
Returns
|
|
164
|
+
-------
|
|
165
|
+
Pipeline
|
|
166
|
+
Pipeline: The cleaning pipeline used
|
|
167
|
+
|
|
168
|
+
Raises
|
|
169
|
+
------
|
|
170
|
+
TypeError
|
|
171
|
+
If config has invalid type
|
|
172
|
+
ValueError
|
|
173
|
+
If configuration validation fails
|
|
174
|
+
"""
|
|
175
|
+
# Handle configuration parameter
|
|
176
|
+
if config is None:
|
|
177
|
+
final_config = AntitoxinParD3CleanerConfig()
|
|
178
|
+
elif isinstance(config, AntitoxinParD3CleanerConfig):
|
|
179
|
+
final_config = config
|
|
180
|
+
elif isinstance(config, dict):
|
|
181
|
+
# Partial configuration - merge with defaults
|
|
182
|
+
default_config = AntitoxinParD3CleanerConfig()
|
|
183
|
+
final_config = default_config.merge(config)
|
|
184
|
+
elif isinstance(config, (str, Path)):
|
|
185
|
+
# Load from file
|
|
186
|
+
final_config = AntitoxinParD3CleanerConfig.from_json(config)
|
|
187
|
+
else:
|
|
188
|
+
raise TypeError(
|
|
189
|
+
f"config must be AntitoxinParD3CleanerConfig, dict, str, Path or None, "
|
|
190
|
+
f"got {type(config)}"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# Log configuration summary
|
|
194
|
+
logger.info(
|
|
195
|
+
f"Antitoxin dataset will be cleaned with pipeline: {final_config.pipeline_name}"
|
|
196
|
+
)
|
|
197
|
+
logger.debug(f"Configuration:\n{final_config.get_summary()}")
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
# Create pipeline
|
|
201
|
+
pipeline = create_pipeline(dataset_or_path, final_config.pipeline_name)
|
|
202
|
+
|
|
203
|
+
# Add cleaning steps
|
|
204
|
+
pipeline = (
|
|
205
|
+
pipeline.delayed_then(
|
|
206
|
+
extract_and_rename_columns,
|
|
207
|
+
column_mapping=final_config.column_mapping,
|
|
208
|
+
)
|
|
209
|
+
.delayed_then(
|
|
210
|
+
filter_and_clean_data,
|
|
211
|
+
filters=final_config.filters,
|
|
212
|
+
)
|
|
213
|
+
.delayed_then(
|
|
214
|
+
convert_data_types,
|
|
215
|
+
type_conversions=final_config.type_conversions,
|
|
216
|
+
)
|
|
217
|
+
.delayed_then(
|
|
218
|
+
add_columns,
|
|
219
|
+
columns_to_add={
|
|
220
|
+
"antitoxin": "name",
|
|
221
|
+
"wt_seq": final_config.wt_sequence,
|
|
222
|
+
},
|
|
223
|
+
)
|
|
224
|
+
.delayed_then(
|
|
225
|
+
simplify_mutations,
|
|
226
|
+
mutation_column=final_config.column_mapping.get("mutation", "mutation"),
|
|
227
|
+
mutation_sep=":",
|
|
228
|
+
)
|
|
229
|
+
.delayed_then(
|
|
230
|
+
validate_mutations,
|
|
231
|
+
mutation_column=final_config.column_mapping.get("mutation", "mutation"),
|
|
232
|
+
mutation_sep=",",
|
|
233
|
+
is_zero_based=True,
|
|
234
|
+
exclude_patterns="WT",
|
|
235
|
+
num_workers=final_config.validate_mut_workers,
|
|
236
|
+
)
|
|
237
|
+
.delayed_then(
|
|
238
|
+
average_labels_by_name,
|
|
239
|
+
name_columns=(
|
|
240
|
+
"name",
|
|
241
|
+
final_config.column_mapping.get("mutation", "mutation"),
|
|
242
|
+
),
|
|
243
|
+
label_columns=final_config.primary_label_column,
|
|
244
|
+
)
|
|
245
|
+
.delayed_then(
|
|
246
|
+
subtract_labels_by_wt,
|
|
247
|
+
name_column="name",
|
|
248
|
+
label_columns=final_config.primary_label_column,
|
|
249
|
+
mutation_column=final_config.column_mapping.get("mutation", "mutation"),
|
|
250
|
+
wt_identifier="WT",
|
|
251
|
+
in_place=True,
|
|
252
|
+
)
|
|
253
|
+
.delayed_then(
|
|
254
|
+
apply_mutations_to_sequences,
|
|
255
|
+
sequence_column="wt_seq",
|
|
256
|
+
name_column="name",
|
|
257
|
+
mutation_column=final_config.column_mapping.get("mutation", "mutation"),
|
|
258
|
+
is_zero_based=True,
|
|
259
|
+
sequence_type="protein",
|
|
260
|
+
num_workers=final_config.process_workers,
|
|
261
|
+
)
|
|
262
|
+
.delayed_then(
|
|
263
|
+
convert_to_mutation_dataset_format,
|
|
264
|
+
name_column="name",
|
|
265
|
+
mutation_column=final_config.column_mapping.get("mutation", "mutation"),
|
|
266
|
+
sequence_column="wt_seq",
|
|
267
|
+
label_column=final_config.primary_label_column,
|
|
268
|
+
is_zero_based=True,
|
|
269
|
+
)
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# Create pipeline based on dataset_or_path type
|
|
273
|
+
if isinstance(dataset_or_path, (str, Path)):
|
|
274
|
+
pipeline.add_delayed_step(read_dataset, 0, file_format="csv")
|
|
275
|
+
elif not isinstance(dataset_or_path, pd.DataFrame):
|
|
276
|
+
raise TypeError(
|
|
277
|
+
f"dataset_or_path must be pd.DataFrame or str/Path, "
|
|
278
|
+
f"got {type(dataset_or_path)}"
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
return pipeline
|
|
282
|
+
|
|
283
|
+
except Exception as e:
|
|
284
|
+
logger.error(f"Error in creating aav capsid cleaning pipeline: {str(e)}")
|
|
285
|
+
raise RuntimeError(f"Error in creating aav capsid cleaning pipeline: {str(e)}")
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def clean_antitoxin_pard3_dataset(
|
|
289
|
+
pipeline: Pipeline,
|
|
290
|
+
) -> Tuple[Pipeline, MutationDataset]:
|
|
291
|
+
"""Clean Antitoxin dataset using configurable pipeline
|
|
292
|
+
|
|
293
|
+
Parameters
|
|
294
|
+
----------
|
|
295
|
+
pipeline : Pipeline
|
|
296
|
+
Antitoxin dataset cleaning pipeline
|
|
297
|
+
|
|
298
|
+
Returns
|
|
299
|
+
-------
|
|
300
|
+
Tuple[Pipeline, MutationDataset]
|
|
301
|
+
- Pipeline: The cleaned pipeline
|
|
302
|
+
- MutationDataset: The cleaned Antitoxin dataset
|
|
303
|
+
|
|
304
|
+
Examples
|
|
305
|
+
--------
|
|
306
|
+
Use default configuration:
|
|
307
|
+
|
|
308
|
+
>>> pipeline = create_antitoxin_cleaner(df) # df is raw Antitoxin dataset file
|
|
309
|
+
|
|
310
|
+
Use partial configuration:
|
|
311
|
+
|
|
312
|
+
>>> pipeline = create_antitoxin_cleaner(df, config={
|
|
313
|
+
... "validate_mut_workers": 8,
|
|
314
|
+
... })
|
|
315
|
+
|
|
316
|
+
Load configuration from file:
|
|
317
|
+
|
|
318
|
+
>>> pipeline = create_antitoxin_cleaner(df, config="config.json")
|
|
319
|
+
>>> pipeline, dataset = clean_antitoxin_dataset(pipeline)
|
|
320
|
+
"""
|
|
321
|
+
try:
|
|
322
|
+
# Run pipeline
|
|
323
|
+
pipeline.execute()
|
|
324
|
+
|
|
325
|
+
# Extract results
|
|
326
|
+
antitoxin_dataset_df, antitoxin_ref_seq = pipeline.data
|
|
327
|
+
antitoxin_dataset = MutationDataset.from_dataframe(
|
|
328
|
+
antitoxin_dataset_df, antitoxin_ref_seq
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
logger.info(
|
|
332
|
+
f"Successfully cleaned antitoxin dataset: "
|
|
333
|
+
f"{len(antitoxin_dataset_df)} mutations from {len(antitoxin_ref_seq)} proteins"
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
return pipeline, antitoxin_dataset
|
|
337
|
+
except Exception as e:
|
|
338
|
+
logger.error(f"Error in running antitoxin dataset cleaning pipeline: {str(e)}")
|
|
339
|
+
raise RuntimeError(
|
|
340
|
+
f"Error in running antitoxin dataset cleaning pipeline: {str(e)}"
|
|
341
|
+
)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import numpy as np
|
|
4
|
+
from typing import List
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from ..core.pipeline import pipeline_step
|
|
8
|
+
from ..core.sequence import ProteinSequence
|
|
9
|
+
|
|
10
|
+
__all__ = ["infer_wt_sequence", "add_wild_type_sequence", "simplify_mutations"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@pipeline_step
|
|
14
|
+
def infer_wt_sequence(
|
|
15
|
+
dataset: pd.DataFrame,
|
|
16
|
+
mutation_col: str,
|
|
17
|
+
sequence_col: str = "mut_seq",
|
|
18
|
+
wt_column: str = "wt_seq",
|
|
19
|
+
) -> pd.DataFrame:
|
|
20
|
+
"""
|
|
21
|
+
infer wild type seuquence from mutation_sequence and mutation information
|
|
22
|
+
|
|
23
|
+
Argument
|
|
24
|
+
--------
|
|
25
|
+
dataset : pd.DataFrame
|
|
26
|
+
the dataset which can be inferred wild-type sequence with mutation sequence and mutation information
|
|
27
|
+
mutation_col : str
|
|
28
|
+
the column contatins mutation information
|
|
29
|
+
sequence_col : str, default="mut_seq"
|
|
30
|
+
the column contains mutation sequence
|
|
31
|
+
wt_column : str, default="wt_seq"
|
|
32
|
+
the column that can be inferred by mutation sequence and mutation information
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
dataset: pd.DataFrame
|
|
37
|
+
the dataset contatins added wild-type sequence
|
|
38
|
+
"""
|
|
39
|
+
if dataset.empty:
|
|
40
|
+
raise ValueError("Dataset is empty. Cannot infer WT sequence.")
|
|
41
|
+
|
|
42
|
+
dataset = dataset.copy()
|
|
43
|
+
|
|
44
|
+
seq_len = len(dataset[sequence_col].iat[0])
|
|
45
|
+
sequences: List[str] = dataset[sequence_col].tolist()
|
|
46
|
+
mutation_seqs: List[str] = dataset[mutation_col].tolist()
|
|
47
|
+
|
|
48
|
+
wt_sequence_list = []
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
mut_matrix = (
|
|
52
|
+
np.array(mutation_seqs, dtype="U").view("U1").reshape(len(dataset), -1)
|
|
53
|
+
)
|
|
54
|
+
seq_matrix = np.array(sequences, dtype="U").view("U1").reshape(len(dataset), -1)
|
|
55
|
+
except ValueError:
|
|
56
|
+
raise ValueError("Sequences in the DataFrame are not of equal length.")
|
|
57
|
+
|
|
58
|
+
for i in range(seq_len):
|
|
59
|
+
match_indices = np.where(mut_matrix[:, i] == "_")[0]
|
|
60
|
+
|
|
61
|
+
if match_indices.size > 0:
|
|
62
|
+
idx = match_indices[0]
|
|
63
|
+
original_char = seq_matrix[idx, i]
|
|
64
|
+
wt_sequence_list.append(original_char)
|
|
65
|
+
else:
|
|
66
|
+
print(
|
|
67
|
+
f"Warning: Position {i} is mutated in All sequences. Cannot recover wild-type sequence."
|
|
68
|
+
)
|
|
69
|
+
wt_sequence_list.append("X")
|
|
70
|
+
|
|
71
|
+
wt_seq = "".join(wt_sequence_list)
|
|
72
|
+
|
|
73
|
+
dataset[wt_column] = wt_seq
|
|
74
|
+
|
|
75
|
+
return dataset
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@pipeline_step
|
|
79
|
+
def simplify_mutations(
|
|
80
|
+
dataset: pd.DataFrame,
|
|
81
|
+
mutation_column: str = "mut_info",
|
|
82
|
+
mutation_sep: str = ":",
|
|
83
|
+
) -> pd.DataFrame:
|
|
84
|
+
"""
|
|
85
|
+
Simplify mutation strings by removing no-change tokens like L47L.
|
|
86
|
+
Keeps unknown tokens unchanged (does not filter rows).
|
|
87
|
+
|
|
88
|
+
Examples
|
|
89
|
+
--------
|
|
90
|
+
>>> import pandas as pd
|
|
91
|
+
>>> df = pd.DataFrame({
|
|
92
|
+
>>> 'name':["protein1", "protein1", "protein1"]
|
|
93
|
+
>>> 'mut_info':["L47L:D51D:I52I","L47A:D51D:I52A","L47_:D51D:I52I"]
|
|
94
|
+
>>> })
|
|
95
|
+
>>> df = simplify_mutations(df)
|
|
96
|
+
>>> df[mut_info]
|
|
97
|
+
pd.Series("WT","L47A,I52A","L47_") # only simplify the mutations but not fliter
|
|
98
|
+
"""
|
|
99
|
+
if mutation_column not in dataset.columns:
|
|
100
|
+
raise ValueError(f"the {mutation_column} is not in the dataset")
|
|
101
|
+
|
|
102
|
+
dataset = dataset.copy()
|
|
103
|
+
|
|
104
|
+
# only for simplifying
|
|
105
|
+
# validate the legality of mutations
|
|
106
|
+
mutation_regex = re.compile(r"^([A-Z\*_])(\d+)([A-Z\*_])$")
|
|
107
|
+
|
|
108
|
+
def simplify_single_mutations(mutations):
|
|
109
|
+
if pd.isna(mutations):
|
|
110
|
+
return mutations
|
|
111
|
+
|
|
112
|
+
mutations = str(mutations).strip()
|
|
113
|
+
if not mutations:
|
|
114
|
+
return mutations
|
|
115
|
+
|
|
116
|
+
keep = []
|
|
117
|
+
for single_mutation_tok in mutations.split(mutation_sep):
|
|
118
|
+
single_mutation_tok = single_mutation_tok.strip()
|
|
119
|
+
if not single_mutation_tok:
|
|
120
|
+
continue
|
|
121
|
+
single_mutation = mutation_regex.match(single_mutation_tok)
|
|
122
|
+
if single_mutation is None:
|
|
123
|
+
keep.append(single_mutation_tok)
|
|
124
|
+
continue
|
|
125
|
+
wt, pos, mut = (
|
|
126
|
+
single_mutation.group(1),
|
|
127
|
+
single_mutation.group(2),
|
|
128
|
+
single_mutation.group(3),
|
|
129
|
+
)
|
|
130
|
+
if wt != mut:
|
|
131
|
+
keep.append(f"{wt}{pos}{mut}")
|
|
132
|
+
|
|
133
|
+
return "WT" if len(keep) == 0 else ",".join(keep)
|
|
134
|
+
|
|
135
|
+
dataset[mutation_column] = dataset[mutation_column].apply(simplify_single_mutations)
|
|
136
|
+
|
|
137
|
+
return dataset
|