mutcleaner 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. mutcleaner/__init__.py +58 -0
  2. mutcleaner/cleaners/__init__.py +108 -0
  3. mutcleaner/cleaners/antitoxin_pard3_cleaner.py +341 -0
  4. mutcleaner/cleaners/antitoxin_pard3_custom_cleaners.py +137 -0
  5. mutcleaner/cleaners/archstabms_1e10_cleaner.py +286 -0
  6. mutcleaner/cleaners/archstabms_1e10_custom_cleaners.py +69 -0
  7. mutcleaner/cleaners/base_config.py +207 -0
  8. mutcleaner/cleaners/basic_cleaners.py +2329 -0
  9. mutcleaner/cleaners/cdna_proteolysis_cleaner.py +332 -0
  10. mutcleaner/cleaners/cdna_proteolysis_custom_cleaners.py +283 -0
  11. mutcleaner/cleaners/ctxm_cleaner.py +304 -0
  12. mutcleaner/cleaners/ddg_dtm_cleaners.py +347 -0
  13. mutcleaner/cleaners/human_domainome_custom_cleaners.py +481 -0
  14. mutcleaner/cleaners/human_domainome_sup2_cleaner.py +356 -0
  15. mutcleaner/cleaners/human_myoglobin_cleaner.py +322 -0
  16. mutcleaner/cleaners/human_myoglobin_custom_cleaners.py +79 -0
  17. mutcleaner/cleaners/proteingym_dms_substitutions_cleaner.py +329 -0
  18. mutcleaner/cleaners/proteingym_dms_substitutions_custom_cleaners.py +203 -0
  19. mutcleaner/cleaners/rbd_ace2_cleaner.py +364 -0
  20. mutcleaner/cleaners/rbd_antibody_cleaner.py +367 -0
  21. mutcleaner/cleaners/rbd_custom_cleaners.py +190 -0
  22. mutcleaner/cleaners/trpb_cleaner.py +298 -0
  23. mutcleaner/core/__init__.py +43 -0
  24. mutcleaner/core/alphabet.py +124 -0
  25. mutcleaner/core/codon.py +83 -0
  26. mutcleaner/core/constants.py +134 -0
  27. mutcleaner/core/dataset.py +1546 -0
  28. mutcleaner/core/mutation.py +739 -0
  29. mutcleaner/core/pipeline.py +1031 -0
  30. mutcleaner/core/sequence.py +774 -0
  31. mutcleaner/core/types.py +27 -0
  32. mutcleaner/utils/__init__.py +39 -0
  33. mutcleaner/utils/cleaner_workers.py +391 -0
  34. mutcleaner/utils/data_source.py +381 -0
  35. mutcleaner/utils/dataset_builders.py +296 -0
  36. mutcleaner/utils/label_resolvers.py +262 -0
  37. mutcleaner/utils/mutation_converter.py +51 -0
  38. mutcleaner/utils/raw_data_downloader.py +743 -0
  39. mutcleaner/utils/sequence_io.py +517 -0
  40. mutcleaner/utils/type_converter.py +313 -0
  41. mutcleaner-0.1.0.dist-info/METADATA +296 -0
  42. mutcleaner-0.1.0.dist-info/RECORD +45 -0
  43. mutcleaner-0.1.0.dist-info/WHEEL +5 -0
  44. mutcleaner-0.1.0.dist-info/licenses/LICENSE +28 -0
  45. mutcleaner-0.1.0.dist-info/top_level.txt +1 -0
mutcleaner/__init__.py ADDED
@@ -0,0 +1,58 @@
1
+ # mutcleaner/__init__.py
2
+ """
3
+ MutCleaner: An efficient framework for cleaning, standardizing, and processing biological mutation data.
4
+ """
5
+
6
+ __author__ = "Yuxiang Tang and Ziyu Shi"
7
+
8
+ __version__ = "0.1.0"
9
+
10
+ from .core import (
11
+ # Alphabet
12
+ alphabet,
13
+ # Codon
14
+ codon,
15
+ # Mutation
16
+ mutation,
17
+ # Sequence
18
+ sequence,
19
+ # Dataset
20
+ MutationDataset,
21
+ # Pipeline
22
+ Pipeline,
23
+ pipeline_step,
24
+ multiout_step,
25
+ create_pipeline,
26
+ )
27
+
28
+ from .cleaners import (
29
+ basic_cleaners,
30
+ cdna_proteolysis_cleaner,
31
+ human_domainome_sup2_cleaner,
32
+ proteingym_dms_substitutions_cleaner,
33
+ ddg_dtm_cleaners,
34
+ archstabms_1e10_cleaner,
35
+ human_myoglobin_cleaner,
36
+ ctxm_cleaner,
37
+ trpb_cleaner,
38
+ antitoxin_pard3_cleaner,
39
+ rbd_antibody_cleaner,
40
+ rbd_ace2_cleaner,
41
+ )
42
+
43
+ from .utils import (
44
+ download,
45
+ download_cdna_proteolysis_source_file,
46
+ download_proteingym_source_file,
47
+ download_human_domainome_source_file,
48
+ download_ddg_dtm_source_file,
49
+ list_datasets_with_built_in_cleaners,
50
+ show_download_instructions,
51
+ download_archstabms1e10_source_file,
52
+ download_human_myoglobin_source_file,
53
+ download_ctxm_source_file,
54
+ download_trpb_source_file,
55
+ download_antitoxin_pard3_source_file,
56
+ download_rbd_antibody_source_file,
57
+ download_rbd_ace2_source_file,
58
+ )
@@ -0,0 +1,108 @@
1
+ """Dataset-specific cleaning pipelines for MutCleaner."""
2
+
3
+ from .cdna_proteolysis_cleaner import (
4
+ CDNAProteolysisCleanerConfig,
5
+ create_cdna_proteolysis_cleaner,
6
+ clean_cdna_proteolysis_dataset,
7
+ )
8
+
9
+ from .proteingym_dms_substitutions_cleaner import (
10
+ ProteinGymCleanerConfig,
11
+ create_proteingym_dms_substitutions_cleaner,
12
+ clean_proteingym_dms_substitutions_dataset,
13
+ )
14
+
15
+ from .human_domainome_sup2_cleaner import (
16
+ HumanDomainomeSup2CleanerConfig,
17
+ create_human_domainome_sup2_cleaner,
18
+ clean_human_domainome_sup2_dataset,
19
+ )
20
+
21
+ from .ddg_dtm_cleaners import (
22
+ DdgDtmCleanerConfig,
23
+ create_ddg_dtm_cleaner,
24
+ clean_ddg_dtm_dataset,
25
+ )
26
+
27
+ from .archstabms_1e10_cleaner import (
28
+ ArchStabMS1E10CleanerConfig,
29
+ create_archstabms_1e10_cleaner,
30
+ clean_archstabms_1e10_dataset,
31
+ )
32
+
33
+ from .antitoxin_pard3_cleaner import (
34
+ AntitoxinParD3CleanerConfig,
35
+ create_antitoxin_pard3_cleaner,
36
+ clean_antitoxin_pard3_dataset,
37
+ )
38
+
39
+ from .trpb_cleaner import (
40
+ TrpBCleanerConfig,
41
+ create_trpb_cleaner,
42
+ clean_trpb_dataset,
43
+ )
44
+
45
+ from .ctxm_cleaner import (
46
+ CTXMCleanerConfig,
47
+ create_ctxm_cleaner,
48
+ clean_ctxm_dataset,
49
+ )
50
+
51
+ from .human_myoglobin_cleaner import (
52
+ HumanMyoglobinCleanerConfig,
53
+ create_human_myoglobin_cleaner,
54
+ clean_human_myoglobin_dataset,
55
+ )
56
+
57
+ from .rbd_antibody_cleaner import (
58
+ RBDAntibodyCleanerConfig,
59
+ create_rbd_antibody_cleaner,
60
+ clean_rbd_antibody_dataset,
61
+ )
62
+
63
+ from .rbd_ace2_cleaner import (
64
+ RBDACE2CleanerConfig,
65
+ create_rbd_ace2_cleaner,
66
+ clean_rbd_ace2_dataset,
67
+ )
68
+
69
+
70
+
71
+ __all__ = [
72
+ "create_cdna_proteolysis_cleaner",
73
+ "clean_cdna_proteolysis_dataset",
74
+ "CDNAProteolysisCleanerConfig",
75
+ "create_proteingym_dms_substitutions_cleaner",
76
+ "clean_proteingym_dms_substitutions_dataset",
77
+ "ProteinGymCleanerConfig",
78
+ "create_human_domainome_sup2_cleaner",
79
+ "clean_human_domainome_sup2_dataset",
80
+ "HumanDomainomeSup2CleanerConfig",
81
+ "create_ddg_dtm_cleaner",
82
+ "clean_ddg_dtm_dataset",
83
+ "DdgDtmCleanerConfig",
84
+ "ArchStabMS1E10CleanerConfig",
85
+ "create_archstabms_1e10_cleaner",
86
+ "clean_archstabms_1e10_dataset",
87
+ "ArchStabMS1E10CleanerConfig",
88
+ "create_archstabms_1e10_cleaner",
89
+ "clean_archstabms_1e10_dataset",
90
+ "AntitoxinParD3CleanerConfig",
91
+ "create_antitoxin_pard3_cleaner",
92
+ "clean_antitoxin_pard3_dataset",
93
+ "TrpBCleanerConfig",
94
+ "create_trpb_cleaner",
95
+ "clean_trpb_dataset",
96
+ "CTXMCleanerConfig",
97
+ "create_ctxm_cleaner",
98
+ "clean_ctxm_dataset",
99
+ "HumanMyoglobinCleanerConfig",
100
+ "create_human_myoglobin_cleaner",
101
+ "clean_human_myoglobin_dataset",
102
+ "RBDAntibodyCleanerConfig",
103
+ "create_rbd_antibody_cleaner",
104
+ "clean_rbd_antibody_dataset",
105
+ "RBDACE2CleanerConfig",
106
+ "create_rbd_ace2_cleaner",
107
+ "clean_rbd_ace2_dataset",
108
+ ]
@@ -0,0 +1,341 @@
1
+ # mutcleaner/cleaners/antitoxin_pard3_cleaner.py
2
+ from __future__ import annotations
3
+
4
+ import pandas as pd
5
+ from typing import TYPE_CHECKING
6
+ from dataclasses import dataclass, field
7
+ from pathlib import Path
8
+ import logging
9
+
10
+ from .base_config import BaseCleanerConfig
11
+ from .basic_cleaners import (
12
+ read_dataset,
13
+ extract_and_rename_columns,
14
+ filter_and_clean_data,
15
+ convert_data_types,
16
+ convert_to_mutation_dataset_format,
17
+ validate_mutations,
18
+ add_columns,
19
+ apply_mutations_to_sequences,
20
+ average_labels_by_name,
21
+ subtract_labels_by_wt,
22
+ )
23
+ from .antitoxin_pard3_custom_cleaners import (
24
+ simplify_mutations,
25
+ )
26
+
27
+ from ..core.dataset import MutationDataset
28
+ from ..core.pipeline import Pipeline, create_pipeline
29
+
30
+ if TYPE_CHECKING:
31
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
32
+
33
+ __all__ = [
34
+ "AntitoxinParD3CleanerConfig",
35
+ "create_antitoxin_pard3_cleaner",
36
+ "clean_antitoxin_pard3_dataset",
37
+ ]
38
+
39
+
40
+ def __dir__() -> List[str]:
41
+ return __all__
42
+
43
+
44
+ # Create module logger
45
+ logger = logging.getLogger(__name__)
46
+
47
+
48
+ @dataclass
49
+ class AntitoxinParD3CleanerConfig(BaseCleanerConfig):
50
+ """
51
+ Configuration class for Antitoxin dataset cleaner.
52
+ Inherits from BaseCleanerConfig and adds Antitoxin-specific configuration options.
53
+
54
+ Simply run `mutcleaner.download_antitoxin_source_file()` to download the dataset.
55
+
56
+ Alternatively, the raw Antitoxin file can be obtained from:
57
+
58
+ - Hugging Face: https://huggingface.co/datasets/xulab-research/TidyMut/blob/main/antitoxin/antitoxin.csv
59
+
60
+ Attributes
61
+ ----------
62
+ column_mapping : Dict[str, str]
63
+ Mapping from source to target column names
64
+ filters : Dict[str, Callable]
65
+ Filter conditions for data cleaning
66
+ wt_sequence : str
67
+ Wildtype sequence for the dataset, used for mutation validation
68
+ type_conversions : Dict[str, str]
69
+ Data type conversion specifications
70
+ validate_mut_workers : int
71
+ Number of workers for mutation validation, set to -1 to use all available CPUs
72
+ process_workers : int
73
+ Number of workers for applying mutations to sequences, set to -1 to use all available CPUs
74
+ label_columns : List[str]
75
+ List of score columns to process
76
+ primary_label_column : str
77
+ Primary score column for the dataset
78
+ pipeline_name : str
79
+ Name of the cleaning pipeline
80
+ """
81
+
82
+ # Column mapping configuration
83
+ column_mapping: Dict[str, str] = field(
84
+ default_factory=lambda: {
85
+ "mutation": "mut_info",
86
+ "label": "label",
87
+ }
88
+ )
89
+
90
+ # Data filtering configuration
91
+ filters: Dict[str, Callable] = field(
92
+ default_factory=lambda: {
93
+ "label": lambda s: pd.to_numeric(s, errors="coerce").notna()
94
+ }
95
+ )
96
+
97
+ # obtained from the article
98
+ wt_sequence = "MANVEKMSVAVTPQQAAVMREAVEAGEYATASEIVREAVRDWLAKRELRHDDIRRLRQLWDEGKASGRPEPVDFDALRKEARQKLTEVPPNGR"
99
+
100
+ # Type conversion configuration
101
+ type_conversions: Dict[str, str] = field(default_factory=lambda: {"label": "float"})
102
+
103
+ # Mutation validation parameters
104
+ validate_mut_workers: int = 16
105
+
106
+ process_workers: int = 16
107
+
108
+ # Score columns configuration
109
+ label_columns: List[str] = field(default_factory=lambda: ["label"])
110
+
111
+ primary_label_column: str = "label"
112
+
113
+ # Override default pipeline name
114
+ pipeline_name: str = "Antitoxin Pipeline"
115
+
116
+ def validate(self) -> None:
117
+ """Validate Antitoxin-specific configuration parameters
118
+
119
+ Raises
120
+ ------
121
+ ValueError
122
+ If configuration is invalid
123
+ """
124
+ # Call parent validation
125
+ super().validate()
126
+
127
+ # Validate score columns
128
+ if not self.label_columns:
129
+ raise ValueError("label_columns cannot be empty")
130
+
131
+ if self.primary_label_column not in self.label_columns:
132
+ raise ValueError(
133
+ f"primary_label_column '{self.primary_label_column}' "
134
+ f"must be in label_columns {self.label_columns}"
135
+ )
136
+
137
+ # Validate column mapping
138
+ required_mappings = {"mutation"}
139
+ missing = required_mappings - set(self.column_mapping.keys())
140
+ if missing:
141
+ raise ValueError(f"Missing required column mappings: {missing}")
142
+
143
+
144
+ def create_antitoxin_pard3_cleaner(
145
+ dataset_or_path: Optional[Union[pd.DataFrame, str, Path]] = None,
146
+ config: Optional[
147
+ Union[AntitoxinParD3CleanerConfig, Dict[str, Any], str, Path]
148
+ ] = None,
149
+ ) -> Pipeline:
150
+ """Create Antitoxin dataset cleaning pipeline
151
+
152
+ Parameters
153
+ ----------
154
+ dataset_or_path : Optional[Union[pd.DataFrame, str, Path]], default=None
155
+ Raw dataset DataFrame or file path to Antitoxin dataset.
156
+ config : Optional[Union[AntitoxinCleanerConfig, Dict[str, Any], str, Path]]
157
+ Configuration for the cleaning pipeline. Can be:
158
+ - AntitoxinCleanerConfig object
159
+ - Dictionary with configuration parameters (merged with defaults)
160
+ - Path to JSON configuration file (str or Path)
161
+ - None (uses default configuration)
162
+
163
+ Returns
164
+ -------
165
+ Pipeline
166
+ Pipeline: The cleaning pipeline used
167
+
168
+ Raises
169
+ ------
170
+ TypeError
171
+ If config has invalid type
172
+ ValueError
173
+ If configuration validation fails
174
+ """
175
+ # Handle configuration parameter
176
+ if config is None:
177
+ final_config = AntitoxinParD3CleanerConfig()
178
+ elif isinstance(config, AntitoxinParD3CleanerConfig):
179
+ final_config = config
180
+ elif isinstance(config, dict):
181
+ # Partial configuration - merge with defaults
182
+ default_config = AntitoxinParD3CleanerConfig()
183
+ final_config = default_config.merge(config)
184
+ elif isinstance(config, (str, Path)):
185
+ # Load from file
186
+ final_config = AntitoxinParD3CleanerConfig.from_json(config)
187
+ else:
188
+ raise TypeError(
189
+ f"config must be AntitoxinParD3CleanerConfig, dict, str, Path or None, "
190
+ f"got {type(config)}"
191
+ )
192
+
193
+ # Log configuration summary
194
+ logger.info(
195
+ f"Antitoxin dataset will be cleaned with pipeline: {final_config.pipeline_name}"
196
+ )
197
+ logger.debug(f"Configuration:\n{final_config.get_summary()}")
198
+
199
+ try:
200
+ # Create pipeline
201
+ pipeline = create_pipeline(dataset_or_path, final_config.pipeline_name)
202
+
203
+ # Add cleaning steps
204
+ pipeline = (
205
+ pipeline.delayed_then(
206
+ extract_and_rename_columns,
207
+ column_mapping=final_config.column_mapping,
208
+ )
209
+ .delayed_then(
210
+ filter_and_clean_data,
211
+ filters=final_config.filters,
212
+ )
213
+ .delayed_then(
214
+ convert_data_types,
215
+ type_conversions=final_config.type_conversions,
216
+ )
217
+ .delayed_then(
218
+ add_columns,
219
+ columns_to_add={
220
+ "antitoxin": "name",
221
+ "wt_seq": final_config.wt_sequence,
222
+ },
223
+ )
224
+ .delayed_then(
225
+ simplify_mutations,
226
+ mutation_column=final_config.column_mapping.get("mutation", "mutation"),
227
+ mutation_sep=":",
228
+ )
229
+ .delayed_then(
230
+ validate_mutations,
231
+ mutation_column=final_config.column_mapping.get("mutation", "mutation"),
232
+ mutation_sep=",",
233
+ is_zero_based=True,
234
+ exclude_patterns="WT",
235
+ num_workers=final_config.validate_mut_workers,
236
+ )
237
+ .delayed_then(
238
+ average_labels_by_name,
239
+ name_columns=(
240
+ "name",
241
+ final_config.column_mapping.get("mutation", "mutation"),
242
+ ),
243
+ label_columns=final_config.primary_label_column,
244
+ )
245
+ .delayed_then(
246
+ subtract_labels_by_wt,
247
+ name_column="name",
248
+ label_columns=final_config.primary_label_column,
249
+ mutation_column=final_config.column_mapping.get("mutation", "mutation"),
250
+ wt_identifier="WT",
251
+ in_place=True,
252
+ )
253
+ .delayed_then(
254
+ apply_mutations_to_sequences,
255
+ sequence_column="wt_seq",
256
+ name_column="name",
257
+ mutation_column=final_config.column_mapping.get("mutation", "mutation"),
258
+ is_zero_based=True,
259
+ sequence_type="protein",
260
+ num_workers=final_config.process_workers,
261
+ )
262
+ .delayed_then(
263
+ convert_to_mutation_dataset_format,
264
+ name_column="name",
265
+ mutation_column=final_config.column_mapping.get("mutation", "mutation"),
266
+ sequence_column="wt_seq",
267
+ label_column=final_config.primary_label_column,
268
+ is_zero_based=True,
269
+ )
270
+ )
271
+
272
+ # Create pipeline based on dataset_or_path type
273
+ if isinstance(dataset_or_path, (str, Path)):
274
+ pipeline.add_delayed_step(read_dataset, 0, file_format="csv")
275
+ elif not isinstance(dataset_or_path, pd.DataFrame):
276
+ raise TypeError(
277
+ f"dataset_or_path must be pd.DataFrame or str/Path, "
278
+ f"got {type(dataset_or_path)}"
279
+ )
280
+
281
+ return pipeline
282
+
283
+ except Exception as e:
284
+ logger.error(f"Error in creating aav capsid cleaning pipeline: {str(e)}")
285
+ raise RuntimeError(f"Error in creating aav capsid cleaning pipeline: {str(e)}")
286
+
287
+
288
+ def clean_antitoxin_pard3_dataset(
289
+ pipeline: Pipeline,
290
+ ) -> Tuple[Pipeline, MutationDataset]:
291
+ """Clean Antitoxin dataset using configurable pipeline
292
+
293
+ Parameters
294
+ ----------
295
+ pipeline : Pipeline
296
+ Antitoxin dataset cleaning pipeline
297
+
298
+ Returns
299
+ -------
300
+ Tuple[Pipeline, MutationDataset]
301
+ - Pipeline: The cleaned pipeline
302
+ - MutationDataset: The cleaned Antitoxin dataset
303
+
304
+ Examples
305
+ --------
306
+ Use default configuration:
307
+
308
+ >>> pipeline = create_antitoxin_cleaner(df) # df is raw Antitoxin dataset file
309
+
310
+ Use partial configuration:
311
+
312
+ >>> pipeline = create_antitoxin_cleaner(df, config={
313
+ ... "validate_mut_workers": 8,
314
+ ... })
315
+
316
+ Load configuration from file:
317
+
318
+ >>> pipeline = create_antitoxin_cleaner(df, config="config.json")
319
+ >>> pipeline, dataset = clean_antitoxin_dataset(pipeline)
320
+ """
321
+ try:
322
+ # Run pipeline
323
+ pipeline.execute()
324
+
325
+ # Extract results
326
+ antitoxin_dataset_df, antitoxin_ref_seq = pipeline.data
327
+ antitoxin_dataset = MutationDataset.from_dataframe(
328
+ antitoxin_dataset_df, antitoxin_ref_seq
329
+ )
330
+
331
+ logger.info(
332
+ f"Successfully cleaned antitoxin dataset: "
333
+ f"{len(antitoxin_dataset_df)} mutations from {len(antitoxin_ref_seq)} proteins"
334
+ )
335
+
336
+ return pipeline, antitoxin_dataset
337
+ except Exception as e:
338
+ logger.error(f"Error in running antitoxin dataset cleaning pipeline: {str(e)}")
339
+ raise RuntimeError(
340
+ f"Error in running antitoxin dataset cleaning pipeline: {str(e)}"
341
+ )
@@ -0,0 +1,137 @@
1
+ import re
2
+ import pandas as pd
3
+ import numpy as np
4
+ from typing import List
5
+ from tqdm import tqdm
6
+
7
+ from ..core.pipeline import pipeline_step
8
+ from ..core.sequence import ProteinSequence
9
+
10
+ __all__ = ["infer_wt_sequence", "add_wild_type_sequence", "simplify_mutations"]
11
+
12
+
13
+ @pipeline_step
14
+ def infer_wt_sequence(
15
+ dataset: pd.DataFrame,
16
+ mutation_col: str,
17
+ sequence_col: str = "mut_seq",
18
+ wt_column: str = "wt_seq",
19
+ ) -> pd.DataFrame:
20
+ """
21
+ infer wild type seuquence from mutation_sequence and mutation information
22
+
23
+ Argument
24
+ --------
25
+ dataset : pd.DataFrame
26
+ the dataset which can be inferred wild-type sequence with mutation sequence and mutation information
27
+ mutation_col : str
28
+ the column contatins mutation information
29
+ sequence_col : str, default="mut_seq"
30
+ the column contains mutation sequence
31
+ wt_column : str, default="wt_seq"
32
+ the column that can be inferred by mutation sequence and mutation information
33
+
34
+ Returns
35
+ -------
36
+ dataset: pd.DataFrame
37
+ the dataset contatins added wild-type sequence
38
+ """
39
+ if dataset.empty:
40
+ raise ValueError("Dataset is empty. Cannot infer WT sequence.")
41
+
42
+ dataset = dataset.copy()
43
+
44
+ seq_len = len(dataset[sequence_col].iat[0])
45
+ sequences: List[str] = dataset[sequence_col].tolist()
46
+ mutation_seqs: List[str] = dataset[mutation_col].tolist()
47
+
48
+ wt_sequence_list = []
49
+
50
+ try:
51
+ mut_matrix = (
52
+ np.array(mutation_seqs, dtype="U").view("U1").reshape(len(dataset), -1)
53
+ )
54
+ seq_matrix = np.array(sequences, dtype="U").view("U1").reshape(len(dataset), -1)
55
+ except ValueError:
56
+ raise ValueError("Sequences in the DataFrame are not of equal length.")
57
+
58
+ for i in range(seq_len):
59
+ match_indices = np.where(mut_matrix[:, i] == "_")[0]
60
+
61
+ if match_indices.size > 0:
62
+ idx = match_indices[0]
63
+ original_char = seq_matrix[idx, i]
64
+ wt_sequence_list.append(original_char)
65
+ else:
66
+ print(
67
+ f"Warning: Position {i} is mutated in All sequences. Cannot recover wild-type sequence."
68
+ )
69
+ wt_sequence_list.append("X")
70
+
71
+ wt_seq = "".join(wt_sequence_list)
72
+
73
+ dataset[wt_column] = wt_seq
74
+
75
+ return dataset
76
+
77
+
78
+ @pipeline_step
79
+ def simplify_mutations(
80
+ dataset: pd.DataFrame,
81
+ mutation_column: str = "mut_info",
82
+ mutation_sep: str = ":",
83
+ ) -> pd.DataFrame:
84
+ """
85
+ Simplify mutation strings by removing no-change tokens like L47L.
86
+ Keeps unknown tokens unchanged (does not filter rows).
87
+
88
+ Examples
89
+ --------
90
+ >>> import pandas as pd
91
+ >>> df = pd.DataFrame({
92
+ >>> 'name':["protein1", "protein1", "protein1"]
93
+ >>> 'mut_info':["L47L:D51D:I52I","L47A:D51D:I52A","L47_:D51D:I52I"]
94
+ >>> })
95
+ >>> df = simplify_mutations(df)
96
+ >>> df[mut_info]
97
+ pd.Series("WT","L47A,I52A","L47_") # only simplify the mutations but not fliter
98
+ """
99
+ if mutation_column not in dataset.columns:
100
+ raise ValueError(f"the {mutation_column} is not in the dataset")
101
+
102
+ dataset = dataset.copy()
103
+
104
+ # only for simplifying
105
+ # validate the legality of mutations
106
+ mutation_regex = re.compile(r"^([A-Z\*_])(\d+)([A-Z\*_])$")
107
+
108
+ def simplify_single_mutations(mutations):
109
+ if pd.isna(mutations):
110
+ return mutations
111
+
112
+ mutations = str(mutations).strip()
113
+ if not mutations:
114
+ return mutations
115
+
116
+ keep = []
117
+ for single_mutation_tok in mutations.split(mutation_sep):
118
+ single_mutation_tok = single_mutation_tok.strip()
119
+ if not single_mutation_tok:
120
+ continue
121
+ single_mutation = mutation_regex.match(single_mutation_tok)
122
+ if single_mutation is None:
123
+ keep.append(single_mutation_tok)
124
+ continue
125
+ wt, pos, mut = (
126
+ single_mutation.group(1),
127
+ single_mutation.group(2),
128
+ single_mutation.group(3),
129
+ )
130
+ if wt != mut:
131
+ keep.append(f"{wt}{pos}{mut}")
132
+
133
+ return "WT" if len(keep) == 0 else ",".join(keep)
134
+
135
+ dataset[mutation_column] = dataset[mutation_column].apply(simplify_single_mutations)
136
+
137
+ return dataset