tidymut 0.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. tidymut-0.0.0/.gitignore +45 -0
  2. tidymut-0.0.0/LICENSE +28 -0
  3. tidymut-0.0.0/PKG-INFO +273 -0
  4. tidymut-0.0.0/README.md +206 -0
  5. tidymut-0.0.0/pyproject.toml +48 -0
  6. tidymut-0.0.0/setup.cfg +4 -0
  7. tidymut-0.0.0/test/test_dataset.py +664 -0
  8. tidymut-0.0.0/test/test_mutation.py +746 -0
  9. tidymut-0.0.0/test/test_pipeline.py +1257 -0
  10. tidymut-0.0.0/test/test_sequence.py +577 -0
  11. tidymut-0.0.0/tidymut/__init__.py +26 -0
  12. tidymut-0.0.0/tidymut/cleaners/__init__.py +25 -0
  13. tidymut-0.0.0/tidymut/cleaners/base_config.py +208 -0
  14. tidymut-0.0.0/tidymut/cleaners/basic_cleaners.py +1165 -0
  15. tidymut-0.0.0/tidymut/cleaners/human_domainome_cleaner.py +417 -0
  16. tidymut-0.0.0/tidymut/cleaners/human_domainome_custom_cleaners.py +363 -0
  17. tidymut-0.0.0/tidymut/cleaners/k50_cleaner.py +283 -0
  18. tidymut-0.0.0/tidymut/cleaners/protein_gym_cleaner.py +318 -0
  19. tidymut-0.0.0/tidymut/cleaners/protein_gym_custom_cleaners.py +203 -0
  20. tidymut-0.0.0/tidymut/core/__init__.py +43 -0
  21. tidymut-0.0.0/tidymut/core/alphabet.py +124 -0
  22. tidymut-0.0.0/tidymut/core/codon.py +84 -0
  23. tidymut-0.0.0/tidymut/core/constants.py +78 -0
  24. tidymut-0.0.0/tidymut/core/dataset.py +1498 -0
  25. tidymut-0.0.0/tidymut/core/mutation.py +797 -0
  26. tidymut-0.0.0/tidymut/core/pipeline.py +1010 -0
  27. tidymut-0.0.0/tidymut/core/sequence.py +733 -0
  28. tidymut-0.0.0/tidymut/core/types.py +14 -0
  29. tidymut-0.0.0/tidymut/utils/__init__.py +0 -0
  30. tidymut-0.0.0/tidymut/utils/cleaner_workers.py +296 -0
  31. tidymut-0.0.0/tidymut/utils/dataset_builders.py +293 -0
  32. tidymut-0.0.0/tidymut/utils/mutation_converter.py +51 -0
  33. tidymut-0.0.0/tidymut/utils/sequence_io.py +517 -0
  34. tidymut-0.0.0/tidymut/utils/type_converter.py +313 -0
  35. tidymut-0.0.0/tidymut.egg-info/PKG-INFO +273 -0
  36. tidymut-0.0.0/tidymut.egg-info/SOURCES.txt +37 -0
  37. tidymut-0.0.0/tidymut.egg-info/dependency_links.txt +1 -0
  38. tidymut-0.0.0/tidymut.egg-info/requires.txt +17 -0
  39. tidymut-0.0.0/tidymut.egg-info/top_level.txt +1 -0
@@ -0,0 +1,45 @@
1
+ # Editor temporary/working/backup files #
2
+ #########################################
3
+ .#*
4
+ *\#*\#
5
+ [#]*#
6
+ *~
7
+ *$
8
+ *.bak
9
+ *flymake*
10
+ *.iml
11
+ *.kdev4
12
+ *.log
13
+ *.swp
14
+ *.pdb
15
+ *.zip
16
+ .project
17
+ .pydevproject
18
+ .settings
19
+ .idea
20
+ .vagrant
21
+ .noseids
22
+ .ipynb_checkpoints
23
+ .tags
24
+ .cache/
25
+ .vscode/
26
+
27
+ # Python files #
28
+ ################
29
+ __pycache__/
30
+ # pytest
31
+ /.pytest_cache
32
+ # egg metadata
33
+ *.egg-info
34
+ *.eggs
35
+ # coverage
36
+ .coverage
37
+ coverage.xml
38
+ coverage_html_report
39
+ htmlcov
40
+
41
+ # Documentation related files #
42
+ ###############################
43
+ doc/source/
44
+ doc/make.bat
45
+ doc/Makefile
tidymut-0.0.0/LICENSE ADDED
@@ -0,0 +1,28 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2025, Yuxiang Tang.
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright notice, this
9
+ list of conditions and the following disclaimer.
10
+
11
+ 2. Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ 3. Neither the name of the copyright holder nor the names of its
16
+ contributors may be used to endorse or promote products derived from
17
+ this software without specific prior written permission.
18
+
19
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
tidymut-0.0.0/PKG-INFO ADDED
@@ -0,0 +1,273 @@
1
+ Metadata-Version: 2.4
2
+ Name: tidymut
3
+ Version: 0.0.0
4
+ Summary: An efficient framework for tidying and standardizing protein mutation data.
5
+ Author-email: Yuxiang Tang <845351766@qq.com>
6
+ License: BSD 3-Clause License
7
+
8
+ Copyright (c) 2025, Yuxiang Tang.
9
+
10
+ Redistribution and use in source and binary forms, with or without
11
+ modification, are permitted provided that the following conditions are met:
12
+
13
+ 1. Redistributions of source code must retain the above copyright notice, this
14
+ list of conditions and the following disclaimer.
15
+
16
+ 2. Redistributions in binary form must reproduce the above copyright notice,
17
+ this list of conditions and the following disclaimer in the documentation
18
+ and/or other materials provided with the distribution.
19
+
20
+ 3. Neither the name of the copyright holder nor the names of its
21
+ contributors may be used to endorse or promote products derived from
22
+ this software without specific prior written permission.
23
+
24
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
27
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
28
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
+ Project-URL: Repository, https://github.com/xulab-research/TidyMut
35
+ Keywords: protein,mutation,tidy,framework,pipeline
36
+ Classifier: Development Status :: 3 - Alpha
37
+ Classifier: License :: OSI Approved :: BSD License
38
+ Classifier: Intended Audience :: Science/Research
39
+ Classifier: Programming Language :: Python :: 3 :: Only
40
+ Classifier: Programming Language :: Python :: 3.10
41
+ Classifier: Programming Language :: Python :: 3.11
42
+ Classifier: Programming Language :: Python :: 3.12
43
+ Classifier: Programming Language :: Python :: 3.13
44
+ Classifier: Operating System :: OS Independent
45
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
46
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
47
+ Classifier: Typing :: Typed
48
+ Requires-Python: >=3.10
49
+ Description-Content-Type: text/markdown
50
+ License-File: LICENSE
51
+ Requires-Dist: joblib>=1.5.0
52
+ Requires-Dist: numpy>=2.1.0
53
+ Requires-Dist: pandas>=2.1.0
54
+ Requires-Dist: tqdm>=4.60.0
55
+ Requires-Dist: python-dateutil>=2.8.2
56
+ Requires-Dist: tzdata>=2022.7
57
+ Provides-Extra: test
58
+ Requires-Dist: pytest>=8.0.0; extra == "test"
59
+ Requires-Dist: pytest-cov>=6.0.0; extra == "test"
60
+ Provides-Extra: dev
61
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
62
+ Requires-Dist: pytest-cov>=6.0.0; extra == "dev"
63
+ Requires-Dist: sphinx>=8.0.0; extra == "dev"
64
+ Requires-Dist: sphinx-autobuild>=2024.10.0; extra == "dev"
65
+ Requires-Dist: sphinx_rtd_theme>=3.0.0; extra == "dev"
66
+ Dynamic: license-file
67
+
68
+ # TidyMut
69
+
70
+ A comprehensive Python package for processing and analyzing biological sequence data with advanced mutation analysis capabilities.
71
+
72
+ ## Overview
73
+
74
+ TidyMut is designed for bioinformaticians, computational biologists, and researchers working with genetic sequence data. The package streamlines the complex process of cleaning, processing, and analyzing DNA and protein sequences, with specialized tools for mutation analysis and large-scale dataset handling.
75
+
76
+ ### Key Capabilities
77
+
78
+ - **Sequence Data Processing**: Comprehensive support for DNA and protein sequence operations including complementation, transcription, translation, and validation
79
+ - **Advanced Mutation Analysis**: Specialized tools for detecting, analyzing, and characterizing genetic mutations with statistical insights
80
+ - **Intelligent Data Cleaning**: Automated preprocessing pipelines that handle common data quality issues in biological datasets
81
+ - **Flexible Pipeline Architecture**: Modular design allowing custom workflow creation for specific research needs
82
+ - **High-Performance Processing**: Optimized for handling large-scale sequence datasets efficiently
83
+
84
+ ## Installation
85
+
86
+ ### Requirements
87
+ - Python 3.13+
88
+ - pandas
89
+
90
+ ### Install via pip
91
+ ```bash
92
+ pip install tidymut
93
+ ```
94
+
95
+ ### Development Installation
96
+ ```bash
97
+ git clone https://github.com/xulab-research/TidyMut.git tidymut
98
+ cd tidymut
99
+ pip install -e .
100
+ ```
101
+
102
+ ## Quick Start
103
+
104
+ ### Processing K50 Dataset
105
+
106
+ Here's a complete example demonstrating TidyMut's capabilities with the K50 mutation dataset:
107
+
108
+ ```python
109
+ from tidymut import k50_cleaner
110
+
111
+
112
+ # Create K50 cleaning pipeline using TidyMut's default pipeline
113
+ # Download from: https://zenodo.org/records/799292
114
+ # File: `Tsuboyama2023_Dataset2_Dataset3_20230416.csv` in `Processed_K50_dG_datasets.zip`
115
+ k50_cleaning_pipeline = k50_cleaner.create_k50_cleaner(
116
+ "path/to/Tsuboyama2023_Dataset2_Dataset3_20230416.csv"
117
+ )
118
+
119
+ # Clean and process the dataset
120
+ k50_dataset = clean_k50_dataset(k50_cleaning_pipeline)
121
+
122
+ # Save the processed dataset
123
+ k50_dataset.save("output/cleaned_k50_data")
124
+
125
+ # Access processed data
126
+ print(f"Dataset contains {len(k50_dataset)} sequences")
127
+ print(f"Mutation types identified: {k50_dataset.mutation_summary()}")
128
+ ```
129
+
130
+ ### Basic Sequence Operations
131
+
132
+ ```python
133
+ from tidymut.sequence import DNASequence, ProteinSequence
134
+
135
+ # DNA sequence analysis
136
+ dna = DNASequence("ATGCGATCGTAGC")
137
+ print(f"Complement: {dna.complement()}")
138
+ print(f"Reverse complement: {dna.reverse_complement()}")
139
+ print(f"Translation: {dna.translate()}")
140
+ ```
141
+
142
+ ## Core Features
143
+
144
+ ### Sequence Data Manipulation
145
+ - **Sequence Validation**: Automatic detection and correction of common sequence errors
146
+ - **Format Conversion**: Seamless conversion between different sequence formats
147
+ - **Batch Processing**: Efficient handling of large sequence collections
148
+
149
+ ### Mutation Analysis
150
+ - **Mutation Detection**: Automated identification of point mutations, insertions, and deletions
151
+ - **Statistical Analysis**: Comprehensive mutation frequency and distribution statistics
152
+ - **Visualization Tools**: Built-in plotting functions for mutation landscapes
153
+
154
+ ### Data Cleaning & Preprocessing
155
+ - **Standardization**: Consistent sequence formatting and annotation
156
+ - **Duplicate Removal**: Intelligent handling of redundant sequences
157
+
158
+ ### Pipeline Architecture
159
+ - **Modular Design**: Mix and match processing components
160
+ - **Parallel Processing**: Multi-core support for large datasets
161
+ - **Progress Tracking**: Real-time processing status and logging
162
+
163
+ ## Examples and Use Cases
164
+
165
+ ### Comparative Mutation Analysis
166
+ ```python
167
+ from tidymut.analysis import MutationComparator
168
+
169
+ comparator = MutationComparator()
170
+ comparator.add_dataset("wildtype", wt_sequences)
171
+ comparator.add_dataset("variant", variant_sequences)
172
+
173
+ results = comparator.compare_mutation_profiles()
174
+ comparator.plot_comparison(results)
175
+ ```
176
+
177
+ ### Custom Processing Pipeline
178
+ ```python
179
+ import pandas as pd
180
+ from typing import Tuple
181
+
182
+ from tidymut.cleaners.basic_cleaners import (
183
+ extract_and_rename_columns,
184
+ filter_and_clean_data,
185
+ convert_data_types,
186
+ validate_mutations,
187
+ infer_wildtype_sequences,
188
+ convert_to_mutation_dataset_format,
189
+ )
190
+ from tidymut.core.dataset import MutationDataset
191
+ from tidymut.core.pipeline import Pipeline, create_pipeline
192
+
193
+ dataset = pd.read_csv("path/to/Tsuboyama2023_Dataset2_Dataset3_20230416.csv")
194
+
195
+ pipeline = create_pipeline(dataset, "k50_cleaner")
196
+ clean_result = (
197
+ pipeline.then(
198
+ extract_and_rename_columns,
199
+ column_mapping={
200
+ "WT_name": "name",
201
+ "aa_seq": "mut_seq",
202
+ "mut_type": "mut_info",
203
+ "ddG_ML": "ddG",
204
+ },
205
+ )
206
+ .then(filter_and_clean_data, filters={"ddG": lambda x: x != "-"})
207
+ .then(convert_data_types, type_conversions={"ddG": "float"})
208
+ .then(
209
+ validate_mutations,
210
+ mutation_column="mut_info",
211
+ mutation_sep="_",
212
+ is_zero_based=False,
213
+ num_workers=16,
214
+ )
215
+ .then(
216
+ infer_wildtype_sequences,
217
+ label_columns=["ddG"],
218
+ handle_multiple_wt="error",
219
+ is_zero_based=True,
220
+ num_workers=16,
221
+ )
222
+ .then(
223
+ convert_to_mutation_dataset_format,
224
+ name_column="name",
225
+ mutation_column="mut_info",
226
+ mutated_sequence_column="mut_seq",
227
+ score_column="ddG",
228
+ is_zero_based=True,
229
+ )
230
+ )
231
+ k50_dataset_df, k50_ref_seq = clean_result.data
232
+ k50_dataset = MutationDataset.from_dataframe(k50_dataset_df, k50_ref_seq)
233
+
234
+ # Get execution summary
235
+ execution_info = pipeline.get_execution_summary()
236
+
237
+ # Access artifacts
238
+ artifacts = pipeline.artifacts
239
+
240
+ # Save pipeline state
241
+ pipeline.save_structured_data("k50_cleaner_pipeline.pkl")
242
+ ```
243
+
244
+ ## Contributing
245
+
246
+ We welcome contributions! Please see our [Contributing Guidelines](CONTRIBUTING.md) for details on:
247
+ - Code style and standards
248
+ - Testing requirements
249
+ - Pull request process
250
+ - Issue reporting
251
+
252
+ ## Citation
253
+
254
+ If you use TidyMut in your research, please cite:
255
+
256
+ ```bibtex
257
+ @software{tidymut,
258
+ title={TidyMut: A Python Package for Biological Sequence Data Processing},
259
+ author={Your Name and Contributors},
260
+ year={2025},
261
+ url={https://github.com/xulab-research/tidymut}
262
+ }
263
+ ```
264
+
265
+ ## License
266
+
267
+ This project is licensed under the BSD 3-Clause License - see the [LICENSE](LICENSE) file for details.
268
+
269
+ ## Support
270
+
271
+ - **Issues**: [GitHub Issues](https://github.com/xulab-research/tidymut/issues)
272
+ - **Discussions**: [GitHub Discussions](https://github.com/xulab-research/tidymut/discussions)
273
+ - **Email**:
@@ -0,0 +1,206 @@
1
+ # TidyMut
2
+
3
+ A comprehensive Python package for processing and analyzing biological sequence data with advanced mutation analysis capabilities.
4
+
5
+ ## Overview
6
+
7
+ TidyMut is designed for bioinformaticians, computational biologists, and researchers working with genetic sequence data. The package streamlines the complex process of cleaning, processing, and analyzing DNA and protein sequences, with specialized tools for mutation analysis and large-scale dataset handling.
8
+
9
+ ### Key Capabilities
10
+
11
+ - **Sequence Data Processing**: Comprehensive support for DNA and protein sequence operations including complementation, transcription, translation, and validation
12
+ - **Advanced Mutation Analysis**: Specialized tools for detecting, analyzing, and characterizing genetic mutations with statistical insights
13
+ - **Intelligent Data Cleaning**: Automated preprocessing pipelines that handle common data quality issues in biological datasets
14
+ - **Flexible Pipeline Architecture**: Modular design allowing custom workflow creation for specific research needs
15
+ - **High-Performance Processing**: Optimized for handling large-scale sequence datasets efficiently
16
+
17
+ ## Installation
18
+
19
+ ### Requirements
20
+ - Python 3.13+
21
+ - pandas
22
+
23
+ ### Install via pip
24
+ ```bash
25
+ pip install tidymut
26
+ ```
27
+
28
+ ### Development Installation
29
+ ```bash
30
+ git clone https://github.com/xulab-research/TidyMut.git tidymut
31
+ cd tidymut
32
+ pip install -e .
33
+ ```
34
+
35
+ ## Quick Start
36
+
37
+ ### Processing K50 Dataset
38
+
39
+ Here's a complete example demonstrating TidyMut's capabilities with the K50 mutation dataset:
40
+
41
+ ```python
42
+ from tidymut import k50_cleaner
43
+
44
+
45
+ # Create K50 cleaning pipeline using TidyMut's default pipeline
46
+ # Download from: https://zenodo.org/records/799292
47
+ # File: `Tsuboyama2023_Dataset2_Dataset3_20230416.csv` in `Processed_K50_dG_datasets.zip`
48
+ k50_cleaning_pipeline = k50_cleaner.create_k50_cleaner(
49
+ "path/to/Tsuboyama2023_Dataset2_Dataset3_20230416.csv"
50
+ )
51
+
52
+ # Clean and process the dataset
53
+ k50_dataset = clean_k50_dataset(k50_cleaning_pipeline)
54
+
55
+ # Save the processed dataset
56
+ k50_dataset.save("output/cleaned_k50_data")
57
+
58
+ # Access processed data
59
+ print(f"Dataset contains {len(k50_dataset)} sequences")
60
+ print(f"Mutation types identified: {k50_dataset.mutation_summary()}")
61
+ ```
62
+
63
+ ### Basic Sequence Operations
64
+
65
+ ```python
66
+ from tidymut.sequence import DNASequence, ProteinSequence
67
+
68
+ # DNA sequence analysis
69
+ dna = DNASequence("ATGCGATCGTAGC")
70
+ print(f"Complement: {dna.complement()}")
71
+ print(f"Reverse complement: {dna.reverse_complement()}")
72
+ print(f"Translation: {dna.translate()}")
73
+ ```
74
+
75
+ ## Core Features
76
+
77
+ ### Sequence Data Manipulation
78
+ - **Sequence Validation**: Automatic detection and correction of common sequence errors
79
+ - **Format Conversion**: Seamless conversion between different sequence formats
80
+ - **Batch Processing**: Efficient handling of large sequence collections
81
+
82
+ ### Mutation Analysis
83
+ - **Mutation Detection**: Automated identification of point mutations, insertions, and deletions
84
+ - **Statistical Analysis**: Comprehensive mutation frequency and distribution statistics
85
+ - **Visualization Tools**: Built-in plotting functions for mutation landscapes
86
+
87
+ ### Data Cleaning & Preprocessing
88
+ - **Standardization**: Consistent sequence formatting and annotation
89
+ - **Duplicate Removal**: Intelligent handling of redundant sequences
90
+
91
+ ### Pipeline Architecture
92
+ - **Modular Design**: Mix and match processing components
93
+ - **Parallel Processing**: Multi-core support for large datasets
94
+ - **Progress Tracking**: Real-time processing status and logging
95
+
96
+ ## Examples and Use Cases
97
+
98
+ ### Comparative Mutation Analysis
99
+ ```python
100
+ from tidymut.analysis import MutationComparator
101
+
102
+ comparator = MutationComparator()
103
+ comparator.add_dataset("wildtype", wt_sequences)
104
+ comparator.add_dataset("variant", variant_sequences)
105
+
106
+ results = comparator.compare_mutation_profiles()
107
+ comparator.plot_comparison(results)
108
+ ```
109
+
110
+ ### Custom Processing Pipeline
111
+ ```python
112
+ import pandas as pd
113
+ from typing import Tuple
114
+
115
+ from tidymut.cleaners.basic_cleaners import (
116
+ extract_and_rename_columns,
117
+ filter_and_clean_data,
118
+ convert_data_types,
119
+ validate_mutations,
120
+ infer_wildtype_sequences,
121
+ convert_to_mutation_dataset_format,
122
+ )
123
+ from tidymut.core.dataset import MutationDataset
124
+ from tidymut.core.pipeline import Pipeline, create_pipeline
125
+
126
+ dataset = pd.read_csv("path/to/Tsuboyama2023_Dataset2_Dataset3_20230416.csv")
127
+
128
+ pipeline = create_pipeline(dataset, "k50_cleaner")
129
+ clean_result = (
130
+ pipeline.then(
131
+ extract_and_rename_columns,
132
+ column_mapping={
133
+ "WT_name": "name",
134
+ "aa_seq": "mut_seq",
135
+ "mut_type": "mut_info",
136
+ "ddG_ML": "ddG",
137
+ },
138
+ )
139
+ .then(filter_and_clean_data, filters={"ddG": lambda x: x != "-"})
140
+ .then(convert_data_types, type_conversions={"ddG": "float"})
141
+ .then(
142
+ validate_mutations,
143
+ mutation_column="mut_info",
144
+ mutation_sep="_",
145
+ is_zero_based=False,
146
+ num_workers=16,
147
+ )
148
+ .then(
149
+ infer_wildtype_sequences,
150
+ label_columns=["ddG"],
151
+ handle_multiple_wt="error",
152
+ is_zero_based=True,
153
+ num_workers=16,
154
+ )
155
+ .then(
156
+ convert_to_mutation_dataset_format,
157
+ name_column="name",
158
+ mutation_column="mut_info",
159
+ mutated_sequence_column="mut_seq",
160
+ score_column="ddG",
161
+ is_zero_based=True,
162
+ )
163
+ )
164
+ k50_dataset_df, k50_ref_seq = clean_result.data
165
+ k50_dataset = MutationDataset.from_dataframe(k50_dataset_df, k50_ref_seq)
166
+
167
+ # Get execution summary
168
+ execution_info = pipeline.get_execution_summary()
169
+
170
+ # Access artifacts
171
+ artifacts = pipeline.artifacts
172
+
173
+ # Save pipeline state
174
+ pipeline.save_structured_data("k50_cleaner_pipeline.pkl")
175
+ ```
176
+
177
+ ## Contributing
178
+
179
+ We welcome contributions! Please see our [Contributing Guidelines](CONTRIBUTING.md) for details on:
180
+ - Code style and standards
181
+ - Testing requirements
182
+ - Pull request process
183
+ - Issue reporting
184
+
185
+ ## Citation
186
+
187
+ If you use TidyMut in your research, please cite:
188
+
189
+ ```bibtex
190
+ @software{tidymut,
191
+ title={TidyMut: A Python Package for Biological Sequence Data Processing},
192
+ author={Your Name and Contributors},
193
+ year={2025},
194
+ url={https://github.com/xulab-research/tidymut}
195
+ }
196
+ ```
197
+
198
+ ## License
199
+
200
+ This project is licensed under the BSD 3-Clause License - see the [LICENSE](LICENSE) file for details.
201
+
202
+ ## Support
203
+
204
+ - **Issues**: [GitHub Issues](https://github.com/xulab-research/tidymut/issues)
205
+ - **Discussions**: [GitHub Discussions](https://github.com/xulab-research/tidymut/discussions)
206
+ - **Email**:
@@ -0,0 +1,48 @@
1
+ [build-system]
2
+ requires = ["setuptools>=77.0.3", "setuptools_scm>=8.2.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "tidymut"
7
+ dynamic = ["version"]
8
+ description = "An efficient framework for tidying and standardizing protein mutation data."
9
+ readme = "README.md"
10
+ authors = [{ name = "Yuxiang Tang", email = "845351766@qq.com" }]
11
+ license = { file = "LICENSE" }
12
+ requires-python = ">=3.10"
13
+ dependencies = [
14
+ "joblib>=1.5.0",
15
+ "numpy>=2.1.0",
16
+ "pandas>=2.1.0",
17
+ "tqdm>=4.60.0",
18
+ "python-dateutil>=2.8.2",
19
+ "tzdata>=2022.7",
20
+ ]
21
+ keywords = ["protein", "mutation", "tidy", "framework", "pipeline"]
22
+ classifiers = [
23
+ "Development Status :: 3 - Alpha",
24
+ "License :: OSI Approved :: BSD License",
25
+ "Intended Audience :: Science/Research",
26
+ "Programming Language :: Python :: 3 :: Only",
27
+ "Programming Language :: Python :: 3.10",
28
+ "Programming Language :: Python :: 3.11",
29
+ "Programming Language :: Python :: 3.12",
30
+ "Programming Language :: Python :: 3.13",
31
+ "Operating System :: OS Independent",
32
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
33
+ "Topic :: Software Development :: Libraries :: Python Modules",
34
+ "Typing :: Typed",
35
+ ]
36
+
37
+ [project.urls]
38
+ Repository = "https://github.com/xulab-research/TidyMut"
39
+
40
+ [project.optional-dependencies]
41
+ test = ["pytest>=8.0.0", "pytest-cov>=6.0.0"]
42
+ dev = [
43
+ "pytest>=8.0.0",
44
+ "pytest-cov>=6.0.0",
45
+ "sphinx>=8.0.0",
46
+ "sphinx-autobuild>=2024.10.0",
47
+ "sphinx_rtd_theme>=3.0.0"
48
+ ]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+