tidymut 0.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tidymut-0.0.0/.gitignore +45 -0
- tidymut-0.0.0/LICENSE +28 -0
- tidymut-0.0.0/PKG-INFO +273 -0
- tidymut-0.0.0/README.md +206 -0
- tidymut-0.0.0/pyproject.toml +48 -0
- tidymut-0.0.0/setup.cfg +4 -0
- tidymut-0.0.0/test/test_dataset.py +664 -0
- tidymut-0.0.0/test/test_mutation.py +746 -0
- tidymut-0.0.0/test/test_pipeline.py +1257 -0
- tidymut-0.0.0/test/test_sequence.py +577 -0
- tidymut-0.0.0/tidymut/__init__.py +26 -0
- tidymut-0.0.0/tidymut/cleaners/__init__.py +25 -0
- tidymut-0.0.0/tidymut/cleaners/base_config.py +208 -0
- tidymut-0.0.0/tidymut/cleaners/basic_cleaners.py +1165 -0
- tidymut-0.0.0/tidymut/cleaners/human_domainome_cleaner.py +417 -0
- tidymut-0.0.0/tidymut/cleaners/human_domainome_custom_cleaners.py +363 -0
- tidymut-0.0.0/tidymut/cleaners/k50_cleaner.py +283 -0
- tidymut-0.0.0/tidymut/cleaners/protein_gym_cleaner.py +318 -0
- tidymut-0.0.0/tidymut/cleaners/protein_gym_custom_cleaners.py +203 -0
- tidymut-0.0.0/tidymut/core/__init__.py +43 -0
- tidymut-0.0.0/tidymut/core/alphabet.py +124 -0
- tidymut-0.0.0/tidymut/core/codon.py +84 -0
- tidymut-0.0.0/tidymut/core/constants.py +78 -0
- tidymut-0.0.0/tidymut/core/dataset.py +1498 -0
- tidymut-0.0.0/tidymut/core/mutation.py +797 -0
- tidymut-0.0.0/tidymut/core/pipeline.py +1010 -0
- tidymut-0.0.0/tidymut/core/sequence.py +733 -0
- tidymut-0.0.0/tidymut/core/types.py +14 -0
- tidymut-0.0.0/tidymut/utils/__init__.py +0 -0
- tidymut-0.0.0/tidymut/utils/cleaner_workers.py +296 -0
- tidymut-0.0.0/tidymut/utils/dataset_builders.py +293 -0
- tidymut-0.0.0/tidymut/utils/mutation_converter.py +51 -0
- tidymut-0.0.0/tidymut/utils/sequence_io.py +517 -0
- tidymut-0.0.0/tidymut/utils/type_converter.py +313 -0
- tidymut-0.0.0/tidymut.egg-info/PKG-INFO +273 -0
- tidymut-0.0.0/tidymut.egg-info/SOURCES.txt +37 -0
- tidymut-0.0.0/tidymut.egg-info/dependency_links.txt +1 -0
- tidymut-0.0.0/tidymut.egg-info/requires.txt +17 -0
- tidymut-0.0.0/tidymut.egg-info/top_level.txt +1 -0
tidymut-0.0.0/.gitignore
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Editor temporary/working/backup files #
|
|
2
|
+
#########################################
|
|
3
|
+
.#*
|
|
4
|
+
*\#*\#
|
|
5
|
+
[#]*#
|
|
6
|
+
*~
|
|
7
|
+
*$
|
|
8
|
+
*.bak
|
|
9
|
+
*flymake*
|
|
10
|
+
*.iml
|
|
11
|
+
*.kdev4
|
|
12
|
+
*.log
|
|
13
|
+
*.swp
|
|
14
|
+
*.pdb
|
|
15
|
+
*.zip
|
|
16
|
+
.project
|
|
17
|
+
.pydevproject
|
|
18
|
+
.settings
|
|
19
|
+
.idea
|
|
20
|
+
.vagrant
|
|
21
|
+
.noseids
|
|
22
|
+
.ipynb_checkpoints
|
|
23
|
+
.tags
|
|
24
|
+
.cache/
|
|
25
|
+
.vscode/
|
|
26
|
+
|
|
27
|
+
# Python files #
|
|
28
|
+
################
|
|
29
|
+
__pycache__/
|
|
30
|
+
# pytest
|
|
31
|
+
/.pytest_cache
|
|
32
|
+
# egg metadata
|
|
33
|
+
*.egg-info
|
|
34
|
+
*.eggs
|
|
35
|
+
# coverage
|
|
36
|
+
.coverage
|
|
37
|
+
coverage.xml
|
|
38
|
+
coverage_html_report
|
|
39
|
+
htmlcov
|
|
40
|
+
|
|
41
|
+
# Documentation related files #
|
|
42
|
+
###############################
|
|
43
|
+
doc/source/
|
|
44
|
+
doc/make.bat
|
|
45
|
+
doc/Makefile
|
tidymut-0.0.0/LICENSE
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
BSD 3-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025, Yuxiang Tang.
|
|
4
|
+
|
|
5
|
+
Redistribution and use in source and binary forms, with or without
|
|
6
|
+
modification, are permitted provided that the following conditions are met:
|
|
7
|
+
|
|
8
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
9
|
+
list of conditions and the following disclaimer.
|
|
10
|
+
|
|
11
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
12
|
+
this list of conditions and the following disclaimer in the documentation
|
|
13
|
+
and/or other materials provided with the distribution.
|
|
14
|
+
|
|
15
|
+
3. Neither the name of the copyright holder nor the names of its
|
|
16
|
+
contributors may be used to endorse or promote products derived from
|
|
17
|
+
this software without specific prior written permission.
|
|
18
|
+
|
|
19
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
20
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
21
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
22
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
23
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
24
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
25
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
26
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
27
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
28
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
tidymut-0.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tidymut
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Summary: An efficient framework for tidying and standardizing protein mutation data.
|
|
5
|
+
Author-email: Yuxiang Tang <845351766@qq.com>
|
|
6
|
+
License: BSD 3-Clause License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2025, Yuxiang Tang.
|
|
9
|
+
|
|
10
|
+
Redistribution and use in source and binary forms, with or without
|
|
11
|
+
modification, are permitted provided that the following conditions are met:
|
|
12
|
+
|
|
13
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
14
|
+
list of conditions and the following disclaimer.
|
|
15
|
+
|
|
16
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
17
|
+
this list of conditions and the following disclaimer in the documentation
|
|
18
|
+
and/or other materials provided with the distribution.
|
|
19
|
+
|
|
20
|
+
3. Neither the name of the copyright holder nor the names of its
|
|
21
|
+
contributors may be used to endorse or promote products derived from
|
|
22
|
+
this software without specific prior written permission.
|
|
23
|
+
|
|
24
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
25
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
26
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
27
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
28
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
29
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
30
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
31
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
32
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
33
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
34
|
+
Project-URL: Repository, https://github.com/xulab-research/TidyMut
|
|
35
|
+
Keywords: protein,mutation,tidy,framework,pipeline
|
|
36
|
+
Classifier: Development Status :: 3 - Alpha
|
|
37
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
38
|
+
Classifier: Intended Audience :: Science/Research
|
|
39
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
42
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
43
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
44
|
+
Classifier: Operating System :: OS Independent
|
|
45
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
46
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
47
|
+
Classifier: Typing :: Typed
|
|
48
|
+
Requires-Python: >=3.10
|
|
49
|
+
Description-Content-Type: text/markdown
|
|
50
|
+
License-File: LICENSE
|
|
51
|
+
Requires-Dist: joblib>=1.5.0
|
|
52
|
+
Requires-Dist: numpy>=2.1.0
|
|
53
|
+
Requires-Dist: pandas>=2.1.0
|
|
54
|
+
Requires-Dist: tqdm>=4.60.0
|
|
55
|
+
Requires-Dist: python-dateutil>=2.8.2
|
|
56
|
+
Requires-Dist: tzdata>=2022.7
|
|
57
|
+
Provides-Extra: test
|
|
58
|
+
Requires-Dist: pytest>=8.0.0; extra == "test"
|
|
59
|
+
Requires-Dist: pytest-cov>=6.0.0; extra == "test"
|
|
60
|
+
Provides-Extra: dev
|
|
61
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
62
|
+
Requires-Dist: pytest-cov>=6.0.0; extra == "dev"
|
|
63
|
+
Requires-Dist: sphinx>=8.0.0; extra == "dev"
|
|
64
|
+
Requires-Dist: sphinx-autobuild>=2024.10.0; extra == "dev"
|
|
65
|
+
Requires-Dist: sphinx_rtd_theme>=3.0.0; extra == "dev"
|
|
66
|
+
Dynamic: license-file
|
|
67
|
+
|
|
68
|
+
# TidyMut
|
|
69
|
+
|
|
70
|
+
A comprehensive Python package for processing and analyzing biological sequence data with advanced mutation analysis capabilities.
|
|
71
|
+
|
|
72
|
+
## Overview
|
|
73
|
+
|
|
74
|
+
TidyMut is designed for bioinformaticians, computational biologists, and researchers working with genetic sequence data. The package streamlines the complex process of cleaning, processing, and analyzing DNA and protein sequences, with specialized tools for mutation analysis and large-scale dataset handling.
|
|
75
|
+
|
|
76
|
+
### Key Capabilities
|
|
77
|
+
|
|
78
|
+
- **Sequence Data Processing**: Comprehensive support for DNA and protein sequence operations including complementation, transcription, translation, and validation
|
|
79
|
+
- **Advanced Mutation Analysis**: Specialized tools for detecting, analyzing, and characterizing genetic mutations with statistical insights
|
|
80
|
+
- **Intelligent Data Cleaning**: Automated preprocessing pipelines that handle common data quality issues in biological datasets
|
|
81
|
+
- **Flexible Pipeline Architecture**: Modular design allowing custom workflow creation for specific research needs
|
|
82
|
+
- **High-Performance Processing**: Optimized for handling large-scale sequence datasets efficiently
|
|
83
|
+
|
|
84
|
+
## Installation
|
|
85
|
+
|
|
86
|
+
### Requirements
|
|
87
|
+
- Python 3.13+
|
|
88
|
+
- pandas
|
|
89
|
+
|
|
90
|
+
### Install via pip
|
|
91
|
+
```bash
|
|
92
|
+
pip install tidymut
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Development Installation
|
|
96
|
+
```bash
|
|
97
|
+
git clone https://github.com/xulab-research/TidyMut.git tidymut
|
|
98
|
+
cd tidymut
|
|
99
|
+
pip install -e .
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Quick Start
|
|
103
|
+
|
|
104
|
+
### Processing K50 Dataset
|
|
105
|
+
|
|
106
|
+
Here's a complete example demonstrating TidyMut's capabilities with the K50 mutation dataset:
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from tidymut import k50_cleaner
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# Create K50 cleaning pipeline using TidyMut's default pipeline
|
|
113
|
+
# Download from: https://zenodo.org/records/799292
|
|
114
|
+
# File: `Tsuboyama2023_Dataset2_Dataset3_20230416.csv` in `Processed_K50_dG_datasets.zip`
|
|
115
|
+
k50_cleaning_pipeline = k50_cleaner.create_k50_cleaner(
|
|
116
|
+
"path/to/Tsuboyama2023_Dataset2_Dataset3_20230416.csv"
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Clean and process the dataset
|
|
120
|
+
k50_dataset = clean_k50_dataset(k50_cleaning_pipeline)
|
|
121
|
+
|
|
122
|
+
# Save the processed dataset
|
|
123
|
+
k50_dataset.save("output/cleaned_k50_data")
|
|
124
|
+
|
|
125
|
+
# Access processed data
|
|
126
|
+
print(f"Dataset contains {len(k50_dataset)} sequences")
|
|
127
|
+
print(f"Mutation types identified: {k50_dataset.mutation_summary()}")
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Basic Sequence Operations
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from tidymut.sequence import DNASequence, ProteinSequence
|
|
134
|
+
|
|
135
|
+
# DNA sequence analysis
|
|
136
|
+
dna = DNASequence("ATGCGATCGTAGC")
|
|
137
|
+
print(f"Complement: {dna.complement()}")
|
|
138
|
+
print(f"Reverse complement: {dna.reverse_complement()}")
|
|
139
|
+
print(f"Translation: {dna.translate()}")
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Core Features
|
|
143
|
+
|
|
144
|
+
### Sequence Data Manipulation
|
|
145
|
+
- **Sequence Validation**: Automatic detection and correction of common sequence errors
|
|
146
|
+
- **Format Conversion**: Seamless conversion between different sequence formats
|
|
147
|
+
- **Batch Processing**: Efficient handling of large sequence collections
|
|
148
|
+
|
|
149
|
+
### Mutation Analysis
|
|
150
|
+
- **Mutation Detection**: Automated identification of point mutations, insertions, and deletions
|
|
151
|
+
- **Statistical Analysis**: Comprehensive mutation frequency and distribution statistics
|
|
152
|
+
- **Visualization Tools**: Built-in plotting functions for mutation landscapes
|
|
153
|
+
|
|
154
|
+
### Data Cleaning & Preprocessing
|
|
155
|
+
- **Standardization**: Consistent sequence formatting and annotation
|
|
156
|
+
- **Duplicate Removal**: Intelligent handling of redundant sequences
|
|
157
|
+
|
|
158
|
+
### Pipeline Architecture
|
|
159
|
+
- **Modular Design**: Mix and match processing components
|
|
160
|
+
- **Parallel Processing**: Multi-core support for large datasets
|
|
161
|
+
- **Progress Tracking**: Real-time processing status and logging
|
|
162
|
+
|
|
163
|
+
## Examples and Use Cases
|
|
164
|
+
|
|
165
|
+
### Comparative Mutation Analysis
|
|
166
|
+
```python
|
|
167
|
+
from tidymut.analysis import MutationComparator
|
|
168
|
+
|
|
169
|
+
comparator = MutationComparator()
|
|
170
|
+
comparator.add_dataset("wildtype", wt_sequences)
|
|
171
|
+
comparator.add_dataset("variant", variant_sequences)
|
|
172
|
+
|
|
173
|
+
results = comparator.compare_mutation_profiles()
|
|
174
|
+
comparator.plot_comparison(results)
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Custom Processing Pipeline
|
|
178
|
+
```python
|
|
179
|
+
import pandas as pd
|
|
180
|
+
from typing import Tuple
|
|
181
|
+
|
|
182
|
+
from tidymut.cleaners.basic_cleaners import (
|
|
183
|
+
extract_and_rename_columns,
|
|
184
|
+
filter_and_clean_data,
|
|
185
|
+
convert_data_types,
|
|
186
|
+
validate_mutations,
|
|
187
|
+
infer_wildtype_sequences,
|
|
188
|
+
convert_to_mutation_dataset_format,
|
|
189
|
+
)
|
|
190
|
+
from tidymut.core.dataset import MutationDataset
|
|
191
|
+
from tidymut.core.pipeline import Pipeline, create_pipeline
|
|
192
|
+
|
|
193
|
+
dataset = pd.read_csv("path/to/Tsuboyama2023_Dataset2_Dataset3_20230416.csv")
|
|
194
|
+
|
|
195
|
+
pipeline = create_pipeline(dataset, "k50_cleaner")
|
|
196
|
+
clean_result = (
|
|
197
|
+
pipeline.then(
|
|
198
|
+
extract_and_rename_columns,
|
|
199
|
+
column_mapping={
|
|
200
|
+
"WT_name": "name",
|
|
201
|
+
"aa_seq": "mut_seq",
|
|
202
|
+
"mut_type": "mut_info",
|
|
203
|
+
"ddG_ML": "ddG",
|
|
204
|
+
},
|
|
205
|
+
)
|
|
206
|
+
.then(filter_and_clean_data, filters={"ddG": lambda x: x != "-"})
|
|
207
|
+
.then(convert_data_types, type_conversions={"ddG": "float"})
|
|
208
|
+
.then(
|
|
209
|
+
validate_mutations,
|
|
210
|
+
mutation_column="mut_info",
|
|
211
|
+
mutation_sep="_",
|
|
212
|
+
is_zero_based=False,
|
|
213
|
+
num_workers=16,
|
|
214
|
+
)
|
|
215
|
+
.then(
|
|
216
|
+
infer_wildtype_sequences,
|
|
217
|
+
label_columns=["ddG"],
|
|
218
|
+
handle_multiple_wt="error",
|
|
219
|
+
is_zero_based=True,
|
|
220
|
+
num_workers=16,
|
|
221
|
+
)
|
|
222
|
+
.then(
|
|
223
|
+
convert_to_mutation_dataset_format,
|
|
224
|
+
name_column="name",
|
|
225
|
+
mutation_column="mut_info",
|
|
226
|
+
mutated_sequence_column="mut_seq",
|
|
227
|
+
score_column="ddG",
|
|
228
|
+
is_zero_based=True,
|
|
229
|
+
)
|
|
230
|
+
)
|
|
231
|
+
k50_dataset_df, k50_ref_seq = clean_result.data
|
|
232
|
+
k50_dataset = MutationDataset.from_dataframe(k50_dataset_df, k50_ref_seq)
|
|
233
|
+
|
|
234
|
+
# Get execution summary
|
|
235
|
+
execution_info = pipeline.get_execution_summary()
|
|
236
|
+
|
|
237
|
+
# Access artifacts
|
|
238
|
+
artifacts = pipeline.artifacts
|
|
239
|
+
|
|
240
|
+
# Save pipeline state
|
|
241
|
+
pipeline.save_structured_data("k50_cleaner_pipeline.pkl")
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
## Contributing
|
|
245
|
+
|
|
246
|
+
We welcome contributions! Please see our [Contributing Guidelines](CONTRIBUTING.md) for details on:
|
|
247
|
+
- Code style and standards
|
|
248
|
+
- Testing requirements
|
|
249
|
+
- Pull request process
|
|
250
|
+
- Issue reporting
|
|
251
|
+
|
|
252
|
+
## Citation
|
|
253
|
+
|
|
254
|
+
If you use TidyMut in your research, please cite:
|
|
255
|
+
|
|
256
|
+
```bibtex
|
|
257
|
+
@software{tidymut,
|
|
258
|
+
title={TidyMut: A Python Package for Biological Sequence Data Processing},
|
|
259
|
+
author={Your Name and Contributors},
|
|
260
|
+
year={2025},
|
|
261
|
+
url={https://github.com/xulab-research/tidymut}
|
|
262
|
+
}
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
## License
|
|
266
|
+
|
|
267
|
+
This project is licensed under the BSD 3-Clause License - see the [LICENSE](LICENSE) file for details.
|
|
268
|
+
|
|
269
|
+
## Support
|
|
270
|
+
|
|
271
|
+
- **Issues**: [GitHub Issues](https://github.com/xulab-research/tidymut/issues)
|
|
272
|
+
- **Discussions**: [GitHub Discussions](https://github.com/xulab-research/tidymut/discussions)
|
|
273
|
+
- **Email**:
|
tidymut-0.0.0/README.md
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
# TidyMut
|
|
2
|
+
|
|
3
|
+
A comprehensive Python package for processing and analyzing biological sequence data with advanced mutation analysis capabilities.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
TidyMut is designed for bioinformaticians, computational biologists, and researchers working with genetic sequence data. The package streamlines the complex process of cleaning, processing, and analyzing DNA and protein sequences, with specialized tools for mutation analysis and large-scale dataset handling.
|
|
8
|
+
|
|
9
|
+
### Key Capabilities
|
|
10
|
+
|
|
11
|
+
- **Sequence Data Processing**: Comprehensive support for DNA and protein sequence operations including complementation, transcription, translation, and validation
|
|
12
|
+
- **Advanced Mutation Analysis**: Specialized tools for detecting, analyzing, and characterizing genetic mutations with statistical insights
|
|
13
|
+
- **Intelligent Data Cleaning**: Automated preprocessing pipelines that handle common data quality issues in biological datasets
|
|
14
|
+
- **Flexible Pipeline Architecture**: Modular design allowing custom workflow creation for specific research needs
|
|
15
|
+
- **High-Performance Processing**: Optimized for handling large-scale sequence datasets efficiently
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
### Requirements
|
|
20
|
+
- Python 3.13+
|
|
21
|
+
- pandas
|
|
22
|
+
|
|
23
|
+
### Install via pip
|
|
24
|
+
```bash
|
|
25
|
+
pip install tidymut
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Development Installation
|
|
29
|
+
```bash
|
|
30
|
+
git clone https://github.com/xulab-research/TidyMut.git tidymut
|
|
31
|
+
cd tidymut
|
|
32
|
+
pip install -e .
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Quick Start
|
|
36
|
+
|
|
37
|
+
### Processing K50 Dataset
|
|
38
|
+
|
|
39
|
+
Here's a complete example demonstrating TidyMut's capabilities with the K50 mutation dataset:
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from tidymut import k50_cleaner
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# Create K50 cleaning pipeline using TidyMut's default pipeline
|
|
46
|
+
# Download from: https://zenodo.org/records/799292
|
|
47
|
+
# File: `Tsuboyama2023_Dataset2_Dataset3_20230416.csv` in `Processed_K50_dG_datasets.zip`
|
|
48
|
+
k50_cleaning_pipeline = k50_cleaner.create_k50_cleaner(
|
|
49
|
+
"path/to/Tsuboyama2023_Dataset2_Dataset3_20230416.csv"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Clean and process the dataset
|
|
53
|
+
k50_dataset = clean_k50_dataset(k50_cleaning_pipeline)
|
|
54
|
+
|
|
55
|
+
# Save the processed dataset
|
|
56
|
+
k50_dataset.save("output/cleaned_k50_data")
|
|
57
|
+
|
|
58
|
+
# Access processed data
|
|
59
|
+
print(f"Dataset contains {len(k50_dataset)} sequences")
|
|
60
|
+
print(f"Mutation types identified: {k50_dataset.mutation_summary()}")
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Basic Sequence Operations
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from tidymut.sequence import DNASequence, ProteinSequence
|
|
67
|
+
|
|
68
|
+
# DNA sequence analysis
|
|
69
|
+
dna = DNASequence("ATGCGATCGTAGC")
|
|
70
|
+
print(f"Complement: {dna.complement()}")
|
|
71
|
+
print(f"Reverse complement: {dna.reverse_complement()}")
|
|
72
|
+
print(f"Translation: {dna.translate()}")
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Core Features
|
|
76
|
+
|
|
77
|
+
### Sequence Data Manipulation
|
|
78
|
+
- **Sequence Validation**: Automatic detection and correction of common sequence errors
|
|
79
|
+
- **Format Conversion**: Seamless conversion between different sequence formats
|
|
80
|
+
- **Batch Processing**: Efficient handling of large sequence collections
|
|
81
|
+
|
|
82
|
+
### Mutation Analysis
|
|
83
|
+
- **Mutation Detection**: Automated identification of point mutations, insertions, and deletions
|
|
84
|
+
- **Statistical Analysis**: Comprehensive mutation frequency and distribution statistics
|
|
85
|
+
- **Visualization Tools**: Built-in plotting functions for mutation landscapes
|
|
86
|
+
|
|
87
|
+
### Data Cleaning & Preprocessing
|
|
88
|
+
- **Standardization**: Consistent sequence formatting and annotation
|
|
89
|
+
- **Duplicate Removal**: Intelligent handling of redundant sequences
|
|
90
|
+
|
|
91
|
+
### Pipeline Architecture
|
|
92
|
+
- **Modular Design**: Mix and match processing components
|
|
93
|
+
- **Parallel Processing**: Multi-core support for large datasets
|
|
94
|
+
- **Progress Tracking**: Real-time processing status and logging
|
|
95
|
+
|
|
96
|
+
## Examples and Use Cases
|
|
97
|
+
|
|
98
|
+
### Comparative Mutation Analysis
|
|
99
|
+
```python
|
|
100
|
+
from tidymut.analysis import MutationComparator
|
|
101
|
+
|
|
102
|
+
comparator = MutationComparator()
|
|
103
|
+
comparator.add_dataset("wildtype", wt_sequences)
|
|
104
|
+
comparator.add_dataset("variant", variant_sequences)
|
|
105
|
+
|
|
106
|
+
results = comparator.compare_mutation_profiles()
|
|
107
|
+
comparator.plot_comparison(results)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Custom Processing Pipeline
|
|
111
|
+
```python
|
|
112
|
+
import pandas as pd
|
|
113
|
+
from typing import Tuple
|
|
114
|
+
|
|
115
|
+
from tidymut.cleaners.basic_cleaners import (
|
|
116
|
+
extract_and_rename_columns,
|
|
117
|
+
filter_and_clean_data,
|
|
118
|
+
convert_data_types,
|
|
119
|
+
validate_mutations,
|
|
120
|
+
infer_wildtype_sequences,
|
|
121
|
+
convert_to_mutation_dataset_format,
|
|
122
|
+
)
|
|
123
|
+
from tidymut.core.dataset import MutationDataset
|
|
124
|
+
from tidymut.core.pipeline import Pipeline, create_pipeline
|
|
125
|
+
|
|
126
|
+
dataset = pd.read_csv("path/to/Tsuboyama2023_Dataset2_Dataset3_20230416.csv")
|
|
127
|
+
|
|
128
|
+
pipeline = create_pipeline(dataset, "k50_cleaner")
|
|
129
|
+
clean_result = (
|
|
130
|
+
pipeline.then(
|
|
131
|
+
extract_and_rename_columns,
|
|
132
|
+
column_mapping={
|
|
133
|
+
"WT_name": "name",
|
|
134
|
+
"aa_seq": "mut_seq",
|
|
135
|
+
"mut_type": "mut_info",
|
|
136
|
+
"ddG_ML": "ddG",
|
|
137
|
+
},
|
|
138
|
+
)
|
|
139
|
+
.then(filter_and_clean_data, filters={"ddG": lambda x: x != "-"})
|
|
140
|
+
.then(convert_data_types, type_conversions={"ddG": "float"})
|
|
141
|
+
.then(
|
|
142
|
+
validate_mutations,
|
|
143
|
+
mutation_column="mut_info",
|
|
144
|
+
mutation_sep="_",
|
|
145
|
+
is_zero_based=False,
|
|
146
|
+
num_workers=16,
|
|
147
|
+
)
|
|
148
|
+
.then(
|
|
149
|
+
infer_wildtype_sequences,
|
|
150
|
+
label_columns=["ddG"],
|
|
151
|
+
handle_multiple_wt="error",
|
|
152
|
+
is_zero_based=True,
|
|
153
|
+
num_workers=16,
|
|
154
|
+
)
|
|
155
|
+
.then(
|
|
156
|
+
convert_to_mutation_dataset_format,
|
|
157
|
+
name_column="name",
|
|
158
|
+
mutation_column="mut_info",
|
|
159
|
+
mutated_sequence_column="mut_seq",
|
|
160
|
+
score_column="ddG",
|
|
161
|
+
is_zero_based=True,
|
|
162
|
+
)
|
|
163
|
+
)
|
|
164
|
+
k50_dataset_df, k50_ref_seq = clean_result.data
|
|
165
|
+
k50_dataset = MutationDataset.from_dataframe(k50_dataset_df, k50_ref_seq)
|
|
166
|
+
|
|
167
|
+
# Get execution summary
|
|
168
|
+
execution_info = pipeline.get_execution_summary()
|
|
169
|
+
|
|
170
|
+
# Access artifacts
|
|
171
|
+
artifacts = pipeline.artifacts
|
|
172
|
+
|
|
173
|
+
# Save pipeline state
|
|
174
|
+
pipeline.save_structured_data("k50_cleaner_pipeline.pkl")
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Contributing
|
|
178
|
+
|
|
179
|
+
We welcome contributions! Please see our [Contributing Guidelines](CONTRIBUTING.md) for details on:
|
|
180
|
+
- Code style and standards
|
|
181
|
+
- Testing requirements
|
|
182
|
+
- Pull request process
|
|
183
|
+
- Issue reporting
|
|
184
|
+
|
|
185
|
+
## Citation
|
|
186
|
+
|
|
187
|
+
If you use TidyMut in your research, please cite:
|
|
188
|
+
|
|
189
|
+
```bibtex
|
|
190
|
+
@software{tidymut,
|
|
191
|
+
title={TidyMut: A Python Package for Biological Sequence Data Processing},
|
|
192
|
+
author={Your Name and Contributors},
|
|
193
|
+
year={2025},
|
|
194
|
+
url={https://github.com/xulab-research/tidymut}
|
|
195
|
+
}
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## License
|
|
199
|
+
|
|
200
|
+
This project is licensed under the BSD 3-Clause License - see the [LICENSE](LICENSE) file for details.
|
|
201
|
+
|
|
202
|
+
## Support
|
|
203
|
+
|
|
204
|
+
- **Issues**: [GitHub Issues](https://github.com/xulab-research/tidymut/issues)
|
|
205
|
+
- **Discussions**: [GitHub Discussions](https://github.com/xulab-research/tidymut/discussions)
|
|
206
|
+
- **Email**:
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=77.0.3", "setuptools_scm>=8.2.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "tidymut"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "An efficient framework for tidying and standardizing protein mutation data."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [{ name = "Yuxiang Tang", email = "845351766@qq.com" }]
|
|
11
|
+
license = { file = "LICENSE" }
|
|
12
|
+
requires-python = ">=3.10"
|
|
13
|
+
dependencies = [
|
|
14
|
+
"joblib>=1.5.0",
|
|
15
|
+
"numpy>=2.1.0",
|
|
16
|
+
"pandas>=2.1.0",
|
|
17
|
+
"tqdm>=4.60.0",
|
|
18
|
+
"python-dateutil>=2.8.2",
|
|
19
|
+
"tzdata>=2022.7",
|
|
20
|
+
]
|
|
21
|
+
keywords = ["protein", "mutation", "tidy", "framework", "pipeline"]
|
|
22
|
+
classifiers = [
|
|
23
|
+
"Development Status :: 3 - Alpha",
|
|
24
|
+
"License :: OSI Approved :: BSD License",
|
|
25
|
+
"Intended Audience :: Science/Research",
|
|
26
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
27
|
+
"Programming Language :: Python :: 3.10",
|
|
28
|
+
"Programming Language :: Python :: 3.11",
|
|
29
|
+
"Programming Language :: Python :: 3.12",
|
|
30
|
+
"Programming Language :: Python :: 3.13",
|
|
31
|
+
"Operating System :: OS Independent",
|
|
32
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
33
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
34
|
+
"Typing :: Typed",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
Repository = "https://github.com/xulab-research/TidyMut"
|
|
39
|
+
|
|
40
|
+
[project.optional-dependencies]
|
|
41
|
+
test = ["pytest>=8.0.0", "pytest-cov>=6.0.0"]
|
|
42
|
+
dev = [
|
|
43
|
+
"pytest>=8.0.0",
|
|
44
|
+
"pytest-cov>=6.0.0",
|
|
45
|
+
"sphinx>=8.0.0",
|
|
46
|
+
"sphinx-autobuild>=2024.10.0",
|
|
47
|
+
"sphinx_rtd_theme>=3.0.0"
|
|
48
|
+
]
|
tidymut-0.0.0/setup.cfg
ADDED