mutcleaner 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. mutcleaner-0.1.0/.github/workflows/docs.yml +56 -0
  2. mutcleaner-0.1.0/.github/workflows/publish.yml +60 -0
  3. mutcleaner-0.1.0/.gitignore +65 -0
  4. mutcleaner-0.1.0/CONTRIBUTING.md +151 -0
  5. mutcleaner-0.1.0/LICENSE +28 -0
  6. mutcleaner-0.1.0/PKG-INFO +296 -0
  7. mutcleaner-0.1.0/README.md +251 -0
  8. mutcleaner-0.1.0/doc/Makefile +20 -0
  9. mutcleaner-0.1.0/doc/changelog/CHANGELOG_0.1.0.md +11 -0
  10. mutcleaner-0.1.0/doc/make.bat +35 -0
  11. mutcleaner-0.1.0/doc/requirements.in +7 -0
  12. mutcleaner-0.1.0/doc/requirements.txt +99 -0
  13. mutcleaner-0.1.0/doc/source/api/index.rst +12 -0
  14. mutcleaner-0.1.0/doc/source/conf.py +48 -0
  15. mutcleaner-0.1.0/doc/source/index.rst +12 -0
  16. mutcleaner-0.1.0/doc/source/user_guide/cleaners.md +503 -0
  17. mutcleaner-0.1.0/doc/source/user_guide/index.rst +10 -0
  18. mutcleaner-0.1.0/doc/source/user_guide/save_data.md +87 -0
  19. mutcleaner-0.1.0/mutcleaner/__init__.py +58 -0
  20. mutcleaner-0.1.0/mutcleaner/cleaners/__init__.py +108 -0
  21. mutcleaner-0.1.0/mutcleaner/cleaners/antitoxin_pard3_cleaner.py +341 -0
  22. mutcleaner-0.1.0/mutcleaner/cleaners/antitoxin_pard3_custom_cleaners.py +137 -0
  23. mutcleaner-0.1.0/mutcleaner/cleaners/archstabms_1e10_cleaner.py +286 -0
  24. mutcleaner-0.1.0/mutcleaner/cleaners/archstabms_1e10_custom_cleaners.py +69 -0
  25. mutcleaner-0.1.0/mutcleaner/cleaners/base_config.py +207 -0
  26. mutcleaner-0.1.0/mutcleaner/cleaners/basic_cleaners.py +2329 -0
  27. mutcleaner-0.1.0/mutcleaner/cleaners/cdna_proteolysis_cleaner.py +332 -0
  28. mutcleaner-0.1.0/mutcleaner/cleaners/cdna_proteolysis_custom_cleaners.py +283 -0
  29. mutcleaner-0.1.0/mutcleaner/cleaners/ctxm_cleaner.py +304 -0
  30. mutcleaner-0.1.0/mutcleaner/cleaners/ddg_dtm_cleaners.py +347 -0
  31. mutcleaner-0.1.0/mutcleaner/cleaners/human_domainome_custom_cleaners.py +481 -0
  32. mutcleaner-0.1.0/mutcleaner/cleaners/human_domainome_sup2_cleaner.py +356 -0
  33. mutcleaner-0.1.0/mutcleaner/cleaners/human_myoglobin_cleaner.py +322 -0
  34. mutcleaner-0.1.0/mutcleaner/cleaners/human_myoglobin_custom_cleaners.py +79 -0
  35. mutcleaner-0.1.0/mutcleaner/cleaners/proteingym_dms_substitutions_cleaner.py +329 -0
  36. mutcleaner-0.1.0/mutcleaner/cleaners/proteingym_dms_substitutions_custom_cleaners.py +203 -0
  37. mutcleaner-0.1.0/mutcleaner/cleaners/rbd_ace2_cleaner.py +364 -0
  38. mutcleaner-0.1.0/mutcleaner/cleaners/rbd_antibody_cleaner.py +367 -0
  39. mutcleaner-0.1.0/mutcleaner/cleaners/rbd_custom_cleaners.py +190 -0
  40. mutcleaner-0.1.0/mutcleaner/cleaners/trpb_cleaner.py +298 -0
  41. mutcleaner-0.1.0/mutcleaner/core/__init__.py +43 -0
  42. mutcleaner-0.1.0/mutcleaner/core/alphabet.py +124 -0
  43. mutcleaner-0.1.0/mutcleaner/core/codon.py +83 -0
  44. mutcleaner-0.1.0/mutcleaner/core/constants.py +134 -0
  45. mutcleaner-0.1.0/mutcleaner/core/dataset.py +1546 -0
  46. mutcleaner-0.1.0/mutcleaner/core/mutation.py +739 -0
  47. mutcleaner-0.1.0/mutcleaner/core/pipeline.py +1031 -0
  48. mutcleaner-0.1.0/mutcleaner/core/sequence.py +774 -0
  49. mutcleaner-0.1.0/mutcleaner/core/types.py +27 -0
  50. mutcleaner-0.1.0/mutcleaner/utils/__init__.py +39 -0
  51. mutcleaner-0.1.0/mutcleaner/utils/cleaner_workers.py +391 -0
  52. mutcleaner-0.1.0/mutcleaner/utils/data_source.py +381 -0
  53. mutcleaner-0.1.0/mutcleaner/utils/dataset_builders.py +296 -0
  54. mutcleaner-0.1.0/mutcleaner/utils/label_resolvers.py +262 -0
  55. mutcleaner-0.1.0/mutcleaner/utils/mutation_converter.py +51 -0
  56. mutcleaner-0.1.0/mutcleaner/utils/raw_data_downloader.py +743 -0
  57. mutcleaner-0.1.0/mutcleaner/utils/sequence_io.py +517 -0
  58. mutcleaner-0.1.0/mutcleaner/utils/type_converter.py +313 -0
  59. mutcleaner-0.1.0/mutcleaner.egg-info/PKG-INFO +296 -0
  60. mutcleaner-0.1.0/mutcleaner.egg-info/SOURCES.txt +70 -0
  61. mutcleaner-0.1.0/mutcleaner.egg-info/dependency_links.txt +1 -0
  62. mutcleaner-0.1.0/mutcleaner.egg-info/requires.txt +26 -0
  63. mutcleaner-0.1.0/mutcleaner.egg-info/top_level.txt +1 -0
  64. mutcleaner-0.1.0/pyproject.toml +78 -0
  65. mutcleaner-0.1.0/setup.cfg +4 -0
  66. mutcleaner-0.1.0/tests/test_dataset.py +795 -0
  67. mutcleaner-0.1.0/tests/test_mutation.py +746 -0
  68. mutcleaner-0.1.0/tests/test_pipeline.py +1257 -0
  69. mutcleaner-0.1.0/tests/test_sequence.py +577 -0
  70. mutcleaner-0.1.0/tools/changelog.py +545 -0
  71. mutcleaner-0.1.0/tools/generate_changelog.sh +38 -0
  72. mutcleaner-0.1.0/tools/release.sh +72 -0
@@ -0,0 +1,56 @@
1
+ name: Deploy Sphinx Docs to GitHub Pages
2
+
3
+ on:
4
+ push:
5
+ branches: ["main"]
6
+ workflow_dispatch:
7
+
8
+ permissions:
9
+ contents: read
10
+ pages: write
11
+ id-token: write
12
+
13
+ concurrency:
14
+ group: "pages"
15
+ cancel-in-progress: true
16
+
17
+ jobs:
18
+ build:
19
+ runs-on: ubuntu-latest
20
+
21
+ steps:
22
+ - name: Checkout repository
23
+ uses: actions/checkout@v4
24
+
25
+ - name: Set up Python
26
+ uses: actions/setup-python@v5
27
+ with:
28
+ python-version: "3.13"
29
+
30
+ - name: Install dependencies
31
+ run: |
32
+ python -m pip install --upgrade pip
33
+ pip install -e ".[dev]"
34
+
35
+ - name: Build Sphinx documentation
36
+ run: |
37
+ cd doc
38
+ make html
39
+
40
+ - name: Upload artifact
41
+ uses: actions/upload-pages-artifact@v3
42
+ with:
43
+ path: "doc/build/html"
44
+
45
+ deploy:
46
+ environment:
47
+ name: github-pages
48
+ url: ${{ steps.deployment.outputs.page_url }}
49
+
50
+ runs-on: ubuntu-latest
51
+ needs: build
52
+
53
+ steps:
54
+ - name: Deploy to GitHub Pages
55
+ id: deployment
56
+ uses: actions/deploy-pages@v4
@@ -0,0 +1,60 @@
1
+ name: Upload Python Package
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ permissions:
8
+ contents: read
9
+
10
+ jobs:
11
+ release-build:
12
+ runs-on: ubuntu-latest
13
+
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+
17
+ - uses: actions/setup-python@v5
18
+ with:
19
+ python-version: "3.13"
20
+
21
+ - name: Build release distributions
22
+ run: |
23
+ # NOTE: put your own distribution build steps here.
24
+ python -m pip install --upgrade pip build twine
25
+ python -m build
26
+
27
+ - name: Upload distributions
28
+ uses: actions/upload-artifact@v4
29
+ with:
30
+ name: release-dists
31
+ path: dist/
32
+
33
+ pypi-publish:
34
+ runs-on: ubuntu-latest
35
+ needs:
36
+ - release-build
37
+ permissions:
38
+ # IMPORTANT: this permission is mandatory for trusted publishing
39
+ id-token: write
40
+
41
+ environment:
42
+ name: pypi
43
+ url: https://pypi.org/p/mutcleaner
44
+ #
45
+ # ALTERNATIVE: if your GitHub Release name is the PyPI project version string
46
+ # ALTERNATIVE: exactly, uncomment the following line instead:
47
+ # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
48
+
49
+ steps:
50
+ - name: Retrieve release distributions
51
+ uses: actions/download-artifact@v4
52
+ with:
53
+ name: release-dists
54
+ path: dist/
55
+
56
+ - name: Publish release distributions to PyPI
57
+ uses: pypa/gh-action-pypi-publish@release/v1
58
+ with:
59
+ packages-dir: dist/
60
+ password: ${{ secrets.PYPI_API_TOKEN }}
@@ -0,0 +1,65 @@
1
+ # Editor temporary/working/backup files #
2
+ #########################################
3
+ .#*
4
+ *\#*\#
5
+ [#]*#
6
+ *~
7
+ *$
8
+ *.bak
9
+ *flymake*
10
+ *.iml
11
+ *.kdev4
12
+ *.log
13
+ *.swp
14
+ *.pdb
15
+ *.zip
16
+ .project
17
+ .pydevproject
18
+ .settings
19
+ .idea
20
+ .vagrant
21
+ .noseids
22
+ .ipynb_checkpoints
23
+ .tags
24
+ .cache/
25
+ .vscode/
26
+
27
+ # Python files #
28
+ ################
29
+ __pycache__/
30
+ # pytest
31
+ /.pytest_cache
32
+ # egg metadata
33
+ *.egg-info
34
+ *.eggs
35
+ # coverage
36
+ .coverage
37
+ coverage.xml
38
+ coverage_html_report
39
+ htmlcov
40
+
41
+ # Build artifacts #
42
+ ###################
43
+ dist/
44
+
45
+ # Database files #
46
+ ##################
47
+ tests/*.pkl
48
+
49
+ # Doc files #
50
+ #############
51
+ doc/build/
52
+ **/.doctrees/
53
+ doc/source/api/generated/
54
+ doc/source/generated/
55
+
56
+ # Datasets and outputs #
57
+ ########################
58
+ dataset/
59
+ clean_dataset/
60
+ outputs/
61
+ validation/
62
+
63
+ # Logs #
64
+ ########
65
+ logs/
@@ -0,0 +1,151 @@
1
+ # Contributing Guide
2
+
3
+ Thanks for considering contributing to this project! To keep things clean and maintainable, please follow the guidelines below.
4
+
5
+ ## commit Message Convention
6
+
7
+ We use the [Conventional Commits](https://www.conventionalcommits.org/) standard:
8
+
9
+ ```
10
+ <type>(<scope>): <summary>
11
+
12
+ <body>
13
+ ```
14
+
15
+ - `type`: The category of the change (see below)
16
+ - `scope`: The part of the codebase the change affects (optional but recommended)
17
+ - `summary`: A short, imperative sentence describing the change
18
+ - `body`: (optional) A detailed explanation of the change and its reasoning
19
+
20
+ ### Common Types
21
+
22
+ | Type | Description |
23
+ | -------- | --------------------------------------------------- |
24
+ | feat | A new feature |
25
+ | fix | A bug fix |
26
+ | docs | Documentation changes only |
27
+ | style | Code style changes (formatting, etc.) |
28
+ | refactor | Code changes that neither fix bugs nor add features |
29
+ | test | Adding or updating tests |
30
+ | chore | Maintenance tasks (build process, tools, etc.) |
31
+ | perf | Performance improvements |
32
+ | ci | Changes to CI/CD configuration |
33
+ | build | Changes to build system or dependencies |
34
+ | revert | Reverting a previous commit |
35
+
36
+ ### Example
37
+
38
+ ```
39
+ feat(cli): add --dry-run flag to simulate deletion
40
+
41
+ This adds a new `--dry-run` option to the CLI. When enabled, the command will simulate
42
+ file deletions without actually removing them. Useful for debugging large batch runs.
43
+
44
+ Closes #42
45
+ ```
46
+
47
+ ## Changelog Policy
48
+
49
+ Generate changelog.md file everytime we publish a new release:
50
+
51
+ - Before **each release** (beta or stable), please update `tools/generate_changelog.sh` and run:
52
+
53
+ ```bash
54
+ cd path/to/tidymut
55
+ bash tools/generate_changelog.sh
56
+ ```
57
+
58
+ To avoid cluttering the `main` branch with release artifacts,
59
+ changelog updates are typically made on a `release/*` branch.
60
+ If desired, you can manually merge only the `changelog.md` back
61
+ into `main` using git checkout `release/* -- CHANGELOG.md`.
62
+
63
+ ## Tips
64
+
65
+ - Use `git commit -m "type(scope): summary" -m "body"` for multi-line messages.
66
+ - Use `git add -p` to split large commits into atomic ones.
67
+ - Use `git tag vX.Y.Z` to prepare for releases.
68
+
69
+ ## Squash Commit Guidelines
70
+
71
+ When using squash-and-merge (especially via GitHub UI), all individual commits in a feature branch are combined into a single commit. Please follow this format for the final squash commit message:
72
+
73
+ ```text
74
+ <type>(<scope>): <summary>
75
+
76
+ <body>
77
+ ```
78
+
79
+ - Follow the same Conventional Commits format.
80
+ - The body can include bullet points summarizing the key changes if the branch includes multiple related edits.
81
+
82
+ ### Example
83
+
84
+ ```text
85
+ feat(parser): add support for multi-mutation parsing
86
+
87
+ - Added helper to parse comma-separated mutations
88
+ - Integrated fallback validator
89
+ - Updated tests for multi-mutation strings
90
+ ```
91
+
92
+ You can edit the squash message in the GitHub UI before confirming the merge.
93
+
94
+ ## Full Release Workflow
95
+
96
+ Here is a complete step-by-step workflow for submitting code from a feature branch, preparing a release, and generating a changelog:
97
+
98
+ ```bash
99
+ # >>> Fix bugs or Add feats >>>
100
+ # Check your branch before start
101
+ git branch
102
+ # Develop on feature branch
103
+ git checkout -b feature/my-feature
104
+ # edit files...
105
+ git commit -m "feat(core): ..."
106
+ git push origin feature/my-feature
107
+
108
+ # For Manager: open a pull request to merge the feature branch into main
109
+ # After code review and testing, use squash and merge in GitHub UI.
110
+ # Or you can do it in command line:
111
+ git checkout main
112
+ git pull origin main
113
+
114
+ ## squash merge
115
+ git merge --squash feature/your-feature-name
116
+
117
+ ## squash commit
118
+ git commit -m "See 'Squash Commit Guidelines' for details."
119
+ git push origin main
120
+
121
+ # After squash-and-merge, sync your local main branch
122
+ git checkout main
123
+ git pull origin main
124
+
125
+ # >>> The instructions below are for maintainers preparing a release >>>
126
+ # Generate changelog when a new version is ready
127
+ ## See Changelog Policy for details
128
+ bash tool/generate_changelog.sh
129
+
130
+ ## Commit the changelog
131
+ git checkout main
132
+ git add path/to/changelog.md # in doc/changelog/
133
+ git commit -m "docs(changelog): update main after version release." # replace with real version
134
+
135
+ ## tag the release
136
+ git tag version-tag # replace with real version
137
+ git push origin version-tag # replace with real version
138
+ git tag -l
139
+ git show v0.1.0-beta
140
+ ```
141
+
142
+ ## Resources
143
+
144
+ - [Conventional Commits](https://www.conventionalcommits.org/)
145
+ - [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
146
+ - [Semantic Versioning](https://semver.org/)
147
+ - [GitHub Releases Guide](https://docs.github.com/en/repositories/releasing-projects-on-github)
148
+
149
+ ---
150
+
151
+ Thank you for contributing! 🙌
@@ -0,0 +1,28 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2026, Yuxiang Tang and Ziyu Shi.
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright notice, this
9
+ list of conditions and the following disclaimer.
10
+
11
+ 2. Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ 3. Neither the name of the copyright holder nor the names of its
16
+ contributors may be used to endorse or promote products derived from
17
+ this software without specific prior written permission.
18
+
19
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,296 @@
1
+ Metadata-Version: 2.4
2
+ Name: mutcleaner
3
+ Version: 0.1.0
4
+ Summary: An efficient framework for cleaning, standardizing, and processing biological mutation data.
5
+ Author: Yuxiang Tang, Ziyu Shi
6
+ License-Expression: BSD-3-Clause
7
+ Project-URL: Repository, https://github.com/xulab-research/MutCleaner
8
+ Keywords: mutation,sequence,bioinformatics,data-cleaning,standardization,protein,codon,DNA,RNA,DMS
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: Programming Language :: Python :: 3 :: Only
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
15
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
16
+ Classifier: Typing :: Typed
17
+ Requires-Python: >=3.13
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: joblib>=1.5.0
21
+ Requires-Dist: numpy>=2.1.0
22
+ Requires-Dist: pandas>=2.2.0
23
+ Requires-Dist: tqdm>=4.60.0
24
+ Requires-Dist: requests>=2.30
25
+ Requires-Dist: openpyxl>=3.1.0
26
+ Provides-Extra: test
27
+ Requires-Dist: pytest>=8.0.0; extra == "test"
28
+ Requires-Dist: pytest-cov>=6.0.0; extra == "test"
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
31
+ Requires-Dist: pytest-cov>=6.0.0; extra == "dev"
32
+ Requires-Dist: sphinx>=8.0.0; extra == "dev"
33
+ Requires-Dist: sphinx-autobuild>=2024.10.0; extra == "dev"
34
+ Requires-Dist: pydata-sphinx-theme; extra == "dev"
35
+ Requires-Dist: twine>=6.0.0; extra == "dev"
36
+ Requires-Dist: gitpython; extra == "dev"
37
+ Requires-Dist: pygithub; extra == "dev"
38
+ Requires-Dist: jinja2; extra == "dev"
39
+ Requires-Dist: numpydoc; extra == "dev"
40
+ Requires-Dist: myst_parser; extra == "dev"
41
+ Requires-Dist: sphinx_copybutton; extra == "dev"
42
+ Requires-Dist: sphinx_design; extra == "dev"
43
+ Requires-Dist: sphinx_autodoc_typehints; extra == "dev"
44
+ Dynamic: license-file
45
+
46
+ # MutCleaner
47
+
48
+ [![License badge](https://img.shields.io/badge/License-BSD_3--Clause-yellow?logo=opensourceinitiative&logoColor=white)](https://github.com/xulab-research/MutCleaner/blob/main/LICENSE)
49
+ [![PyPI version badge](https://img.shields.io/pypi/v/mutcleaner?logo=python&logoColor=white&color=orange)](https://pypi.org/project/mutcleaner/)
50
+ [![Docs](https://github.com/xulab-research/MutCleaner/actions/workflows/docs.yml/badge.svg)](https://xulab-research.github.io/MutCleaner/)
51
+
52
+ MutCleaner is an extensible Python toolkit for cleaning and standardizing biological mutation datasets, integrating dataset-specific cleaning pipelines with core abstractions for protein, nucleotide, and codon-level mutation representations.
53
+
54
+ * **Documentation**: https://xulab-research.github.io/MutCleaner/
55
+ * **Cleaning Examples**: https://xulab-research.github.io/MutCleaner/user_guide/cleaners.html
56
+
57
+ ## Overview
58
+
59
+ MutCleaner is an extensible Python toolkit for cleaning, standardizing, and analyzing biological mutation datasets. It currently focuses on protein variant data while providing core abstractions for DNA, RNA, protein sequences, and codon-level mutation representations.
60
+
61
+ The package combines dataset-specific cleaning pipelines with reusable sequence and mutation utilities, enabling reproducible preprocessing of large-scale mutational datasets for downstream bioinformatics and machine learning analyses.
62
+
63
+ ### Key Capabilities
64
+
65
+ - **Mutation dataset cleaning and standardization**: Harmonize mutation annotations, sequences, labels, and metadata across heterogeneous biological mutation datasets.
66
+ - **Sequence representation and validation**: Utilities for DNA, RNA, and protein sequences, including validation, transcription, reverse transcription, translation, and mutation application.
67
+ - **Mutation parsing and transformation**: Tools for parsing amino-acid and codon-level mutations, inferring mutations from sequences, applying mutations to reference sequences, and converting codon mutations into amino-acid changes.
68
+ - **Modular pipeline architecture**: A composable pipeline interface for building reproducible dataset-cleaning workflows.
69
+ - **Parallel and scalable dataset processing**: Multi-core utilities for mutation validation, mutation application, and sequence-based mutation inference, supporting efficient processing of large tabular mutation datasets.
70
+
71
+ ## Installation
72
+
73
+ ### Requirements
74
+
75
+ - Python 3.13+
76
+ - Dependencies are automatically installed via pip.
77
+
78
+ ### Install via pip
79
+
80
+ ```bash
81
+ pip install mutcleaner
82
+ ```
83
+
84
+ ### Development Installation
85
+
86
+ ```bash
87
+ git clone https://github.com/xulab-research/MutCleaner.git MutCleaner
88
+ cd MutCleaner
89
+ pip install -e .
90
+ ```
91
+
92
+ To install development dependencies for testing and documentation:
93
+
94
+ ```bash
95
+ pip install -e ".[dev]"
96
+ ```
97
+
98
+ ## Quick Start
99
+
100
+ See the [Data Cleaners Usage Guide](https://xulab-research.github.io/MutCleaner/user_guide/cleaners.html) for more examples.
101
+
102
+ ### Supported Datasets
103
+
104
+ | Dataset Name | Reference | File |
105
+ | --- | --- | --- |
106
+ | Human Domainome Dataset | [Site-saturation mutagenesis of 500 human protein domains](https://doi.org/10.1038/s41586-024-08370-4) | [SupplementaryTable2.txt](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/Human_Domainome_Dataset/SupplementaryTable2.txt), [wild_type.fasta](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/Human_Domainome_Dataset/wild_type.fasta) |
107
+ | ProteinGym DMS Substitutions Dataset | [ProteinGym: Large-Scale Benchmarks for Protein Design and Fitness Prediction](https://doi.org/10.1101/2023.12.07.570727) | [DMS_ProteinGym_substitutions.zip](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/ProteinGym_DMS_Substitutions_Dataset/DMS_ProteinGym_substitutions.zip) |
108
+ | cDNA Proteolysis Dataset | [Mega-scale experimental analysis of protein folding stability in biology and design](https://doi.org/10.1038/s41586-023-06328-6) | [Tsuboyama2023_Dataset2_Dataset3_20230416.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/cDNA_Proteolysis_Dataset/Tsuboyama2023_Dataset2_Dataset3_20230416.csv) |
109
+ | ΔΔG Dataset | [Improving the prediction of protein stability changes upon mutations by geometric learning and a pre-training strategy](https://doi.org/10.1038/s43588-024-00716-2) | [M1261.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/%CE%94%CE%94G_Dataset/M1261.csv), [S461.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/%CE%94%CE%94G_Dataset/S461.csv), [S669.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/%CE%94%CE%94G_Dataset/S669.csv), [S783.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/%CE%94%CE%94G_Dataset/S783.csv), [S8754.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/%CE%94%CE%94G_Dataset/S8754.csv) |
110
+ | ΔTm Dataset | [Improving the prediction of protein stability changes upon mutations by geometric learning and a pre-training strategy](https://doi.org/10.1038/s43588-024-00716-2) | [S4346.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/%CE%94Tm_Dataset/S4346.csv), [S571.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/%CE%94Tm_Dataset/S571.csv) |
111
+ | ArchStabMS1E10 Epistasis Dataset | [The genetic architecture of protein stability](https://doi.org/10.1038/s41586-024-07966-0) | [ArchStabMS1E10_Epistasis_Dataset.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/ArchStabMS1E10_Epistasis_Dataset/ArchStabMS1E10_Epistasis_Dataset.csv) |
112
+ | Antitoxin ParD3 Epistasis Dataset | [Protein design using structure-based residue preferences](https://doi.org/10.1038/s41467-024-45621-4) | [Antitoxin_ParD3_Epistasis_Dataset.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/Antitoxin_ParD3_Epistasis_Dataset/Antitoxin_ParD3_Epistasis_Dataset.csv) |
113
+ | TrpB Epistasis Dataset | [A combinatorially complete epistatic fitness landscape in an enzyme active site](https://doi.org/10.1073/pnas.2400439121) | [TrpB_Epistasis_Dataset.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/TrpB_Epistasis_Dataset/TrpB_Epistasis_Dataset.csv) |
114
+ | Human Myoglobin Epistasis Dataset | [Decoding Stability and Epistasis in Human Myoglobin by Deep Mutational Scanning and Codon-level Machine Learning](https://doi.org/10.1101/2024.02.24.581358) | [Human_Myoglobin_Epistasis_Dataset.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/Human_Myoglobin_Epistasis_Dataset/Human_Myoglobin_Epistasis_Dataset.csv) |
115
+ | CTXM Epistasis Dataset | [Network of epistatic interactions in an enzyme active site revealed by DMS](https://doi.org/10.1073/pnas.2313513121) | [CTXM_Cefotaxime_Epistasis_Dataset.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/CTXM_Epistasis_Dataset/CTXM_Cefotaxime_Epistasis_Dataset.csv), [CTXM_Ampicillin_Epistasis_Dataset.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/CTXM_Epistasis_Dataset/CTXM_Ampicillin_Epistasis_Dataset.csv) |
116
+ | RBD ACE2 Dataset | [Shifting mutational constraints in the SARS-CoV-2 receptor-binding domain during viral evolution](https://doi.org/10.1126/science.abo7896), [Deep mutational scans for ACE2 binding, RBD expression, and antibody escape in the SARS-CoV-2 Omicron BA.1 and BA.2 receptor-binding domains](https://doi.org/10.1371/journal.ppat.1010951), [Deep mutational scans of XBB.1.5 and BQ.1.1 reveal ongoing epistatic drift during SARS-CoV-2 evolution](https://doi.org/10.1371/journal.ppat.1011901), [Deep mutational scanning of SARS-CoV-2 Omicron BA.2.86 and epistatic emergence of the KP.3 variant](https://doi.org/10.1093/ve/veae067) | [SARS-CoV-2-RBD_DMS_variants_bc_binding.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_ACE2_Dataset/SARS-CoV-2-RBD_DMS_variants_bc_binding.csv), [SARS-CoV-2-RBD_Delta_bc_binding.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_ACE2_Dataset/SARS-CoV-2-RBD_Delta_bc_binding.csv), [SARS-CoV-2-RBD_DMS_Omicron_bc_binding.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_ACE2_Dataset/SARS-CoV-2-RBD_DMS_Omicron_bc_binding.csv), [SARS-CoV-2-RBD_DMS_Omicron-XBB-BQ_bc_binding.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_ACE2_Dataset/SARS-CoV-2-RBD_DMS_Omicron-XBB-BQ_bc_binding.csv), [SARS-CoV-2-RBD_DMS_Omicron-EG5-FLip-BA286_bc_binding.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_ACE2_Dataset/SARS-CoV-2-RBD_DMS_Omicron-EG5-FLip-BA286_bc_binding.csv) |
117
+ | RBD Antibody Dataset | [Antibodies elicited by mRNA-1273 vaccination bind more broadly to the receptor binding domain than do those from SARS-CoV-2 infection](https://doi.org/10.1126/scitranslmed.abi9915), [Comprehensive mapping of mutations in the SARS-CoV-2 receptor-binding domain that affect recognition by polyclonal human plasma antibodies](https://doi.org/10.1016/j.chom.2021.02.003), [Mapping mutations to the SARS-CoV-2 RBD that escape binding by different classes of antibodies](https://doi.org/10.1038/s41467-021-24435-8), [Genetic and structural basis for SARS-CoV-2 variant neutralization by a two-antibody cocktail](https://doi.org/10.1038/s41564-021-00972-2), [SARS-CoV-2 RBD antibodies that maximize breadth and resistance to escape](https://doi.org/10.1038/s41586-021-03807-6), [Prospective mapping of viral mutations that escape antibodies used to treat COVID-19](https://doi.org/10.1126/science.abf9302) | [SARS-CoV-2-RBD_MAP_Moderna_scores.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_Antibody_Dataset/SARS-CoV-2-RBD_MAP_Moderna_scores.csv), [SARS-CoV-2-RBD_MAP_HAARVI_sera_scores.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_Antibody_Dataset/SARS-CoV-2-RBD_MAP_HAARVI_sera_scores.csv), [SARS-CoV-2-RBD_MAP_Rockefeller_scores.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_Antibody_Dataset/SARS-CoV-2-RBD_MAP_Rockefeller_scores.csv), [SARS-CoV-2-RBD_MAP_AZ_Abs_scores.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_Antibody_Dataset/SARS-CoV-2-RBD_MAP_AZ_Abs_scores.csv), [SARS-CoV-2-RBD_MAP_Vir_mAbs_scores.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_Antibody_Dataset/SARS-CoV-2-RBD_MAP_Vir_mAbs_scores.csv), [SARS-CoV-2-RBD_MAP_clinical_Abs_scores.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_Antibody_Dataset/SARS-CoV-2-RBD_MAP_clinical_Abs_scores.csv) |
118
+
119
+
120
+
121
+ ### Processing cDNA Proteolysis Dataset
122
+
123
+ Here's a complete example demonstrating MutCleaner's capabilities with the cDNA Proteolysis mutation dataset:
124
+
125
+ ```python
126
+ from mutcleaner import cdna_proteolysis_cleaner
127
+ from mutcleaner import download_cdna_proteolysis_source_file
128
+
129
+ # Create a cDNA Proteolysis cleaning pipeline using MutCleaner's default pipeline.
130
+ cdna_proteolysis_filepath = download_cdna_proteolysis_source_file(
131
+ "dir_path",
132
+ "file_name",
133
+ )["filename"]
134
+
135
+ cdna_proteolysis_cleaning_pipeline = cdna_proteolysis_cleaner.create_cdna_proteolysis_cleaner(
136
+ cdna_proteolysis_filepath,
137
+ )
138
+
139
+ # Clean and process the dataset.
140
+ cdna_proteolysis_cleaning_pipeline, cdna_proteolysis_dataset = (
141
+ cdna_proteolysis_cleaner.clean_cdna_proteolysis_dataset(
142
+ cdna_proteolysis_cleaning_pipeline,
143
+ )
144
+ )
145
+
146
+ # Save the processed dataset.
147
+ cdna_proteolysis_dataset.save("output/cleaned_cdna_proteolysis_data")
148
+ ```
149
+
150
+ ### Basic Sequence Operations
151
+
152
+ ```python
153
+ from mutcleaner.core.sequence import DNASequence
154
+
155
+ # DNA sequence analysis.
156
+ dna = DNASequence("ATGCGATCGTAA")
157
+
158
+ print(f"Reverse complement: {dna.reverse_complement()}")
159
+ print(f"Transcription: {dna.transcribe()}")
160
+ print(f"Translation: {dna.translate()}")
161
+ ```
162
+
163
+ ## Core Features
164
+
165
+ ### Sequence Data Manipulation
166
+
167
+ - **Sequence validation**: Validate DNA, RNA, and protein sequences against predefined alphabets.
168
+ - **Sequence transformation**: Support transcription, reverse transcription, translation, and reverse-complement operations.
169
+ - **Batch processing**: Process large tabular mutation datasets through reusable cleaning utilities.
170
+
171
+ ### Mutation Analysis
172
+
173
+ - **Mutation parsing**: Parse amino-acid and codon-level mutation annotations.
174
+ - **Mutation inference**: Infer mutation annotations by comparing reference and mutated sequences.
175
+ - **Mutation transformation**: Apply mutation annotations to reference sequences and convert codon-level mutations into amino-acid changes.
176
+
177
+ ### Data Cleaning and Preprocessing
178
+
179
+ - **Standardization**: Harmonize mutation names, sequences, labels, and metadata across heterogeneous datasets.
180
+ - **Duplicate handling**: Remove or aggregate redundant mutation records according to dataset-specific rules.
181
+ - **Dataset-specific cleaners**: Provide reusable cleaning pipelines for commonly used mutation datasets.
182
+
183
+ ### Pipeline Architecture
184
+
185
+ - **Modular design**: Compose cleaning workflows from reusable processing components.
186
+ - **Parallel processing**: Use multi-core processing for mutation validation, mutation application, and sequence-based mutation inference.
187
+ - **Progress tracking**: Monitor long-running cleaning tasks with progress bars and structured execution summaries.
188
+
189
+ ## Examples and Use Cases
190
+
191
+ ### Custom Processing Pipeline
192
+
193
+ ```python
194
+ import pandas as pd
195
+
196
+ from mutcleaner.cleaners.basic_cleaners import (
197
+ extract_and_rename_columns,
198
+ filter_and_clean_data,
199
+ convert_data_types,
200
+ validate_mutations,
201
+ convert_to_mutation_dataset_format,
202
+ )
203
+ from mutcleaner.cleaners.cdna_proteolysis_custom_cleaners import (
204
+ validate_wt_sequence,
205
+ average_labels_by_name,
206
+ subtract_labels_by_wt,
207
+ )
208
+ from mutcleaner.core.dataset import MutationDataset
209
+ from mutcleaner.core.pipeline import create_pipeline
210
+
211
+ dataset = pd.read_csv("path/to/Tsuboyama2023_Dataset2_Dataset3_20230416.csv")
212
+
213
+ pipeline = create_pipeline(dataset, "cdna_proteolysis_cleaner")
214
+ clean_result = (
215
+ pipeline.then(
216
+ extract_and_rename_columns,
217
+ column_mapping={
218
+ "WT_name": "name",
219
+ "aa_seq": "mut_seq",
220
+ "mut_type": "mut_info",
221
+ "ddG_ML": "ddG",
222
+ },
223
+ )
224
+ .then(filter_and_clean_data, filters={"ddG": lambda x: x != "-"})
225
+ .then(convert_data_types, type_conversions={"ddG": "float"})
226
+ .then(
227
+ validate_mutations,
228
+ mutation_column="mut_info",
229
+ mutation_sep="_",
230
+ is_zero_based=False,
231
+ num_workers=16,
232
+ )
233
+ .then(
234
+ average_labels_by_name,
235
+ name_columns=("name", "mut_info"),
236
+ label_columns="ddG",
237
+ )
238
+ .then(
239
+ validate_wt_sequence,
240
+ name_column="name",
241
+ mutation_column="mut_info",
242
+ sequence_column="mut_seq",
243
+ wt_identifier="wt",
244
+ num_workers=16,
245
+ )
246
+ .then(
247
+ subtract_labels_by_wt,
248
+ name_column="name",
249
+ label_columns="ddG",
250
+ mutation_column="mut_info",
251
+ in_place=True,
252
+ )
253
+ .then(
254
+ convert_to_mutation_dataset_format,
255
+ name_column="name",
256
+ mutation_column="mut_info",
257
+ mutated_sequence_column="mut_seq",
258
+ score_column="ddG",
259
+ is_zero_based=True,
260
+ )
261
+ )
262
+
263
+ cdna_proteolysis_dataset_df, cdna_proteolysis_ref_seq = clean_result.data
264
+ cdna_proteolysis_dataset = MutationDataset.from_dataframe(
265
+ cdna_proteolysis_dataset_df,
266
+ cdna_proteolysis_ref_seq,
267
+ )
268
+
269
+ execution_info = pipeline.get_execution_summary()
270
+ artifacts = pipeline.artifacts
271
+ pipeline.save_structured_data("cdna_proteolysis_cleaner_pipeline.pkl")
272
+ ```
273
+
274
+ ## Citation
275
+
276
+ If you use MutCleaner in your research, please cite:
277
+
278
+ ```bibtex
279
+ @software{mutcleaner,
280
+ title={
281
+ MutCleaner: An efficient framework for cleaning, standardizing, and processing biological mutation data.
282
+ },
283
+ author={Yuxiang Tang and Ziyu Shi},
284
+ year={2026},
285
+ url={https://github.com/xulab-research/MutCleaner}
286
+ }
287
+ ```
288
+
289
+ ## License
290
+
291
+ This project is licensed under the BSD 3-Clause License. See the [LICENSE](LICENSE) file for details.
292
+
293
+ ## Support
294
+
295
+ - **Issues**: [GitHub Issues](https://github.com/xulab-research/MutCleaner/issues)
296
+ - **Discussions**: [GitHub Discussions](https://github.com/xulab-research/MutCleaner/discussions)