PyPI - mutcleaner - Versions diffs - 0.1.0__tar.gz - Mend

mutcleaner 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

mutcleaner-0.1.0/.github/workflows/docs.yml +56 -0
mutcleaner-0.1.0/.github/workflows/publish.yml +60 -0
mutcleaner-0.1.0/.gitignore +65 -0
mutcleaner-0.1.0/CONTRIBUTING.md +151 -0
mutcleaner-0.1.0/LICENSE +28 -0
mutcleaner-0.1.0/PKG-INFO +296 -0
mutcleaner-0.1.0/README.md +251 -0
mutcleaner-0.1.0/doc/Makefile +20 -0
mutcleaner-0.1.0/doc/changelog/CHANGELOG_0.1.0.md +11 -0
mutcleaner-0.1.0/doc/make.bat +35 -0
mutcleaner-0.1.0/doc/requirements.in +7 -0
mutcleaner-0.1.0/doc/requirements.txt +99 -0
mutcleaner-0.1.0/doc/source/api/index.rst +12 -0
mutcleaner-0.1.0/doc/source/conf.py +48 -0
mutcleaner-0.1.0/doc/source/index.rst +12 -0
mutcleaner-0.1.0/doc/source/user_guide/cleaners.md +503 -0
mutcleaner-0.1.0/doc/source/user_guide/index.rst +10 -0
mutcleaner-0.1.0/doc/source/user_guide/save_data.md +87 -0
mutcleaner-0.1.0/mutcleaner/__init__.py +58 -0
mutcleaner-0.1.0/mutcleaner/cleaners/__init__.py +108 -0
mutcleaner-0.1.0/mutcleaner/cleaners/antitoxin_pard3_cleaner.py +341 -0
mutcleaner-0.1.0/mutcleaner/cleaners/antitoxin_pard3_custom_cleaners.py +137 -0
mutcleaner-0.1.0/mutcleaner/cleaners/archstabms_1e10_cleaner.py +286 -0
mutcleaner-0.1.0/mutcleaner/cleaners/archstabms_1e10_custom_cleaners.py +69 -0
mutcleaner-0.1.0/mutcleaner/cleaners/base_config.py +207 -0
mutcleaner-0.1.0/mutcleaner/cleaners/basic_cleaners.py +2329 -0
mutcleaner-0.1.0/mutcleaner/cleaners/cdna_proteolysis_cleaner.py +332 -0
mutcleaner-0.1.0/mutcleaner/cleaners/cdna_proteolysis_custom_cleaners.py +283 -0
mutcleaner-0.1.0/mutcleaner/cleaners/ctxm_cleaner.py +304 -0
mutcleaner-0.1.0/mutcleaner/cleaners/ddg_dtm_cleaners.py +347 -0
mutcleaner-0.1.0/mutcleaner/cleaners/human_domainome_custom_cleaners.py +481 -0
mutcleaner-0.1.0/mutcleaner/cleaners/human_domainome_sup2_cleaner.py +356 -0
mutcleaner-0.1.0/mutcleaner/cleaners/human_myoglobin_cleaner.py +322 -0
mutcleaner-0.1.0/mutcleaner/cleaners/human_myoglobin_custom_cleaners.py +79 -0
mutcleaner-0.1.0/mutcleaner/cleaners/proteingym_dms_substitutions_cleaner.py +329 -0
mutcleaner-0.1.0/mutcleaner/cleaners/proteingym_dms_substitutions_custom_cleaners.py +203 -0
mutcleaner-0.1.0/mutcleaner/cleaners/rbd_ace2_cleaner.py +364 -0
mutcleaner-0.1.0/mutcleaner/cleaners/rbd_antibody_cleaner.py +367 -0
mutcleaner-0.1.0/mutcleaner/cleaners/rbd_custom_cleaners.py +190 -0
mutcleaner-0.1.0/mutcleaner/cleaners/trpb_cleaner.py +298 -0
mutcleaner-0.1.0/mutcleaner/core/__init__.py +43 -0
mutcleaner-0.1.0/mutcleaner/core/alphabet.py +124 -0
mutcleaner-0.1.0/mutcleaner/core/codon.py +83 -0
mutcleaner-0.1.0/mutcleaner/core/constants.py +134 -0
mutcleaner-0.1.0/mutcleaner/core/dataset.py +1546 -0
mutcleaner-0.1.0/mutcleaner/core/mutation.py +739 -0
mutcleaner-0.1.0/mutcleaner/core/pipeline.py +1031 -0
mutcleaner-0.1.0/mutcleaner/core/sequence.py +774 -0
mutcleaner-0.1.0/mutcleaner/core/types.py +27 -0
mutcleaner-0.1.0/mutcleaner/utils/__init__.py +39 -0
mutcleaner-0.1.0/mutcleaner/utils/cleaner_workers.py +391 -0
mutcleaner-0.1.0/mutcleaner/utils/data_source.py +381 -0
mutcleaner-0.1.0/mutcleaner/utils/dataset_builders.py +296 -0
mutcleaner-0.1.0/mutcleaner/utils/label_resolvers.py +262 -0
mutcleaner-0.1.0/mutcleaner/utils/mutation_converter.py +51 -0
mutcleaner-0.1.0/mutcleaner/utils/raw_data_downloader.py +743 -0
mutcleaner-0.1.0/mutcleaner/utils/sequence_io.py +517 -0
mutcleaner-0.1.0/mutcleaner/utils/type_converter.py +313 -0
mutcleaner-0.1.0/mutcleaner.egg-info/PKG-INFO +296 -0
mutcleaner-0.1.0/mutcleaner.egg-info/SOURCES.txt +70 -0
mutcleaner-0.1.0/mutcleaner.egg-info/dependency_links.txt +1 -0
mutcleaner-0.1.0/mutcleaner.egg-info/requires.txt +26 -0
mutcleaner-0.1.0/mutcleaner.egg-info/top_level.txt +1 -0
mutcleaner-0.1.0/pyproject.toml +78 -0
mutcleaner-0.1.0/setup.cfg +4 -0
mutcleaner-0.1.0/tests/test_dataset.py +795 -0
mutcleaner-0.1.0/tests/test_mutation.py +746 -0
mutcleaner-0.1.0/tests/test_pipeline.py +1257 -0
mutcleaner-0.1.0/tests/test_sequence.py +577 -0
mutcleaner-0.1.0/tools/changelog.py +545 -0
mutcleaner-0.1.0/tools/generate_changelog.sh +38 -0
mutcleaner-0.1.0/tools/release.sh +72 -0

mutcleaner-0.1.0/.github/workflows/docs.yml ADDED Viewed

@@ -0,0 +1,56 @@
+name: Deploy Sphinx Docs to GitHub Pages
+on:
+  push:
+    branches: ["main"]
+  workflow_dispatch:
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+concurrency:
+  group: "pages"
+  cancel-in-progress: true
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[dev]"
+      - name: Build Sphinx documentation
+        run: |
+          cd doc
+          make html
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v3
+        with:
+          path: "doc/build/html"
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    needs: build
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4

mutcleaner-0.1.0/.github/workflows/publish.yml ADDED Viewed

@@ -0,0 +1,60 @@
+name: Upload Python Package
+on:
+  release:
+    types: [published]
+permissions:
+  contents: read
+jobs:
+  release-build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+      - name: Build release distributions
+        run: |
+          # NOTE: put your own distribution build steps here.
+          python -m pip install --upgrade pip build twine
+          python -m build
+      - name: Upload distributions
+        uses: actions/upload-artifact@v4
+        with:
+          name: release-dists
+          path: dist/
+  pypi-publish:
+    runs-on: ubuntu-latest
+    needs:
+      - release-build
+    permissions:
+      # IMPORTANT: this permission is mandatory for trusted publishing
+      id-token: write
+    environment:
+      name: pypi
+      url: https://pypi.org/p/mutcleaner
+      #
+      # ALTERNATIVE: if your GitHub Release name is the PyPI project version string
+      # ALTERNATIVE: exactly, uncomment the following line instead:
+      # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
+    steps:
+      - name: Retrieve release distributions
+        uses: actions/download-artifact@v4
+        with:
+          name: release-dists
+          path: dist/
+      - name: Publish release distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          packages-dir: dist/
+          password: ${{ secrets.PYPI_API_TOKEN }}

mutcleaner-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,65 @@
+# Editor temporary/working/backup files #
+#########################################
+.#*
+*\#*\#
+[#]*#
+*~
+*$
+*.bak
+*flymake*
+*.iml
+*.kdev4
+*.log
+*.swp
+*.pdb
+*.zip
+.project
+.pydevproject
+.settings
+.idea
+.vagrant
+.noseids
+.ipynb_checkpoints
+.tags
+.cache/
+.vscode/
+# Python files #
+################
+__pycache__/
+# pytest
+/.pytest_cache
+# egg metadata
+*.egg-info
+*.eggs
+# coverage
+.coverage
+coverage.xml
+coverage_html_report
+htmlcov
+# Build artifacts #
+###################
+dist/
+# Database files #
+##################
+tests/*.pkl
+# Doc files #
+#############
+doc/build/
+**/.doctrees/
+doc/source/api/generated/
+doc/source/generated/
+# Datasets and outputs #
+########################
+dataset/
+clean_dataset/
+outputs/
+validation/
+# Logs #
+########
+logs/

mutcleaner-0.1.0/CONTRIBUTING.md ADDED Viewed

@@ -0,0 +1,151 @@
+# Contributing Guide
+Thanks for considering contributing to this project! To keep things clean and maintainable, please follow the guidelines below.
+## commit Message Convention
+We use the [Conventional Commits](https://www.conventionalcommits.org/) standard:
+```
+<type>(<scope>): <summary>
+<body>
+```
+- `type`: The category of the change (see below)
+- `scope`: The part of the codebase the change affects (optional but recommended)
+- `summary`: A short, imperative sentence describing the change
+- `body`: (optional) A detailed explanation of the change and its reasoning
+### Common Types
+| Type     | Description                                         |
+| -------- | --------------------------------------------------- |
+| feat     | A new feature                                       |
+| fix      | A bug fix                                           |
+| docs     | Documentation changes only                          |
+| style    | Code style changes (formatting, etc.)               |
+| refactor | Code changes that neither fix bugs nor add features |
+| test     | Adding or updating tests                            |
+| chore    | Maintenance tasks (build process, tools, etc.)      |
+| perf     | Performance improvements                            |
+| ci       | Changes to CI/CD configuration                      |
+| build    | Changes to build system or dependencies             |
+| revert   | Reverting a previous commit                         |
+### Example
+```
+feat(cli): add --dry-run flag to simulate deletion
+This adds a new `--dry-run` option to the CLI. When enabled, the command will simulate
+file deletions without actually removing them. Useful for debugging large batch runs.
+Closes #42
+```
+## Changelog Policy
+Generate changelog.md file everytime we publish a new release:
+- Before **each release** (beta or stable), please update `tools/generate_changelog.sh` and run:
+```bash
+cd path/to/tidymut
+bash tools/generate_changelog.sh
+```
+To avoid cluttering the `main` branch with release artifacts,
+changelog updates are typically made on a `release/*` branch.
+If desired, you can manually merge only the `changelog.md` back
+into `main` using git checkout `release/* -- CHANGELOG.md`.
+## Tips
+- Use `git commit -m "type(scope): summary" -m "body"` for multi-line messages.
+- Use `git add -p` to split large commits into atomic ones.
+- Use `git tag vX.Y.Z` to prepare for releases.
+## Squash Commit Guidelines
+When using squash-and-merge (especially via GitHub UI), all individual commits in a feature branch are combined into a single commit. Please follow this format for the final squash commit message:
+```text
+<type>(<scope>): <summary>
+<body>
+```
+- Follow the same Conventional Commits format.
+- The body can include bullet points summarizing the key changes if the branch includes multiple related edits.
+### Example
+```text
+feat(parser): add support for multi-mutation parsing
+- Added helper to parse comma-separated mutations
+- Integrated fallback validator
+- Updated tests for multi-mutation strings
+```
+You can edit the squash message in the GitHub UI before confirming the merge.
+## Full Release Workflow
+Here is a complete step-by-step workflow for submitting code from a feature branch, preparing a release, and generating a changelog:
+```bash
+# >>> Fix bugs or Add feats >>>
+# Check your branch before start
+git branch
+# Develop on feature branch
+git checkout -b feature/my-feature
+# edit files...
+git commit -m "feat(core): ..."
+git push origin feature/my-feature
+# For Manager: open a pull request to merge the feature branch into main
+# After code review and testing, use squash and merge in GitHub UI.
+# Or you can do it in command line:
+git checkout main
+git pull origin main
+## squash merge
+git merge --squash feature/your-feature-name
+## squash commit
+git commit -m "See 'Squash Commit Guidelines' for details."
+git push origin main
+# After squash-and-merge, sync your local main branch
+git checkout main
+git pull origin main
+# >>> The instructions below are for maintainers preparing a release >>>
+# Generate changelog when a new version is ready
+## See Changelog Policy for details
+bash tool/generate_changelog.sh
+## Commit the changelog
+git checkout main
+git add path/to/changelog.md  # in doc/changelog/
+git commit -m "docs(changelog): update main after version release."  # replace with real version
+## tag the release
+git tag version-tag  # replace with real version
+git push origin version-tag  # replace with real version
+git tag -l
+git show v0.1.0-beta
+```
+## Resources
+- [Conventional Commits](https://www.conventionalcommits.org/)
+- [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
+- [Semantic Versioning](https://semver.org/)
+- [GitHub Releases Guide](https://docs.github.com/en/repositories/releasing-projects-on-github)
+---
+Thank you for contributing! 🙌

mutcleaner-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,28 @@
+BSD 3-Clause License
+Copyright (c) 2026, Yuxiang Tang and Ziyu Shi.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

mutcleaner-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,296 @@
+Metadata-Version: 2.4
+Name: mutcleaner
+Version: 0.1.0
+Summary: An efficient framework for cleaning, standardizing, and processing biological mutation data.
+Author: Yuxiang Tang, Ziyu Shi
+License-Expression: BSD-3-Clause
+Project-URL: Repository, https://github.com/xulab-research/MutCleaner
+Keywords: mutation,sequence,bioinformatics,data-cleaning,standardization,protein,codon,DNA,RNA,DMS
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Operating System :: OS Independent
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Typing :: Typed
+Requires-Python: >=3.13
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: joblib>=1.5.0
+Requires-Dist: numpy>=2.1.0
+Requires-Dist: pandas>=2.2.0
+Requires-Dist: tqdm>=4.60.0
+Requires-Dist: requests>=2.30
+Requires-Dist: openpyxl>=3.1.0
+Provides-Extra: test
+Requires-Dist: pytest>=8.0.0; extra == "test"
+Requires-Dist: pytest-cov>=6.0.0; extra == "test"
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=6.0.0; extra == "dev"
+Requires-Dist: sphinx>=8.0.0; extra == "dev"
+Requires-Dist: sphinx-autobuild>=2024.10.0; extra == "dev"
+Requires-Dist: pydata-sphinx-theme; extra == "dev"
+Requires-Dist: twine>=6.0.0; extra == "dev"
+Requires-Dist: gitpython; extra == "dev"
+Requires-Dist: pygithub; extra == "dev"
+Requires-Dist: jinja2; extra == "dev"
+Requires-Dist: numpydoc; extra == "dev"
+Requires-Dist: myst_parser; extra == "dev"
+Requires-Dist: sphinx_copybutton; extra == "dev"
+Requires-Dist: sphinx_design; extra == "dev"
+Requires-Dist: sphinx_autodoc_typehints; extra == "dev"
+Dynamic: license-file
+# MutCleaner
+[![License badge](https://img.shields.io/badge/License-BSD_3--Clause-yellow?logo=opensourceinitiative&logoColor=white)](https://github.com/xulab-research/MutCleaner/blob/main/LICENSE)
+[![PyPI version badge](https://img.shields.io/pypi/v/mutcleaner?logo=python&logoColor=white&color=orange)](https://pypi.org/project/mutcleaner/)
+[![Docs](https://github.com/xulab-research/MutCleaner/actions/workflows/docs.yml/badge.svg)](https://xulab-research.github.io/MutCleaner/)
+MutCleaner is an extensible Python toolkit for cleaning and standardizing biological mutation datasets, integrating dataset-specific cleaning pipelines with core abstractions for protein, nucleotide, and codon-level mutation representations.
+* **Documentation**: https://xulab-research.github.io/MutCleaner/
+* **Cleaning Examples**: https://xulab-research.github.io/MutCleaner/user_guide/cleaners.html
+## Overview
+MutCleaner is an extensible Python toolkit for cleaning, standardizing, and analyzing biological mutation datasets. It currently focuses on protein variant data while providing core abstractions for DNA, RNA, protein sequences, and codon-level mutation representations.
+The package combines dataset-specific cleaning pipelines with reusable sequence and mutation utilities, enabling reproducible preprocessing of large-scale mutational datasets for downstream bioinformatics and machine learning analyses.
+### Key Capabilities
+- **Mutation dataset cleaning and standardization**: Harmonize mutation annotations, sequences, labels, and metadata across heterogeneous biological mutation datasets.
+- **Sequence representation and validation**: Utilities for DNA, RNA, and protein sequences, including validation, transcription, reverse transcription, translation, and mutation application.
+- **Mutation parsing and transformation**: Tools for parsing amino-acid and codon-level mutations, inferring mutations from sequences, applying mutations to reference sequences, and converting codon mutations into amino-acid changes.
+- **Modular pipeline architecture**: A composable pipeline interface for building reproducible dataset-cleaning workflows.
+- **Parallel and scalable dataset processing**: Multi-core utilities for mutation validation, mutation application, and sequence-based mutation inference, supporting efficient processing of large tabular mutation datasets.
+## Installation
+### Requirements
+- Python 3.13+
+- Dependencies are automatically installed via pip.
+### Install via pip
+```bash
+pip install mutcleaner
+```
+### Development Installation
+```bash
+git clone https://github.com/xulab-research/MutCleaner.git MutCleaner
+cd MutCleaner
+pip install -e .
+```
+To install development dependencies for testing and documentation:
+```bash
+pip install -e ".[dev]"
+```
+## Quick Start
+See the [Data Cleaners Usage Guide](https://xulab-research.github.io/MutCleaner/user_guide/cleaners.html) for more examples.
+### Supported Datasets
+| Dataset Name | Reference | File |
+| --- | --- | --- |
+| Human Domainome Dataset | [Site-saturation mutagenesis of 500 human protein domains](https://doi.org/10.1038/s41586-024-08370-4) | [SupplementaryTable2.txt](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/Human_Domainome_Dataset/SupplementaryTable2.txt), [wild_type.fasta](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/Human_Domainome_Dataset/wild_type.fasta) |
+| ProteinGym DMS Substitutions Dataset | [ProteinGym: Large-Scale Benchmarks for Protein Design and Fitness Prediction](https://doi.org/10.1101/2023.12.07.570727) | [DMS_ProteinGym_substitutions.zip](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/ProteinGym_DMS_Substitutions_Dataset/DMS_ProteinGym_substitutions.zip) |
+| cDNA Proteolysis Dataset | [Mega-scale experimental analysis of protein folding stability in biology and design](https://doi.org/10.1038/s41586-023-06328-6) | [Tsuboyama2023_Dataset2_Dataset3_20230416.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/cDNA_Proteolysis_Dataset/Tsuboyama2023_Dataset2_Dataset3_20230416.csv) |
+| ΔΔG Dataset | [Improving the prediction of protein stability changes upon mutations by geometric learning and a pre-training strategy](https://doi.org/10.1038/s43588-024-00716-2) | [M1261.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/%CE%94%CE%94G_Dataset/M1261.csv), [S461.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/%CE%94%CE%94G_Dataset/S461.csv), [S669.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/%CE%94%CE%94G_Dataset/S669.csv), [S783.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/%CE%94%CE%94G_Dataset/S783.csv), [S8754.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/%CE%94%CE%94G_Dataset/S8754.csv) |
+| ΔTm Dataset | [Improving the prediction of protein stability changes upon mutations by geometric learning and a pre-training strategy](https://doi.org/10.1038/s43588-024-00716-2) | [S4346.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/%CE%94Tm_Dataset/S4346.csv), [S571.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/%CE%94Tm_Dataset/S571.csv) |
+| ArchStabMS1E10 Epistasis Dataset | [The genetic architecture of protein stability](https://doi.org/10.1038/s41586-024-07966-0) | [ArchStabMS1E10_Epistasis_Dataset.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/ArchStabMS1E10_Epistasis_Dataset/ArchStabMS1E10_Epistasis_Dataset.csv) |
+| Antitoxin ParD3 Epistasis Dataset | [Protein design using structure-based residue preferences](https://doi.org/10.1038/s41467-024-45621-4) | [Antitoxin_ParD3_Epistasis_Dataset.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/Antitoxin_ParD3_Epistasis_Dataset/Antitoxin_ParD3_Epistasis_Dataset.csv) |
+| TrpB Epistasis Dataset | [A combinatorially complete epistatic fitness landscape in an enzyme active site](https://doi.org/10.1073/pnas.2400439121) | [TrpB_Epistasis_Dataset.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/TrpB_Epistasis_Dataset/TrpB_Epistasis_Dataset.csv) |
+| Human Myoglobin Epistasis Dataset | [Decoding Stability and Epistasis in Human Myoglobin by Deep Mutational Scanning and Codon-level Machine Learning](https://doi.org/10.1101/2024.02.24.581358) | [Human_Myoglobin_Epistasis_Dataset.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/Human_Myoglobin_Epistasis_Dataset/Human_Myoglobin_Epistasis_Dataset.csv) |
+| CTXM Epistasis Dataset | [Network of epistatic interactions in an enzyme active site revealed by DMS](https://doi.org/10.1073/pnas.2313513121) | [CTXM_Cefotaxime_Epistasis_Dataset.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/CTXM_Epistasis_Dataset/CTXM_Cefotaxime_Epistasis_Dataset.csv), [CTXM_Ampicillin_Epistasis_Dataset.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/CTXM_Epistasis_Dataset/CTXM_Ampicillin_Epistasis_Dataset.csv) |
+| RBD ACE2 Dataset | [Shifting mutational constraints in the SARS-CoV-2 receptor-binding domain during viral evolution](https://doi.org/10.1126/science.abo7896), [Deep mutational scans for ACE2 binding, RBD expression, and antibody escape in the SARS-CoV-2 Omicron BA.1 and BA.2 receptor-binding domains](https://doi.org/10.1371/journal.ppat.1010951), [Deep mutational scans of XBB.1.5 and BQ.1.1 reveal ongoing epistatic drift during SARS-CoV-2 evolution](https://doi.org/10.1371/journal.ppat.1011901), [Deep mutational scanning of SARS-CoV-2 Omicron BA.2.86 and epistatic emergence of the KP.3 variant](https://doi.org/10.1093/ve/veae067) | [SARS-CoV-2-RBD_DMS_variants_bc_binding.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_ACE2_Dataset/SARS-CoV-2-RBD_DMS_variants_bc_binding.csv), [SARS-CoV-2-RBD_Delta_bc_binding.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_ACE2_Dataset/SARS-CoV-2-RBD_Delta_bc_binding.csv), [SARS-CoV-2-RBD_DMS_Omicron_bc_binding.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_ACE2_Dataset/SARS-CoV-2-RBD_DMS_Omicron_bc_binding.csv), [SARS-CoV-2-RBD_DMS_Omicron-XBB-BQ_bc_binding.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_ACE2_Dataset/SARS-CoV-2-RBD_DMS_Omicron-XBB-BQ_bc_binding.csv), [SARS-CoV-2-RBD_DMS_Omicron-EG5-FLip-BA286_bc_binding.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_ACE2_Dataset/SARS-CoV-2-RBD_DMS_Omicron-EG5-FLip-BA286_bc_binding.csv) |
+| RBD Antibody Dataset | [Antibodies elicited by mRNA-1273 vaccination bind more broadly to the receptor binding domain than do those from SARS-CoV-2 infection](https://doi.org/10.1126/scitranslmed.abi9915), [Comprehensive mapping of mutations in the SARS-CoV-2 receptor-binding domain that affect recognition by polyclonal human plasma antibodies](https://doi.org/10.1016/j.chom.2021.02.003), [Mapping mutations to the SARS-CoV-2 RBD that escape binding by different classes of antibodies](https://doi.org/10.1038/s41467-021-24435-8), [Genetic and structural basis for SARS-CoV-2 variant neutralization by a two-antibody cocktail](https://doi.org/10.1038/s41564-021-00972-2), [SARS-CoV-2 RBD antibodies that maximize breadth and resistance to escape](https://doi.org/10.1038/s41586-021-03807-6), [Prospective mapping of viral mutations that escape antibodies used to treat COVID-19](https://doi.org/10.1126/science.abf9302) | [SARS-CoV-2-RBD_MAP_Moderna_scores.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_Antibody_Dataset/SARS-CoV-2-RBD_MAP_Moderna_scores.csv), [SARS-CoV-2-RBD_MAP_HAARVI_sera_scores.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_Antibody_Dataset/SARS-CoV-2-RBD_MAP_HAARVI_sera_scores.csv), [SARS-CoV-2-RBD_MAP_Rockefeller_scores.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_Antibody_Dataset/SARS-CoV-2-RBD_MAP_Rockefeller_scores.csv), [SARS-CoV-2-RBD_MAP_AZ_Abs_scores.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_Antibody_Dataset/SARS-CoV-2-RBD_MAP_AZ_Abs_scores.csv), [SARS-CoV-2-RBD_MAP_Vir_mAbs_scores.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_Antibody_Dataset/SARS-CoV-2-RBD_MAP_Vir_mAbs_scores.csv), [SARS-CoV-2-RBD_MAP_clinical_Abs_scores.csv](https://huggingface.co/datasets/xulab-research/MutCleaner/blob/main/RBD_Antibody_Dataset/SARS-CoV-2-RBD_MAP_clinical_Abs_scores.csv) |
+### Processing cDNA Proteolysis Dataset
+Here's a complete example demonstrating MutCleaner's capabilities with the cDNA Proteolysis mutation dataset:
+```python
+from mutcleaner import cdna_proteolysis_cleaner
+from mutcleaner import download_cdna_proteolysis_source_file
+# Create a cDNA Proteolysis cleaning pipeline using MutCleaner's default pipeline.
+cdna_proteolysis_filepath = download_cdna_proteolysis_source_file(
+    "dir_path",
+    "file_name",
+)["filename"]
+cdna_proteolysis_cleaning_pipeline = cdna_proteolysis_cleaner.create_cdna_proteolysis_cleaner(
+    cdna_proteolysis_filepath,
+)
+# Clean and process the dataset.
+cdna_proteolysis_cleaning_pipeline, cdna_proteolysis_dataset = (
+    cdna_proteolysis_cleaner.clean_cdna_proteolysis_dataset(
+        cdna_proteolysis_cleaning_pipeline,
+    )
+)
+# Save the processed dataset.
+cdna_proteolysis_dataset.save("output/cleaned_cdna_proteolysis_data")
+```
+### Basic Sequence Operations
+```python
+from mutcleaner.core.sequence import DNASequence
+# DNA sequence analysis.
+dna = DNASequence("ATGCGATCGTAA")
+print(f"Reverse complement: {dna.reverse_complement()}")
+print(f"Transcription: {dna.transcribe()}")
+print(f"Translation: {dna.translate()}")
+```
+## Core Features
+### Sequence Data Manipulation
+- **Sequence validation**: Validate DNA, RNA, and protein sequences against predefined alphabets.
+- **Sequence transformation**: Support transcription, reverse transcription, translation, and reverse-complement operations.
+- **Batch processing**: Process large tabular mutation datasets through reusable cleaning utilities.
+### Mutation Analysis
+- **Mutation parsing**: Parse amino-acid and codon-level mutation annotations.
+- **Mutation inference**: Infer mutation annotations by comparing reference and mutated sequences.
+- **Mutation transformation**: Apply mutation annotations to reference sequences and convert codon-level mutations into amino-acid changes.
+### Data Cleaning and Preprocessing
+- **Standardization**: Harmonize mutation names, sequences, labels, and metadata across heterogeneous datasets.
+- **Duplicate handling**: Remove or aggregate redundant mutation records according to dataset-specific rules.
+- **Dataset-specific cleaners**: Provide reusable cleaning pipelines for commonly used mutation datasets.
+### Pipeline Architecture
+- **Modular design**: Compose cleaning workflows from reusable processing components.
+- **Parallel processing**: Use multi-core processing for mutation validation, mutation application, and sequence-based mutation inference.
+- **Progress tracking**: Monitor long-running cleaning tasks with progress bars and structured execution summaries.
+## Examples and Use Cases
+### Custom Processing Pipeline
+```python
+import pandas as pd
+from mutcleaner.cleaners.basic_cleaners import (
+    extract_and_rename_columns,
+    filter_and_clean_data,
+    convert_data_types,
+    validate_mutations,
+    convert_to_mutation_dataset_format,
+)
+from mutcleaner.cleaners.cdna_proteolysis_custom_cleaners import (
+    validate_wt_sequence,
+    average_labels_by_name,
+    subtract_labels_by_wt,
+)
+from mutcleaner.core.dataset import MutationDataset
+from mutcleaner.core.pipeline import create_pipeline
+dataset = pd.read_csv("path/to/Tsuboyama2023_Dataset2_Dataset3_20230416.csv")
+pipeline = create_pipeline(dataset, "cdna_proteolysis_cleaner")
+clean_result = (
+    pipeline.then(
+        extract_and_rename_columns,
+        column_mapping={
+            "WT_name": "name",
+            "aa_seq": "mut_seq",
+            "mut_type": "mut_info",
+            "ddG_ML": "ddG",
+        },
+    )
+    .then(filter_and_clean_data, filters={"ddG": lambda x: x != "-"})
+    .then(convert_data_types, type_conversions={"ddG": "float"})
+    .then(
+        validate_mutations,
+        mutation_column="mut_info",
+        mutation_sep="_",
+        is_zero_based=False,
+        num_workers=16,
+    )
+    .then(
+        average_labels_by_name,
+        name_columns=("name", "mut_info"),
+        label_columns="ddG",
+    )
+    .then(
+        validate_wt_sequence,
+        name_column="name",
+        mutation_column="mut_info",
+        sequence_column="mut_seq",
+        wt_identifier="wt",
+        num_workers=16,
+    )
+    .then(
+        subtract_labels_by_wt,
+        name_column="name",
+        label_columns="ddG",
+        mutation_column="mut_info",
+        in_place=True,
+    )
+    .then(
+        convert_to_mutation_dataset_format,
+        name_column="name",
+        mutation_column="mut_info",
+        mutated_sequence_column="mut_seq",
+        score_column="ddG",
+        is_zero_based=True,
+    )
+)
+cdna_proteolysis_dataset_df, cdna_proteolysis_ref_seq = clean_result.data
+cdna_proteolysis_dataset = MutationDataset.from_dataframe(
+    cdna_proteolysis_dataset_df,
+    cdna_proteolysis_ref_seq,
+)
+execution_info = pipeline.get_execution_summary()
+artifacts = pipeline.artifacts
+pipeline.save_structured_data("cdna_proteolysis_cleaner_pipeline.pkl")
+```
+## Citation
+If you use MutCleaner in your research, please cite:
+```bibtex
+@software{mutcleaner,
+  title={
+    MutCleaner: An efficient framework for cleaning, standardizing, and processing biological mutation data.
+  },
+  author={Yuxiang Tang and Ziyu Shi},
+  year={2026},
+  url={https://github.com/xulab-research/MutCleaner}
+}
+```
+## License
+This project is licensed under the BSD 3-Clause License. See the [LICENSE](LICENSE) file for details.
+## Support
+- **Issues**: [GitHub Issues](https://github.com/xulab-research/MutCleaner/issues)
+- **Discussions**: [GitHub Discussions](https://github.com/xulab-research/MutCleaner/discussions)