mpath-pseudotime 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mpath_pseudotime-0.2.0/.github/workflows/ci.yml +29 -0
- mpath_pseudotime-0.2.0/.github/workflows/release-please.yml +45 -0
- mpath_pseudotime-0.2.0/.github/workflows/release.yml +115 -0
- mpath_pseudotime-0.2.0/.github/workflows/tests.yml +40 -0
- mpath_pseudotime-0.2.0/.gitignore +18 -0
- mpath_pseudotime-0.2.0/.pre-commit-config.yaml +9 -0
- mpath_pseudotime-0.2.0/.release-please-manifest.json +3 -0
- mpath_pseudotime-0.2.0/CHANGELOG.md +14 -0
- mpath_pseudotime-0.2.0/Dockerfile +25 -0
- mpath_pseudotime-0.2.0/LICENSE +21 -0
- mpath_pseudotime-0.2.0/PKG-INFO +286 -0
- mpath_pseudotime-0.2.0/README.md +251 -0
- mpath_pseudotime-0.2.0/examples/nextflow/README.md +48 -0
- mpath_pseudotime-0.2.0/examples/nextflow/main.nf +127 -0
- mpath_pseudotime-0.2.0/examples/nextflow/nextflow.config +42 -0
- mpath_pseudotime-0.2.0/examples/pipeline.md +144 -0
- mpath_pseudotime-0.2.0/matlab/NascMatur_PCA_scatterhistogram.m +232 -0
- mpath_pseudotime-0.2.0/matlab/README.md +16 -0
- mpath_pseudotime-0.2.0/pyproject.toml +67 -0
- mpath_pseudotime-0.2.0/release-please-config.json +12 -0
- mpath_pseudotime-0.2.0/src/mpath/__init__.py +3 -0
- mpath_pseudotime-0.2.0/src/mpath/cli.py +154 -0
- mpath_pseudotime-0.2.0/src/mpath/io.py +417 -0
- mpath_pseudotime-0.2.0/src/mpath/metrics.py +392 -0
- mpath_pseudotime-0.2.0/src/mpath/pca.py +249 -0
- mpath_pseudotime-0.2.0/tests/_make_fixtures.py +93 -0
- mpath_pseudotime-0.2.0/tests/conftest.py +30 -0
- mpath_pseudotime-0.2.0/tests/data/mature_metrics.csv +31 -0
- mpath_pseudotime-0.2.0/tests/data/modkit_calls.tsv +259 -0
- mpath_pseudotime-0.2.0/tests/data/nascent_metrics.csv +31 -0
- mpath_pseudotime-0.2.0/tests/data/wgbs.bed +120 -0
- mpath_pseudotime-0.2.0/tests/test_io.py +177 -0
- mpath_pseudotime-0.2.0/tests/test_metrics.py +94 -0
- mpath_pseudotime-0.2.0/tests/test_pca.py +66 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
# Fast gate on every push/PR. Only linting runs here -- the (heavier) test
|
|
4
|
+
# matrix lives in tests.yml and is deliberately gated to the release-please PR
|
|
5
|
+
# so we don't pay for it on every push.
|
|
6
|
+
on:
|
|
7
|
+
push:
|
|
8
|
+
branches: [main]
|
|
9
|
+
pull_request:
|
|
10
|
+
workflow_dispatch:
|
|
11
|
+
|
|
12
|
+
concurrency:
|
|
13
|
+
group: ci-${{ github.ref }}
|
|
14
|
+
cancel-in-progress: true
|
|
15
|
+
|
|
16
|
+
permissions:
|
|
17
|
+
contents: read
|
|
18
|
+
|
|
19
|
+
jobs:
|
|
20
|
+
lint:
|
|
21
|
+
name: ruff
|
|
22
|
+
runs-on: ubuntu-22.04
|
|
23
|
+
steps:
|
|
24
|
+
- uses: actions/checkout@v5
|
|
25
|
+
- uses: actions/setup-python@v6
|
|
26
|
+
with:
|
|
27
|
+
python-version: "3.12"
|
|
28
|
+
- run: pip install ruff
|
|
29
|
+
- run: ruff check
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
name: release-please
|
|
2
|
+
|
|
3
|
+
# Maintains a rolling "release PR" that bumps the version (in
|
|
4
|
+
# src/mpath/__init__.py, the single source of truth) and writes CHANGELOG.md from
|
|
5
|
+
# Conventional Commit messages (feat:, fix:, ...). Merging that PR creates the
|
|
6
|
+
# GitHub Release + v* tag and then *calls* release.yml directly.
|
|
7
|
+
#
|
|
8
|
+
# We call release.yml via `uses:` (workflow_call) instead of relying on the
|
|
9
|
+
# pushed tag: a tag pushed by the default GITHUB_TOKEN does NOT trigger other
|
|
10
|
+
# workflows, so the call keeps publishing reliable with no PAT.
|
|
11
|
+
on:
|
|
12
|
+
push:
|
|
13
|
+
branches: [main]
|
|
14
|
+
workflow_dispatch:
|
|
15
|
+
|
|
16
|
+
permissions:
|
|
17
|
+
contents: write
|
|
18
|
+
pull-requests: write
|
|
19
|
+
|
|
20
|
+
jobs:
|
|
21
|
+
release-please:
|
|
22
|
+
runs-on: ubuntu-22.04
|
|
23
|
+
outputs:
|
|
24
|
+
release_created: ${{ steps.rp.outputs.release_created }}
|
|
25
|
+
tag_name: ${{ steps.rp.outputs.tag_name }}
|
|
26
|
+
steps:
|
|
27
|
+
- uses: googleapis/release-please-action@v5
|
|
28
|
+
id: rp
|
|
29
|
+
with:
|
|
30
|
+
config-file: release-please-config.json
|
|
31
|
+
manifest-file: .release-please-manifest.json
|
|
32
|
+
|
|
33
|
+
publish:
|
|
34
|
+
needs: release-please
|
|
35
|
+
if: needs.release-please.outputs.release_created == 'true'
|
|
36
|
+
permissions:
|
|
37
|
+
contents: write # attach artifacts to the GitHub Release
|
|
38
|
+
id-token: write # PyPI Trusted Publishing (OIDC) + provenance attestation
|
|
39
|
+
attestations: write
|
|
40
|
+
packages: write # push the Docker image to GHCR
|
|
41
|
+
uses: ./.github/workflows/release.yml
|
|
42
|
+
with:
|
|
43
|
+
publish: true
|
|
44
|
+
tag: ${{ needs.release-please.outputs.tag_name }}
|
|
45
|
+
secrets: inherit
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
name: build & publish
|
|
2
|
+
|
|
3
|
+
# Builds the pure-Python wheel (py3-none-any) + sdist and publishes them.
|
|
4
|
+
# Triggered three ways:
|
|
5
|
+
# - workflow_call (from release-please.yml, with publish: true) -> build + publish
|
|
6
|
+
# - tag push vX.Y.Z (manual `git tag`) -> build + publish
|
|
7
|
+
# - workflow_dispatch -> build only (dry run)
|
|
8
|
+
on:
|
|
9
|
+
push:
|
|
10
|
+
tags: ["v*"]
|
|
11
|
+
workflow_dispatch:
|
|
12
|
+
inputs:
|
|
13
|
+
publish:
|
|
14
|
+
description: "Publish to PyPI (otherwise build-only dry run)"
|
|
15
|
+
type: boolean
|
|
16
|
+
default: false
|
|
17
|
+
workflow_call:
|
|
18
|
+
inputs:
|
|
19
|
+
publish:
|
|
20
|
+
type: boolean
|
|
21
|
+
default: false
|
|
22
|
+
tag:
|
|
23
|
+
description: "Tag to attach built artifacts to on the GitHub Release"
|
|
24
|
+
type: string
|
|
25
|
+
default: ""
|
|
26
|
+
|
|
27
|
+
permissions:
|
|
28
|
+
contents: read
|
|
29
|
+
|
|
30
|
+
jobs:
|
|
31
|
+
build:
|
|
32
|
+
name: build wheel + sdist
|
|
33
|
+
runs-on: ubuntu-22.04
|
|
34
|
+
permissions:
|
|
35
|
+
contents: read
|
|
36
|
+
id-token: write # build provenance attestation
|
|
37
|
+
attestations: write
|
|
38
|
+
steps:
|
|
39
|
+
- uses: actions/checkout@v5
|
|
40
|
+
- uses: actions/setup-python@v6
|
|
41
|
+
with:
|
|
42
|
+
python-version: "3.12"
|
|
43
|
+
- name: Build
|
|
44
|
+
run: |
|
|
45
|
+
pip install build
|
|
46
|
+
python -m build
|
|
47
|
+
- name: Attest build provenance
|
|
48
|
+
uses: actions/attest-build-provenance@v4
|
|
49
|
+
with:
|
|
50
|
+
subject-path: dist/*
|
|
51
|
+
- uses: actions/upload-artifact@v6
|
|
52
|
+
with:
|
|
53
|
+
name: dist
|
|
54
|
+
path: dist
|
|
55
|
+
|
|
56
|
+
publish:
|
|
57
|
+
name: publish to PyPI
|
|
58
|
+
needs: build
|
|
59
|
+
if: ${{ startsWith(github.ref, 'refs/tags/') || inputs.publish }}
|
|
60
|
+
runs-on: ubuntu-22.04
|
|
61
|
+
environment: pypi
|
|
62
|
+
permissions:
|
|
63
|
+
contents: write # attach artifacts to the GitHub Release
|
|
64
|
+
id-token: write # PyPI Trusted Publishing (OIDC) -- no API token/secret needed
|
|
65
|
+
steps:
|
|
66
|
+
- uses: actions/download-artifact@v7
|
|
67
|
+
with:
|
|
68
|
+
name: dist
|
|
69
|
+
path: dist
|
|
70
|
+
- name: Attach wheel + sdist to the GitHub Release
|
|
71
|
+
uses: softprops/action-gh-release@v3
|
|
72
|
+
with:
|
|
73
|
+
tag_name: ${{ inputs.tag || github.ref_name }}
|
|
74
|
+
files: dist/*
|
|
75
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
76
|
+
with:
|
|
77
|
+
packages-dir: dist
|
|
78
|
+
# Disable PEP 740 attestations: this runs as a reusable workflow called
|
|
79
|
+
# by release-please.yml, so the attestation's signing identity wouldn't
|
|
80
|
+
# match the PyPI Trusted Publisher and gets rejected. Trusted-publishing
|
|
81
|
+
# AUTH is unaffected; GitHub provenance is still recorded in the build job.
|
|
82
|
+
attestations: false
|
|
83
|
+
|
|
84
|
+
docker:
|
|
85
|
+
name: build & push image (GHCR)
|
|
86
|
+
needs: [publish]
|
|
87
|
+
if: ${{ startsWith(github.ref, 'refs/tags/') || inputs.publish }}
|
|
88
|
+
runs-on: ubuntu-22.04
|
|
89
|
+
permissions:
|
|
90
|
+
contents: read
|
|
91
|
+
packages: write # push to GitHub Container Registry
|
|
92
|
+
steps:
|
|
93
|
+
- uses: actions/checkout@v5
|
|
94
|
+
- name: Resolve version from tag
|
|
95
|
+
id: ver
|
|
96
|
+
run: |
|
|
97
|
+
TAG="${{ inputs.tag || github.ref_name }}"
|
|
98
|
+
echo "version=${TAG#v}" >> "$GITHUB_OUTPUT"
|
|
99
|
+
- uses: docker/setup-qemu-action@v4
|
|
100
|
+
- uses: docker/setup-buildx-action@v4
|
|
101
|
+
- uses: docker/login-action@v4
|
|
102
|
+
with:
|
|
103
|
+
registry: ghcr.io
|
|
104
|
+
username: ${{ github.actor }}
|
|
105
|
+
password: ${{ secrets.GITHUB_TOKEN }}
|
|
106
|
+
- uses: docker/build-push-action@v7
|
|
107
|
+
with:
|
|
108
|
+
context: .
|
|
109
|
+
platforms: linux/amd64,linux/arm64
|
|
110
|
+
push: true
|
|
111
|
+
build-args: |
|
|
112
|
+
VERSION=${{ steps.ver.outputs.version }}
|
|
113
|
+
tags: |
|
|
114
|
+
ghcr.io/${{ github.repository_owner }}/mpath:${{ steps.ver.outputs.version }}
|
|
115
|
+
ghcr.io/${{ github.repository_owner }}/mpath:latest
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
name: tests
|
|
2
|
+
|
|
3
|
+
# The multi-Python test matrix runs ONLY on the release-please PR (and on manual
|
|
4
|
+
# dispatch), not on every push. release-please keeps an open PR from a branch
|
|
5
|
+
# named `release-please--branches--<base>`; gating on that head ref means the
|
|
6
|
+
# suite runs exactly once -- right before a release is cut -- instead of on every
|
|
7
|
+
# commit. Flip the `if:` below to broaden coverage if that ever feels too thin.
|
|
8
|
+
on:
|
|
9
|
+
pull_request:
|
|
10
|
+
workflow_dispatch:
|
|
11
|
+
|
|
12
|
+
permissions:
|
|
13
|
+
contents: read
|
|
14
|
+
|
|
15
|
+
jobs:
|
|
16
|
+
test:
|
|
17
|
+
name: py${{ matrix.python }} ${{ matrix.numpy && format('/ {0}', matrix.numpy) || '' }}
|
|
18
|
+
if: ${{ github.event_name == 'workflow_dispatch' || startsWith(github.head_ref, 'release-please--branches--') }}
|
|
19
|
+
runs-on: ubuntu-22.04
|
|
20
|
+
strategy:
|
|
21
|
+
fail-fast: false
|
|
22
|
+
matrix:
|
|
23
|
+
python: ["3.9", "3.10", "3.11", "3.12", "3.13"]
|
|
24
|
+
include:
|
|
25
|
+
# One extra cell to prove the package runs on the older numpy 1.x line.
|
|
26
|
+
- { python: "3.10", numpy: "numpy==1.26.*" }
|
|
27
|
+
steps:
|
|
28
|
+
- uses: actions/checkout@v5
|
|
29
|
+
- uses: actions/setup-python@v6
|
|
30
|
+
with:
|
|
31
|
+
python-version: ${{ matrix.python }}
|
|
32
|
+
- name: Pin numpy (compat cell only)
|
|
33
|
+
if: ${{ matrix.numpy }}
|
|
34
|
+
run: pip install "${{ matrix.numpy }}"
|
|
35
|
+
- name: Install package + test deps
|
|
36
|
+
run: pip install -e ".[dev]"
|
|
37
|
+
- name: Show resolved versions
|
|
38
|
+
run: python -c "import numpy, pandas, polars; print('numpy', numpy.__version__, '| pandas', pandas.__version__, '| polars', polars.__version__)"
|
|
39
|
+
- name: Run tests
|
|
40
|
+
run: pytest
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# Mirrors CI's fast checks so issues surface at commit time, not on push.
|
|
2
|
+
# Optional -- install once with `pre-commit install` (pip install pre-commit).
|
|
3
|
+
# Anyone who skips it is still covered by CI.
|
|
4
|
+
repos:
|
|
5
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
6
|
+
rev: v0.6.9
|
|
7
|
+
hooks:
|
|
8
|
+
- id: ruff
|
|
9
|
+
args: [--fix]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.2.0](https://github.com/downinglab/mpath/compare/v0.1.0...v0.2.0) (2026-06-29)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
### Features
|
|
7
|
+
|
|
8
|
+
* add CI, pipeline, and standardize to Python ([eda22ee](https://github.com/downinglab/mpath/commit/eda22ee5c60d748d85cd4e759b7e57413ce0179f))
|
|
9
|
+
|
|
10
|
+
## Changelog
|
|
11
|
+
|
|
12
|
+
All notable changes to this project are documented here. This file is maintained
|
|
13
|
+
automatically by [release-please](https://github.com/googleapis/release-please)
|
|
14
|
+
from [Conventional Commit](https://www.conventionalcommits.org/) messages.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Reproducible, version-pinned image for running MPATH -- the artifact that
|
|
2
|
+
# Docker/Singularity-based pipelines (Nextflow, Snakemake, ...) consume.
|
|
3
|
+
#
|
|
4
|
+
# MPATH is a pure-Python package, so this just installs it from PyPI. The
|
|
5
|
+
# upstream preprocessing tools (dorado, modkit, samtools) are NOT included --
|
|
6
|
+
# they are large and version-sensitive; run them in their own images/steps and
|
|
7
|
+
# feed the resulting modkit calls + WGBS bed into `mpath metrics` here.
|
|
8
|
+
#
|
|
9
|
+
# Built and pushed to ghcr.io/<owner>/mpath:<version> by the release workflow.
|
|
10
|
+
# To build locally:
|
|
11
|
+
# docker build --build-arg VERSION=0.1.0 -t mpath:0.1.0 .
|
|
12
|
+
FROM python:3.12-slim
|
|
13
|
+
|
|
14
|
+
LABEL org.opencontainers.image.source="https://github.com/downinglab/mpath"
|
|
15
|
+
LABEL org.opencontainers.image.description="MPATH: methylation pseudotime analysis for Nanopore long reads"
|
|
16
|
+
LABEL org.opencontainers.image.licenses="MIT"
|
|
17
|
+
|
|
18
|
+
# Pin at build time. The release workflow passes the tag's version; an empty
|
|
19
|
+
# VERSION (local builds) installs the latest release from PyPI.
|
|
20
|
+
ARG VERSION=
|
|
21
|
+
RUN pip install --no-cache-dir "mpath-pseudotime${VERSION:+==${VERSION}}"
|
|
22
|
+
|
|
23
|
+
# Default to the CLI so `docker run <image> metrics ...` / `... pca ...` just work.
|
|
24
|
+
ENTRYPOINT ["mpath"]
|
|
25
|
+
CMD ["--help"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Downing Lab
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mpath-pseudotime
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: MPATH: methylation pseudotime analysis for Oxford Nanopore long reads
|
|
5
|
+
Project-URL: Homepage, https://github.com/downinglab/mpath
|
|
6
|
+
Project-URL: Repository, https://github.com/downinglab/mpath
|
|
7
|
+
Project-URL: Issues, https://github.com/downinglab/mpath/issues
|
|
8
|
+
Author: Nandor Laszik
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: bioinformatics,epigenetics,methylation,nanopore,pseudotime
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Requires-Dist: joblib
|
|
23
|
+
Requires-Dist: matplotlib
|
|
24
|
+
Requires-Dist: numpy
|
|
25
|
+
Requires-Dist: pandas
|
|
26
|
+
Requires-Dist: polars
|
|
27
|
+
Requires-Dist: scikit-learn
|
|
28
|
+
Requires-Dist: scipy
|
|
29
|
+
Requires-Dist: tqdm
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: build; extra == 'dev'
|
|
32
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
33
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
# MPATH
|
|
37
|
+
|
|
38
|
+
Methylation pseudotime analysis for deciphering epigenetic cross-talk across sub-cell-cycle timescales.
|
|
39
|
+
|
|
40
|
+
MPATH calculates pseudotime for long-read methylation data from Oxford Nanopore
|
|
41
|
+
sequencing. The full flow of data is:
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
Nanopore BAM (MM/ML tags)
|
|
45
|
+
-> modkit extract calls (per-CpG methylation calls)
|
|
46
|
+
-> mpath metrics (read-level metrics; merges in the WGBS reference)
|
|
47
|
+
-> mpath pca fit / apply (PCA separation of nascent vs mature reads)
|
|
48
|
+
-> pseudotime score
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install mpath-pseudotime
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
This installs the `mpath` command-line tool and the `mpath` Python package. No
|
|
58
|
+
MATLAB required — the PCA has been ported to Python. (The original MATLAB PCA
|
|
59
|
+
script is retained, unchanged, under [`matlab/`](matlab/) for reproducibility of
|
|
60
|
+
the published figures.)
|
|
61
|
+
|
|
62
|
+
Requires Python ≥ 3.9. Dependencies (numpy, pandas, polars, scipy, joblib, tqdm,
|
|
63
|
+
matplotlib, scikit-learn) are installed automatically.
|
|
64
|
+
|
|
65
|
+
## Preliminary setup
|
|
66
|
+
|
|
67
|
+
Before MPATH, process the raw Nanopore data into per-CpG methylation calls. Exact
|
|
68
|
+
syntax depends on your dorado / modkit / samtools versions.
|
|
69
|
+
|
|
70
|
+
1. **Basecall + align** with a model that emits 5mCG tags, then sort:
|
|
71
|
+
```bash
|
|
72
|
+
dorado basecaller hac input.pod5 --modified-bases 5mCG_5hmCG > calls.bam
|
|
73
|
+
dorado aligner hg19.fa calls.bam > reads_aligned.bam
|
|
74
|
+
samtools sort reads_aligned.bam > reads_aligned_sorted.bam
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
2. **Remove chimeric reads** and index:
|
|
78
|
+
```bash
|
|
79
|
+
samtools view -b -F0x900 reads_aligned_sorted.bam > reads_nonchimeric.bam
|
|
80
|
+
samtools index reads_nonchimeric.bam
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
3. **Extract read-level methylation calls** with modkit (developed against
|
|
84
|
+
modkit v0.4.x). The `--read-calls` output is the file MPATH consumes; it can be
|
|
85
|
+
large, so gzip it (MPATH reads `.gz` directly):
|
|
86
|
+
```bash
|
|
87
|
+
modkit extract reads_nonchimeric.bam extract_full.tsv \
|
|
88
|
+
--read-calls calls_cpg.tsv \
|
|
89
|
+
--ref hg38.fa \
|
|
90
|
+
--include-bed CG_motifs.bed # optional: restrict to CpG-motif positions
|
|
91
|
+
pigz calls_cpg.tsv
|
|
92
|
+
```
|
|
93
|
+
(On newer modkit the equivalent is `modkit extract calls reads.bam calls_cpg.tsv`.)
|
|
94
|
+
The output has a header row; MPATH resolves the columns it needs (`read_id`,
|
|
95
|
+
`chrom`, `ref_position`, `call_code`, `ref_strand`, and `fail` if present) **by
|
|
96
|
+
name**, so added/reordered columns across modkit versions are fine.
|
|
97
|
+
|
|
98
|
+
4. **(Optional) BrdU filtering.** For BrdU-labeled data, filter to BrdU-positive
|
|
99
|
+
reads first. BrdU can be called with [DNAscent](https://github.com/MBoemo/DNAscent);
|
|
100
|
+
average the per-read BrdU probabilities to score each read.
|
|
101
|
+
|
|
102
|
+
5. **Obtain a WGBS reference bed** — the expected methylation ratio for each CpG in
|
|
103
|
+
your cell type. You supply your own. The expected form is a simple tab-separated
|
|
104
|
+
bed; the common 4-column `chrom, start, end, ratio` works out of the box:
|
|
105
|
+
```
|
|
106
|
+
chrom start end ratio
|
|
107
|
+
chr1 1061 1062 0.823
|
|
108
|
+
```
|
|
109
|
+
The ratio column is selected with `-wgbs_column` (0-based; default `3`), and the
|
|
110
|
+
`start` must use the **same genome build** as modkit's `ref_position`. You do
|
|
111
|
+
**not** need to know the file's exact coordinate convention (0- vs 1-based, which
|
|
112
|
+
base of the CpG dyad/strand it anchors on) — MPATH **auto-probes** it (see below).
|
|
113
|
+
The one thing it can't infer is the ratio *scale* (`0.5` could be a ratio or a
|
|
114
|
+
rounded percent): ratios must be 0–1, or pass `--wgbs-scale 0-100` for percentages
|
|
115
|
+
(MPATH warns if your values look like percentages).
|
|
116
|
+
|
|
117
|
+
> **Why no manual `input.bed` anymore?** Earlier versions required hand-merging the
|
|
118
|
+
> methylation calls and the WGBS ratios into a single bed. `mpath metrics` now does
|
|
119
|
+
> that merge internally (joining on `chrom` + position), so you pass the modkit
|
|
120
|
+
> calls and the WGBS bed directly. (If you'd rather do the intersection yourself —
|
|
121
|
+
> e.g. with `bedtools` in a pipeline — feed a pre-merged bed via `-path_input_bed`;
|
|
122
|
+
> see *Pre-merged input* below.)
|
|
123
|
+
|
|
124
|
+
### Coordinate auto-probe + intersection QC
|
|
125
|
+
|
|
126
|
+
WGBS beds vary in convention and you usually can't tell from the file. So instead
|
|
127
|
+
of asking you to know, `mpath metrics` reads a sample of CpGs and **measures** the
|
|
128
|
+
match rate for each candidate coordinate offset (`-1/0/+1`) and strand-collapse
|
|
129
|
+
mode, then picks the best and prints what it found:
|
|
130
|
+
|
|
131
|
+
```
|
|
132
|
+
WGBS alignment probe (offset, collapse_strands -> match):
|
|
133
|
+
offset=+0 collapse_strands=True -> 100.0%
|
|
134
|
+
offset=+0 collapse_strands=False -> 53.3%
|
|
135
|
+
offset=-1 collapse_strands=False -> 46.7%
|
|
136
|
+
...
|
|
137
|
+
chosen: offset=+0, collapse_strands=True (100.0%)
|
|
138
|
+
WGBS intersection: matched 43630/43630 CpGs (100.0%); 0/609 reads had no WGBS overlap.
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
The correct convention reveals itself as a sharp jump in the matched fraction
|
|
142
|
+
(this is self-verifying, unlike the ratio scale). If even the best alignment
|
|
143
|
+
matches poorly, MPATH **warns** (it never aborts) and names the likely cause —
|
|
144
|
+
chromosome naming (`chr1` vs `1`), genome build, or the wrong ratio column/scale.
|
|
145
|
+
Force a specific convention with `--wgbs-offset` / `--wgbs-collapse` if you ever
|
|
146
|
+
need to override the probe.
|
|
147
|
+
|
|
148
|
+
### Pre-merged input (escape hatch)
|
|
149
|
+
|
|
150
|
+
If you prefer to do the CpG↔WGBS intersection yourself (e.g. `bedtools intersect`
|
|
151
|
+
inside a pipeline), skip the merge entirely and pass a pre-merged 7-column bed —
|
|
152
|
+
`chrom, start, stop, strand, read_id, methylation(0/1), wgbs_ratio`, grouped by
|
|
153
|
+
`read_id`:
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
mpath metrics -path_input_bed merged.bed -path_output_csv metrics.csv -p 8
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
In this mode MPATH does no merge, probe, or QC — it just computes metrics on what
|
|
160
|
+
you provide. `-path_input_bed` is mutually exclusive with `-path_calls`/`-path_wgbs`.
|
|
161
|
+
|
|
162
|
+
## Metrics: `mpath metrics`
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
mpath metrics \
|
|
166
|
+
-path_calls calls_cpg.tsv.gz \
|
|
167
|
+
-path_wgbs wgbs.bed \
|
|
168
|
+
-wgbs_column 3 \
|
|
169
|
+
-path_output_csv metrics.csv \
|
|
170
|
+
-p 8 -min_cpgs 3 -bin_limits 0,100,1000,5000,10000 --use_full_matrix
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
For each read, MPATH compares all CpG pairs to produce a simple matching
|
|
174
|
+
coefficient (SMC), a uniformity score, and a Pearson correlation — overall, per
|
|
175
|
+
genomic-distance bin, and for nearest-neighbour CpGs.
|
|
176
|
+
|
|
177
|
+
### Arguments
|
|
178
|
+
|
|
179
|
+
| name | description | type | required | default |
|
|
180
|
+
|-----------------------|------------------------------------------------------|------|----------|-----------------------|
|
|
181
|
+
| `-path_calls` | modkit read-calls TSV (`.tsv` or `.tsv.gz`) | str | yes\* | — |
|
|
182
|
+
| `-path_wgbs` | WGBS reference bed (`.bed` or `.bed.gz`) | str | yes\* | — |
|
|
183
|
+
| `-path_input_bed` | pre-merged 7-col bed (alternative to calls+wgbs) | str | yes\* | — |
|
|
184
|
+
| `-path_output_csv` | output metrics CSV | str | yes | — |
|
|
185
|
+
| `-wgbs_column` | 0-based column of the WGBS ratio | int | no | 3 |
|
|
186
|
+
| `-min_cpgs` | minimum CpGs on a read to compute metrics | int | no | 3 |
|
|
187
|
+
| `-bin_limits` | comma-separated distance-bin limits | str | no | 0,100,1000,5000,10000 |
|
|
188
|
+
| `-batch_size` | approx CpGs per batch (controls RAM) | int | no | 1e8 |
|
|
189
|
+
| `-p` | number of parallel processes | int | no | 1 |
|
|
190
|
+
| `--use_full_matrix` | use the full CpG-pair matrix (vs one triangle) | flag | no | off |
|
|
191
|
+
| `--include-hydroxy` | count 5hmC (`h`) calls as methylated | flag | no | off |
|
|
192
|
+
| `--keep-unmatched-wgbs` | keep CpGs absent from the WGBS bed (NaN ratio) | flag | no | off |
|
|
193
|
+
| `--wgbs-scale` | scale of the WGBS ratio: `0-1` or `0-100` | str | no | 0-1 |
|
|
194
|
+
| `--wgbs-offset` | coordinate offset for the join: `auto`/`-1`/`0`/`1` | str | no | auto |
|
|
195
|
+
| `--wgbs-collapse` | map -strand CpGs to + dyad anchor: `auto`/`on`/`off` | str | no | auto |
|
|
196
|
+
|
|
197
|
+
\* Provide **either** `-path_calls` + `-path_wgbs` (on-the-fly merge) **or**
|
|
198
|
+
`-path_input_bed` (pre-merged); the two modes are mutually exclusive.
|
|
199
|
+
|
|
200
|
+
**Notes.** `-bin_limits 0,100,1000,5000,10000` defines bins
|
|
201
|
+
`[0,100], [100,1000], [1000,5000], [5000,10000]`. Larger `-batch_size` and more
|
|
202
|
+
`-p` processes use more RAM (multiplicatively); tune to your machine. By default
|
|
203
|
+
only CpGs present in the WGBS reference are used (inner join); pass
|
|
204
|
+
`--keep-unmatched-wgbs` to keep the rest. `--wgbs-offset`/`--wgbs-collapse` are
|
|
205
|
+
auto-probed (above) unless you force them.
|
|
206
|
+
|
|
207
|
+
### Output
|
|
208
|
+
|
|
209
|
+
A wide CSV: one row per read, with the per-read columns `read_id`,
|
|
210
|
+
`read_wgbs_distance` (mean squared difference of read vs WGBS ratio),
|
|
211
|
+
`read_meth_ratio`, followed by `<metric>_<bin>` columns for each metric
|
|
212
|
+
(`num_pairs`, `smc`, `uniformity`, `pearson_r`, `pearson_p`) and each bin
|
|
213
|
+
(`bin_all`, `bin_0` … `bin_N`, `bin_closest`).
|
|
214
|
+
|
|
215
|
+
> Not all metrics need to feed the PCA. Different combinations may give better
|
|
216
|
+
> nascent/mature separation — choose them with `mpath pca fit --columns`.
|
|
217
|
+
|
|
218
|
+
## PCA: `mpath pca`
|
|
219
|
+
|
|
220
|
+
Fit a PCA on labelled nascent + mature metric tables (run `mpath metrics`
|
|
221
|
+
separately on each):
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
mpath pca fit \
|
|
225
|
+
--nascent nascent_metrics.csv \
|
|
226
|
+
--mature mature_metrics.csv \
|
|
227
|
+
--out-dir pca_out/
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
This writes, into `pca_out/`:
|
|
231
|
+
|
|
232
|
+
- `nascent_scores.csv`, `mature_scores.csv` — input tables with `PCA1…PCAk` appended,
|
|
233
|
+
- `pca_model.npz` — the fitted transform (loadings, mean, feature columns),
|
|
234
|
+
- `coefficients.png`, `explained.png`, `scatter.png`, `scatter_histogram.png` — diagnostics.
|
|
235
|
+
|
|
236
|
+
By default the PCA uses every numeric metric column except identifiers and the
|
|
237
|
+
`num_pairs_*` counts; restrict it with `--columns smc_bin_all,uniformity_bin_all,...`.
|
|
238
|
+
Like MATLAB's `pca`, data is mean-centred and not scaled (pass `--standardize` to
|
|
239
|
+
z-score). PC signs may be flipped relative to MATLAB; the separation is unchanged.
|
|
240
|
+
|
|
241
|
+
### Downstream analysis of unlabelled data
|
|
242
|
+
|
|
243
|
+
Compute metrics for an unlabelled dataset, then project it into the PCA space that
|
|
244
|
+
was fit on the labelled data:
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
mpath pca apply \
|
|
248
|
+
--model pca_out/pca_model.npz \
|
|
249
|
+
--input unlabelled_metrics.csv \
|
|
250
|
+
--out unlabelled_scores.csv
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
## Docker
|
|
254
|
+
|
|
255
|
+
A version-pinned image is published to GHCR for containerized pipelines:
|
|
256
|
+
|
|
257
|
+
```bash
|
|
258
|
+
docker run --rm -v "$PWD":/data ghcr.io/downinglab/mpath:latest \
|
|
259
|
+
metrics -path_calls /data/methylation_calls.tsv.gz -path_wgbs /data/wgbs.bed \
|
|
260
|
+
-path_output_csv /data/metrics.csv -p 4
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
The image contains MPATH only; run the upstream tools (dorado, modkit, samtools)
|
|
264
|
+
in their own steps. See [`examples/pipeline.md`](examples/pipeline.md) for a full
|
|
265
|
+
worked example, and [`examples/nextflow/`](examples/nextflow/) for a runnable
|
|
266
|
+
reference pipeline (modkit → mpath, version-pinned) that starts from an aligned
|
|
267
|
+
BAM produced by a standard dorado workflow.
|
|
268
|
+
|
|
269
|
+
## Development
|
|
270
|
+
|
|
271
|
+
```bash
|
|
272
|
+
git clone https://github.com/downinglab/mpath.git
|
|
273
|
+
cd mpath
|
|
274
|
+
pip install -e ".[dev]"
|
|
275
|
+
pytest # test suite
|
|
276
|
+
ruff check # lint
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
Releases are automated with [release-please](https://github.com/googleapis/release-please):
|
|
280
|
+
merging the rolling release PR bumps the version, updates `CHANGELOG.md`, tags the
|
|
281
|
+
release, and publishes to PyPI + GHCR. The multi-Python test matrix runs on that
|
|
282
|
+
release PR.
|
|
283
|
+
|
|
284
|
+
## License
|
|
285
|
+
|
|
286
|
+
MIT — see [LICENSE](LICENSE).
|