mpath-pseudotime 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. mpath_pseudotime-0.2.0/.github/workflows/ci.yml +29 -0
  2. mpath_pseudotime-0.2.0/.github/workflows/release-please.yml +45 -0
  3. mpath_pseudotime-0.2.0/.github/workflows/release.yml +115 -0
  4. mpath_pseudotime-0.2.0/.github/workflows/tests.yml +40 -0
  5. mpath_pseudotime-0.2.0/.gitignore +18 -0
  6. mpath_pseudotime-0.2.0/.pre-commit-config.yaml +9 -0
  7. mpath_pseudotime-0.2.0/.release-please-manifest.json +3 -0
  8. mpath_pseudotime-0.2.0/CHANGELOG.md +14 -0
  9. mpath_pseudotime-0.2.0/Dockerfile +25 -0
  10. mpath_pseudotime-0.2.0/LICENSE +21 -0
  11. mpath_pseudotime-0.2.0/PKG-INFO +286 -0
  12. mpath_pseudotime-0.2.0/README.md +251 -0
  13. mpath_pseudotime-0.2.0/examples/nextflow/README.md +48 -0
  14. mpath_pseudotime-0.2.0/examples/nextflow/main.nf +127 -0
  15. mpath_pseudotime-0.2.0/examples/nextflow/nextflow.config +42 -0
  16. mpath_pseudotime-0.2.0/examples/pipeline.md +144 -0
  17. mpath_pseudotime-0.2.0/matlab/NascMatur_PCA_scatterhistogram.m +232 -0
  18. mpath_pseudotime-0.2.0/matlab/README.md +16 -0
  19. mpath_pseudotime-0.2.0/pyproject.toml +67 -0
  20. mpath_pseudotime-0.2.0/release-please-config.json +12 -0
  21. mpath_pseudotime-0.2.0/src/mpath/__init__.py +3 -0
  22. mpath_pseudotime-0.2.0/src/mpath/cli.py +154 -0
  23. mpath_pseudotime-0.2.0/src/mpath/io.py +417 -0
  24. mpath_pseudotime-0.2.0/src/mpath/metrics.py +392 -0
  25. mpath_pseudotime-0.2.0/src/mpath/pca.py +249 -0
  26. mpath_pseudotime-0.2.0/tests/_make_fixtures.py +93 -0
  27. mpath_pseudotime-0.2.0/tests/conftest.py +30 -0
  28. mpath_pseudotime-0.2.0/tests/data/mature_metrics.csv +31 -0
  29. mpath_pseudotime-0.2.0/tests/data/modkit_calls.tsv +259 -0
  30. mpath_pseudotime-0.2.0/tests/data/nascent_metrics.csv +31 -0
  31. mpath_pseudotime-0.2.0/tests/data/wgbs.bed +120 -0
  32. mpath_pseudotime-0.2.0/tests/test_io.py +177 -0
  33. mpath_pseudotime-0.2.0/tests/test_metrics.py +94 -0
  34. mpath_pseudotime-0.2.0/tests/test_pca.py +66 -0
@@ -0,0 +1,29 @@
1
+ name: CI
2
+
3
+ # Fast gate on every push/PR. Only linting runs here -- the (heavier) test
4
+ # matrix lives in tests.yml and is deliberately gated to the release-please PR
5
+ # so we don't pay for it on every push.
6
+ on:
7
+ push:
8
+ branches: [main]
9
+ pull_request:
10
+ workflow_dispatch:
11
+
12
+ concurrency:
13
+ group: ci-${{ github.ref }}
14
+ cancel-in-progress: true
15
+
16
+ permissions:
17
+ contents: read
18
+
19
+ jobs:
20
+ lint:
21
+ name: ruff
22
+ runs-on: ubuntu-22.04
23
+ steps:
24
+ - uses: actions/checkout@v5
25
+ - uses: actions/setup-python@v6
26
+ with:
27
+ python-version: "3.12"
28
+ - run: pip install ruff
29
+ - run: ruff check
@@ -0,0 +1,45 @@
1
+ name: release-please
2
+
3
+ # Maintains a rolling "release PR" that bumps the version (in
4
+ # src/mpath/__init__.py, the single source of truth) and writes CHANGELOG.md from
5
+ # Conventional Commit messages (feat:, fix:, ...). Merging that PR creates the
6
+ # GitHub Release + v* tag and then *calls* release.yml directly.
7
+ #
8
+ # We call release.yml via `uses:` (workflow_call) instead of relying on the
9
+ # pushed tag: a tag pushed by the default GITHUB_TOKEN does NOT trigger other
10
+ # workflows, so the call keeps publishing reliable with no PAT.
11
+ on:
12
+ push:
13
+ branches: [main]
14
+ workflow_dispatch:
15
+
16
+ permissions:
17
+ contents: write
18
+ pull-requests: write
19
+
20
+ jobs:
21
+ release-please:
22
+ runs-on: ubuntu-22.04
23
+ outputs:
24
+ release_created: ${{ steps.rp.outputs.release_created }}
25
+ tag_name: ${{ steps.rp.outputs.tag_name }}
26
+ steps:
27
+ - uses: googleapis/release-please-action@v5
28
+ id: rp
29
+ with:
30
+ config-file: release-please-config.json
31
+ manifest-file: .release-please-manifest.json
32
+
33
+ publish:
34
+ needs: release-please
35
+ if: needs.release-please.outputs.release_created == 'true'
36
+ permissions:
37
+ contents: write # attach artifacts to the GitHub Release
38
+ id-token: write # PyPI Trusted Publishing (OIDC) + provenance attestation
39
+ attestations: write
40
+ packages: write # push the Docker image to GHCR
41
+ uses: ./.github/workflows/release.yml
42
+ with:
43
+ publish: true
44
+ tag: ${{ needs.release-please.outputs.tag_name }}
45
+ secrets: inherit
@@ -0,0 +1,115 @@
1
+ name: build & publish
2
+
3
+ # Builds the pure-Python wheel (py3-none-any) + sdist and publishes them.
4
+ # Triggered three ways:
5
+ # - workflow_call (from release-please.yml, with publish: true) -> build + publish
6
+ # - tag push vX.Y.Z (manual `git tag`) -> build + publish
7
+ # - workflow_dispatch -> build only (dry run)
8
+ on:
9
+ push:
10
+ tags: ["v*"]
11
+ workflow_dispatch:
12
+ inputs:
13
+ publish:
14
+ description: "Publish to PyPI (otherwise build-only dry run)"
15
+ type: boolean
16
+ default: false
17
+ workflow_call:
18
+ inputs:
19
+ publish:
20
+ type: boolean
21
+ default: false
22
+ tag:
23
+ description: "Tag to attach built artifacts to on the GitHub Release"
24
+ type: string
25
+ default: ""
26
+
27
+ permissions:
28
+ contents: read
29
+
30
+ jobs:
31
+ build:
32
+ name: build wheel + sdist
33
+ runs-on: ubuntu-22.04
34
+ permissions:
35
+ contents: read
36
+ id-token: write # build provenance attestation
37
+ attestations: write
38
+ steps:
39
+ - uses: actions/checkout@v5
40
+ - uses: actions/setup-python@v6
41
+ with:
42
+ python-version: "3.12"
43
+ - name: Build
44
+ run: |
45
+ pip install build
46
+ python -m build
47
+ - name: Attest build provenance
48
+ uses: actions/attest-build-provenance@v4
49
+ with:
50
+ subject-path: dist/*
51
+ - uses: actions/upload-artifact@v6
52
+ with:
53
+ name: dist
54
+ path: dist
55
+
56
+ publish:
57
+ name: publish to PyPI
58
+ needs: build
59
+ if: ${{ startsWith(github.ref, 'refs/tags/') || inputs.publish }}
60
+ runs-on: ubuntu-22.04
61
+ environment: pypi
62
+ permissions:
63
+ contents: write # attach artifacts to the GitHub Release
64
+ id-token: write # PyPI Trusted Publishing (OIDC) -- no API token/secret needed
65
+ steps:
66
+ - uses: actions/download-artifact@v7
67
+ with:
68
+ name: dist
69
+ path: dist
70
+ - name: Attach wheel + sdist to the GitHub Release
71
+ uses: softprops/action-gh-release@v3
72
+ with:
73
+ tag_name: ${{ inputs.tag || github.ref_name }}
74
+ files: dist/*
75
+ - uses: pypa/gh-action-pypi-publish@release/v1
76
+ with:
77
+ packages-dir: dist
78
+ # Disable PEP 740 attestations: this runs as a reusable workflow called
79
+ # by release-please.yml, so the attestation's signing identity wouldn't
80
+ # match the PyPI Trusted Publisher and gets rejected. Trusted-publishing
81
+ # AUTH is unaffected; GitHub provenance is still recorded in the build job.
82
+ attestations: false
83
+
84
+ docker:
85
+ name: build & push image (GHCR)
86
+ needs: [publish]
87
+ if: ${{ startsWith(github.ref, 'refs/tags/') || inputs.publish }}
88
+ runs-on: ubuntu-22.04
89
+ permissions:
90
+ contents: read
91
+ packages: write # push to GitHub Container Registry
92
+ steps:
93
+ - uses: actions/checkout@v5
94
+ - name: Resolve version from tag
95
+ id: ver
96
+ run: |
97
+ TAG="${{ inputs.tag || github.ref_name }}"
98
+ echo "version=${TAG#v}" >> "$GITHUB_OUTPUT"
99
+ - uses: docker/setup-qemu-action@v4
100
+ - uses: docker/setup-buildx-action@v4
101
+ - uses: docker/login-action@v4
102
+ with:
103
+ registry: ghcr.io
104
+ username: ${{ github.actor }}
105
+ password: ${{ secrets.GITHUB_TOKEN }}
106
+ - uses: docker/build-push-action@v7
107
+ with:
108
+ context: .
109
+ platforms: linux/amd64,linux/arm64
110
+ push: true
111
+ build-args: |
112
+ VERSION=${{ steps.ver.outputs.version }}
113
+ tags: |
114
+ ghcr.io/${{ github.repository_owner }}/mpath:${{ steps.ver.outputs.version }}
115
+ ghcr.io/${{ github.repository_owner }}/mpath:latest
@@ -0,0 +1,40 @@
1
+ name: tests
2
+
3
+ # The multi-Python test matrix runs ONLY on the release-please PR (and on manual
4
+ # dispatch), not on every push. release-please keeps an open PR from a branch
5
+ # named `release-please--branches--<base>`; gating on that head ref means the
6
+ # suite runs exactly once -- right before a release is cut -- instead of on every
7
+ # commit. Flip the `if:` below to broaden coverage if that ever feels too thin.
8
+ on:
9
+ pull_request:
10
+ workflow_dispatch:
11
+
12
+ permissions:
13
+ contents: read
14
+
15
+ jobs:
16
+ test:
17
+ name: py${{ matrix.python }} ${{ matrix.numpy && format('/ {0}', matrix.numpy) || '' }}
18
+ if: ${{ github.event_name == 'workflow_dispatch' || startsWith(github.head_ref, 'release-please--branches--') }}
19
+ runs-on: ubuntu-22.04
20
+ strategy:
21
+ fail-fast: false
22
+ matrix:
23
+ python: ["3.9", "3.10", "3.11", "3.12", "3.13"]
24
+ include:
25
+ # One extra cell to prove the package runs on the older numpy 1.x line.
26
+ - { python: "3.10", numpy: "numpy==1.26.*" }
27
+ steps:
28
+ - uses: actions/checkout@v5
29
+ - uses: actions/setup-python@v6
30
+ with:
31
+ python-version: ${{ matrix.python }}
32
+ - name: Pin numpy (compat cell only)
33
+ if: ${{ matrix.numpy }}
34
+ run: pip install "${{ matrix.numpy }}"
35
+ - name: Install package + test deps
36
+ run: pip install -e ".[dev]"
37
+ - name: Show resolved versions
38
+ run: python -c "import numpy, pandas, polars; print('numpy', numpy.__version__, '| pandas', pandas.__version__, '| polars', polars.__version__)"
39
+ - name: Run tests
40
+ run: pytest
@@ -0,0 +1,18 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+ .venv/
9
+ venv/
10
+
11
+ # Tooling caches
12
+ .pytest_cache/
13
+ .ruff_cache/
14
+ .mypy_cache/
15
+
16
+ # Editors / OS
17
+ .ipynb_checkpoints/
18
+ .DS_Store
@@ -0,0 +1,9 @@
1
+ # Mirrors CI's fast checks so issues surface at commit time, not on push.
2
+ # Optional -- install once with `pre-commit install` (pip install pre-commit).
3
+ # Anyone who skips it is still covered by CI.
4
+ repos:
5
+ - repo: https://github.com/astral-sh/ruff-pre-commit
6
+ rev: v0.6.9
7
+ hooks:
8
+ - id: ruff
9
+ args: [--fix]
@@ -0,0 +1,3 @@
1
+ {
2
+ ".": "0.2.0"
3
+ }
@@ -0,0 +1,14 @@
1
+ # Changelog
2
+
3
+ ## [0.2.0](https://github.com/downinglab/mpath/compare/v0.1.0...v0.2.0) (2026-06-29)
4
+
5
+
6
+ ### Features
7
+
8
+ * add CI, pipeline, and standardize to Python ([eda22ee](https://github.com/downinglab/mpath/commit/eda22ee5c60d748d85cd4e759b7e57413ce0179f))
9
+
10
+ ## Changelog
11
+
12
+ All notable changes to this project are documented here. This file is maintained
13
+ automatically by [release-please](https://github.com/googleapis/release-please)
14
+ from [Conventional Commit](https://www.conventionalcommits.org/) messages.
@@ -0,0 +1,25 @@
1
+ # Reproducible, version-pinned image for running MPATH -- the artifact that
2
+ # Docker/Singularity-based pipelines (Nextflow, Snakemake, ...) consume.
3
+ #
4
+ # MPATH is a pure-Python package, so this just installs it from PyPI. The
5
+ # upstream preprocessing tools (dorado, modkit, samtools) are NOT included --
6
+ # they are large and version-sensitive; run them in their own images/steps and
7
+ # feed the resulting modkit calls + WGBS bed into `mpath metrics` here.
8
+ #
9
+ # Built and pushed to ghcr.io/<owner>/mpath:<version> by the release workflow.
10
+ # To build locally:
11
+ # docker build --build-arg VERSION=0.1.0 -t mpath:0.1.0 .
12
+ FROM python:3.12-slim
13
+
14
+ LABEL org.opencontainers.image.source="https://github.com/downinglab/mpath"
15
+ LABEL org.opencontainers.image.description="MPATH: methylation pseudotime analysis for Nanopore long reads"
16
+ LABEL org.opencontainers.image.licenses="MIT"
17
+
18
+ # Pin at build time. The release workflow passes the tag's version; an empty
19
+ # VERSION (local builds) installs the latest release from PyPI.
20
+ ARG VERSION=
21
+ RUN pip install --no-cache-dir "mpath-pseudotime${VERSION:+==${VERSION}}"
22
+
23
+ # Default to the CLI so `docker run <image> metrics ...` / `... pca ...` just work.
24
+ ENTRYPOINT ["mpath"]
25
+ CMD ["--help"]
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Downing Lab
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,286 @@
1
+ Metadata-Version: 2.4
2
+ Name: mpath-pseudotime
3
+ Version: 0.2.0
4
+ Summary: MPATH: methylation pseudotime analysis for Oxford Nanopore long reads
5
+ Project-URL: Homepage, https://github.com/downinglab/mpath
6
+ Project-URL: Repository, https://github.com/downinglab/mpath
7
+ Project-URL: Issues, https://github.com/downinglab/mpath/issues
8
+ Author: Nandor Laszik
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: bioinformatics,epigenetics,methylation,nanopore,pseudotime
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
21
+ Requires-Python: >=3.9
22
+ Requires-Dist: joblib
23
+ Requires-Dist: matplotlib
24
+ Requires-Dist: numpy
25
+ Requires-Dist: pandas
26
+ Requires-Dist: polars
27
+ Requires-Dist: scikit-learn
28
+ Requires-Dist: scipy
29
+ Requires-Dist: tqdm
30
+ Provides-Extra: dev
31
+ Requires-Dist: build; extra == 'dev'
32
+ Requires-Dist: pytest; extra == 'dev'
33
+ Requires-Dist: ruff; extra == 'dev'
34
+ Description-Content-Type: text/markdown
35
+
36
+ # MPATH
37
+
38
+ Methylation pseudotime analysis for deciphering epigenetic cross-talk across sub-cell-cycle timescales.
39
+
40
+ MPATH calculates pseudotime for long-read methylation data from Oxford Nanopore
41
+ sequencing. The full flow of data is:
42
+
43
+ ```
44
+ Nanopore BAM (MM/ML tags)
45
+ -> modkit extract calls (per-CpG methylation calls)
46
+ -> mpath metrics (read-level metrics; merges in the WGBS reference)
47
+ -> mpath pca fit / apply (PCA separation of nascent vs mature reads)
48
+ -> pseudotime score
49
+ ```
50
+
51
+ ## Installation
52
+
53
+ ```bash
54
+ pip install mpath-pseudotime
55
+ ```
56
+
57
+ This installs the `mpath` command-line tool and the `mpath` Python package. No
58
+ MATLAB required — the PCA has been ported to Python. (The original MATLAB PCA
59
+ script is retained, unchanged, under [`matlab/`](matlab/) for reproducibility of
60
+ the published figures.)
61
+
62
+ Requires Python ≥ 3.9. Dependencies (numpy, pandas, polars, scipy, joblib, tqdm,
63
+ matplotlib, scikit-learn) are installed automatically.
64
+
65
+ ## Preliminary setup
66
+
67
+ Before MPATH, process the raw Nanopore data into per-CpG methylation calls. Exact
68
+ syntax depends on your dorado / modkit / samtools versions.
69
+
70
+ 1. **Basecall + align** with a model that emits 5mCG tags, then sort:
71
+ ```bash
72
+ dorado basecaller hac input.pod5 --modified-bases 5mCG_5hmCG > calls.bam
73
+ dorado aligner hg19.fa calls.bam > reads_aligned.bam
74
+ samtools sort reads_aligned.bam > reads_aligned_sorted.bam
75
+ ```
76
+
77
+ 2. **Remove chimeric reads** and index:
78
+ ```bash
79
+ samtools view -b -F0x900 reads_aligned_sorted.bam > reads_nonchimeric.bam
80
+ samtools index reads_nonchimeric.bam
81
+ ```
82
+
83
+ 3. **Extract read-level methylation calls** with modkit (developed against
84
+ modkit v0.4.x). The `--read-calls` output is the file MPATH consumes; it can be
85
+ large, so gzip it (MPATH reads `.gz` directly):
86
+ ```bash
87
+ modkit extract reads_nonchimeric.bam extract_full.tsv \
88
+ --read-calls calls_cpg.tsv \
89
+ --ref hg38.fa \
90
+ --include-bed CG_motifs.bed # optional: restrict to CpG-motif positions
91
+ pigz calls_cpg.tsv
92
+ ```
93
+ (On newer modkit the equivalent is `modkit extract calls reads.bam calls_cpg.tsv`.)
94
+ The output has a header row; MPATH resolves the columns it needs (`read_id`,
95
+ `chrom`, `ref_position`, `call_code`, `ref_strand`, and `fail` if present) **by
96
+ name**, so added/reordered columns across modkit versions are fine.
97
+
98
+ 4. **(Optional) BrdU filtering.** For BrdU-labeled data, filter to BrdU-positive
99
+ reads first. BrdU can be called with [DNAscent](https://github.com/MBoemo/DNAscent);
100
+ average the per-read BrdU probabilities to score each read.
101
+
102
+ 5. **Obtain a WGBS reference bed** — the expected methylation ratio for each CpG in
103
+ your cell type. You supply your own. The expected form is a simple tab-separated
104
+ bed; the common 4-column `chrom, start, end, ratio` works out of the box:
105
+ ```
106
+ chrom start end ratio
107
+ chr1 1061 1062 0.823
108
+ ```
109
+ The ratio column is selected with `-wgbs_column` (0-based; default `3`), and the
110
+ `start` must use the **same genome build** as modkit's `ref_position`. You do
111
+ **not** need to know the file's exact coordinate convention (0- vs 1-based, which
112
+ base of the CpG dyad/strand it anchors on) — MPATH **auto-probes** it (see below).
113
+ The one thing it can't infer is the ratio *scale* (`0.5` could be a ratio or a
114
+ rounded percent): ratios must be 0–1, or pass `--wgbs-scale 0-100` for percentages
115
+ (MPATH warns if your values look like percentages).
116
+
117
+ > **Why no manual `input.bed` anymore?** Earlier versions required hand-merging the
118
+ > methylation calls and the WGBS ratios into a single bed. `mpath metrics` now does
119
+ > that merge internally (joining on `chrom` + position), so you pass the modkit
120
+ > calls and the WGBS bed directly. (If you'd rather do the intersection yourself —
121
+ > e.g. with `bedtools` in a pipeline — feed a pre-merged bed via `-path_input_bed`;
122
+ > see *Pre-merged input* below.)
123
+
124
+ ### Coordinate auto-probe + intersection QC
125
+
126
+ WGBS beds vary in convention and you usually can't tell from the file. So instead
127
+ of asking you to know, `mpath metrics` reads a sample of CpGs and **measures** the
128
+ match rate for each candidate coordinate offset (`-1/0/+1`) and strand-collapse
129
+ mode, then picks the best and prints what it found:
130
+
131
+ ```
132
+ WGBS alignment probe (offset, collapse_strands -> match):
133
+ offset=+0 collapse_strands=True -> 100.0%
134
+ offset=+0 collapse_strands=False -> 53.3%
135
+ offset=-1 collapse_strands=False -> 46.7%
136
+ ...
137
+ chosen: offset=+0, collapse_strands=True (100.0%)
138
+ WGBS intersection: matched 43630/43630 CpGs (100.0%); 0/609 reads had no WGBS overlap.
139
+ ```
140
+
141
+ The correct convention reveals itself as a sharp jump in the matched fraction
142
+ (this is self-verifying, unlike the ratio scale). If even the best alignment
143
+ matches poorly, MPATH **warns** (it never aborts) and names the likely cause —
144
+ chromosome naming (`chr1` vs `1`), genome build, or the wrong ratio column/scale.
145
+ Force a specific convention with `--wgbs-offset` / `--wgbs-collapse` if you ever
146
+ need to override the probe.
147
+
148
+ ### Pre-merged input (escape hatch)
149
+
150
+ If you prefer to do the CpG↔WGBS intersection yourself (e.g. `bedtools intersect`
151
+ inside a pipeline), skip the merge entirely and pass a pre-merged 7-column bed —
152
+ `chrom, start, stop, strand, read_id, methylation(0/1), wgbs_ratio`, grouped by
153
+ `read_id`:
154
+
155
+ ```bash
156
+ mpath metrics -path_input_bed merged.bed -path_output_csv metrics.csv -p 8
157
+ ```
158
+
159
+ In this mode MPATH does no merge, probe, or QC — it just computes metrics on what
160
+ you provide. `-path_input_bed` is mutually exclusive with `-path_calls`/`-path_wgbs`.
161
+
162
+ ## Metrics: `mpath metrics`
163
+
164
+ ```bash
165
+ mpath metrics \
166
+ -path_calls calls_cpg.tsv.gz \
167
+ -path_wgbs wgbs.bed \
168
+ -wgbs_column 3 \
169
+ -path_output_csv metrics.csv \
170
+ -p 8 -min_cpgs 3 -bin_limits 0,100,1000,5000,10000 --use_full_matrix
171
+ ```
172
+
173
+ For each read, MPATH compares all CpG pairs to produce a simple matching
174
+ coefficient (SMC), a uniformity score, and a Pearson correlation — overall, per
175
+ genomic-distance bin, and for nearest-neighbour CpGs.
176
+
177
+ ### Arguments
178
+
179
+ | name | description | type | required | default |
180
+ |-----------------------|------------------------------------------------------|------|----------|-----------------------|
181
+ | `-path_calls` | modkit read-calls TSV (`.tsv` or `.tsv.gz`) | str | yes\* | — |
182
+ | `-path_wgbs` | WGBS reference bed (`.bed` or `.bed.gz`) | str | yes\* | — |
183
+ | `-path_input_bed` | pre-merged 7-col bed (alternative to calls+wgbs) | str | yes\* | — |
184
+ | `-path_output_csv` | output metrics CSV | str | yes | — |
185
+ | `-wgbs_column` | 0-based column of the WGBS ratio | int | no | 3 |
186
+ | `-min_cpgs` | minimum CpGs on a read to compute metrics | int | no | 3 |
187
+ | `-bin_limits` | comma-separated distance-bin limits | str | no | 0,100,1000,5000,10000 |
188
+ | `-batch_size` | approx CpGs per batch (controls RAM) | int | no | 1e8 |
189
+ | `-p` | number of parallel processes | int | no | 1 |
190
+ | `--use_full_matrix` | use the full CpG-pair matrix (vs one triangle) | flag | no | off |
191
+ | `--include-hydroxy` | count 5hmC (`h`) calls as methylated | flag | no | off |
192
+ | `--keep-unmatched-wgbs` | keep CpGs absent from the WGBS bed (NaN ratio) | flag | no | off |
193
+ | `--wgbs-scale` | scale of the WGBS ratio: `0-1` or `0-100` | str | no | 0-1 |
194
+ | `--wgbs-offset` | coordinate offset for the join: `auto`/`-1`/`0`/`1` | str | no | auto |
195
+ | `--wgbs-collapse` | map -strand CpGs to + dyad anchor: `auto`/`on`/`off` | str | no | auto |
196
+
197
+ \* Provide **either** `-path_calls` + `-path_wgbs` (on-the-fly merge) **or**
198
+ `-path_input_bed` (pre-merged); the two modes are mutually exclusive.
199
+
200
+ **Notes.** `-bin_limits 0,100,1000,5000,10000` defines bins
201
+ `[0,100], [100,1000], [1000,5000], [5000,10000]`. Larger `-batch_size` and more
202
+ `-p` processes use more RAM (multiplicatively); tune to your machine. By default
203
+ only CpGs present in the WGBS reference are used (inner join); pass
204
+ `--keep-unmatched-wgbs` to keep the rest. `--wgbs-offset`/`--wgbs-collapse` are
205
+ auto-probed (above) unless you force them.
206
+
207
+ ### Output
208
+
209
+ A wide CSV: one row per read, with the per-read columns `read_id`,
210
+ `read_wgbs_distance` (mean squared difference of read vs WGBS ratio),
211
+ `read_meth_ratio`, followed by `<metric>_<bin>` columns for each metric
212
+ (`num_pairs`, `smc`, `uniformity`, `pearson_r`, `pearson_p`) and each bin
213
+ (`bin_all`, `bin_0` … `bin_N`, `bin_closest`).
214
+
215
+ > Not all metrics need to feed the PCA. Different combinations may give better
216
+ > nascent/mature separation — choose them with `mpath pca fit --columns`.
217
+
218
+ ## PCA: `mpath pca`
219
+
220
+ Fit a PCA on labelled nascent + mature metric tables (run `mpath metrics`
221
+ separately on each):
222
+
223
+ ```bash
224
+ mpath pca fit \
225
+ --nascent nascent_metrics.csv \
226
+ --mature mature_metrics.csv \
227
+ --out-dir pca_out/
228
+ ```
229
+
230
+ This writes, into `pca_out/`:
231
+
232
+ - `nascent_scores.csv`, `mature_scores.csv` — input tables with `PCA1…PCAk` appended,
233
+ - `pca_model.npz` — the fitted transform (loadings, mean, feature columns),
234
+ - `coefficients.png`, `explained.png`, `scatter.png`, `scatter_histogram.png` — diagnostics.
235
+
236
+ By default the PCA uses every numeric metric column except identifiers and the
237
+ `num_pairs_*` counts; restrict it with `--columns smc_bin_all,uniformity_bin_all,...`.
238
+ Like MATLAB's `pca`, data is mean-centred and not scaled (pass `--standardize` to
239
+ z-score). PC signs may be flipped relative to MATLAB; the separation is unchanged.
240
+
241
+ ### Downstream analysis of unlabelled data
242
+
243
+ Compute metrics for an unlabelled dataset, then project it into the PCA space that
244
+ was fit on the labelled data:
245
+
246
+ ```bash
247
+ mpath pca apply \
248
+ --model pca_out/pca_model.npz \
249
+ --input unlabelled_metrics.csv \
250
+ --out unlabelled_scores.csv
251
+ ```
252
+
253
+ ## Docker
254
+
255
+ A version-pinned image is published to GHCR for containerized pipelines:
256
+
257
+ ```bash
258
+ docker run --rm -v "$PWD":/data ghcr.io/downinglab/mpath:latest \
259
+ metrics -path_calls /data/methylation_calls.tsv.gz -path_wgbs /data/wgbs.bed \
260
+ -path_output_csv /data/metrics.csv -p 4
261
+ ```
262
+
263
+ The image contains MPATH only; run the upstream tools (dorado, modkit, samtools)
264
+ in their own steps. See [`examples/pipeline.md`](examples/pipeline.md) for a full
265
+ worked example, and [`examples/nextflow/`](examples/nextflow/) for a runnable
266
+ reference pipeline (modkit → mpath, version-pinned) that starts from an aligned
267
+ BAM produced by a standard dorado workflow.
268
+
269
+ ## Development
270
+
271
+ ```bash
272
+ git clone https://github.com/downinglab/mpath.git
273
+ cd mpath
274
+ pip install -e ".[dev]"
275
+ pytest # test suite
276
+ ruff check # lint
277
+ ```
278
+
279
+ Releases are automated with [release-please](https://github.com/googleapis/release-please):
280
+ merging the rolling release PR bumps the version, updates `CHANGELOG.md`, tags the
281
+ release, and publishes to PyPI + GHCR. The multi-Python test matrix runs on that
282
+ release PR.
283
+
284
+ ## License
285
+
286
+ MIT — see [LICENSE](LICENSE).