mpath-pseudotime 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mpath/__init__.py +3 -0
- mpath/cli.py +154 -0
- mpath/io.py +417 -0
- mpath/metrics.py +392 -0
- mpath/pca.py +249 -0
- mpath_pseudotime-0.2.0.dist-info/METADATA +286 -0
- mpath_pseudotime-0.2.0.dist-info/RECORD +10 -0
- mpath_pseudotime-0.2.0.dist-info/WHEEL +4 -0
- mpath_pseudotime-0.2.0.dist-info/entry_points.txt +2 -0
- mpath_pseudotime-0.2.0.dist-info/licenses/LICENSE +21 -0
mpath/__init__.py
ADDED
mpath/cli.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
"""Command-line interface for MPATH.
|
|
4
|
+
|
|
5
|
+
Subcommands::
|
|
6
|
+
|
|
7
|
+
mpath metrics compute read-level metrics from modkit calls + WGBS bed
|
|
8
|
+
mpath pca fit fit a PCA on labelled nascent/mature metric tables
|
|
9
|
+
mpath pca apply project an unlabelled metric table into a fitted PCA space
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
|
|
14
|
+
from . import __version__
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _add_metrics_parser(subparsers):
|
|
18
|
+
p = subparsers.add_parser(
|
|
19
|
+
"metrics",
|
|
20
|
+
help="compute read-level methylation metrics",
|
|
21
|
+
description="Compute read-level metrics from a modkit read-calls TSV + WGBS reference, "
|
|
22
|
+
"or from a pre-merged bed.",
|
|
23
|
+
)
|
|
24
|
+
# Single-dash long options preserved for continuity with the original scripts.
|
|
25
|
+
# Mode A: merge calls + WGBS on the fly. Mode B: a pre-merged bed (escape hatch).
|
|
26
|
+
p.add_argument("-path_calls", default=None, help="modkit read-calls TSV (optionally .gz)")
|
|
27
|
+
p.add_argument("-path_wgbs", default=None, help="WGBS reference bed (optionally .gz)")
|
|
28
|
+
p.add_argument(
|
|
29
|
+
"-path_input_bed",
|
|
30
|
+
default=None,
|
|
31
|
+
help="pre-merged 7-col bed (chrom,start,stop,strand,read_id,meth,wgbs); skips the WGBS merge",
|
|
32
|
+
)
|
|
33
|
+
p.add_argument("-path_output_csv", required=True, help="output wide-format metrics CSV")
|
|
34
|
+
p.add_argument("-wgbs_column", default=3, type=int, help="0-based column of the WGBS ratio (default 3)")
|
|
35
|
+
p.add_argument("-min_cpgs", default=3, type=int, help="minimum CpGs on a read to compute metrics")
|
|
36
|
+
p.add_argument("-bin_limits", default="0,100,1000,5000,10000", help="comma-separated distance-bin limits")
|
|
37
|
+
p.add_argument("-batch_size", default=int(1e8), type=int, help="approx CpGs per batch (RAM control)")
|
|
38
|
+
p.add_argument("-p", default=1, type=int, help="number of parallel processes")
|
|
39
|
+
p.add_argument("--use_full_matrix", action="store_true", help="use the full CpG-pair matrix")
|
|
40
|
+
p.add_argument("--include-hydroxy", action="store_true", help="count 5hmC ('h') calls as methylated")
|
|
41
|
+
p.add_argument(
|
|
42
|
+
"--keep-unmatched-wgbs",
|
|
43
|
+
action="store_true",
|
|
44
|
+
help="keep CpGs with no WGBS entry (NaN ratio) instead of dropping them",
|
|
45
|
+
)
|
|
46
|
+
# WGBS reference ratio scale (explicit -- 0-1 vs 0-100 can't be auto-detected).
|
|
47
|
+
p.add_argument(
|
|
48
|
+
"--wgbs-scale",
|
|
49
|
+
choices=["0-1", "0-100"],
|
|
50
|
+
default="0-1",
|
|
51
|
+
help="scale of the WGBS ratio column: 0-1 (default) or 0-100 (percentages)",
|
|
52
|
+
)
|
|
53
|
+
# Coordinate-convention handling. These are auto-probed by default (the data
|
|
54
|
+
# tells us which offset/strand mode matches); override only to force one.
|
|
55
|
+
p.add_argument(
|
|
56
|
+
"--wgbs-offset",
|
|
57
|
+
choices=["auto", "-1", "0", "1"],
|
|
58
|
+
default="auto",
|
|
59
|
+
help="global coordinate offset for the CpG<->WGBS join (default: auto-probe)",
|
|
60
|
+
)
|
|
61
|
+
p.add_argument(
|
|
62
|
+
"--wgbs-collapse",
|
|
63
|
+
choices=["auto", "on", "off"],
|
|
64
|
+
default="auto",
|
|
65
|
+
help="map -strand CpGs to the + dyad anchor: on/off, or auto-probe (default)",
|
|
66
|
+
)
|
|
67
|
+
p.set_defaults(func=_run_metrics)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _run_metrics(args):
|
|
71
|
+
from . import io as mpath_io
|
|
72
|
+
from . import metrics
|
|
73
|
+
|
|
74
|
+
methyl_codes = ("m", "h") if args.include_hydroxy else mpath_io.DEFAULT_METHYL_CODES
|
|
75
|
+
metrics.run_metrics(
|
|
76
|
+
path_calls=args.path_calls,
|
|
77
|
+
path_wgbs=args.path_wgbs,
|
|
78
|
+
path_input_bed=args.path_input_bed,
|
|
79
|
+
path_output_csv=args.path_output_csv,
|
|
80
|
+
wgbs_column=args.wgbs_column,
|
|
81
|
+
min_cpgs=args.min_cpgs,
|
|
82
|
+
bin_limits=args.bin_limits,
|
|
83
|
+
batch_size=args.batch_size,
|
|
84
|
+
processes=args.p,
|
|
85
|
+
use_full_matrix=args.use_full_matrix,
|
|
86
|
+
methyl_codes=methyl_codes,
|
|
87
|
+
drop_unmatched_wgbs=not args.keep_unmatched_wgbs,
|
|
88
|
+
wgbs_scale=args.wgbs_scale,
|
|
89
|
+
wgbs_offset=args.wgbs_offset,
|
|
90
|
+
wgbs_collapse=args.wgbs_collapse,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _add_pca_parser(subparsers):
|
|
95
|
+
p = subparsers.add_parser("pca", help="PCA of read-level metrics")
|
|
96
|
+
pca_sub = p.add_subparsers(dest="pca_command", required=True)
|
|
97
|
+
|
|
98
|
+
fit = pca_sub.add_parser("fit", help="fit a PCA on labelled nascent + mature tables")
|
|
99
|
+
fit.add_argument("--nascent", required=True, help="nascent metrics CSV")
|
|
100
|
+
fit.add_argument("--mature", required=True, help="mature metrics CSV")
|
|
101
|
+
fit.add_argument("--out-dir", required=True, help="output directory for scores, model and figures")
|
|
102
|
+
fit.add_argument("--columns", default=None, help="comma-separated metric columns to use (default: auto)")
|
|
103
|
+
fit.add_argument("--standardize", action="store_true", help="z-score metrics before PCA (default: off)")
|
|
104
|
+
fit.add_argument("--n-components", default=None, type=int, help="number of PCs to keep (default: all)")
|
|
105
|
+
fit.set_defaults(func=_run_pca_fit)
|
|
106
|
+
|
|
107
|
+
ap = pca_sub.add_parser("apply", help="project an unlabelled table into a fitted PCA space")
|
|
108
|
+
ap.add_argument("--model", required=True, help="pca_model.npz produced by `mpath pca fit`")
|
|
109
|
+
ap.add_argument("--input", required=True, help="unlabelled metrics CSV")
|
|
110
|
+
ap.add_argument("--out", required=True, help="output CSV with appended PCA scores")
|
|
111
|
+
ap.set_defaults(func=_run_pca_apply)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _run_pca_fit(args):
|
|
115
|
+
from . import pca
|
|
116
|
+
|
|
117
|
+
columns = [c.strip() for c in args.columns.split(",")] if args.columns else None
|
|
118
|
+
pca.fit(
|
|
119
|
+
path_nascent=args.nascent,
|
|
120
|
+
path_mature=args.mature,
|
|
121
|
+
out_dir=args.out_dir,
|
|
122
|
+
columns=columns,
|
|
123
|
+
standardize=args.standardize,
|
|
124
|
+
n_components=args.n_components,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _run_pca_apply(args):
|
|
129
|
+
from . import pca
|
|
130
|
+
|
|
131
|
+
pca.apply(path_model=args.model, path_input=args.input, path_output=args.out)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def build_parser():
|
|
135
|
+
parser = argparse.ArgumentParser(prog="mpath", description="MPATH: methylation pseudotime analysis")
|
|
136
|
+
parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
|
|
137
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
138
|
+
_add_metrics_parser(subparsers)
|
|
139
|
+
_add_pca_parser(subparsers)
|
|
140
|
+
return parser
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def main(argv=None):
|
|
144
|
+
parser = build_parser()
|
|
145
|
+
args = parser.parse_args(argv)
|
|
146
|
+
try:
|
|
147
|
+
args.func(args)
|
|
148
|
+
except (ValueError, FileNotFoundError) as e:
|
|
149
|
+
# Present input/usage errors cleanly instead of as a raw traceback.
|
|
150
|
+
parser.error(str(e))
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
if __name__ == "__main__":
|
|
154
|
+
main()
|
mpath/io.py
ADDED
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Native merge of ``modkit extract calls`` output with a WGBS reference bed.
|
|
5
|
+
|
|
6
|
+
This replaces the old workflow of hand-building a 7-column ``input.bed`` before
|
|
7
|
+
running the metrics. The metrics CLI now consumes the modkit calls TSV and the
|
|
8
|
+
WGBS bed directly:
|
|
9
|
+
|
|
10
|
+
* the modkit calls TSV gives, per CpG call: ``read_id``, ``chrom``,
|
|
11
|
+
``ref_position`` and ``call_code`` (``-`` canonical, ``m`` 5mC, ``h`` 5hmC),
|
|
12
|
+
* the WGBS bed gives the expected methylation ratio for each CpG of the cell
|
|
13
|
+
type, in a user-specified column.
|
|
14
|
+
|
|
15
|
+
The two are joined on ``(chrom, ref_position)``. modkit emits all rows for a
|
|
16
|
+
given read contiguously, so reads are streamed in groups with
|
|
17
|
+
``itertools.groupby`` and yielded in CpG-bounded batches -- whole-genome files
|
|
18
|
+
never have to be held in memory at once. The WGBS reference is stored as
|
|
19
|
+
compact per-chromosome sorted ``int32``/``float32`` arrays and looked up with a
|
|
20
|
+
vectorised ``np.searchsorted`` per read.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import gzip
|
|
24
|
+
import itertools
|
|
25
|
+
|
|
26
|
+
import numpy as np
|
|
27
|
+
import polars as pl
|
|
28
|
+
|
|
29
|
+
# Column names emitted by `modkit extract --read-calls` (modkit >= 0.4) and the
|
|
30
|
+
# equivalent `modkit extract calls`. We resolve them from the header so the code
|
|
31
|
+
# tolerates added/reordered columns across modkit versions.
|
|
32
|
+
COL_READ_ID = "read_id"
|
|
33
|
+
COL_REF_POSITION = "ref_position"
|
|
34
|
+
COL_CHROM = "chrom"
|
|
35
|
+
COL_CALL_CODE = "call_code"
|
|
36
|
+
COL_FAIL = "fail"
|
|
37
|
+
COL_REF_STRAND = "ref_strand"
|
|
38
|
+
|
|
39
|
+
# Default modification codes counted as "methylated". 5hmC (`h`) is excluded by
|
|
40
|
+
# default; pass methyl_codes=("m", "h") to include it.
|
|
41
|
+
DEFAULT_METHYL_CODES = ("m",)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def load_wgbs_reference(path_wgbs, wgbs_column, chrom_column=0, position_column=1, scale="0-1"):
|
|
45
|
+
"""Load a WGBS bed into per-chromosome sorted (positions, ratios) arrays.
|
|
46
|
+
|
|
47
|
+
The expected input is a simple tab-separated bed (no header), e.g. the common
|
|
48
|
+
``chrom, start, end, ratio`` 4-column form (ratio at the default
|
|
49
|
+
``wgbs_column=3``).
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
path_wgbs : str
|
|
54
|
+
Path to the WGBS bed (optionally gzipped).
|
|
55
|
+
wgbs_column : int
|
|
56
|
+
0-based column index holding the expected methylation ratio.
|
|
57
|
+
chrom_column, position_column : int
|
|
58
|
+
0-based column indices of the chromosome and (0-based) start position.
|
|
59
|
+
scale : str
|
|
60
|
+
``"0-1"`` (ratios already in [0, 1]) or ``"0-100"`` (percentages; divided
|
|
61
|
+
by 100). No auto-detection -- declare the scale explicitly.
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
dict[str, tuple[np.ndarray, np.ndarray]]
|
|
66
|
+
Mapping ``chrom -> (positions_sorted_int64, ratios_float64)``.
|
|
67
|
+
"""
|
|
68
|
+
if scale not in ("0-1", "0-100"):
|
|
69
|
+
raise ValueError(f"scale must be '0-1' or '0-100', got {scale!r}")
|
|
70
|
+
|
|
71
|
+
needed = sorted({chrom_column, position_column, wgbs_column})
|
|
72
|
+
# polars reads gzip transparently and parses far faster than the csv module.
|
|
73
|
+
frame = pl.read_csv(
|
|
74
|
+
path_wgbs,
|
|
75
|
+
separator="\t",
|
|
76
|
+
has_header=False,
|
|
77
|
+
columns=needed,
|
|
78
|
+
infer_schema_length=0, # read everything as str, we cast explicitly below
|
|
79
|
+
)
|
|
80
|
+
cols = frame.columns # named like "column_<idx>" in selection order
|
|
81
|
+
name_for = {idx: cols[i] for i, idx in enumerate(needed)}
|
|
82
|
+
|
|
83
|
+
frame = frame.select(
|
|
84
|
+
pl.col(name_for[chrom_column]).alias("chrom"),
|
|
85
|
+
pl.col(name_for[position_column]).cast(pl.Int64).alias("pos"),
|
|
86
|
+
pl.col(name_for[wgbs_column]).cast(pl.Float64, strict=False).alias("ratio"),
|
|
87
|
+
).sort(["chrom", "pos"])
|
|
88
|
+
|
|
89
|
+
divisor = 100.0 if scale == "0-100" else 1.0
|
|
90
|
+
reference = {}
|
|
91
|
+
for chrom, sub in frame.group_by("chrom", maintain_order=True):
|
|
92
|
+
chrom = chrom[0] if isinstance(chrom, tuple) else chrom
|
|
93
|
+
reference[str(chrom)] = (
|
|
94
|
+
sub["pos"].to_numpy(),
|
|
95
|
+
sub["ratio"].to_numpy() / divisor,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# A wrong scale silently corrupts read_wgbs_distance (and the position-only
|
|
99
|
+
# intersection QC would not catch it), so flag ratios that look like percents.
|
|
100
|
+
if scale == "0-1":
|
|
101
|
+
ref_max = max((r.max() for _, r in reference.values() if len(r)), default=0.0)
|
|
102
|
+
if ref_max > 1.5:
|
|
103
|
+
print(
|
|
104
|
+
f"WARNING: WGBS ratios reach {ref_max:.1f} (> 1) -- they look like 0-100 "
|
|
105
|
+
"percentages. Pass --wgbs-scale 0-100 if so."
|
|
106
|
+
)
|
|
107
|
+
return reference
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def lookup_wgbs(reference, chrom, positions):
|
|
111
|
+
"""Vectorised lookup of WGBS ratios for one read's CpG positions.
|
|
112
|
+
|
|
113
|
+
Returns an array aligned with ``positions``; entries with no matching CpG in
|
|
114
|
+
the reference are ``np.nan``.
|
|
115
|
+
"""
|
|
116
|
+
arrays = reference.get(str(chrom))
|
|
117
|
+
if arrays is None:
|
|
118
|
+
return np.full(len(positions), np.nan, dtype=np.float64)
|
|
119
|
+
ref_pos, ref_ratio = arrays
|
|
120
|
+
idx = np.searchsorted(ref_pos, positions)
|
|
121
|
+
idx_clipped = np.clip(idx, 0, len(ref_pos) - 1)
|
|
122
|
+
matched = ref_pos[idx_clipped] == positions
|
|
123
|
+
ratios = np.where(matched, ref_ratio[idx_clipped], np.nan).astype(np.float64)
|
|
124
|
+
return ratios
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _open_text(path):
|
|
128
|
+
"""Open a possibly-gzipped text file."""
|
|
129
|
+
if str(path).endswith(".gz"):
|
|
130
|
+
return gzip.open(path, "rt")
|
|
131
|
+
return open(path)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _resolve_columns(header_line):
|
|
135
|
+
"""Map required modkit column names to indices from the header line."""
|
|
136
|
+
names = header_line.rstrip("\n").split("\t")
|
|
137
|
+
index = {name: i for i, name in enumerate(names)}
|
|
138
|
+
missing = [c for c in (COL_READ_ID, COL_REF_POSITION, COL_CHROM, COL_CALL_CODE) if c not in index]
|
|
139
|
+
if missing:
|
|
140
|
+
raise ValueError(
|
|
141
|
+
f"modkit calls file is missing required column(s) {missing}. "
|
|
142
|
+
f"Found columns: {names}. Generate it with `modkit extract --read-calls`."
|
|
143
|
+
)
|
|
144
|
+
return index
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def new_stats():
|
|
148
|
+
"""Fresh intersection-QC accumulator (see :func:`iter_read_batches`)."""
|
|
149
|
+
return {
|
|
150
|
+
"reads_total": 0,
|
|
151
|
+
"reads_dropped": 0, # reads with zero WGBS-matched CpGs
|
|
152
|
+
"cpgs_total": 0,
|
|
153
|
+
"cpgs_matched": 0,
|
|
154
|
+
"calls_chroms": set(),
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def iter_read_batches(
|
|
159
|
+
path_calls,
|
|
160
|
+
wgbs_reference,
|
|
161
|
+
batch_size,
|
|
162
|
+
methyl_codes=DEFAULT_METHYL_CODES,
|
|
163
|
+
drop_unmatched_wgbs=True,
|
|
164
|
+
collapse_strands=True,
|
|
165
|
+
position_offset=0,
|
|
166
|
+
stats=None,
|
|
167
|
+
):
|
|
168
|
+
"""Stream reads from a modkit calls file in CpG-bounded batches.
|
|
169
|
+
|
|
170
|
+
Each yielded batch is a list of read records with the keys the metrics code
|
|
171
|
+
expects: ``read_id``, ``meth_values`` (0/1), ``positions``, ``wgbs_values``.
|
|
172
|
+
|
|
173
|
+
Parameters
|
|
174
|
+
----------
|
|
175
|
+
path_calls : str
|
|
176
|
+
Path to the modkit read-calls TSV (optionally gzipped).
|
|
177
|
+
wgbs_reference : dict
|
|
178
|
+
Output of :func:`load_wgbs_reference`.
|
|
179
|
+
batch_size : int
|
|
180
|
+
Approximate maximum number of CpGs per batch (reads are never split).
|
|
181
|
+
methyl_codes : tuple[str, ...]
|
|
182
|
+
``call_code`` values counted as methylated (others are unmethylated).
|
|
183
|
+
drop_unmatched_wgbs : bool
|
|
184
|
+
If True (default), drop CpGs with no WGBS reference entry (inner join);
|
|
185
|
+
otherwise keep them with a NaN WGBS value.
|
|
186
|
+
collapse_strands : bool
|
|
187
|
+
If True (default) and a ``ref_strand`` column is present, a cytosine on
|
|
188
|
+
the reference ``-`` strand (``ref_position`` = ``p``) is matched to the
|
|
189
|
+
``+`` strand anchor of its CpG dyad (``p - 1``). This is what lets a
|
|
190
|
+
combined/``+``-strand WGBS reference match ``-`` strand read CpGs;
|
|
191
|
+
without it roughly half of all CpGs silently fail to join. Distances
|
|
192
|
+
between CpGs are unaffected (a uniform shift), so only the lookup uses
|
|
193
|
+
the anchor.
|
|
194
|
+
stats : dict, optional
|
|
195
|
+
If given (see :func:`new_stats`), intersection-quality counters are
|
|
196
|
+
accumulated in place for the caller to report.
|
|
197
|
+
"""
|
|
198
|
+
methyl_set = set(methyl_codes)
|
|
199
|
+
with _open_text(path_calls) as fh:
|
|
200
|
+
header = fh.readline()
|
|
201
|
+
if not header:
|
|
202
|
+
return
|
|
203
|
+
col = _resolve_columns(header)
|
|
204
|
+
i_read = col[COL_READ_ID]
|
|
205
|
+
i_pos = col[COL_REF_POSITION]
|
|
206
|
+
i_chrom = col[COL_CHROM]
|
|
207
|
+
i_code = col[COL_CALL_CODE]
|
|
208
|
+
i_fail = col.get(COL_FAIL)
|
|
209
|
+
i_strand = col.get(COL_REF_STRAND)
|
|
210
|
+
|
|
211
|
+
rows = (line.rstrip("\n").split("\t") for line in fh)
|
|
212
|
+
|
|
213
|
+
batch = []
|
|
214
|
+
n_cpgs = 0
|
|
215
|
+
for read_id, group in itertools.groupby(rows, key=lambda r: r[i_read]):
|
|
216
|
+
meth = []
|
|
217
|
+
positions = []
|
|
218
|
+
chrom = None
|
|
219
|
+
ref_strand = None
|
|
220
|
+
for r in group:
|
|
221
|
+
# Skip calls that failed modkit's confidence threshold.
|
|
222
|
+
if i_fail is not None and r[i_fail].lower() == "true":
|
|
223
|
+
continue
|
|
224
|
+
chrom = r[i_chrom]
|
|
225
|
+
if i_strand is not None:
|
|
226
|
+
ref_strand = r[i_strand]
|
|
227
|
+
positions.append(int(r[i_pos]))
|
|
228
|
+
meth.append(1 if r[i_code] in methyl_set else 0)
|
|
229
|
+
|
|
230
|
+
if not positions:
|
|
231
|
+
continue
|
|
232
|
+
|
|
233
|
+
positions = np.asarray(positions, dtype=np.int64)
|
|
234
|
+
meth = np.asarray(meth, dtype=np.float64)
|
|
235
|
+
|
|
236
|
+
# Compute the WGBS lookup anchor: a global coordinate offset
|
|
237
|
+
# (0-/1-based etc., usually chosen by probe_alignment) plus, when
|
|
238
|
+
# collapsing strands, the -1 shift that maps a -strand C onto its
|
|
239
|
+
# + strand dyad anchor. A read aligns to one strand, so ref_strand is
|
|
240
|
+
# constant. Distances between CpGs are unaffected (uniform shift).
|
|
241
|
+
anchors = positions + position_offset
|
|
242
|
+
if collapse_strands and ref_strand == "-":
|
|
243
|
+
anchors = anchors - 1
|
|
244
|
+
wgbs = lookup_wgbs(wgbs_reference, chrom, anchors)
|
|
245
|
+
|
|
246
|
+
if stats is not None:
|
|
247
|
+
stats["reads_total"] += 1
|
|
248
|
+
stats["cpgs_total"] += positions.size
|
|
249
|
+
matched = int(np.count_nonzero(~np.isnan(wgbs)))
|
|
250
|
+
stats["cpgs_matched"] += matched
|
|
251
|
+
stats["calls_chroms"].add(chrom)
|
|
252
|
+
if matched == 0:
|
|
253
|
+
stats["reads_dropped"] += 1
|
|
254
|
+
|
|
255
|
+
if drop_unmatched_wgbs:
|
|
256
|
+
keep = ~np.isnan(wgbs)
|
|
257
|
+
positions = positions[keep]
|
|
258
|
+
meth = meth[keep]
|
|
259
|
+
wgbs = wgbs[keep]
|
|
260
|
+
if positions.size == 0:
|
|
261
|
+
continue
|
|
262
|
+
|
|
263
|
+
batch.append(
|
|
264
|
+
{
|
|
265
|
+
"read_id": read_id,
|
|
266
|
+
"meth_values": meth,
|
|
267
|
+
"positions": positions,
|
|
268
|
+
"wgbs_values": wgbs,
|
|
269
|
+
}
|
|
270
|
+
)
|
|
271
|
+
n_cpgs += positions.size
|
|
272
|
+
|
|
273
|
+
if n_cpgs >= batch_size:
|
|
274
|
+
yield batch
|
|
275
|
+
batch = []
|
|
276
|
+
n_cpgs = 0
|
|
277
|
+
|
|
278
|
+
if batch:
|
|
279
|
+
yield batch
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def report_intersection(stats, wgbs_reference, warn_threshold=0.5):
|
|
283
|
+
"""Print WGBS-intersection QC and warn (not fail) on a poor overlap.
|
|
284
|
+
|
|
285
|
+
Returns the matched fraction in [0, 1] (0.0 if no CpGs were seen).
|
|
286
|
+
"""
|
|
287
|
+
total = stats["cpgs_total"]
|
|
288
|
+
matched = stats["cpgs_matched"]
|
|
289
|
+
frac = matched / total if total else 0.0
|
|
290
|
+
print(
|
|
291
|
+
f"WGBS intersection: matched {matched}/{total} CpGs ({frac:.1%}); "
|
|
292
|
+
f"{stats['reads_dropped']}/{stats['reads_total']} reads had no WGBS overlap."
|
|
293
|
+
)
|
|
294
|
+
if total and frac < warn_threshold:
|
|
295
|
+
calls_chroms = stats["calls_chroms"]
|
|
296
|
+
ref_chroms = set(wgbs_reference)
|
|
297
|
+
print(f"WARNING: only {frac:.1%} of CpGs matched the WGBS reference. Likely causes:")
|
|
298
|
+
if calls_chroms and not (calls_chroms & ref_chroms):
|
|
299
|
+
print(
|
|
300
|
+
f" - chromosome naming mismatch: calls use e.g. {sorted(calls_chroms)[:3]} "
|
|
301
|
+
f"but the WGBS reference has e.g. {sorted(ref_chroms)[:3]}"
|
|
302
|
+
)
|
|
303
|
+
print(" - wrong ratio column or scale: check -wgbs_column / --wgbs-scale")
|
|
304
|
+
print(" - genome build / coordinate-system difference between calls and WGBS")
|
|
305
|
+
print(" - if you forced --wgbs-offset / --wgbs-collapse, try 'auto'")
|
|
306
|
+
return frac
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
# Candidate global coordinate offsets the aligner probe tries (0-/1-based and
|
|
310
|
+
# off-by-one dyad/strand-anchor conventions). 0 first so ties prefer no shift.
|
|
311
|
+
CANDIDATE_OFFSETS = (0, -1, 1)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def probe_alignment(
|
|
315
|
+
path_calls,
|
|
316
|
+
wgbs_reference,
|
|
317
|
+
offsets=CANDIDATE_OFFSETS,
|
|
318
|
+
collapse_options=(True, False),
|
|
319
|
+
sample_size=20000,
|
|
320
|
+
):
|
|
321
|
+
"""Empirically detect the coordinate convention of a WGBS reference.
|
|
322
|
+
|
|
323
|
+
WGBS beds vary in convention (0- vs 1-based, which base of the CpG dyad/strand
|
|
324
|
+
they anchor on) and you usually can't tell from the file. So rather than
|
|
325
|
+
requiring the user to know, we read a sample of CpGs from the head of the
|
|
326
|
+
calls file and measure the match rate for each ``(offset, collapse_strands)``
|
|
327
|
+
combination -- the correct convention reveals itself as a sharp jump in the
|
|
328
|
+
matched fraction. Self-verifying, unlike the ratio scale, which is why this is
|
|
329
|
+
probed while ``--wgbs-scale`` stays an explicit flag.
|
|
330
|
+
|
|
331
|
+
Returns
|
|
332
|
+
-------
|
|
333
|
+
list[tuple[int, bool, float]]
|
|
334
|
+
``(offset, collapse_strands, matched_fraction)`` sorted best-first.
|
|
335
|
+
"""
|
|
336
|
+
sample = {} # chrom -> ([positions], [is_minus])
|
|
337
|
+
n = 0
|
|
338
|
+
with _open_text(path_calls) as fh:
|
|
339
|
+
header = fh.readline()
|
|
340
|
+
if not header:
|
|
341
|
+
return []
|
|
342
|
+
col = _resolve_columns(header)
|
|
343
|
+
i_pos = col[COL_REF_POSITION]
|
|
344
|
+
i_chrom = col[COL_CHROM]
|
|
345
|
+
i_strand = col.get(COL_REF_STRAND)
|
|
346
|
+
for line in fh:
|
|
347
|
+
r = line.rstrip("\n").split("\t")
|
|
348
|
+
chrom = r[i_chrom]
|
|
349
|
+
pos_list, minus_list = sample.setdefault(chrom, ([], []))
|
|
350
|
+
pos_list.append(int(r[i_pos]))
|
|
351
|
+
minus_list.append(i_strand is not None and r[i_strand] == "-")
|
|
352
|
+
n += 1
|
|
353
|
+
if n >= sample_size:
|
|
354
|
+
break
|
|
355
|
+
|
|
356
|
+
sample = {
|
|
357
|
+
chrom: (np.asarray(p, dtype=np.int64), np.asarray(m, dtype=bool))
|
|
358
|
+
for chrom, (p, m) in sample.items()
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
results = []
|
|
362
|
+
for offset in offsets:
|
|
363
|
+
for collapse in collapse_options:
|
|
364
|
+
matched = total = 0
|
|
365
|
+
for chrom, (positions, minus) in sample.items():
|
|
366
|
+
anchors = positions + offset
|
|
367
|
+
if collapse:
|
|
368
|
+
anchors = anchors - minus.astype(np.int64)
|
|
369
|
+
w = lookup_wgbs(wgbs_reference, chrom, anchors)
|
|
370
|
+
matched += int(np.count_nonzero(~np.isnan(w)))
|
|
371
|
+
total += positions.size
|
|
372
|
+
results.append((offset, collapse, matched / total if total else 0.0))
|
|
373
|
+
results.sort(key=lambda t: t[2], reverse=True)
|
|
374
|
+
return results
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
# Column layout of a pre-merged 7-column bed (the escape hatch / legacy format):
|
|
378
|
+
# chrom, start, stop, strand, read_id, methylation(0/1), wgbs_ratio.
|
|
379
|
+
PREMERGED_READ_ID_COLUMN = 4
|
|
380
|
+
PREMERGED_POSITION_COLUMN = 1
|
|
381
|
+
PREMERGED_METH_COLUMN = 5
|
|
382
|
+
PREMERGED_WGBS_COLUMN = 6
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def iter_premerged_batches(path_bed, batch_size):
|
|
386
|
+
"""Stream a pre-merged 7-column bed (chrom, start, stop, strand, read_id,
|
|
387
|
+
methylation, wgbs) in CpG-bounded batches.
|
|
388
|
+
|
|
389
|
+
This is the escape hatch for users who do the CpG<->WGBS intersection
|
|
390
|
+
themselves (e.g. with bedtools in a pipeline): MPATH does no merge/probe and
|
|
391
|
+
just computes metrics. Rows must be grouped by ``read_id`` (no header).
|
|
392
|
+
"""
|
|
393
|
+
with _open_text(path_bed) as fh:
|
|
394
|
+
rows = (line.rstrip("\n").split("\t") for line in fh)
|
|
395
|
+
batch = []
|
|
396
|
+
n_cpgs = 0
|
|
397
|
+
for read_id, group in itertools.groupby(rows, key=lambda r: r[PREMERGED_READ_ID_COLUMN]):
|
|
398
|
+
positions, meth, wgbs = [], [], []
|
|
399
|
+
for r in group:
|
|
400
|
+
positions.append(int(r[PREMERGED_POSITION_COLUMN]))
|
|
401
|
+
meth.append(float(r[PREMERGED_METH_COLUMN]))
|
|
402
|
+
wgbs.append(float(r[PREMERGED_WGBS_COLUMN]))
|
|
403
|
+
batch.append(
|
|
404
|
+
{
|
|
405
|
+
"read_id": read_id,
|
|
406
|
+
"meth_values": np.asarray(meth, dtype=np.float64),
|
|
407
|
+
"positions": np.asarray(positions, dtype=np.int64),
|
|
408
|
+
"wgbs_values": np.asarray(wgbs, dtype=np.float64),
|
|
409
|
+
}
|
|
410
|
+
)
|
|
411
|
+
n_cpgs += len(positions)
|
|
412
|
+
if n_cpgs >= batch_size:
|
|
413
|
+
yield batch
|
|
414
|
+
batch = []
|
|
415
|
+
n_cpgs = 0
|
|
416
|
+
if batch:
|
|
417
|
+
yield batch
|