mpath-pseudotime 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mpath/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """MPATH: methylation pseudotime analysis for Oxford Nanopore long reads."""
2
+
3
+ __version__ = "0.2.0" # x-release-please-version
mpath/cli.py ADDED
@@ -0,0 +1,154 @@
1
+ #!/usr/bin/env python3
2
+
3
+ """Command-line interface for MPATH.
4
+
5
+ Subcommands::
6
+
7
+ mpath metrics compute read-level metrics from modkit calls + WGBS bed
8
+ mpath pca fit fit a PCA on labelled nascent/mature metric tables
9
+ mpath pca apply project an unlabelled metric table into a fitted PCA space
10
+ """
11
+
12
+ import argparse
13
+
14
+ from . import __version__
15
+
16
+
17
+ def _add_metrics_parser(subparsers):
18
+ p = subparsers.add_parser(
19
+ "metrics",
20
+ help="compute read-level methylation metrics",
21
+ description="Compute read-level metrics from a modkit read-calls TSV + WGBS reference, "
22
+ "or from a pre-merged bed.",
23
+ )
24
+ # Single-dash long options preserved for continuity with the original scripts.
25
+ # Mode A: merge calls + WGBS on the fly. Mode B: a pre-merged bed (escape hatch).
26
+ p.add_argument("-path_calls", default=None, help="modkit read-calls TSV (optionally .gz)")
27
+ p.add_argument("-path_wgbs", default=None, help="WGBS reference bed (optionally .gz)")
28
+ p.add_argument(
29
+ "-path_input_bed",
30
+ default=None,
31
+ help="pre-merged 7-col bed (chrom,start,stop,strand,read_id,meth,wgbs); skips the WGBS merge",
32
+ )
33
+ p.add_argument("-path_output_csv", required=True, help="output wide-format metrics CSV")
34
+ p.add_argument("-wgbs_column", default=3, type=int, help="0-based column of the WGBS ratio (default 3)")
35
+ p.add_argument("-min_cpgs", default=3, type=int, help="minimum CpGs on a read to compute metrics")
36
+ p.add_argument("-bin_limits", default="0,100,1000,5000,10000", help="comma-separated distance-bin limits")
37
+ p.add_argument("-batch_size", default=int(1e8), type=int, help="approx CpGs per batch (RAM control)")
38
+ p.add_argument("-p", default=1, type=int, help="number of parallel processes")
39
+ p.add_argument("--use_full_matrix", action="store_true", help="use the full CpG-pair matrix")
40
+ p.add_argument("--include-hydroxy", action="store_true", help="count 5hmC ('h') calls as methylated")
41
+ p.add_argument(
42
+ "--keep-unmatched-wgbs",
43
+ action="store_true",
44
+ help="keep CpGs with no WGBS entry (NaN ratio) instead of dropping them",
45
+ )
46
+ # WGBS reference ratio scale (explicit -- 0-1 vs 0-100 can't be auto-detected).
47
+ p.add_argument(
48
+ "--wgbs-scale",
49
+ choices=["0-1", "0-100"],
50
+ default="0-1",
51
+ help="scale of the WGBS ratio column: 0-1 (default) or 0-100 (percentages)",
52
+ )
53
+ # Coordinate-convention handling. These are auto-probed by default (the data
54
+ # tells us which offset/strand mode matches); override only to force one.
55
+ p.add_argument(
56
+ "--wgbs-offset",
57
+ choices=["auto", "-1", "0", "1"],
58
+ default="auto",
59
+ help="global coordinate offset for the CpG<->WGBS join (default: auto-probe)",
60
+ )
61
+ p.add_argument(
62
+ "--wgbs-collapse",
63
+ choices=["auto", "on", "off"],
64
+ default="auto",
65
+ help="map -strand CpGs to the + dyad anchor: on/off, or auto-probe (default)",
66
+ )
67
+ p.set_defaults(func=_run_metrics)
68
+
69
+
70
+ def _run_metrics(args):
71
+ from . import io as mpath_io
72
+ from . import metrics
73
+
74
+ methyl_codes = ("m", "h") if args.include_hydroxy else mpath_io.DEFAULT_METHYL_CODES
75
+ metrics.run_metrics(
76
+ path_calls=args.path_calls,
77
+ path_wgbs=args.path_wgbs,
78
+ path_input_bed=args.path_input_bed,
79
+ path_output_csv=args.path_output_csv,
80
+ wgbs_column=args.wgbs_column,
81
+ min_cpgs=args.min_cpgs,
82
+ bin_limits=args.bin_limits,
83
+ batch_size=args.batch_size,
84
+ processes=args.p,
85
+ use_full_matrix=args.use_full_matrix,
86
+ methyl_codes=methyl_codes,
87
+ drop_unmatched_wgbs=not args.keep_unmatched_wgbs,
88
+ wgbs_scale=args.wgbs_scale,
89
+ wgbs_offset=args.wgbs_offset,
90
+ wgbs_collapse=args.wgbs_collapse,
91
+ )
92
+
93
+
94
+ def _add_pca_parser(subparsers):
95
+ p = subparsers.add_parser("pca", help="PCA of read-level metrics")
96
+ pca_sub = p.add_subparsers(dest="pca_command", required=True)
97
+
98
+ fit = pca_sub.add_parser("fit", help="fit a PCA on labelled nascent + mature tables")
99
+ fit.add_argument("--nascent", required=True, help="nascent metrics CSV")
100
+ fit.add_argument("--mature", required=True, help="mature metrics CSV")
101
+ fit.add_argument("--out-dir", required=True, help="output directory for scores, model and figures")
102
+ fit.add_argument("--columns", default=None, help="comma-separated metric columns to use (default: auto)")
103
+ fit.add_argument("--standardize", action="store_true", help="z-score metrics before PCA (default: off)")
104
+ fit.add_argument("--n-components", default=None, type=int, help="number of PCs to keep (default: all)")
105
+ fit.set_defaults(func=_run_pca_fit)
106
+
107
+ ap = pca_sub.add_parser("apply", help="project an unlabelled table into a fitted PCA space")
108
+ ap.add_argument("--model", required=True, help="pca_model.npz produced by `mpath pca fit`")
109
+ ap.add_argument("--input", required=True, help="unlabelled metrics CSV")
110
+ ap.add_argument("--out", required=True, help="output CSV with appended PCA scores")
111
+ ap.set_defaults(func=_run_pca_apply)
112
+
113
+
114
+ def _run_pca_fit(args):
115
+ from . import pca
116
+
117
+ columns = [c.strip() for c in args.columns.split(",")] if args.columns else None
118
+ pca.fit(
119
+ path_nascent=args.nascent,
120
+ path_mature=args.mature,
121
+ out_dir=args.out_dir,
122
+ columns=columns,
123
+ standardize=args.standardize,
124
+ n_components=args.n_components,
125
+ )
126
+
127
+
128
+ def _run_pca_apply(args):
129
+ from . import pca
130
+
131
+ pca.apply(path_model=args.model, path_input=args.input, path_output=args.out)
132
+
133
+
134
+ def build_parser():
135
+ parser = argparse.ArgumentParser(prog="mpath", description="MPATH: methylation pseudotime analysis")
136
+ parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
137
+ subparsers = parser.add_subparsers(dest="command", required=True)
138
+ _add_metrics_parser(subparsers)
139
+ _add_pca_parser(subparsers)
140
+ return parser
141
+
142
+
143
+ def main(argv=None):
144
+ parser = build_parser()
145
+ args = parser.parse_args(argv)
146
+ try:
147
+ args.func(args)
148
+ except (ValueError, FileNotFoundError) as e:
149
+ # Present input/usage errors cleanly instead of as a raw traceback.
150
+ parser.error(str(e))
151
+
152
+
153
+ if __name__ == "__main__":
154
+ main()
mpath/io.py ADDED
@@ -0,0 +1,417 @@
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ Native merge of ``modkit extract calls`` output with a WGBS reference bed.
5
+
6
+ This replaces the old workflow of hand-building a 7-column ``input.bed`` before
7
+ running the metrics. The metrics CLI now consumes the modkit calls TSV and the
8
+ WGBS bed directly:
9
+
10
+ * the modkit calls TSV gives, per CpG call: ``read_id``, ``chrom``,
11
+ ``ref_position`` and ``call_code`` (``-`` canonical, ``m`` 5mC, ``h`` 5hmC),
12
+ * the WGBS bed gives the expected methylation ratio for each CpG of the cell
13
+ type, in a user-specified column.
14
+
15
+ The two are joined on ``(chrom, ref_position)``. modkit emits all rows for a
16
+ given read contiguously, so reads are streamed in groups with
17
+ ``itertools.groupby`` and yielded in CpG-bounded batches -- whole-genome files
18
+ never have to be held in memory at once. The WGBS reference is stored as
19
+ compact per-chromosome sorted ``int32``/``float32`` arrays and looked up with a
20
+ vectorised ``np.searchsorted`` per read.
21
+ """
22
+
23
+ import gzip
24
+ import itertools
25
+
26
+ import numpy as np
27
+ import polars as pl
28
+
29
+ # Column names emitted by `modkit extract --read-calls` (modkit >= 0.4) and the
30
+ # equivalent `modkit extract calls`. We resolve them from the header so the code
31
+ # tolerates added/reordered columns across modkit versions.
32
+ COL_READ_ID = "read_id"
33
+ COL_REF_POSITION = "ref_position"
34
+ COL_CHROM = "chrom"
35
+ COL_CALL_CODE = "call_code"
36
+ COL_FAIL = "fail"
37
+ COL_REF_STRAND = "ref_strand"
38
+
39
+ # Default modification codes counted as "methylated". 5hmC (`h`) is excluded by
40
+ # default; pass methyl_codes=("m", "h") to include it.
41
+ DEFAULT_METHYL_CODES = ("m",)
42
+
43
+
44
+ def load_wgbs_reference(path_wgbs, wgbs_column, chrom_column=0, position_column=1, scale="0-1"):
45
+ """Load a WGBS bed into per-chromosome sorted (positions, ratios) arrays.
46
+
47
+ The expected input is a simple tab-separated bed (no header), e.g. the common
48
+ ``chrom, start, end, ratio`` 4-column form (ratio at the default
49
+ ``wgbs_column=3``).
50
+
51
+ Parameters
52
+ ----------
53
+ path_wgbs : str
54
+ Path to the WGBS bed (optionally gzipped).
55
+ wgbs_column : int
56
+ 0-based column index holding the expected methylation ratio.
57
+ chrom_column, position_column : int
58
+ 0-based column indices of the chromosome and (0-based) start position.
59
+ scale : str
60
+ ``"0-1"`` (ratios already in [0, 1]) or ``"0-100"`` (percentages; divided
61
+ by 100). No auto-detection -- declare the scale explicitly.
62
+
63
+ Returns
64
+ -------
65
+ dict[str, tuple[np.ndarray, np.ndarray]]
66
+ Mapping ``chrom -> (positions_sorted_int64, ratios_float64)``.
67
+ """
68
+ if scale not in ("0-1", "0-100"):
69
+ raise ValueError(f"scale must be '0-1' or '0-100', got {scale!r}")
70
+
71
+ needed = sorted({chrom_column, position_column, wgbs_column})
72
+ # polars reads gzip transparently and parses far faster than the csv module.
73
+ frame = pl.read_csv(
74
+ path_wgbs,
75
+ separator="\t",
76
+ has_header=False,
77
+ columns=needed,
78
+ infer_schema_length=0, # read everything as str, we cast explicitly below
79
+ )
80
+ cols = frame.columns # named like "column_<idx>" in selection order
81
+ name_for = {idx: cols[i] for i, idx in enumerate(needed)}
82
+
83
+ frame = frame.select(
84
+ pl.col(name_for[chrom_column]).alias("chrom"),
85
+ pl.col(name_for[position_column]).cast(pl.Int64).alias("pos"),
86
+ pl.col(name_for[wgbs_column]).cast(pl.Float64, strict=False).alias("ratio"),
87
+ ).sort(["chrom", "pos"])
88
+
89
+ divisor = 100.0 if scale == "0-100" else 1.0
90
+ reference = {}
91
+ for chrom, sub in frame.group_by("chrom", maintain_order=True):
92
+ chrom = chrom[0] if isinstance(chrom, tuple) else chrom
93
+ reference[str(chrom)] = (
94
+ sub["pos"].to_numpy(),
95
+ sub["ratio"].to_numpy() / divisor,
96
+ )
97
+
98
+ # A wrong scale silently corrupts read_wgbs_distance (and the position-only
99
+ # intersection QC would not catch it), so flag ratios that look like percents.
100
+ if scale == "0-1":
101
+ ref_max = max((r.max() for _, r in reference.values() if len(r)), default=0.0)
102
+ if ref_max > 1.5:
103
+ print(
104
+ f"WARNING: WGBS ratios reach {ref_max:.1f} (> 1) -- they look like 0-100 "
105
+ "percentages. Pass --wgbs-scale 0-100 if so."
106
+ )
107
+ return reference
108
+
109
+
110
+ def lookup_wgbs(reference, chrom, positions):
111
+ """Vectorised lookup of WGBS ratios for one read's CpG positions.
112
+
113
+ Returns an array aligned with ``positions``; entries with no matching CpG in
114
+ the reference are ``np.nan``.
115
+ """
116
+ arrays = reference.get(str(chrom))
117
+ if arrays is None:
118
+ return np.full(len(positions), np.nan, dtype=np.float64)
119
+ ref_pos, ref_ratio = arrays
120
+ idx = np.searchsorted(ref_pos, positions)
121
+ idx_clipped = np.clip(idx, 0, len(ref_pos) - 1)
122
+ matched = ref_pos[idx_clipped] == positions
123
+ ratios = np.where(matched, ref_ratio[idx_clipped], np.nan).astype(np.float64)
124
+ return ratios
125
+
126
+
127
+ def _open_text(path):
128
+ """Open a possibly-gzipped text file."""
129
+ if str(path).endswith(".gz"):
130
+ return gzip.open(path, "rt")
131
+ return open(path)
132
+
133
+
134
+ def _resolve_columns(header_line):
135
+ """Map required modkit column names to indices from the header line."""
136
+ names = header_line.rstrip("\n").split("\t")
137
+ index = {name: i for i, name in enumerate(names)}
138
+ missing = [c for c in (COL_READ_ID, COL_REF_POSITION, COL_CHROM, COL_CALL_CODE) if c not in index]
139
+ if missing:
140
+ raise ValueError(
141
+ f"modkit calls file is missing required column(s) {missing}. "
142
+ f"Found columns: {names}. Generate it with `modkit extract --read-calls`."
143
+ )
144
+ return index
145
+
146
+
147
+ def new_stats():
148
+ """Fresh intersection-QC accumulator (see :func:`iter_read_batches`)."""
149
+ return {
150
+ "reads_total": 0,
151
+ "reads_dropped": 0, # reads with zero WGBS-matched CpGs
152
+ "cpgs_total": 0,
153
+ "cpgs_matched": 0,
154
+ "calls_chroms": set(),
155
+ }
156
+
157
+
158
+ def iter_read_batches(
159
+ path_calls,
160
+ wgbs_reference,
161
+ batch_size,
162
+ methyl_codes=DEFAULT_METHYL_CODES,
163
+ drop_unmatched_wgbs=True,
164
+ collapse_strands=True,
165
+ position_offset=0,
166
+ stats=None,
167
+ ):
168
+ """Stream reads from a modkit calls file in CpG-bounded batches.
169
+
170
+ Each yielded batch is a list of read records with the keys the metrics code
171
+ expects: ``read_id``, ``meth_values`` (0/1), ``positions``, ``wgbs_values``.
172
+
173
+ Parameters
174
+ ----------
175
+ path_calls : str
176
+ Path to the modkit read-calls TSV (optionally gzipped).
177
+ wgbs_reference : dict
178
+ Output of :func:`load_wgbs_reference`.
179
+ batch_size : int
180
+ Approximate maximum number of CpGs per batch (reads are never split).
181
+ methyl_codes : tuple[str, ...]
182
+ ``call_code`` values counted as methylated (others are unmethylated).
183
+ drop_unmatched_wgbs : bool
184
+ If True (default), drop CpGs with no WGBS reference entry (inner join);
185
+ otherwise keep them with a NaN WGBS value.
186
+ collapse_strands : bool
187
+ If True (default) and a ``ref_strand`` column is present, a cytosine on
188
+ the reference ``-`` strand (``ref_position`` = ``p``) is matched to the
189
+ ``+`` strand anchor of its CpG dyad (``p - 1``). This is what lets a
190
+ combined/``+``-strand WGBS reference match ``-`` strand read CpGs;
191
+ without it roughly half of all CpGs silently fail to join. Distances
192
+ between CpGs are unaffected (a uniform shift), so only the lookup uses
193
+ the anchor.
194
+ stats : dict, optional
195
+ If given (see :func:`new_stats`), intersection-quality counters are
196
+ accumulated in place for the caller to report.
197
+ """
198
+ methyl_set = set(methyl_codes)
199
+ with _open_text(path_calls) as fh:
200
+ header = fh.readline()
201
+ if not header:
202
+ return
203
+ col = _resolve_columns(header)
204
+ i_read = col[COL_READ_ID]
205
+ i_pos = col[COL_REF_POSITION]
206
+ i_chrom = col[COL_CHROM]
207
+ i_code = col[COL_CALL_CODE]
208
+ i_fail = col.get(COL_FAIL)
209
+ i_strand = col.get(COL_REF_STRAND)
210
+
211
+ rows = (line.rstrip("\n").split("\t") for line in fh)
212
+
213
+ batch = []
214
+ n_cpgs = 0
215
+ for read_id, group in itertools.groupby(rows, key=lambda r: r[i_read]):
216
+ meth = []
217
+ positions = []
218
+ chrom = None
219
+ ref_strand = None
220
+ for r in group:
221
+ # Skip calls that failed modkit's confidence threshold.
222
+ if i_fail is not None and r[i_fail].lower() == "true":
223
+ continue
224
+ chrom = r[i_chrom]
225
+ if i_strand is not None:
226
+ ref_strand = r[i_strand]
227
+ positions.append(int(r[i_pos]))
228
+ meth.append(1 if r[i_code] in methyl_set else 0)
229
+
230
+ if not positions:
231
+ continue
232
+
233
+ positions = np.asarray(positions, dtype=np.int64)
234
+ meth = np.asarray(meth, dtype=np.float64)
235
+
236
+ # Compute the WGBS lookup anchor: a global coordinate offset
237
+ # (0-/1-based etc., usually chosen by probe_alignment) plus, when
238
+ # collapsing strands, the -1 shift that maps a -strand C onto its
239
+ # + strand dyad anchor. A read aligns to one strand, so ref_strand is
240
+ # constant. Distances between CpGs are unaffected (uniform shift).
241
+ anchors = positions + position_offset
242
+ if collapse_strands and ref_strand == "-":
243
+ anchors = anchors - 1
244
+ wgbs = lookup_wgbs(wgbs_reference, chrom, anchors)
245
+
246
+ if stats is not None:
247
+ stats["reads_total"] += 1
248
+ stats["cpgs_total"] += positions.size
249
+ matched = int(np.count_nonzero(~np.isnan(wgbs)))
250
+ stats["cpgs_matched"] += matched
251
+ stats["calls_chroms"].add(chrom)
252
+ if matched == 0:
253
+ stats["reads_dropped"] += 1
254
+
255
+ if drop_unmatched_wgbs:
256
+ keep = ~np.isnan(wgbs)
257
+ positions = positions[keep]
258
+ meth = meth[keep]
259
+ wgbs = wgbs[keep]
260
+ if positions.size == 0:
261
+ continue
262
+
263
+ batch.append(
264
+ {
265
+ "read_id": read_id,
266
+ "meth_values": meth,
267
+ "positions": positions,
268
+ "wgbs_values": wgbs,
269
+ }
270
+ )
271
+ n_cpgs += positions.size
272
+
273
+ if n_cpgs >= batch_size:
274
+ yield batch
275
+ batch = []
276
+ n_cpgs = 0
277
+
278
+ if batch:
279
+ yield batch
280
+
281
+
282
+ def report_intersection(stats, wgbs_reference, warn_threshold=0.5):
283
+ """Print WGBS-intersection QC and warn (not fail) on a poor overlap.
284
+
285
+ Returns the matched fraction in [0, 1] (0.0 if no CpGs were seen).
286
+ """
287
+ total = stats["cpgs_total"]
288
+ matched = stats["cpgs_matched"]
289
+ frac = matched / total if total else 0.0
290
+ print(
291
+ f"WGBS intersection: matched {matched}/{total} CpGs ({frac:.1%}); "
292
+ f"{stats['reads_dropped']}/{stats['reads_total']} reads had no WGBS overlap."
293
+ )
294
+ if total and frac < warn_threshold:
295
+ calls_chroms = stats["calls_chroms"]
296
+ ref_chroms = set(wgbs_reference)
297
+ print(f"WARNING: only {frac:.1%} of CpGs matched the WGBS reference. Likely causes:")
298
+ if calls_chroms and not (calls_chroms & ref_chroms):
299
+ print(
300
+ f" - chromosome naming mismatch: calls use e.g. {sorted(calls_chroms)[:3]} "
301
+ f"but the WGBS reference has e.g. {sorted(ref_chroms)[:3]}"
302
+ )
303
+ print(" - wrong ratio column or scale: check -wgbs_column / --wgbs-scale")
304
+ print(" - genome build / coordinate-system difference between calls and WGBS")
305
+ print(" - if you forced --wgbs-offset / --wgbs-collapse, try 'auto'")
306
+ return frac
307
+
308
+
309
+ # Candidate global coordinate offsets the aligner probe tries (0-/1-based and
310
+ # off-by-one dyad/strand-anchor conventions). 0 first so ties prefer no shift.
311
+ CANDIDATE_OFFSETS = (0, -1, 1)
312
+
313
+
314
+ def probe_alignment(
315
+ path_calls,
316
+ wgbs_reference,
317
+ offsets=CANDIDATE_OFFSETS,
318
+ collapse_options=(True, False),
319
+ sample_size=20000,
320
+ ):
321
+ """Empirically detect the coordinate convention of a WGBS reference.
322
+
323
+ WGBS beds vary in convention (0- vs 1-based, which base of the CpG dyad/strand
324
+ they anchor on) and you usually can't tell from the file. So rather than
325
+ requiring the user to know, we read a sample of CpGs from the head of the
326
+ calls file and measure the match rate for each ``(offset, collapse_strands)``
327
+ combination -- the correct convention reveals itself as a sharp jump in the
328
+ matched fraction. Self-verifying, unlike the ratio scale, which is why this is
329
+ probed while ``--wgbs-scale`` stays an explicit flag.
330
+
331
+ Returns
332
+ -------
333
+ list[tuple[int, bool, float]]
334
+ ``(offset, collapse_strands, matched_fraction)`` sorted best-first.
335
+ """
336
+ sample = {} # chrom -> ([positions], [is_minus])
337
+ n = 0
338
+ with _open_text(path_calls) as fh:
339
+ header = fh.readline()
340
+ if not header:
341
+ return []
342
+ col = _resolve_columns(header)
343
+ i_pos = col[COL_REF_POSITION]
344
+ i_chrom = col[COL_CHROM]
345
+ i_strand = col.get(COL_REF_STRAND)
346
+ for line in fh:
347
+ r = line.rstrip("\n").split("\t")
348
+ chrom = r[i_chrom]
349
+ pos_list, minus_list = sample.setdefault(chrom, ([], []))
350
+ pos_list.append(int(r[i_pos]))
351
+ minus_list.append(i_strand is not None and r[i_strand] == "-")
352
+ n += 1
353
+ if n >= sample_size:
354
+ break
355
+
356
+ sample = {
357
+ chrom: (np.asarray(p, dtype=np.int64), np.asarray(m, dtype=bool))
358
+ for chrom, (p, m) in sample.items()
359
+ }
360
+
361
+ results = []
362
+ for offset in offsets:
363
+ for collapse in collapse_options:
364
+ matched = total = 0
365
+ for chrom, (positions, minus) in sample.items():
366
+ anchors = positions + offset
367
+ if collapse:
368
+ anchors = anchors - minus.astype(np.int64)
369
+ w = lookup_wgbs(wgbs_reference, chrom, anchors)
370
+ matched += int(np.count_nonzero(~np.isnan(w)))
371
+ total += positions.size
372
+ results.append((offset, collapse, matched / total if total else 0.0))
373
+ results.sort(key=lambda t: t[2], reverse=True)
374
+ return results
375
+
376
+
377
+ # Column layout of a pre-merged 7-column bed (the escape hatch / legacy format):
378
+ # chrom, start, stop, strand, read_id, methylation(0/1), wgbs_ratio.
379
+ PREMERGED_READ_ID_COLUMN = 4
380
+ PREMERGED_POSITION_COLUMN = 1
381
+ PREMERGED_METH_COLUMN = 5
382
+ PREMERGED_WGBS_COLUMN = 6
383
+
384
+
385
+ def iter_premerged_batches(path_bed, batch_size):
386
+ """Stream a pre-merged 7-column bed (chrom, start, stop, strand, read_id,
387
+ methylation, wgbs) in CpG-bounded batches.
388
+
389
+ This is the escape hatch for users who do the CpG<->WGBS intersection
390
+ themselves (e.g. with bedtools in a pipeline): MPATH does no merge/probe and
391
+ just computes metrics. Rows must be grouped by ``read_id`` (no header).
392
+ """
393
+ with _open_text(path_bed) as fh:
394
+ rows = (line.rstrip("\n").split("\t") for line in fh)
395
+ batch = []
396
+ n_cpgs = 0
397
+ for read_id, group in itertools.groupby(rows, key=lambda r: r[PREMERGED_READ_ID_COLUMN]):
398
+ positions, meth, wgbs = [], [], []
399
+ for r in group:
400
+ positions.append(int(r[PREMERGED_POSITION_COLUMN]))
401
+ meth.append(float(r[PREMERGED_METH_COLUMN]))
402
+ wgbs.append(float(r[PREMERGED_WGBS_COLUMN]))
403
+ batch.append(
404
+ {
405
+ "read_id": read_id,
406
+ "meth_values": np.asarray(meth, dtype=np.float64),
407
+ "positions": np.asarray(positions, dtype=np.int64),
408
+ "wgbs_values": np.asarray(wgbs, dtype=np.float64),
409
+ }
410
+ )
411
+ n_cpgs += len(positions)
412
+ if n_cpgs >= batch_size:
413
+ yield batch
414
+ batch = []
415
+ n_cpgs = 0
416
+ if batch:
417
+ yield batch