methdb 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdb/__init__.py +1 -0
- mdb/build_methyl_mats.py +409 -0
- mdb/create.py +84 -0
- mdb/index.py +107 -0
- mdb/main.py +27 -0
- mdb/merge.py +252 -0
- mdb/ont_bed_parsing.py +328 -0
- mdb/pacbio_bed_parsing.py +241 -0
- mdb/parse_args.py +68 -0
- mdb/pca.py +440 -0
- mdb/strand.py +3 -0
- methdb-0.0.2.dist-info/METADATA +63 -0
- methdb-0.0.2.dist-info/RECORD +17 -0
- methdb-0.0.2.dist-info/WHEEL +5 -0
- methdb-0.0.2.dist-info/entry_points.txt +2 -0
- methdb-0.0.2.dist-info/licenses/LICENSE +21 -0
- methdb-0.0.2.dist-info/top_level.txt +1 -0
mdb/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "v0.0.2"
|
mdb/build_methyl_mats.py
ADDED
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Build CpG-by-sample matrices for 5mC and 5hmC ONLY (no M_sum),
|
|
4
|
+
using a pre-built CpG index (NPZ) and methylation bed-like files.
|
|
5
|
+
|
|
6
|
+
Supports two input formats via --platform:
|
|
7
|
+
- ont (default): modkit pileup BED (no header; 18 columns; 'm' and 'h' rows)
|
|
8
|
+
- pacbio: pb-cpg-tools cpg_scores TSV (has header lines starting with ## and a # header row;
|
|
9
|
+
columns include: chrom, begin, end, ... discretized_mod_score)
|
|
10
|
+
|
|
11
|
+
Outputs:
|
|
12
|
+
<out_prefix>.5mC.npy float32 (fraction 0..1 unless --keep-percent)
|
|
13
|
+
<out_prefix>.5hmC.npy float32 (ONT only; for PacBio will remain NaN)
|
|
14
|
+
|
|
15
|
+
Notes:
|
|
16
|
+
- Full-genome RAM allocation can be huge; for 400 samples this likely won't fit in 50 GB.
|
|
17
|
+
Use chromosome-sharded or memmap approach if needed.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import os
|
|
21
|
+
import sys
|
|
22
|
+
import time
|
|
23
|
+
import argparse
|
|
24
|
+
import logging
|
|
25
|
+
from typing import Dict, Tuple, List, Optional, Callable
|
|
26
|
+
|
|
27
|
+
import numpy as np
|
|
28
|
+
import polars as pl
|
|
29
|
+
from tqdm import tqdm
|
|
30
|
+
|
|
31
|
+
# ---------- ONT (modkit pileup bed, has_header=False => column_1..)
|
|
32
|
+
ONT_COL_CHROM = "column_1"
|
|
33
|
+
ONT_COL_START = "column_2"
|
|
34
|
+
ONT_COL_CODE = "column_4" # "m" or "h"
|
|
35
|
+
ONT_COL_PCT = "column_11" # percent modified
|
|
36
|
+
ONT_COL_COV = "column_10" # Nvalid_cov
|
|
37
|
+
|
|
38
|
+
# ---------- PacBio (pb-cpg-tools cpg_scores, tab-separated with comments + header)
|
|
39
|
+
# Columns (example):
|
|
40
|
+
# chrom begin end mod_score type cov est_mod_count est_unmod_count discretized_mod_score
|
|
41
|
+
PB_COL_CHROM = "chrom"
|
|
42
|
+
PB_COL_START = "begin" # 0-based start
|
|
43
|
+
PB_COL_COV = "cov"
|
|
44
|
+
PB_COL_DMS = "discretized_mod_score" # percent 0..100
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def setup_logging(log_path: Optional[str], verbose: bool = False) -> logging.Logger:
|
|
48
|
+
logger = logging.getLogger("build_5mc_5hmc")
|
|
49
|
+
logger.setLevel(logging.DEBUG)
|
|
50
|
+
|
|
51
|
+
fmt = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s")
|
|
52
|
+
|
|
53
|
+
sh = logging.StreamHandler(sys.stderr)
|
|
54
|
+
sh.setLevel(logging.DEBUG if verbose else logging.INFO)
|
|
55
|
+
sh.setFormatter(fmt)
|
|
56
|
+
logger.addHandler(sh)
|
|
57
|
+
|
|
58
|
+
if log_path:
|
|
59
|
+
os.makedirs(os.path.dirname(os.path.abspath(log_path)), exist_ok=True)
|
|
60
|
+
fh = logging.FileHandler(log_path)
|
|
61
|
+
fh.setLevel(logging.DEBUG)
|
|
62
|
+
fh.setFormatter(fmt)
|
|
63
|
+
logger.addHandler(fh)
|
|
64
|
+
|
|
65
|
+
return logger
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def load_index(index_npz: str, logger: logging.Logger):
|
|
69
|
+
idx = np.load(index_npz, allow_pickle=True)
|
|
70
|
+
chroms = idx["chroms"] # object array
|
|
71
|
+
chrom_offsets = idx["chrom_offsets"] # int64
|
|
72
|
+
pos0 = idx["pos0"] # int32
|
|
73
|
+
|
|
74
|
+
n_cpg = int(pos0.shape[0])
|
|
75
|
+
chrom_slices: Dict[str, Tuple[int, int]] = {}
|
|
76
|
+
for i, c in enumerate(chroms):
|
|
77
|
+
c = str(c)
|
|
78
|
+
s = int(chrom_offsets[i])
|
|
79
|
+
e = int(chrom_offsets[i + 1]) if i + 1 < len(chrom_offsets) else n_cpg
|
|
80
|
+
chrom_slices[c] = (s, e)
|
|
81
|
+
|
|
82
|
+
chrom_to_idx = {str(c): i for i, c in enumerate(chroms)}
|
|
83
|
+
allowed_chroms = list(chrom_to_idx.keys())
|
|
84
|
+
|
|
85
|
+
logger.info(f"Loaded index: {index_npz}")
|
|
86
|
+
logger.info(f"Index CpGs: {n_cpg:,}")
|
|
87
|
+
logger.info(f"Index chroms ({len(chroms)}): {allowed_chroms}")
|
|
88
|
+
|
|
89
|
+
return chroms, chrom_offsets, pos0, chrom_slices, allowed_chroms
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def read_beds_list(beds_list_path: str) -> Tuple[List[str], List[str]]:
|
|
93
|
+
"""
|
|
94
|
+
Returns (sample_names, file_paths)
|
|
95
|
+
Accepts either:
|
|
96
|
+
- path
|
|
97
|
+
- sample<TAB>path
|
|
98
|
+
"""
|
|
99
|
+
names: List[str] = []
|
|
100
|
+
paths: List[str] = []
|
|
101
|
+
with open(beds_list_path, "r") as f:
|
|
102
|
+
for line in f:
|
|
103
|
+
line = line.strip()
|
|
104
|
+
if not line or line.startswith("#"):
|
|
105
|
+
continue
|
|
106
|
+
parts = line.split("\t")
|
|
107
|
+
if len(parts) == 1:
|
|
108
|
+
p = parts[0]
|
|
109
|
+
names.append(os.path.basename(p))
|
|
110
|
+
paths.append(p)
|
|
111
|
+
else:
|
|
112
|
+
names.append(parts[0])
|
|
113
|
+
paths.append(parts[1])
|
|
114
|
+
return names, paths
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def map_positions_to_rows(
|
|
118
|
+
chrom: str,
|
|
119
|
+
starts_np: np.ndarray,
|
|
120
|
+
*,
|
|
121
|
+
pos0: np.ndarray,
|
|
122
|
+
chrom_slices: Dict[str, Tuple[int, int]],
|
|
123
|
+
) -> np.ndarray:
|
|
124
|
+
"""Map 0-based CpG start positions on a given chrom to global row indices; -1 if not found."""
|
|
125
|
+
if chrom not in chrom_slices:
|
|
126
|
+
return np.full(starts_np.shape[0], -1, dtype=np.int64)
|
|
127
|
+
|
|
128
|
+
s, e = chrom_slices[chrom]
|
|
129
|
+
chrom_pos = pos0[s:e] # sorted increasing within chrom
|
|
130
|
+
j = np.searchsorted(chrom_pos, starts_np, side="left")
|
|
131
|
+
ok = (j < chrom_pos.shape[0]) & (chrom_pos[j] == starts_np)
|
|
132
|
+
|
|
133
|
+
out = np.full(starts_np.shape[0], -1, dtype=np.int64)
|
|
134
|
+
out[ok] = (s + j[ok]).astype(np.int64)
|
|
135
|
+
return out
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# -------------------- Platform-specific fill functions --------------------
|
|
139
|
+
|
|
140
|
+
def fill_one_sample_ont(
|
|
141
|
+
bed_path: str,
|
|
142
|
+
col_idx: int,
|
|
143
|
+
M_5mC: np.ndarray,
|
|
144
|
+
M_5hmC: np.ndarray,
|
|
145
|
+
*,
|
|
146
|
+
pos0: np.ndarray,
|
|
147
|
+
chrom_slices: Dict[str, Tuple[int, int]],
|
|
148
|
+
allowed_chroms: List[str],
|
|
149
|
+
min_cov: int,
|
|
150
|
+
keep_percent: bool,
|
|
151
|
+
logger: logging.Logger,
|
|
152
|
+
) -> Dict[str, int]:
|
|
153
|
+
"""ONT modkit pileup BED: has_header=False, 'm' and 'h' rows."""
|
|
154
|
+
t0 = time.time()
|
|
155
|
+
|
|
156
|
+
lf = (
|
|
157
|
+
pl.scan_csv(bed_path, separator="\t", has_header=False)
|
|
158
|
+
.select([ONT_COL_CHROM, ONT_COL_START, ONT_COL_CODE, ONT_COL_PCT, ONT_COL_COV])
|
|
159
|
+
.filter(pl.col(ONT_COL_CHROM).is_in(allowed_chroms))
|
|
160
|
+
)
|
|
161
|
+
if min_cov > 0:
|
|
162
|
+
lf = lf.filter(pl.col(ONT_COL_COV) >= min_cov)
|
|
163
|
+
|
|
164
|
+
df = lf.collect(engine="streaming")
|
|
165
|
+
|
|
166
|
+
stats = {
|
|
167
|
+
"rows_after_filters": int(df.height),
|
|
168
|
+
"rows_m": 0,
|
|
169
|
+
"rows_h": 0,
|
|
170
|
+
"mapped_m": 0,
|
|
171
|
+
"mapped_h": 0,
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
if df.height == 0:
|
|
175
|
+
logger.warning(f"[{col_idx}] {os.path.basename(bed_path)}: no rows after filters")
|
|
176
|
+
return stats
|
|
177
|
+
|
|
178
|
+
# percent -> value
|
|
179
|
+
if keep_percent:
|
|
180
|
+
df = df.with_columns(pl.col(ONT_COL_PCT).cast(pl.Float32).alias("val"))
|
|
181
|
+
else:
|
|
182
|
+
df = df.with_columns((pl.col(ONT_COL_PCT) / 100.0).cast(pl.Float32).alias("val"))
|
|
183
|
+
|
|
184
|
+
df_m = df.filter(pl.col(ONT_COL_CODE) == "m").select([ONT_COL_CHROM, ONT_COL_START, "val"])
|
|
185
|
+
df_h = df.filter(pl.col(ONT_COL_CODE) == "h").select([ONT_COL_CHROM, ONT_COL_START, "val"])
|
|
186
|
+
stats["rows_m"] = int(df_m.height)
|
|
187
|
+
stats["rows_h"] = int(df_h.height)
|
|
188
|
+
|
|
189
|
+
def _apply(df_sub: pl.DataFrame, mat: np.ndarray, tag: str):
|
|
190
|
+
if df_sub.height == 0:
|
|
191
|
+
return 0
|
|
192
|
+
mapped = 0
|
|
193
|
+
for chrom, sub in df_sub.group_by(ONT_COL_CHROM, maintain_order=True):
|
|
194
|
+
chrom = chrom[0]
|
|
195
|
+
starts = sub[ONT_COL_START].to_numpy().astype(np.int32)
|
|
196
|
+
rows = map_positions_to_rows(chrom, starts, pos0=pos0, chrom_slices=chrom_slices)
|
|
197
|
+
ok = rows != -1
|
|
198
|
+
if ok.any():
|
|
199
|
+
vals = sub["val"].to_numpy().astype(np.float32)
|
|
200
|
+
mat[rows[ok], col_idx] = vals[ok]
|
|
201
|
+
mapped += int(ok.sum())
|
|
202
|
+
stats[f"mapped_{tag}"] = mapped
|
|
203
|
+
return mapped
|
|
204
|
+
|
|
205
|
+
mapped_m = _apply(df_m, M_5mC, "m")
|
|
206
|
+
mapped_h = _apply(df_h, M_5hmC, "h")
|
|
207
|
+
|
|
208
|
+
logger.info(
|
|
209
|
+
f"[{col_idx}] {os.path.basename(bed_path)} "
|
|
210
|
+
f"rows={stats['rows_after_filters']:,} "
|
|
211
|
+
f"m={stats['rows_m']:,}(mapped {mapped_m:,}) "
|
|
212
|
+
f"h={stats['rows_h']:,}(mapped {mapped_h:,}) "
|
|
213
|
+
f"min_cov={min_cov}"
|
|
214
|
+
)
|
|
215
|
+
logger.debug(f"[{col_idx}] done in {time.time()-t0:.1f}s")
|
|
216
|
+
return stats
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def fill_one_sample_pacbio(
|
|
220
|
+
bed_path: str,
|
|
221
|
+
col_idx: int,
|
|
222
|
+
M_5mC: np.ndarray,
|
|
223
|
+
M_5hmC: np.ndarray,
|
|
224
|
+
*,
|
|
225
|
+
pos0: np.ndarray,
|
|
226
|
+
chrom_slices: Dict[str, Tuple[int, int]],
|
|
227
|
+
allowed_chroms: List[str],
|
|
228
|
+
min_cov: int,
|
|
229
|
+
keep_percent: bool,
|
|
230
|
+
logger: logging.Logger,
|
|
231
|
+
) -> Dict[str, int]:
|
|
232
|
+
"""
|
|
233
|
+
PacBio pb-cpg-tools 'cpg_scores' TSV:
|
|
234
|
+
- comment lines start with ## and a header line starts with '#chrom ...'
|
|
235
|
+
- uses discretized_mod_score (0..100) as methylation percent for CpG.
|
|
236
|
+
- 5hmC not available -> M_5hmC column stays NaN.
|
|
237
|
+
|
|
238
|
+
This reader:
|
|
239
|
+
- skips comment lines starting with "##"
|
|
240
|
+
- treats the '#chrom ...' line as header (we strip the leading '#')
|
|
241
|
+
"""
|
|
242
|
+
t0 = time.time()
|
|
243
|
+
|
|
244
|
+
# We need to (1) skip lines starting with ##, and (2) parse the header line "#chrom ..."
|
|
245
|
+
# Polars supports 'comment_prefix' for a single prefix; we use it to skip "##" lines.
|
|
246
|
+
# Then we read with has_header=True; the first non-## line is the "#chrom ..." header.
|
|
247
|
+
lf = pl.scan_csv(
|
|
248
|
+
bed_path,
|
|
249
|
+
separator="\t",
|
|
250
|
+
has_header=True,
|
|
251
|
+
comment_prefix="##",
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# Header will create a column named "#chrom" (with leading '#').
|
|
255
|
+
# Normalize it to "chrom" so the rest is consistent.
|
|
256
|
+
# Also enforce the minimal needed columns exist.
|
|
257
|
+
lf = lf.rename({"#chrom": PB_COL_CHROM}) if "#chrom" in lf.columns else lf
|
|
258
|
+
|
|
259
|
+
# Keep required columns only
|
|
260
|
+
needed = [PB_COL_CHROM, PB_COL_START, PB_COL_DMS, PB_COL_COV]
|
|
261
|
+
missing = [c for c in needed if c not in lf.columns]
|
|
262
|
+
if missing:
|
|
263
|
+
raise ValueError(
|
|
264
|
+
f"PacBio file missing columns {missing}. "
|
|
265
|
+
f"Found columns: {lf.columns}"
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
lf = (
|
|
269
|
+
lf.select(needed)
|
|
270
|
+
.filter(pl.col(PB_COL_CHROM).is_in(allowed_chroms))
|
|
271
|
+
)
|
|
272
|
+
if min_cov > 0:
|
|
273
|
+
lf = lf.filter(pl.col(PB_COL_COV) >= min_cov)
|
|
274
|
+
|
|
275
|
+
df = lf.collect(engine="streaming")
|
|
276
|
+
|
|
277
|
+
stats = {
|
|
278
|
+
"rows_after_filters": int(df.height),
|
|
279
|
+
"mapped_m": 0, # treat as 5mC
|
|
280
|
+
"mapped_h": 0, # always 0 for pacbio
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
if df.height == 0:
|
|
284
|
+
logger.warning(f"[{col_idx}] {os.path.basename(bed_path)}: no rows after filters")
|
|
285
|
+
return stats
|
|
286
|
+
|
|
287
|
+
# discretized_mod_score is percent 0..100
|
|
288
|
+
if keep_percent:
|
|
289
|
+
df = df.with_columns(pl.col(PB_COL_DMS).cast(pl.Float32).alias("val"))
|
|
290
|
+
else:
|
|
291
|
+
df = df.with_columns((pl.col(PB_COL_DMS) / 100.0).cast(pl.Float32).alias("val"))
|
|
292
|
+
|
|
293
|
+
mapped = 0
|
|
294
|
+
for chrom, sub in df.group_by(PB_COL_CHROM, maintain_order=True):
|
|
295
|
+
chrom = chrom[0]
|
|
296
|
+
starts = sub[PB_COL_START].to_numpy().astype(np.int32)
|
|
297
|
+
rows = map_positions_to_rows(chrom, starts, pos0=pos0, chrom_slices=chrom_slices)
|
|
298
|
+
ok = rows != -1
|
|
299
|
+
if ok.any():
|
|
300
|
+
vals = sub["val"].to_numpy().astype(np.float32)
|
|
301
|
+
M_5mC[rows[ok], col_idx] = vals[ok]
|
|
302
|
+
mapped += int(ok.sum())
|
|
303
|
+
|
|
304
|
+
stats["mapped_m"] = mapped
|
|
305
|
+
|
|
306
|
+
logger.info(
|
|
307
|
+
f"[{col_idx}] {os.path.basename(bed_path)} "
|
|
308
|
+
f"rows={stats['rows_after_filters']:,} "
|
|
309
|
+
f"pacbio(mapped {mapped:,}) "
|
|
310
|
+
f"min_cov={min_cov} (5hmC=NaN)"
|
|
311
|
+
)
|
|
312
|
+
logger.debug(f"[{col_idx}] done in {time.time()-t0:.1f}s")
|
|
313
|
+
return stats
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
# -------------------- main --------------------
|
|
317
|
+
|
|
318
|
+
def main():
|
|
319
|
+
ap = argparse.ArgumentParser()
|
|
320
|
+
ap.add_argument("--index", required=True, help="CpG index NPZ")
|
|
321
|
+
ap.add_argument("--beds-list", required=True, help="file list (path or sample<TAB>path)")
|
|
322
|
+
ap.add_argument("--out-prefix", required=True, help="Output prefix for .npy files")
|
|
323
|
+
ap.add_argument("--min-cov", type=int, default=0, help="Minimum coverage filter (Nvalid_cov or cov)")
|
|
324
|
+
ap.add_argument("--keep-percent", action="store_true", help="Store 0..100 instead of 0..1")
|
|
325
|
+
ap.add_argument(
|
|
326
|
+
"--platform",
|
|
327
|
+
choices=["ont", "pacbio"],
|
|
328
|
+
default="ont",
|
|
329
|
+
help="Input format: ont(modkit) or pacbio(pb-cpg-tools cpg_scores)"
|
|
330
|
+
)
|
|
331
|
+
ap.add_argument("--log", default=None, help="Log file path")
|
|
332
|
+
ap.add_argument("--verbose", action="store_true", help="Verbose console logs")
|
|
333
|
+
args = ap.parse_args()
|
|
334
|
+
|
|
335
|
+
logger = setup_logging(args.log, verbose=args.verbose)
|
|
336
|
+
logger.info(f"polars {pl.__version__}")
|
|
337
|
+
logger.info(f"platform={args.platform}")
|
|
338
|
+
|
|
339
|
+
chroms, chrom_offsets, pos0, chrom_slices, allowed_chroms = load_index(args.index, logger)
|
|
340
|
+
sample_names, file_paths = read_beds_list(args.beds_list)
|
|
341
|
+
if not file_paths:
|
|
342
|
+
logger.error("No files found in --beds-list")
|
|
343
|
+
sys.exit(1)
|
|
344
|
+
|
|
345
|
+
n_samples = len(file_paths)
|
|
346
|
+
n_cpg = int(pos0.shape[0])
|
|
347
|
+
|
|
348
|
+
logger.info(f"Samples: {n_samples}")
|
|
349
|
+
logger.info(f"Allocating matrices: CpGs={n_cpg:,} x samples={n_samples}")
|
|
350
|
+
logger.info(f"Value mode: {'percent(0..100)' if args.keep_percent else 'fraction(0..1)'}; min_cov={args.min_cov}")
|
|
351
|
+
|
|
352
|
+
# Allocate RAM matrices
|
|
353
|
+
M_5mC = np.full((n_cpg, n_samples), np.nan, dtype=np.float32)
|
|
354
|
+
M_5hmC = np.full((n_cpg, n_samples), np.nan, dtype=np.float32)
|
|
355
|
+
|
|
356
|
+
# Choose fill function
|
|
357
|
+
if args.platform == "ont":
|
|
358
|
+
fill_fn: Callable[..., Dict[str, int]] = fill_one_sample_ont
|
|
359
|
+
else:
|
|
360
|
+
fill_fn = fill_one_sample_pacbio
|
|
361
|
+
|
|
362
|
+
# Fill
|
|
363
|
+
for j, path in enumerate(tqdm(file_paths, desc="Processing files", unit="file")):
|
|
364
|
+
if not os.path.exists(path):
|
|
365
|
+
logger.warning(f"[{j}] Missing file: {path}")
|
|
366
|
+
continue
|
|
367
|
+
|
|
368
|
+
fill_fn(
|
|
369
|
+
path,
|
|
370
|
+
j,
|
|
371
|
+
M_5mC,
|
|
372
|
+
M_5hmC,
|
|
373
|
+
pos0=pos0,
|
|
374
|
+
chrom_slices=chrom_slices,
|
|
375
|
+
allowed_chroms=allowed_chroms,
|
|
376
|
+
min_cov=args.min_cov,
|
|
377
|
+
keep_percent=args.keep_percent,
|
|
378
|
+
logger=logger,
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
# Save outputs (stop here; no M_sum)
|
|
382
|
+
out_prefix = args.out_prefix
|
|
383
|
+
os.makedirs(os.path.dirname(os.path.abspath(out_prefix)), exist_ok=True)
|
|
384
|
+
|
|
385
|
+
# Always save 5mC
|
|
386
|
+
np.save(out_prefix + ".5mC.npy", M_5mC)
|
|
387
|
+
logger.info(f"Saved: {out_prefix}.5mC.npy")
|
|
388
|
+
|
|
389
|
+
# Save 5hmC only for ONT
|
|
390
|
+
if args.platform == "ont":
|
|
391
|
+
np.save(out_prefix + ".5hmC.npy", M_5hmC)
|
|
392
|
+
logger.info(f"Saved: {out_prefix}.5hmC.npy")
|
|
393
|
+
|
|
394
|
+
# QC summary
|
|
395
|
+
filled_5mc = int(np.isfinite(M_5mC).sum())
|
|
396
|
+
logger.info(f"Filled entries: 5mC={filled_5mc:,}")
|
|
397
|
+
|
|
398
|
+
if args.platform == "ont":
|
|
399
|
+
filled_5hmc = int(np.isfinite(M_5hmC).sum())
|
|
400
|
+
logger.info(f"Filled entries: 5hmC={filled_5hmc:,}")
|
|
401
|
+
else:
|
|
402
|
+
logger.info("PacBio mode: only 5mC matrix written (no 5hmC signal).")
|
|
403
|
+
|
|
404
|
+
logger.info("Done.")
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
if __name__ == "__main__":
|
|
408
|
+
main()
|
|
409
|
+
|
mdb/create.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import time
|
|
4
|
+
import logging
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
from typing import Dict, Tuple, List, Optional
|
|
8
|
+
from mdb.ont_bed_parsing import ont_bed_parsing
|
|
9
|
+
from mdb.pacbio_bed_parsing import pacbio_bed_parsing
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def load_index(index_npz: str):
|
|
13
|
+
idx = np.load(index_npz, allow_pickle=True)
|
|
14
|
+
chroms = idx["chroms"] # object array
|
|
15
|
+
chrom_offsets = idx["chrom_offsets"] # int64
|
|
16
|
+
pos0 = idx["pos0"] # int32
|
|
17
|
+
|
|
18
|
+
n_cpg = int(pos0.shape[0])
|
|
19
|
+
chrom_slices: Dict[str, Tuple[int, int]] = {}
|
|
20
|
+
for i, c in enumerate(chroms):
|
|
21
|
+
c = str(c)
|
|
22
|
+
s = int(chrom_offsets[i])
|
|
23
|
+
e = int(chrom_offsets[i + 1]) if i + 1 < len(chrom_offsets) else n_cpg
|
|
24
|
+
chrom_slices[c] = (s, e)
|
|
25
|
+
|
|
26
|
+
chrom_to_idx = {str(c): i for i, c in enumerate(chroms)}
|
|
27
|
+
allowed_chroms = list(chrom_to_idx.keys())
|
|
28
|
+
return chroms, chrom_offsets, pos0, chrom_slices, allowed_chroms
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def create_main(args, logger=None):
|
|
33
|
+
# Inputs
|
|
34
|
+
platform = args.platform
|
|
35
|
+
index_npz = args.npz
|
|
36
|
+
bed_path = args.bed
|
|
37
|
+
output_mdb = args.output
|
|
38
|
+
min_cov = args.min_coverage
|
|
39
|
+
|
|
40
|
+
if platform not in ("ont", "pacbio"):
|
|
41
|
+
raise ValueError(f"Unsupported platform: {platform}, supported platforms are 'ont' and 'pacbio'")
|
|
42
|
+
|
|
43
|
+
chroms, chrom_offsets, pos0, chrom_slices, allowed_chroms = load_index(index_npz)
|
|
44
|
+
print(f"Loaded index: {index_npz} with {len(chroms)} chromosomes and {pos0.shape[0]} CpGs")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
os.makedirs(output_mdb, exist_ok=True)
|
|
48
|
+
if platform == "ont":
|
|
49
|
+
stats, matrices, bed_map = ont_bed_parsing(
|
|
50
|
+
input_path=bed_path,
|
|
51
|
+
chrom_slices=chrom_slices,
|
|
52
|
+
cpg_pos0=pos0,
|
|
53
|
+
allowed_chroms=allowed_chroms,
|
|
54
|
+
min_cov=min_cov,
|
|
55
|
+
)
|
|
56
|
+
with open(os.path.join(output_mdb, "bed_map.txt"), "w") as f:
|
|
57
|
+
for file_name, bed_file in bed_map.items():
|
|
58
|
+
f.write(f"{file_name}\t{bed_file}\n")
|
|
59
|
+
if len(matrices) != 4:
|
|
60
|
+
np.save(os.path.join(output_mdb, "5mC.npy"), matrices[0])
|
|
61
|
+
np.save(os.path.join(output_mdb, "5hmC.npy"), matrices[1])
|
|
62
|
+
print(f"Saved 5mC and 5hmC matrices to {os.path.join(output_mdb, '5mC.npy')} and {os.path.join(output_mdb, '5hmC.npy')}")
|
|
63
|
+
elif len(matrices) == 4:
|
|
64
|
+
m_5mC_plus, m_5mC_minus, m_5hmC_plus, m_5hmC_minus = matrices
|
|
65
|
+
np.save(os.path.join(output_mdb, "5mC_plus.npy"), m_5mC_plus)
|
|
66
|
+
np.save(os.path.join(output_mdb, "5mC_minus.npy"), m_5mC_minus)
|
|
67
|
+
np.save(os.path.join(output_mdb, "5hmC_plus.npy"), m_5hmC_plus)
|
|
68
|
+
np.save(os.path.join(output_mdb, "5hmC_minus.npy"), m_5hmC_minus)
|
|
69
|
+
print(f"Saved 5mC and 5hmC strand-specific matrices to {os.path.join(output_mdb, '5mC_plus.npy')}, {os.path.join(output_mdb, '5mC_minus.npy')}, {os.path.join(output_mdb, '5hmC_plus.npy')}, and {os.path.join(output_mdb, '5hmC_minus.npy')}")
|
|
70
|
+
|
|
71
|
+
elif platform == "pacbio":
|
|
72
|
+
stats, matrices, bed_map = pacbio_bed_parsing(
|
|
73
|
+
input_path=bed_path,
|
|
74
|
+
chrom_slices=chrom_slices,
|
|
75
|
+
cpg_pos0=pos0,
|
|
76
|
+
allowed_chroms=allowed_chroms,
|
|
77
|
+
min_cov=min_cov,
|
|
78
|
+
)
|
|
79
|
+
np.save(os.path.join(output_mdb, "5mC.npy"), matrices)
|
|
80
|
+
with open(os.path.join(output_mdb, "bed_map.txt"), "w") as f:
|
|
81
|
+
for file_name, bed_file in bed_map.items():
|
|
82
|
+
f.write(f"{file_name}\t{bed_file}\n")
|
|
83
|
+
print(f"Saved 5mC matrix to {os.path.join(output_mdb, '5mC.npy')}")
|
|
84
|
+
|
mdb/index.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from tqdm.auto import tqdm
|
|
3
|
+
import logging
|
|
4
|
+
import argparse
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def fasta_iter(path):
|
|
8
|
+
name = None
|
|
9
|
+
seq_chunks = []
|
|
10
|
+
with open(path, "r") as f:
|
|
11
|
+
for line in f:
|
|
12
|
+
if line.startswith(">"):
|
|
13
|
+
if name is not None:
|
|
14
|
+
yield name, "".join(seq_chunks).upper()
|
|
15
|
+
name = line[1:].split()[0]
|
|
16
|
+
seq_chunks = []
|
|
17
|
+
else:
|
|
18
|
+
seq_chunks.append(line.strip())
|
|
19
|
+
if name is not None:
|
|
20
|
+
yield name, "".join(seq_chunks).upper()
|
|
21
|
+
|
|
22
|
+
def find_cpg_pos0(seq: str) -> np.ndarray:
|
|
23
|
+
# 0-based positions of C in "CG"
|
|
24
|
+
pos = []
|
|
25
|
+
i = seq.find("CG")
|
|
26
|
+
while i != -1:
|
|
27
|
+
pos.append(i)
|
|
28
|
+
i = seq.find("CG", i + 1)
|
|
29
|
+
return np.asarray(pos, dtype=np.int32)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def map_positions_to_rows(chrom: str, starts_np: np.ndarray) -> np.ndarray:
|
|
34
|
+
"""
|
|
35
|
+
Map 0-based CpG start positions on a given chrom to global row indices.
|
|
36
|
+
Returns int64 array of row indices, with -1 for positions not found.
|
|
37
|
+
"""
|
|
38
|
+
starts_np = np.asarray(starts_np)
|
|
39
|
+
if starts_np.ndim != 1:
|
|
40
|
+
raise ValueError("starts_np must be a 1D array of 0-based CpG C positions.")
|
|
41
|
+
|
|
42
|
+
if chrom not in chrom_slices:
|
|
43
|
+
return np.full(starts_np.shape[0], -1, dtype=np.int64)
|
|
44
|
+
|
|
45
|
+
s, e = chrom_slices[chrom]
|
|
46
|
+
chrom_pos = cpg_pos0[s:e] # sorted within chrom
|
|
47
|
+
|
|
48
|
+
# Ensure searchsorted dtype compatibility (int32 is fine; int64 also fine)
|
|
49
|
+
j = np.searchsorted(chrom_pos, starts_np.astype(chrom_pos.dtype, copy=False), side="left")
|
|
50
|
+
ok = (j < chrom_pos.shape[0]) & (chrom_pos[j] == starts_np.astype(chrom_pos.dtype, copy=False))
|
|
51
|
+
|
|
52
|
+
out = np.full(starts_np.shape[0], -1, dtype=np.int64)
|
|
53
|
+
out[ok] = (s + j[ok]).astype(np.int64, copy=False)
|
|
54
|
+
return out
|
|
55
|
+
|
|
56
|
+
def index_main(args):
|
|
57
|
+
|
|
58
|
+
reference = args.reference
|
|
59
|
+
output = args.output
|
|
60
|
+
sex = args.sex
|
|
61
|
+
|
|
62
|
+
chroms = []
|
|
63
|
+
chrom_offsets = []
|
|
64
|
+
cpg_pos0_all = []
|
|
65
|
+
total = 0
|
|
66
|
+
|
|
67
|
+
autosomes = [f"chr{str(i)}" for i in range(1, 23)]
|
|
68
|
+
matrix_chroms = autosomes
|
|
69
|
+
if sex == True:
|
|
70
|
+
matrix_chroms += ["chrX", "chrY"]
|
|
71
|
+
for chrom, seq in tqdm(fasta_iter(reference), desc="Scanning chromosomes", unit="chrom", total=len(matrix_chroms)):
|
|
72
|
+
if chrom not in matrix_chroms:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
pos0_chr = find_cpg_pos0(seq) # int32, sorted increasing
|
|
76
|
+
chroms.append(chrom)
|
|
77
|
+
chrom_offsets.append(total) # start index into global arrays
|
|
78
|
+
cpg_pos0_all.append(pos0_chr)
|
|
79
|
+
total += pos0_chr.size
|
|
80
|
+
|
|
81
|
+
# finalize offsets as int64 numpy array
|
|
82
|
+
chrom_offsets = np.asarray(chrom_offsets, dtype=np.int64)
|
|
83
|
+
|
|
84
|
+
# Build global pos0 array
|
|
85
|
+
if total == 0:
|
|
86
|
+
raise RuntimeError("No CpGs found for the requested chromosomes. Check FASTA contig names.")
|
|
87
|
+
|
|
88
|
+
cpg_pos0 = np.concatenate(cpg_pos0_all).astype(np.int32, copy=False)
|
|
89
|
+
n_cpg = int(cpg_pos0.shape[0])
|
|
90
|
+
|
|
91
|
+
# Build global chrom_id array (per-row chromosome index)
|
|
92
|
+
chrom_id = np.empty(n_cpg, dtype=np.int32)
|
|
93
|
+
for i, pos0_chr in enumerate(cpg_pos0_all):
|
|
94
|
+
start = int(chrom_offsets[i])
|
|
95
|
+
chrom_id[start:start + pos0_chr.size] = i
|
|
96
|
+
|
|
97
|
+
print("Total CpGs:", n_cpg)
|
|
98
|
+
print("Chromosomes used:", chroms)
|
|
99
|
+
|
|
100
|
+
np.savez_compressed(
|
|
101
|
+
output,
|
|
102
|
+
chroms=np.array(chroms, dtype=object),
|
|
103
|
+
chrom_offsets=chrom_offsets,
|
|
104
|
+
chrom_id=chrom_id,
|
|
105
|
+
pos0=cpg_pos0,
|
|
106
|
+
)
|
|
107
|
+
print("Saved: ", output)
|
mdb/main.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import argparse, sys, os
|
|
4
|
+
|
|
5
|
+
from mdb.parse_args import parse_args
|
|
6
|
+
|
|
7
|
+
def main(argv=None):
|
|
8
|
+
if argv is None:
|
|
9
|
+
argv = sys.argv[1:]
|
|
10
|
+
args = parse_args(argv)
|
|
11
|
+
if args.command == "index":
|
|
12
|
+
from mdb.index import index_main
|
|
13
|
+
index_main(args)
|
|
14
|
+
elif args.command == "create":
|
|
15
|
+
from mdb.create import create_main
|
|
16
|
+
create_main(args)
|
|
17
|
+
elif args.command == "merge":
|
|
18
|
+
from mdb.merge import merge_main
|
|
19
|
+
merge_main(args)
|
|
20
|
+
elif args.command == "pca":
|
|
21
|
+
from mdb.pca import pca_main
|
|
22
|
+
pca_main(args)
|
|
23
|
+
return 0
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
if __name__ == "__main__":
|
|
27
|
+
sys.exit(main(sys.argv[1:]))
|