methdb 0.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- methdb-0.0.2/LICENSE +21 -0
- methdb-0.0.2/PKG-INFO +63 -0
- methdb-0.0.2/README.md +37 -0
- methdb-0.0.2/mdb/__init__.py +1 -0
- methdb-0.0.2/mdb/build_methyl_mats.py +409 -0
- methdb-0.0.2/mdb/create.py +84 -0
- methdb-0.0.2/mdb/index.py +107 -0
- methdb-0.0.2/mdb/main.py +27 -0
- methdb-0.0.2/mdb/merge.py +252 -0
- methdb-0.0.2/mdb/ont_bed_parsing.py +328 -0
- methdb-0.0.2/mdb/pacbio_bed_parsing.py +241 -0
- methdb-0.0.2/mdb/parse_args.py +68 -0
- methdb-0.0.2/mdb/pca.py +440 -0
- methdb-0.0.2/mdb/strand.py +3 -0
- methdb-0.0.2/methdb.egg-info/PKG-INFO +63 -0
- methdb-0.0.2/methdb.egg-info/SOURCES.txt +38 -0
- methdb-0.0.2/methdb.egg-info/dependency_links.txt +1 -0
- methdb-0.0.2/methdb.egg-info/entry_points.txt +2 -0
- methdb-0.0.2/methdb.egg-info/requires.txt +4 -0
- methdb-0.0.2/methdb.egg-info/top_level.txt +1 -0
- methdb-0.0.2/setup.cfg +36 -0
- methdb-0.0.2/setup.py +13 -0
methdb-0.0.2/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Yilei Fu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
methdb-0.0.2/PKG-INFO
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: methdb
|
|
3
|
+
Version: 0.0.2
|
|
4
|
+
Summary: mdb: population-level DNA methylation analysis toolkit
|
|
5
|
+
Home-page: https://github.com/Fu-Yilei/mdb
|
|
6
|
+
Author: Yilei Fu
|
|
7
|
+
Author-email:
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: polars>=0.17.9
|
|
16
|
+
Requires-Dist: numpy>=2.2.0
|
|
17
|
+
Requires-Dist: plotly
|
|
18
|
+
Requires-Dist: tqdm
|
|
19
|
+
Dynamic: author
|
|
20
|
+
Dynamic: description
|
|
21
|
+
Dynamic: description-content-type
|
|
22
|
+
Dynamic: home-page
|
|
23
|
+
Dynamic: license
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
Dynamic: summary
|
|
26
|
+
|
|
27
|
+
# mdb
|
|
28
|
+
A toolkit to create DNA methylation database for cross-sample comparison
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
## Requirement
|
|
32
|
+
**Output format taken from:**
|
|
33
|
+
- Modkit >= v0.6
|
|
34
|
+
- Pb-CpG-tools >= v3.0.0
|
|
35
|
+
|
|
36
|
+
**Recommended DNA methylation pileup parameter:**
|
|
37
|
+
- Use `--combine-strand` in modkit pileup.
|
|
38
|
+
- Use `-m` function to aggregrate 5mC and 5hmC for ONT's pileup if merging ONT with PacBio.
|
|
39
|
+
|
|
40
|
+
**mdb features**
|
|
41
|
+
- mdb creates a genome-wide CpG matrix to support massive computing with population-level DNA methylation signals
|
|
42
|
+
- Use `mdb index` to index reference genome
|
|
43
|
+
- Use `mdb create` to create `.mdb` files (a set of `.npy` files) for each methylBED file
|
|
44
|
+
- Use `mdb merge` to merge per sample `.mdb` database to form a CpGxSample matrix
|
|
45
|
+
- Use `mdb pca` to parform PCA on merged matrix
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
usage: mdb [-h] [-v] {index,create,merge,pca} ...
|
|
49
|
+
|
|
50
|
+
DNA methylation database builder for quick population-level analysis.
|
|
51
|
+
|
|
52
|
+
positional arguments:
|
|
53
|
+
{index,create,merge,pca}
|
|
54
|
+
index Index all CpG locations on the reference genome
|
|
55
|
+
create Create single sample-level methylation database
|
|
56
|
+
merge mdb databases from multiple samples into a single database: COMBINE STRAND and HAPLOTYPE
|
|
57
|
+
pca Perform PCA on the merged mdb database
|
|
58
|
+
|
|
59
|
+
options:
|
|
60
|
+
-h, --help show this help message and exit
|
|
61
|
+
-v, --version show program's version number and exit
|
|
62
|
+
|
|
63
|
+
Version v0.0.2
|
methdb-0.0.2/README.md
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# mdb
|
|
2
|
+
A toolkit to create DNA methylation database for cross-sample comparison
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
## Requirement
|
|
6
|
+
**Output format taken from:**
|
|
7
|
+
- Modkit >= v0.6
|
|
8
|
+
- Pb-CpG-tools >= v3.0.0
|
|
9
|
+
|
|
10
|
+
**Recommended DNA methylation pileup parameter:**
|
|
11
|
+
- Use `--combine-strand` in modkit pileup.
|
|
12
|
+
- Use `-m` function to aggregrate 5mC and 5hmC for ONT's pileup if merging ONT with PacBio.
|
|
13
|
+
|
|
14
|
+
**mdb features**
|
|
15
|
+
- mdb creates a genome-wide CpG matrix to support massive computing with population-level DNA methylation signals
|
|
16
|
+
- Use `mdb index` to index reference genome
|
|
17
|
+
- Use `mdb create` to create `.mdb` files (a set of `.npy` files) for each methylBED file
|
|
18
|
+
- Use `mdb merge` to merge per sample `.mdb` database to form a CpGxSample matrix
|
|
19
|
+
- Use `mdb pca` to parform PCA on merged matrix
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
usage: mdb [-h] [-v] {index,create,merge,pca} ...
|
|
23
|
+
|
|
24
|
+
DNA methylation database builder for quick population-level analysis.
|
|
25
|
+
|
|
26
|
+
positional arguments:
|
|
27
|
+
{index,create,merge,pca}
|
|
28
|
+
index Index all CpG locations on the reference genome
|
|
29
|
+
create Create single sample-level methylation database
|
|
30
|
+
merge mdb databases from multiple samples into a single database: COMBINE STRAND and HAPLOTYPE
|
|
31
|
+
pca Perform PCA on the merged mdb database
|
|
32
|
+
|
|
33
|
+
options:
|
|
34
|
+
-h, --help show this help message and exit
|
|
35
|
+
-v, --version show program's version number and exit
|
|
36
|
+
|
|
37
|
+
Version v0.0.2
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "v0.0.2"
|
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Build CpG-by-sample matrices for 5mC and 5hmC ONLY (no M_sum),
|
|
4
|
+
using a pre-built CpG index (NPZ) and methylation bed-like files.
|
|
5
|
+
|
|
6
|
+
Supports two input formats via --platform:
|
|
7
|
+
- ont (default): modkit pileup BED (no header; 18 columns; 'm' and 'h' rows)
|
|
8
|
+
- pacbio: pb-cpg-tools cpg_scores TSV (has header lines starting with ## and a # header row;
|
|
9
|
+
columns include: chrom, begin, end, ... discretized_mod_score)
|
|
10
|
+
|
|
11
|
+
Outputs:
|
|
12
|
+
<out_prefix>.5mC.npy float32 (fraction 0..1 unless --keep-percent)
|
|
13
|
+
<out_prefix>.5hmC.npy float32 (ONT only; for PacBio will remain NaN)
|
|
14
|
+
|
|
15
|
+
Notes:
|
|
16
|
+
- Full-genome RAM allocation can be huge; for 400 samples this likely won't fit in 50 GB.
|
|
17
|
+
Use chromosome-sharded or memmap approach if needed.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import os
|
|
21
|
+
import sys
|
|
22
|
+
import time
|
|
23
|
+
import argparse
|
|
24
|
+
import logging
|
|
25
|
+
from typing import Dict, Tuple, List, Optional, Callable
|
|
26
|
+
|
|
27
|
+
import numpy as np
|
|
28
|
+
import polars as pl
|
|
29
|
+
from tqdm import tqdm
|
|
30
|
+
|
|
31
|
+
# ---------- ONT (modkit pileup bed, has_header=False => column_1..)
|
|
32
|
+
ONT_COL_CHROM = "column_1"
|
|
33
|
+
ONT_COL_START = "column_2"
|
|
34
|
+
ONT_COL_CODE = "column_4" # "m" or "h"
|
|
35
|
+
ONT_COL_PCT = "column_11" # percent modified
|
|
36
|
+
ONT_COL_COV = "column_10" # Nvalid_cov
|
|
37
|
+
|
|
38
|
+
# ---------- PacBio (pb-cpg-tools cpg_scores, tab-separated with comments + header)
|
|
39
|
+
# Columns (example):
|
|
40
|
+
# chrom begin end mod_score type cov est_mod_count est_unmod_count discretized_mod_score
|
|
41
|
+
PB_COL_CHROM = "chrom"
|
|
42
|
+
PB_COL_START = "begin" # 0-based start
|
|
43
|
+
PB_COL_COV = "cov"
|
|
44
|
+
PB_COL_DMS = "discretized_mod_score" # percent 0..100
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def setup_logging(log_path: Optional[str], verbose: bool = False) -> logging.Logger:
|
|
48
|
+
logger = logging.getLogger("build_5mc_5hmc")
|
|
49
|
+
logger.setLevel(logging.DEBUG)
|
|
50
|
+
|
|
51
|
+
fmt = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s")
|
|
52
|
+
|
|
53
|
+
sh = logging.StreamHandler(sys.stderr)
|
|
54
|
+
sh.setLevel(logging.DEBUG if verbose else logging.INFO)
|
|
55
|
+
sh.setFormatter(fmt)
|
|
56
|
+
logger.addHandler(sh)
|
|
57
|
+
|
|
58
|
+
if log_path:
|
|
59
|
+
os.makedirs(os.path.dirname(os.path.abspath(log_path)), exist_ok=True)
|
|
60
|
+
fh = logging.FileHandler(log_path)
|
|
61
|
+
fh.setLevel(logging.DEBUG)
|
|
62
|
+
fh.setFormatter(fmt)
|
|
63
|
+
logger.addHandler(fh)
|
|
64
|
+
|
|
65
|
+
return logger
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def load_index(index_npz: str, logger: logging.Logger):
|
|
69
|
+
idx = np.load(index_npz, allow_pickle=True)
|
|
70
|
+
chroms = idx["chroms"] # object array
|
|
71
|
+
chrom_offsets = idx["chrom_offsets"] # int64
|
|
72
|
+
pos0 = idx["pos0"] # int32
|
|
73
|
+
|
|
74
|
+
n_cpg = int(pos0.shape[0])
|
|
75
|
+
chrom_slices: Dict[str, Tuple[int, int]] = {}
|
|
76
|
+
for i, c in enumerate(chroms):
|
|
77
|
+
c = str(c)
|
|
78
|
+
s = int(chrom_offsets[i])
|
|
79
|
+
e = int(chrom_offsets[i + 1]) if i + 1 < len(chrom_offsets) else n_cpg
|
|
80
|
+
chrom_slices[c] = (s, e)
|
|
81
|
+
|
|
82
|
+
chrom_to_idx = {str(c): i for i, c in enumerate(chroms)}
|
|
83
|
+
allowed_chroms = list(chrom_to_idx.keys())
|
|
84
|
+
|
|
85
|
+
logger.info(f"Loaded index: {index_npz}")
|
|
86
|
+
logger.info(f"Index CpGs: {n_cpg:,}")
|
|
87
|
+
logger.info(f"Index chroms ({len(chroms)}): {allowed_chroms}")
|
|
88
|
+
|
|
89
|
+
return chroms, chrom_offsets, pos0, chrom_slices, allowed_chroms
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def read_beds_list(beds_list_path: str) -> Tuple[List[str], List[str]]:
|
|
93
|
+
"""
|
|
94
|
+
Returns (sample_names, file_paths)
|
|
95
|
+
Accepts either:
|
|
96
|
+
- path
|
|
97
|
+
- sample<TAB>path
|
|
98
|
+
"""
|
|
99
|
+
names: List[str] = []
|
|
100
|
+
paths: List[str] = []
|
|
101
|
+
with open(beds_list_path, "r") as f:
|
|
102
|
+
for line in f:
|
|
103
|
+
line = line.strip()
|
|
104
|
+
if not line or line.startswith("#"):
|
|
105
|
+
continue
|
|
106
|
+
parts = line.split("\t")
|
|
107
|
+
if len(parts) == 1:
|
|
108
|
+
p = parts[0]
|
|
109
|
+
names.append(os.path.basename(p))
|
|
110
|
+
paths.append(p)
|
|
111
|
+
else:
|
|
112
|
+
names.append(parts[0])
|
|
113
|
+
paths.append(parts[1])
|
|
114
|
+
return names, paths
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def map_positions_to_rows(
|
|
118
|
+
chrom: str,
|
|
119
|
+
starts_np: np.ndarray,
|
|
120
|
+
*,
|
|
121
|
+
pos0: np.ndarray,
|
|
122
|
+
chrom_slices: Dict[str, Tuple[int, int]],
|
|
123
|
+
) -> np.ndarray:
|
|
124
|
+
"""Map 0-based CpG start positions on a given chrom to global row indices; -1 if not found."""
|
|
125
|
+
if chrom not in chrom_slices:
|
|
126
|
+
return np.full(starts_np.shape[0], -1, dtype=np.int64)
|
|
127
|
+
|
|
128
|
+
s, e = chrom_slices[chrom]
|
|
129
|
+
chrom_pos = pos0[s:e] # sorted increasing within chrom
|
|
130
|
+
j = np.searchsorted(chrom_pos, starts_np, side="left")
|
|
131
|
+
ok = (j < chrom_pos.shape[0]) & (chrom_pos[j] == starts_np)
|
|
132
|
+
|
|
133
|
+
out = np.full(starts_np.shape[0], -1, dtype=np.int64)
|
|
134
|
+
out[ok] = (s + j[ok]).astype(np.int64)
|
|
135
|
+
return out
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# -------------------- Platform-specific fill functions --------------------
|
|
139
|
+
|
|
140
|
+
def fill_one_sample_ont(
|
|
141
|
+
bed_path: str,
|
|
142
|
+
col_idx: int,
|
|
143
|
+
M_5mC: np.ndarray,
|
|
144
|
+
M_5hmC: np.ndarray,
|
|
145
|
+
*,
|
|
146
|
+
pos0: np.ndarray,
|
|
147
|
+
chrom_slices: Dict[str, Tuple[int, int]],
|
|
148
|
+
allowed_chroms: List[str],
|
|
149
|
+
min_cov: int,
|
|
150
|
+
keep_percent: bool,
|
|
151
|
+
logger: logging.Logger,
|
|
152
|
+
) -> Dict[str, int]:
|
|
153
|
+
"""ONT modkit pileup BED: has_header=False, 'm' and 'h' rows."""
|
|
154
|
+
t0 = time.time()
|
|
155
|
+
|
|
156
|
+
lf = (
|
|
157
|
+
pl.scan_csv(bed_path, separator="\t", has_header=False)
|
|
158
|
+
.select([ONT_COL_CHROM, ONT_COL_START, ONT_COL_CODE, ONT_COL_PCT, ONT_COL_COV])
|
|
159
|
+
.filter(pl.col(ONT_COL_CHROM).is_in(allowed_chroms))
|
|
160
|
+
)
|
|
161
|
+
if min_cov > 0:
|
|
162
|
+
lf = lf.filter(pl.col(ONT_COL_COV) >= min_cov)
|
|
163
|
+
|
|
164
|
+
df = lf.collect(engine="streaming")
|
|
165
|
+
|
|
166
|
+
stats = {
|
|
167
|
+
"rows_after_filters": int(df.height),
|
|
168
|
+
"rows_m": 0,
|
|
169
|
+
"rows_h": 0,
|
|
170
|
+
"mapped_m": 0,
|
|
171
|
+
"mapped_h": 0,
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
if df.height == 0:
|
|
175
|
+
logger.warning(f"[{col_idx}] {os.path.basename(bed_path)}: no rows after filters")
|
|
176
|
+
return stats
|
|
177
|
+
|
|
178
|
+
# percent -> value
|
|
179
|
+
if keep_percent:
|
|
180
|
+
df = df.with_columns(pl.col(ONT_COL_PCT).cast(pl.Float32).alias("val"))
|
|
181
|
+
else:
|
|
182
|
+
df = df.with_columns((pl.col(ONT_COL_PCT) / 100.0).cast(pl.Float32).alias("val"))
|
|
183
|
+
|
|
184
|
+
df_m = df.filter(pl.col(ONT_COL_CODE) == "m").select([ONT_COL_CHROM, ONT_COL_START, "val"])
|
|
185
|
+
df_h = df.filter(pl.col(ONT_COL_CODE) == "h").select([ONT_COL_CHROM, ONT_COL_START, "val"])
|
|
186
|
+
stats["rows_m"] = int(df_m.height)
|
|
187
|
+
stats["rows_h"] = int(df_h.height)
|
|
188
|
+
|
|
189
|
+
def _apply(df_sub: pl.DataFrame, mat: np.ndarray, tag: str):
|
|
190
|
+
if df_sub.height == 0:
|
|
191
|
+
return 0
|
|
192
|
+
mapped = 0
|
|
193
|
+
for chrom, sub in df_sub.group_by(ONT_COL_CHROM, maintain_order=True):
|
|
194
|
+
chrom = chrom[0]
|
|
195
|
+
starts = sub[ONT_COL_START].to_numpy().astype(np.int32)
|
|
196
|
+
rows = map_positions_to_rows(chrom, starts, pos0=pos0, chrom_slices=chrom_slices)
|
|
197
|
+
ok = rows != -1
|
|
198
|
+
if ok.any():
|
|
199
|
+
vals = sub["val"].to_numpy().astype(np.float32)
|
|
200
|
+
mat[rows[ok], col_idx] = vals[ok]
|
|
201
|
+
mapped += int(ok.sum())
|
|
202
|
+
stats[f"mapped_{tag}"] = mapped
|
|
203
|
+
return mapped
|
|
204
|
+
|
|
205
|
+
mapped_m = _apply(df_m, M_5mC, "m")
|
|
206
|
+
mapped_h = _apply(df_h, M_5hmC, "h")
|
|
207
|
+
|
|
208
|
+
logger.info(
|
|
209
|
+
f"[{col_idx}] {os.path.basename(bed_path)} "
|
|
210
|
+
f"rows={stats['rows_after_filters']:,} "
|
|
211
|
+
f"m={stats['rows_m']:,}(mapped {mapped_m:,}) "
|
|
212
|
+
f"h={stats['rows_h']:,}(mapped {mapped_h:,}) "
|
|
213
|
+
f"min_cov={min_cov}"
|
|
214
|
+
)
|
|
215
|
+
logger.debug(f"[{col_idx}] done in {time.time()-t0:.1f}s")
|
|
216
|
+
return stats
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def fill_one_sample_pacbio(
|
|
220
|
+
bed_path: str,
|
|
221
|
+
col_idx: int,
|
|
222
|
+
M_5mC: np.ndarray,
|
|
223
|
+
M_5hmC: np.ndarray,
|
|
224
|
+
*,
|
|
225
|
+
pos0: np.ndarray,
|
|
226
|
+
chrom_slices: Dict[str, Tuple[int, int]],
|
|
227
|
+
allowed_chroms: List[str],
|
|
228
|
+
min_cov: int,
|
|
229
|
+
keep_percent: bool,
|
|
230
|
+
logger: logging.Logger,
|
|
231
|
+
) -> Dict[str, int]:
|
|
232
|
+
"""
|
|
233
|
+
PacBio pb-cpg-tools 'cpg_scores' TSV:
|
|
234
|
+
- comment lines start with ## and a header line starts with '#chrom ...'
|
|
235
|
+
- uses discretized_mod_score (0..100) as methylation percent for CpG.
|
|
236
|
+
- 5hmC not available -> M_5hmC column stays NaN.
|
|
237
|
+
|
|
238
|
+
This reader:
|
|
239
|
+
- skips comment lines starting with "##"
|
|
240
|
+
- treats the '#chrom ...' line as header (we strip the leading '#')
|
|
241
|
+
"""
|
|
242
|
+
t0 = time.time()
|
|
243
|
+
|
|
244
|
+
# We need to (1) skip lines starting with ##, and (2) parse the header line "#chrom ..."
|
|
245
|
+
# Polars supports 'comment_prefix' for a single prefix; we use it to skip "##" lines.
|
|
246
|
+
# Then we read with has_header=True; the first non-## line is the "#chrom ..." header.
|
|
247
|
+
lf = pl.scan_csv(
|
|
248
|
+
bed_path,
|
|
249
|
+
separator="\t",
|
|
250
|
+
has_header=True,
|
|
251
|
+
comment_prefix="##",
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# Header will create a column named "#chrom" (with leading '#').
|
|
255
|
+
# Normalize it to "chrom" so the rest is consistent.
|
|
256
|
+
# Also enforce the minimal needed columns exist.
|
|
257
|
+
lf = lf.rename({"#chrom": PB_COL_CHROM}) if "#chrom" in lf.columns else lf
|
|
258
|
+
|
|
259
|
+
# Keep required columns only
|
|
260
|
+
needed = [PB_COL_CHROM, PB_COL_START, PB_COL_DMS, PB_COL_COV]
|
|
261
|
+
missing = [c for c in needed if c not in lf.columns]
|
|
262
|
+
if missing:
|
|
263
|
+
raise ValueError(
|
|
264
|
+
f"PacBio file missing columns {missing}. "
|
|
265
|
+
f"Found columns: {lf.columns}"
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
lf = (
|
|
269
|
+
lf.select(needed)
|
|
270
|
+
.filter(pl.col(PB_COL_CHROM).is_in(allowed_chroms))
|
|
271
|
+
)
|
|
272
|
+
if min_cov > 0:
|
|
273
|
+
lf = lf.filter(pl.col(PB_COL_COV) >= min_cov)
|
|
274
|
+
|
|
275
|
+
df = lf.collect(engine="streaming")
|
|
276
|
+
|
|
277
|
+
stats = {
|
|
278
|
+
"rows_after_filters": int(df.height),
|
|
279
|
+
"mapped_m": 0, # treat as 5mC
|
|
280
|
+
"mapped_h": 0, # always 0 for pacbio
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
if df.height == 0:
|
|
284
|
+
logger.warning(f"[{col_idx}] {os.path.basename(bed_path)}: no rows after filters")
|
|
285
|
+
return stats
|
|
286
|
+
|
|
287
|
+
# discretized_mod_score is percent 0..100
|
|
288
|
+
if keep_percent:
|
|
289
|
+
df = df.with_columns(pl.col(PB_COL_DMS).cast(pl.Float32).alias("val"))
|
|
290
|
+
else:
|
|
291
|
+
df = df.with_columns((pl.col(PB_COL_DMS) / 100.0).cast(pl.Float32).alias("val"))
|
|
292
|
+
|
|
293
|
+
mapped = 0
|
|
294
|
+
for chrom, sub in df.group_by(PB_COL_CHROM, maintain_order=True):
|
|
295
|
+
chrom = chrom[0]
|
|
296
|
+
starts = sub[PB_COL_START].to_numpy().astype(np.int32)
|
|
297
|
+
rows = map_positions_to_rows(chrom, starts, pos0=pos0, chrom_slices=chrom_slices)
|
|
298
|
+
ok = rows != -1
|
|
299
|
+
if ok.any():
|
|
300
|
+
vals = sub["val"].to_numpy().astype(np.float32)
|
|
301
|
+
M_5mC[rows[ok], col_idx] = vals[ok]
|
|
302
|
+
mapped += int(ok.sum())
|
|
303
|
+
|
|
304
|
+
stats["mapped_m"] = mapped
|
|
305
|
+
|
|
306
|
+
logger.info(
|
|
307
|
+
f"[{col_idx}] {os.path.basename(bed_path)} "
|
|
308
|
+
f"rows={stats['rows_after_filters']:,} "
|
|
309
|
+
f"pacbio(mapped {mapped:,}) "
|
|
310
|
+
f"min_cov={min_cov} (5hmC=NaN)"
|
|
311
|
+
)
|
|
312
|
+
logger.debug(f"[{col_idx}] done in {time.time()-t0:.1f}s")
|
|
313
|
+
return stats
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
# -------------------- main --------------------
|
|
317
|
+
|
|
318
|
+
def main():
|
|
319
|
+
ap = argparse.ArgumentParser()
|
|
320
|
+
ap.add_argument("--index", required=True, help="CpG index NPZ")
|
|
321
|
+
ap.add_argument("--beds-list", required=True, help="file list (path or sample<TAB>path)")
|
|
322
|
+
ap.add_argument("--out-prefix", required=True, help="Output prefix for .npy files")
|
|
323
|
+
ap.add_argument("--min-cov", type=int, default=0, help="Minimum coverage filter (Nvalid_cov or cov)")
|
|
324
|
+
ap.add_argument("--keep-percent", action="store_true", help="Store 0..100 instead of 0..1")
|
|
325
|
+
ap.add_argument(
|
|
326
|
+
"--platform",
|
|
327
|
+
choices=["ont", "pacbio"],
|
|
328
|
+
default="ont",
|
|
329
|
+
help="Input format: ont(modkit) or pacbio(pb-cpg-tools cpg_scores)"
|
|
330
|
+
)
|
|
331
|
+
ap.add_argument("--log", default=None, help="Log file path")
|
|
332
|
+
ap.add_argument("--verbose", action="store_true", help="Verbose console logs")
|
|
333
|
+
args = ap.parse_args()
|
|
334
|
+
|
|
335
|
+
logger = setup_logging(args.log, verbose=args.verbose)
|
|
336
|
+
logger.info(f"polars {pl.__version__}")
|
|
337
|
+
logger.info(f"platform={args.platform}")
|
|
338
|
+
|
|
339
|
+
chroms, chrom_offsets, pos0, chrom_slices, allowed_chroms = load_index(args.index, logger)
|
|
340
|
+
sample_names, file_paths = read_beds_list(args.beds_list)
|
|
341
|
+
if not file_paths:
|
|
342
|
+
logger.error("No files found in --beds-list")
|
|
343
|
+
sys.exit(1)
|
|
344
|
+
|
|
345
|
+
n_samples = len(file_paths)
|
|
346
|
+
n_cpg = int(pos0.shape[0])
|
|
347
|
+
|
|
348
|
+
logger.info(f"Samples: {n_samples}")
|
|
349
|
+
logger.info(f"Allocating matrices: CpGs={n_cpg:,} x samples={n_samples}")
|
|
350
|
+
logger.info(f"Value mode: {'percent(0..100)' if args.keep_percent else 'fraction(0..1)'}; min_cov={args.min_cov}")
|
|
351
|
+
|
|
352
|
+
# Allocate RAM matrices
|
|
353
|
+
M_5mC = np.full((n_cpg, n_samples), np.nan, dtype=np.float32)
|
|
354
|
+
M_5hmC = np.full((n_cpg, n_samples), np.nan, dtype=np.float32)
|
|
355
|
+
|
|
356
|
+
# Choose fill function
|
|
357
|
+
if args.platform == "ont":
|
|
358
|
+
fill_fn: Callable[..., Dict[str, int]] = fill_one_sample_ont
|
|
359
|
+
else:
|
|
360
|
+
fill_fn = fill_one_sample_pacbio
|
|
361
|
+
|
|
362
|
+
# Fill
|
|
363
|
+
for j, path in enumerate(tqdm(file_paths, desc="Processing files", unit="file")):
|
|
364
|
+
if not os.path.exists(path):
|
|
365
|
+
logger.warning(f"[{j}] Missing file: {path}")
|
|
366
|
+
continue
|
|
367
|
+
|
|
368
|
+
fill_fn(
|
|
369
|
+
path,
|
|
370
|
+
j,
|
|
371
|
+
M_5mC,
|
|
372
|
+
M_5hmC,
|
|
373
|
+
pos0=pos0,
|
|
374
|
+
chrom_slices=chrom_slices,
|
|
375
|
+
allowed_chroms=allowed_chroms,
|
|
376
|
+
min_cov=args.min_cov,
|
|
377
|
+
keep_percent=args.keep_percent,
|
|
378
|
+
logger=logger,
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
# Save outputs (stop here; no M_sum)
|
|
382
|
+
out_prefix = args.out_prefix
|
|
383
|
+
os.makedirs(os.path.dirname(os.path.abspath(out_prefix)), exist_ok=True)
|
|
384
|
+
|
|
385
|
+
# Always save 5mC
|
|
386
|
+
np.save(out_prefix + ".5mC.npy", M_5mC)
|
|
387
|
+
logger.info(f"Saved: {out_prefix}.5mC.npy")
|
|
388
|
+
|
|
389
|
+
# Save 5hmC only for ONT
|
|
390
|
+
if args.platform == "ont":
|
|
391
|
+
np.save(out_prefix + ".5hmC.npy", M_5hmC)
|
|
392
|
+
logger.info(f"Saved: {out_prefix}.5hmC.npy")
|
|
393
|
+
|
|
394
|
+
# QC summary
|
|
395
|
+
filled_5mc = int(np.isfinite(M_5mC).sum())
|
|
396
|
+
logger.info(f"Filled entries: 5mC={filled_5mc:,}")
|
|
397
|
+
|
|
398
|
+
if args.platform == "ont":
|
|
399
|
+
filled_5hmc = int(np.isfinite(M_5hmC).sum())
|
|
400
|
+
logger.info(f"Filled entries: 5hmC={filled_5hmc:,}")
|
|
401
|
+
else:
|
|
402
|
+
logger.info("PacBio mode: only 5mC matrix written (no 5hmC signal).")
|
|
403
|
+
|
|
404
|
+
logger.info("Done.")
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
if __name__ == "__main__":
|
|
408
|
+
main()
|
|
409
|
+
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import time
|
|
4
|
+
import logging
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
from typing import Dict, Tuple, List, Optional
|
|
8
|
+
from mdb.ont_bed_parsing import ont_bed_parsing
|
|
9
|
+
from mdb.pacbio_bed_parsing import pacbio_bed_parsing
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def load_index(index_npz: str):
|
|
13
|
+
idx = np.load(index_npz, allow_pickle=True)
|
|
14
|
+
chroms = idx["chroms"] # object array
|
|
15
|
+
chrom_offsets = idx["chrom_offsets"] # int64
|
|
16
|
+
pos0 = idx["pos0"] # int32
|
|
17
|
+
|
|
18
|
+
n_cpg = int(pos0.shape[0])
|
|
19
|
+
chrom_slices: Dict[str, Tuple[int, int]] = {}
|
|
20
|
+
for i, c in enumerate(chroms):
|
|
21
|
+
c = str(c)
|
|
22
|
+
s = int(chrom_offsets[i])
|
|
23
|
+
e = int(chrom_offsets[i + 1]) if i + 1 < len(chrom_offsets) else n_cpg
|
|
24
|
+
chrom_slices[c] = (s, e)
|
|
25
|
+
|
|
26
|
+
chrom_to_idx = {str(c): i for i, c in enumerate(chroms)}
|
|
27
|
+
allowed_chroms = list(chrom_to_idx.keys())
|
|
28
|
+
return chroms, chrom_offsets, pos0, chrom_slices, allowed_chroms
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def create_main(args, logger=None):
|
|
33
|
+
# Inputs
|
|
34
|
+
platform = args.platform
|
|
35
|
+
index_npz = args.npz
|
|
36
|
+
bed_path = args.bed
|
|
37
|
+
output_mdb = args.output
|
|
38
|
+
min_cov = args.min_coverage
|
|
39
|
+
|
|
40
|
+
if platform not in ("ont", "pacbio"):
|
|
41
|
+
raise ValueError(f"Unsupported platform: {platform}, supported platforms are 'ont' and 'pacbio'")
|
|
42
|
+
|
|
43
|
+
chroms, chrom_offsets, pos0, chrom_slices, allowed_chroms = load_index(index_npz)
|
|
44
|
+
print(f"Loaded index: {index_npz} with {len(chroms)} chromosomes and {pos0.shape[0]} CpGs")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
os.makedirs(output_mdb, exist_ok=True)
|
|
48
|
+
if platform == "ont":
|
|
49
|
+
stats, matrices, bed_map = ont_bed_parsing(
|
|
50
|
+
input_path=bed_path,
|
|
51
|
+
chrom_slices=chrom_slices,
|
|
52
|
+
cpg_pos0=pos0,
|
|
53
|
+
allowed_chroms=allowed_chroms,
|
|
54
|
+
min_cov=min_cov,
|
|
55
|
+
)
|
|
56
|
+
with open(os.path.join(output_mdb, "bed_map.txt"), "w") as f:
|
|
57
|
+
for file_name, bed_file in bed_map.items():
|
|
58
|
+
f.write(f"{file_name}\t{bed_file}\n")
|
|
59
|
+
if len(matrices) != 4:
|
|
60
|
+
np.save(os.path.join(output_mdb, "5mC.npy"), matrices[0])
|
|
61
|
+
np.save(os.path.join(output_mdb, "5hmC.npy"), matrices[1])
|
|
62
|
+
print(f"Saved 5mC and 5hmC matrices to {os.path.join(output_mdb, '5mC.npy')} and {os.path.join(output_mdb, '5hmC.npy')}")
|
|
63
|
+
elif len(matrices) == 4:
|
|
64
|
+
m_5mC_plus, m_5mC_minus, m_5hmC_plus, m_5hmC_minus = matrices
|
|
65
|
+
np.save(os.path.join(output_mdb, "5mC_plus.npy"), m_5mC_plus)
|
|
66
|
+
np.save(os.path.join(output_mdb, "5mC_minus.npy"), m_5mC_minus)
|
|
67
|
+
np.save(os.path.join(output_mdb, "5hmC_plus.npy"), m_5hmC_plus)
|
|
68
|
+
np.save(os.path.join(output_mdb, "5hmC_minus.npy"), m_5hmC_minus)
|
|
69
|
+
print(f"Saved 5mC and 5hmC strand-specific matrices to {os.path.join(output_mdb, '5mC_plus.npy')}, {os.path.join(output_mdb, '5mC_minus.npy')}, {os.path.join(output_mdb, '5hmC_plus.npy')}, and {os.path.join(output_mdb, '5hmC_minus.npy')}")
|
|
70
|
+
|
|
71
|
+
elif platform == "pacbio":
|
|
72
|
+
stats, matrices, bed_map = pacbio_bed_parsing(
|
|
73
|
+
input_path=bed_path,
|
|
74
|
+
chrom_slices=chrom_slices,
|
|
75
|
+
cpg_pos0=pos0,
|
|
76
|
+
allowed_chroms=allowed_chroms,
|
|
77
|
+
min_cov=min_cov,
|
|
78
|
+
)
|
|
79
|
+
np.save(os.path.join(output_mdb, "5mC.npy"), matrices)
|
|
80
|
+
with open(os.path.join(output_mdb, "bed_map.txt"), "w") as f:
|
|
81
|
+
for file_name, bed_file in bed_map.items():
|
|
82
|
+
f.write(f"{file_name}\t{bed_file}\n")
|
|
83
|
+
print(f"Saved 5mC matrix to {os.path.join(output_mdb, '5mC.npy')}")
|
|
84
|
+
|