methdb 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
methdb-0.0.2/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Yilei Fu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
methdb-0.0.2/PKG-INFO ADDED
@@ -0,0 +1,63 @@
1
+ Metadata-Version: 2.4
2
+ Name: methdb
3
+ Version: 0.0.2
4
+ Summary: mdb: population-level DNA methylation analysis toolkit
5
+ Home-page: https://github.com/Fu-Yilei/mdb
6
+ Author: Yilei Fu
7
+ Author-email:
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.10
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: polars>=0.17.9
16
+ Requires-Dist: numpy>=2.2.0
17
+ Requires-Dist: plotly
18
+ Requires-Dist: tqdm
19
+ Dynamic: author
20
+ Dynamic: description
21
+ Dynamic: description-content-type
22
+ Dynamic: home-page
23
+ Dynamic: license
24
+ Dynamic: license-file
25
+ Dynamic: summary
26
+
27
+ # mdb
28
+ A toolkit to create DNA methylation database for cross-sample comparison
29
+
30
+
31
+ ## Requirement
32
+ **Output format taken from:**
33
+ - Modkit >= v0.6
34
+ - Pb-CpG-tools >= v3.0.0
35
+
36
+ **Recommended DNA methylation pileup parameter:**
37
+ - Use `--combine-strand` in modkit pileup.
38
+ - Use `-m` function to aggregrate 5mC and 5hmC for ONT's pileup if merging ONT with PacBio.
39
+
40
+ **mdb features**
41
+ - mdb creates a genome-wide CpG matrix to support massive computing with population-level DNA methylation signals
42
+ - Use `mdb index` to index reference genome
43
+ - Use `mdb create` to create `.mdb` files (a set of `.npy` files) for each methylBED file
44
+ - Use `mdb merge` to merge per sample `.mdb` database to form a CpGxSample matrix
45
+ - Use `mdb pca` to parform PCA on merged matrix
46
+
47
+
48
+ usage: mdb [-h] [-v] {index,create,merge,pca} ...
49
+
50
+ DNA methylation database builder for quick population-level analysis.
51
+
52
+ positional arguments:
53
+ {index,create,merge,pca}
54
+ index Index all CpG locations on the reference genome
55
+ create Create single sample-level methylation database
56
+ merge mdb databases from multiple samples into a single database: COMBINE STRAND and HAPLOTYPE
57
+ pca Perform PCA on the merged mdb database
58
+
59
+ options:
60
+ -h, --help show this help message and exit
61
+ -v, --version show program's version number and exit
62
+
63
+ Version v0.0.2
methdb-0.0.2/README.md ADDED
@@ -0,0 +1,37 @@
1
+ # mdb
2
+ A toolkit to create DNA methylation database for cross-sample comparison
3
+
4
+
5
+ ## Requirement
6
+ **Output format taken from:**
7
+ - Modkit >= v0.6
8
+ - Pb-CpG-tools >= v3.0.0
9
+
10
+ **Recommended DNA methylation pileup parameter:**
11
+ - Use `--combine-strand` in modkit pileup.
12
+ - Use `-m` function to aggregrate 5mC and 5hmC for ONT's pileup if merging ONT with PacBio.
13
+
14
+ **mdb features**
15
+ - mdb creates a genome-wide CpG matrix to support massive computing with population-level DNA methylation signals
16
+ - Use `mdb index` to index reference genome
17
+ - Use `mdb create` to create `.mdb` files (a set of `.npy` files) for each methylBED file
18
+ - Use `mdb merge` to merge per sample `.mdb` database to form a CpGxSample matrix
19
+ - Use `mdb pca` to parform PCA on merged matrix
20
+
21
+
22
+ usage: mdb [-h] [-v] {index,create,merge,pca} ...
23
+
24
+ DNA methylation database builder for quick population-level analysis.
25
+
26
+ positional arguments:
27
+ {index,create,merge,pca}
28
+ index Index all CpG locations on the reference genome
29
+ create Create single sample-level methylation database
30
+ merge mdb databases from multiple samples into a single database: COMBINE STRAND and HAPLOTYPE
31
+ pca Perform PCA on the merged mdb database
32
+
33
+ options:
34
+ -h, --help show this help message and exit
35
+ -v, --version show program's version number and exit
36
+
37
+ Version v0.0.2
@@ -0,0 +1 @@
1
+ __version__ = "v0.0.2"
@@ -0,0 +1,409 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Build CpG-by-sample matrices for 5mC and 5hmC ONLY (no M_sum),
4
+ using a pre-built CpG index (NPZ) and methylation bed-like files.
5
+
6
+ Supports two input formats via --platform:
7
+ - ont (default): modkit pileup BED (no header; 18 columns; 'm' and 'h' rows)
8
+ - pacbio: pb-cpg-tools cpg_scores TSV (has header lines starting with ## and a # header row;
9
+ columns include: chrom, begin, end, ... discretized_mod_score)
10
+
11
+ Outputs:
12
+ <out_prefix>.5mC.npy float32 (fraction 0..1 unless --keep-percent)
13
+ <out_prefix>.5hmC.npy float32 (ONT only; for PacBio will remain NaN)
14
+
15
+ Notes:
16
+ - Full-genome RAM allocation can be huge; for 400 samples this likely won't fit in 50 GB.
17
+ Use chromosome-sharded or memmap approach if needed.
18
+ """
19
+
20
+ import os
21
+ import sys
22
+ import time
23
+ import argparse
24
+ import logging
25
+ from typing import Dict, Tuple, List, Optional, Callable
26
+
27
+ import numpy as np
28
+ import polars as pl
29
+ from tqdm import tqdm
30
+
31
+ # ---------- ONT (modkit pileup bed, has_header=False => column_1..)
32
+ ONT_COL_CHROM = "column_1"
33
+ ONT_COL_START = "column_2"
34
+ ONT_COL_CODE = "column_4" # "m" or "h"
35
+ ONT_COL_PCT = "column_11" # percent modified
36
+ ONT_COL_COV = "column_10" # Nvalid_cov
37
+
38
+ # ---------- PacBio (pb-cpg-tools cpg_scores, tab-separated with comments + header)
39
+ # Columns (example):
40
+ # chrom begin end mod_score type cov est_mod_count est_unmod_count discretized_mod_score
41
+ PB_COL_CHROM = "chrom"
42
+ PB_COL_START = "begin" # 0-based start
43
+ PB_COL_COV = "cov"
44
+ PB_COL_DMS = "discretized_mod_score" # percent 0..100
45
+
46
+
47
+ def setup_logging(log_path: Optional[str], verbose: bool = False) -> logging.Logger:
48
+ logger = logging.getLogger("build_5mc_5hmc")
49
+ logger.setLevel(logging.DEBUG)
50
+
51
+ fmt = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s")
52
+
53
+ sh = logging.StreamHandler(sys.stderr)
54
+ sh.setLevel(logging.DEBUG if verbose else logging.INFO)
55
+ sh.setFormatter(fmt)
56
+ logger.addHandler(sh)
57
+
58
+ if log_path:
59
+ os.makedirs(os.path.dirname(os.path.abspath(log_path)), exist_ok=True)
60
+ fh = logging.FileHandler(log_path)
61
+ fh.setLevel(logging.DEBUG)
62
+ fh.setFormatter(fmt)
63
+ logger.addHandler(fh)
64
+
65
+ return logger
66
+
67
+
68
+ def load_index(index_npz: str, logger: logging.Logger):
69
+ idx = np.load(index_npz, allow_pickle=True)
70
+ chroms = idx["chroms"] # object array
71
+ chrom_offsets = idx["chrom_offsets"] # int64
72
+ pos0 = idx["pos0"] # int32
73
+
74
+ n_cpg = int(pos0.shape[0])
75
+ chrom_slices: Dict[str, Tuple[int, int]] = {}
76
+ for i, c in enumerate(chroms):
77
+ c = str(c)
78
+ s = int(chrom_offsets[i])
79
+ e = int(chrom_offsets[i + 1]) if i + 1 < len(chrom_offsets) else n_cpg
80
+ chrom_slices[c] = (s, e)
81
+
82
+ chrom_to_idx = {str(c): i for i, c in enumerate(chroms)}
83
+ allowed_chroms = list(chrom_to_idx.keys())
84
+
85
+ logger.info(f"Loaded index: {index_npz}")
86
+ logger.info(f"Index CpGs: {n_cpg:,}")
87
+ logger.info(f"Index chroms ({len(chroms)}): {allowed_chroms}")
88
+
89
+ return chroms, chrom_offsets, pos0, chrom_slices, allowed_chroms
90
+
91
+
92
+ def read_beds_list(beds_list_path: str) -> Tuple[List[str], List[str]]:
93
+ """
94
+ Returns (sample_names, file_paths)
95
+ Accepts either:
96
+ - path
97
+ - sample<TAB>path
98
+ """
99
+ names: List[str] = []
100
+ paths: List[str] = []
101
+ with open(beds_list_path, "r") as f:
102
+ for line in f:
103
+ line = line.strip()
104
+ if not line or line.startswith("#"):
105
+ continue
106
+ parts = line.split("\t")
107
+ if len(parts) == 1:
108
+ p = parts[0]
109
+ names.append(os.path.basename(p))
110
+ paths.append(p)
111
+ else:
112
+ names.append(parts[0])
113
+ paths.append(parts[1])
114
+ return names, paths
115
+
116
+
117
+ def map_positions_to_rows(
118
+ chrom: str,
119
+ starts_np: np.ndarray,
120
+ *,
121
+ pos0: np.ndarray,
122
+ chrom_slices: Dict[str, Tuple[int, int]],
123
+ ) -> np.ndarray:
124
+ """Map 0-based CpG start positions on a given chrom to global row indices; -1 if not found."""
125
+ if chrom not in chrom_slices:
126
+ return np.full(starts_np.shape[0], -1, dtype=np.int64)
127
+
128
+ s, e = chrom_slices[chrom]
129
+ chrom_pos = pos0[s:e] # sorted increasing within chrom
130
+ j = np.searchsorted(chrom_pos, starts_np, side="left")
131
+ ok = (j < chrom_pos.shape[0]) & (chrom_pos[j] == starts_np)
132
+
133
+ out = np.full(starts_np.shape[0], -1, dtype=np.int64)
134
+ out[ok] = (s + j[ok]).astype(np.int64)
135
+ return out
136
+
137
+
138
+ # -------------------- Platform-specific fill functions --------------------
139
+
140
+ def fill_one_sample_ont(
141
+ bed_path: str,
142
+ col_idx: int,
143
+ M_5mC: np.ndarray,
144
+ M_5hmC: np.ndarray,
145
+ *,
146
+ pos0: np.ndarray,
147
+ chrom_slices: Dict[str, Tuple[int, int]],
148
+ allowed_chroms: List[str],
149
+ min_cov: int,
150
+ keep_percent: bool,
151
+ logger: logging.Logger,
152
+ ) -> Dict[str, int]:
153
+ """ONT modkit pileup BED: has_header=False, 'm' and 'h' rows."""
154
+ t0 = time.time()
155
+
156
+ lf = (
157
+ pl.scan_csv(bed_path, separator="\t", has_header=False)
158
+ .select([ONT_COL_CHROM, ONT_COL_START, ONT_COL_CODE, ONT_COL_PCT, ONT_COL_COV])
159
+ .filter(pl.col(ONT_COL_CHROM).is_in(allowed_chroms))
160
+ )
161
+ if min_cov > 0:
162
+ lf = lf.filter(pl.col(ONT_COL_COV) >= min_cov)
163
+
164
+ df = lf.collect(engine="streaming")
165
+
166
+ stats = {
167
+ "rows_after_filters": int(df.height),
168
+ "rows_m": 0,
169
+ "rows_h": 0,
170
+ "mapped_m": 0,
171
+ "mapped_h": 0,
172
+ }
173
+
174
+ if df.height == 0:
175
+ logger.warning(f"[{col_idx}] {os.path.basename(bed_path)}: no rows after filters")
176
+ return stats
177
+
178
+ # percent -> value
179
+ if keep_percent:
180
+ df = df.with_columns(pl.col(ONT_COL_PCT).cast(pl.Float32).alias("val"))
181
+ else:
182
+ df = df.with_columns((pl.col(ONT_COL_PCT) / 100.0).cast(pl.Float32).alias("val"))
183
+
184
+ df_m = df.filter(pl.col(ONT_COL_CODE) == "m").select([ONT_COL_CHROM, ONT_COL_START, "val"])
185
+ df_h = df.filter(pl.col(ONT_COL_CODE) == "h").select([ONT_COL_CHROM, ONT_COL_START, "val"])
186
+ stats["rows_m"] = int(df_m.height)
187
+ stats["rows_h"] = int(df_h.height)
188
+
189
+ def _apply(df_sub: pl.DataFrame, mat: np.ndarray, tag: str):
190
+ if df_sub.height == 0:
191
+ return 0
192
+ mapped = 0
193
+ for chrom, sub in df_sub.group_by(ONT_COL_CHROM, maintain_order=True):
194
+ chrom = chrom[0]
195
+ starts = sub[ONT_COL_START].to_numpy().astype(np.int32)
196
+ rows = map_positions_to_rows(chrom, starts, pos0=pos0, chrom_slices=chrom_slices)
197
+ ok = rows != -1
198
+ if ok.any():
199
+ vals = sub["val"].to_numpy().astype(np.float32)
200
+ mat[rows[ok], col_idx] = vals[ok]
201
+ mapped += int(ok.sum())
202
+ stats[f"mapped_{tag}"] = mapped
203
+ return mapped
204
+
205
+ mapped_m = _apply(df_m, M_5mC, "m")
206
+ mapped_h = _apply(df_h, M_5hmC, "h")
207
+
208
+ logger.info(
209
+ f"[{col_idx}] {os.path.basename(bed_path)} "
210
+ f"rows={stats['rows_after_filters']:,} "
211
+ f"m={stats['rows_m']:,}(mapped {mapped_m:,}) "
212
+ f"h={stats['rows_h']:,}(mapped {mapped_h:,}) "
213
+ f"min_cov={min_cov}"
214
+ )
215
+ logger.debug(f"[{col_idx}] done in {time.time()-t0:.1f}s")
216
+ return stats
217
+
218
+
219
+ def fill_one_sample_pacbio(
220
+ bed_path: str,
221
+ col_idx: int,
222
+ M_5mC: np.ndarray,
223
+ M_5hmC: np.ndarray,
224
+ *,
225
+ pos0: np.ndarray,
226
+ chrom_slices: Dict[str, Tuple[int, int]],
227
+ allowed_chroms: List[str],
228
+ min_cov: int,
229
+ keep_percent: bool,
230
+ logger: logging.Logger,
231
+ ) -> Dict[str, int]:
232
+ """
233
+ PacBio pb-cpg-tools 'cpg_scores' TSV:
234
+ - comment lines start with ## and a header line starts with '#chrom ...'
235
+ - uses discretized_mod_score (0..100) as methylation percent for CpG.
236
+ - 5hmC not available -> M_5hmC column stays NaN.
237
+
238
+ This reader:
239
+ - skips comment lines starting with "##"
240
+ - treats the '#chrom ...' line as header (we strip the leading '#')
241
+ """
242
+ t0 = time.time()
243
+
244
+ # We need to (1) skip lines starting with ##, and (2) parse the header line "#chrom ..."
245
+ # Polars supports 'comment_prefix' for a single prefix; we use it to skip "##" lines.
246
+ # Then we read with has_header=True; the first non-## line is the "#chrom ..." header.
247
+ lf = pl.scan_csv(
248
+ bed_path,
249
+ separator="\t",
250
+ has_header=True,
251
+ comment_prefix="##",
252
+ )
253
+
254
+ # Header will create a column named "#chrom" (with leading '#').
255
+ # Normalize it to "chrom" so the rest is consistent.
256
+ # Also enforce the minimal needed columns exist.
257
+ lf = lf.rename({"#chrom": PB_COL_CHROM}) if "#chrom" in lf.columns else lf
258
+
259
+ # Keep required columns only
260
+ needed = [PB_COL_CHROM, PB_COL_START, PB_COL_DMS, PB_COL_COV]
261
+ missing = [c for c in needed if c not in lf.columns]
262
+ if missing:
263
+ raise ValueError(
264
+ f"PacBio file missing columns {missing}. "
265
+ f"Found columns: {lf.columns}"
266
+ )
267
+
268
+ lf = (
269
+ lf.select(needed)
270
+ .filter(pl.col(PB_COL_CHROM).is_in(allowed_chroms))
271
+ )
272
+ if min_cov > 0:
273
+ lf = lf.filter(pl.col(PB_COL_COV) >= min_cov)
274
+
275
+ df = lf.collect(engine="streaming")
276
+
277
+ stats = {
278
+ "rows_after_filters": int(df.height),
279
+ "mapped_m": 0, # treat as 5mC
280
+ "mapped_h": 0, # always 0 for pacbio
281
+ }
282
+
283
+ if df.height == 0:
284
+ logger.warning(f"[{col_idx}] {os.path.basename(bed_path)}: no rows after filters")
285
+ return stats
286
+
287
+ # discretized_mod_score is percent 0..100
288
+ if keep_percent:
289
+ df = df.with_columns(pl.col(PB_COL_DMS).cast(pl.Float32).alias("val"))
290
+ else:
291
+ df = df.with_columns((pl.col(PB_COL_DMS) / 100.0).cast(pl.Float32).alias("val"))
292
+
293
+ mapped = 0
294
+ for chrom, sub in df.group_by(PB_COL_CHROM, maintain_order=True):
295
+ chrom = chrom[0]
296
+ starts = sub[PB_COL_START].to_numpy().astype(np.int32)
297
+ rows = map_positions_to_rows(chrom, starts, pos0=pos0, chrom_slices=chrom_slices)
298
+ ok = rows != -1
299
+ if ok.any():
300
+ vals = sub["val"].to_numpy().astype(np.float32)
301
+ M_5mC[rows[ok], col_idx] = vals[ok]
302
+ mapped += int(ok.sum())
303
+
304
+ stats["mapped_m"] = mapped
305
+
306
+ logger.info(
307
+ f"[{col_idx}] {os.path.basename(bed_path)} "
308
+ f"rows={stats['rows_after_filters']:,} "
309
+ f"pacbio(mapped {mapped:,}) "
310
+ f"min_cov={min_cov} (5hmC=NaN)"
311
+ )
312
+ logger.debug(f"[{col_idx}] done in {time.time()-t0:.1f}s")
313
+ return stats
314
+
315
+
316
+ # -------------------- main --------------------
317
+
318
+ def main():
319
+ ap = argparse.ArgumentParser()
320
+ ap.add_argument("--index", required=True, help="CpG index NPZ")
321
+ ap.add_argument("--beds-list", required=True, help="file list (path or sample<TAB>path)")
322
+ ap.add_argument("--out-prefix", required=True, help="Output prefix for .npy files")
323
+ ap.add_argument("--min-cov", type=int, default=0, help="Minimum coverage filter (Nvalid_cov or cov)")
324
+ ap.add_argument("--keep-percent", action="store_true", help="Store 0..100 instead of 0..1")
325
+ ap.add_argument(
326
+ "--platform",
327
+ choices=["ont", "pacbio"],
328
+ default="ont",
329
+ help="Input format: ont(modkit) or pacbio(pb-cpg-tools cpg_scores)"
330
+ )
331
+ ap.add_argument("--log", default=None, help="Log file path")
332
+ ap.add_argument("--verbose", action="store_true", help="Verbose console logs")
333
+ args = ap.parse_args()
334
+
335
+ logger = setup_logging(args.log, verbose=args.verbose)
336
+ logger.info(f"polars {pl.__version__}")
337
+ logger.info(f"platform={args.platform}")
338
+
339
+ chroms, chrom_offsets, pos0, chrom_slices, allowed_chroms = load_index(args.index, logger)
340
+ sample_names, file_paths = read_beds_list(args.beds_list)
341
+ if not file_paths:
342
+ logger.error("No files found in --beds-list")
343
+ sys.exit(1)
344
+
345
+ n_samples = len(file_paths)
346
+ n_cpg = int(pos0.shape[0])
347
+
348
+ logger.info(f"Samples: {n_samples}")
349
+ logger.info(f"Allocating matrices: CpGs={n_cpg:,} x samples={n_samples}")
350
+ logger.info(f"Value mode: {'percent(0..100)' if args.keep_percent else 'fraction(0..1)'}; min_cov={args.min_cov}")
351
+
352
+ # Allocate RAM matrices
353
+ M_5mC = np.full((n_cpg, n_samples), np.nan, dtype=np.float32)
354
+ M_5hmC = np.full((n_cpg, n_samples), np.nan, dtype=np.float32)
355
+
356
+ # Choose fill function
357
+ if args.platform == "ont":
358
+ fill_fn: Callable[..., Dict[str, int]] = fill_one_sample_ont
359
+ else:
360
+ fill_fn = fill_one_sample_pacbio
361
+
362
+ # Fill
363
+ for j, path in enumerate(tqdm(file_paths, desc="Processing files", unit="file")):
364
+ if not os.path.exists(path):
365
+ logger.warning(f"[{j}] Missing file: {path}")
366
+ continue
367
+
368
+ fill_fn(
369
+ path,
370
+ j,
371
+ M_5mC,
372
+ M_5hmC,
373
+ pos0=pos0,
374
+ chrom_slices=chrom_slices,
375
+ allowed_chroms=allowed_chroms,
376
+ min_cov=args.min_cov,
377
+ keep_percent=args.keep_percent,
378
+ logger=logger,
379
+ )
380
+
381
+ # Save outputs (stop here; no M_sum)
382
+ out_prefix = args.out_prefix
383
+ os.makedirs(os.path.dirname(os.path.abspath(out_prefix)), exist_ok=True)
384
+
385
+ # Always save 5mC
386
+ np.save(out_prefix + ".5mC.npy", M_5mC)
387
+ logger.info(f"Saved: {out_prefix}.5mC.npy")
388
+
389
+ # Save 5hmC only for ONT
390
+ if args.platform == "ont":
391
+ np.save(out_prefix + ".5hmC.npy", M_5hmC)
392
+ logger.info(f"Saved: {out_prefix}.5hmC.npy")
393
+
394
+ # QC summary
395
+ filled_5mc = int(np.isfinite(M_5mC).sum())
396
+ logger.info(f"Filled entries: 5mC={filled_5mc:,}")
397
+
398
+ if args.platform == "ont":
399
+ filled_5hmc = int(np.isfinite(M_5hmC).sum())
400
+ logger.info(f"Filled entries: 5hmC={filled_5hmc:,}")
401
+ else:
402
+ logger.info("PacBio mode: only 5mC matrix written (no 5hmC signal).")
403
+
404
+ logger.info("Done.")
405
+
406
+
407
+ if __name__ == "__main__":
408
+ main()
409
+
@@ -0,0 +1,84 @@
1
+ import os
2
+ import sys
3
+ import time
4
+ import logging
5
+ import numpy as np
6
+ import polars as pl
7
+ from typing import Dict, Tuple, List, Optional
8
+ from mdb.ont_bed_parsing import ont_bed_parsing
9
+ from mdb.pacbio_bed_parsing import pacbio_bed_parsing
10
+
11
+
12
+ def load_index(index_npz: str):
13
+ idx = np.load(index_npz, allow_pickle=True)
14
+ chroms = idx["chroms"] # object array
15
+ chrom_offsets = idx["chrom_offsets"] # int64
16
+ pos0 = idx["pos0"] # int32
17
+
18
+ n_cpg = int(pos0.shape[0])
19
+ chrom_slices: Dict[str, Tuple[int, int]] = {}
20
+ for i, c in enumerate(chroms):
21
+ c = str(c)
22
+ s = int(chrom_offsets[i])
23
+ e = int(chrom_offsets[i + 1]) if i + 1 < len(chrom_offsets) else n_cpg
24
+ chrom_slices[c] = (s, e)
25
+
26
+ chrom_to_idx = {str(c): i for i, c in enumerate(chroms)}
27
+ allowed_chroms = list(chrom_to_idx.keys())
28
+ return chroms, chrom_offsets, pos0, chrom_slices, allowed_chroms
29
+
30
+
31
+
32
+ def create_main(args, logger=None):
33
+ # Inputs
34
+ platform = args.platform
35
+ index_npz = args.npz
36
+ bed_path = args.bed
37
+ output_mdb = args.output
38
+ min_cov = args.min_coverage
39
+
40
+ if platform not in ("ont", "pacbio"):
41
+ raise ValueError(f"Unsupported platform: {platform}, supported platforms are 'ont' and 'pacbio'")
42
+
43
+ chroms, chrom_offsets, pos0, chrom_slices, allowed_chroms = load_index(index_npz)
44
+ print(f"Loaded index: {index_npz} with {len(chroms)} chromosomes and {pos0.shape[0]} CpGs")
45
+
46
+
47
+ os.makedirs(output_mdb, exist_ok=True)
48
+ if platform == "ont":
49
+ stats, matrices, bed_map = ont_bed_parsing(
50
+ input_path=bed_path,
51
+ chrom_slices=chrom_slices,
52
+ cpg_pos0=pos0,
53
+ allowed_chroms=allowed_chroms,
54
+ min_cov=min_cov,
55
+ )
56
+ with open(os.path.join(output_mdb, "bed_map.txt"), "w") as f:
57
+ for file_name, bed_file in bed_map.items():
58
+ f.write(f"{file_name}\t{bed_file}\n")
59
+ if len(matrices) != 4:
60
+ np.save(os.path.join(output_mdb, "5mC.npy"), matrices[0])
61
+ np.save(os.path.join(output_mdb, "5hmC.npy"), matrices[1])
62
+ print(f"Saved 5mC and 5hmC matrices to {os.path.join(output_mdb, '5mC.npy')} and {os.path.join(output_mdb, '5hmC.npy')}")
63
+ elif len(matrices) == 4:
64
+ m_5mC_plus, m_5mC_minus, m_5hmC_plus, m_5hmC_minus = matrices
65
+ np.save(os.path.join(output_mdb, "5mC_plus.npy"), m_5mC_plus)
66
+ np.save(os.path.join(output_mdb, "5mC_minus.npy"), m_5mC_minus)
67
+ np.save(os.path.join(output_mdb, "5hmC_plus.npy"), m_5hmC_plus)
68
+ np.save(os.path.join(output_mdb, "5hmC_minus.npy"), m_5hmC_minus)
69
+ print(f"Saved 5mC and 5hmC strand-specific matrices to {os.path.join(output_mdb, '5mC_plus.npy')}, {os.path.join(output_mdb, '5mC_minus.npy')}, {os.path.join(output_mdb, '5hmC_plus.npy')}, and {os.path.join(output_mdb, '5hmC_minus.npy')}")
70
+
71
+ elif platform == "pacbio":
72
+ stats, matrices, bed_map = pacbio_bed_parsing(
73
+ input_path=bed_path,
74
+ chrom_slices=chrom_slices,
75
+ cpg_pos0=pos0,
76
+ allowed_chroms=allowed_chroms,
77
+ min_cov=min_cov,
78
+ )
79
+ np.save(os.path.join(output_mdb, "5mC.npy"), matrices)
80
+ with open(os.path.join(output_mdb, "bed_map.txt"), "w") as f:
81
+ for file_name, bed_file in bed_map.items():
82
+ f.write(f"{file_name}\t{bed_file}\n")
83
+ print(f"Saved 5mC matrix to {os.path.join(output_mdb, '5mC.npy')}")
84
+