methdb 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdb/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "v0.0.2"
@@ -0,0 +1,409 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Build CpG-by-sample matrices for 5mC and 5hmC ONLY (no M_sum),
4
+ using a pre-built CpG index (NPZ) and methylation bed-like files.
5
+
6
+ Supports two input formats via --platform:
7
+ - ont (default): modkit pileup BED (no header; 18 columns; 'm' and 'h' rows)
8
+ - pacbio: pb-cpg-tools cpg_scores TSV (has header lines starting with ## and a # header row;
9
+ columns include: chrom, begin, end, ... discretized_mod_score)
10
+
11
+ Outputs:
12
+ <out_prefix>.5mC.npy float32 (fraction 0..1 unless --keep-percent)
13
+ <out_prefix>.5hmC.npy float32 (ONT only; for PacBio will remain NaN)
14
+
15
+ Notes:
16
+ - Full-genome RAM allocation can be huge; for 400 samples this likely won't fit in 50 GB.
17
+ Use chromosome-sharded or memmap approach if needed.
18
+ """
19
+
20
+ import os
21
+ import sys
22
+ import time
23
+ import argparse
24
+ import logging
25
+ from typing import Dict, Tuple, List, Optional, Callable
26
+
27
+ import numpy as np
28
+ import polars as pl
29
+ from tqdm import tqdm
30
+
31
+ # ---------- ONT (modkit pileup bed, has_header=False => column_1..)
32
+ ONT_COL_CHROM = "column_1"
33
+ ONT_COL_START = "column_2"
34
+ ONT_COL_CODE = "column_4" # "m" or "h"
35
+ ONT_COL_PCT = "column_11" # percent modified
36
+ ONT_COL_COV = "column_10" # Nvalid_cov
37
+
38
+ # ---------- PacBio (pb-cpg-tools cpg_scores, tab-separated with comments + header)
39
+ # Columns (example):
40
+ # chrom begin end mod_score type cov est_mod_count est_unmod_count discretized_mod_score
41
+ PB_COL_CHROM = "chrom"
42
+ PB_COL_START = "begin" # 0-based start
43
+ PB_COL_COV = "cov"
44
+ PB_COL_DMS = "discretized_mod_score" # percent 0..100
45
+
46
+
47
+ def setup_logging(log_path: Optional[str], verbose: bool = False) -> logging.Logger:
48
+ logger = logging.getLogger("build_5mc_5hmc")
49
+ logger.setLevel(logging.DEBUG)
50
+
51
+ fmt = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s")
52
+
53
+ sh = logging.StreamHandler(sys.stderr)
54
+ sh.setLevel(logging.DEBUG if verbose else logging.INFO)
55
+ sh.setFormatter(fmt)
56
+ logger.addHandler(sh)
57
+
58
+ if log_path:
59
+ os.makedirs(os.path.dirname(os.path.abspath(log_path)), exist_ok=True)
60
+ fh = logging.FileHandler(log_path)
61
+ fh.setLevel(logging.DEBUG)
62
+ fh.setFormatter(fmt)
63
+ logger.addHandler(fh)
64
+
65
+ return logger
66
+
67
+
68
+ def load_index(index_npz: str, logger: logging.Logger):
69
+ idx = np.load(index_npz, allow_pickle=True)
70
+ chroms = idx["chroms"] # object array
71
+ chrom_offsets = idx["chrom_offsets"] # int64
72
+ pos0 = idx["pos0"] # int32
73
+
74
+ n_cpg = int(pos0.shape[0])
75
+ chrom_slices: Dict[str, Tuple[int, int]] = {}
76
+ for i, c in enumerate(chroms):
77
+ c = str(c)
78
+ s = int(chrom_offsets[i])
79
+ e = int(chrom_offsets[i + 1]) if i + 1 < len(chrom_offsets) else n_cpg
80
+ chrom_slices[c] = (s, e)
81
+
82
+ chrom_to_idx = {str(c): i for i, c in enumerate(chroms)}
83
+ allowed_chroms = list(chrom_to_idx.keys())
84
+
85
+ logger.info(f"Loaded index: {index_npz}")
86
+ logger.info(f"Index CpGs: {n_cpg:,}")
87
+ logger.info(f"Index chroms ({len(chroms)}): {allowed_chroms}")
88
+
89
+ return chroms, chrom_offsets, pos0, chrom_slices, allowed_chroms
90
+
91
+
92
+ def read_beds_list(beds_list_path: str) -> Tuple[List[str], List[str]]:
93
+ """
94
+ Returns (sample_names, file_paths)
95
+ Accepts either:
96
+ - path
97
+ - sample<TAB>path
98
+ """
99
+ names: List[str] = []
100
+ paths: List[str] = []
101
+ with open(beds_list_path, "r") as f:
102
+ for line in f:
103
+ line = line.strip()
104
+ if not line or line.startswith("#"):
105
+ continue
106
+ parts = line.split("\t")
107
+ if len(parts) == 1:
108
+ p = parts[0]
109
+ names.append(os.path.basename(p))
110
+ paths.append(p)
111
+ else:
112
+ names.append(parts[0])
113
+ paths.append(parts[1])
114
+ return names, paths
115
+
116
+
117
+ def map_positions_to_rows(
118
+ chrom: str,
119
+ starts_np: np.ndarray,
120
+ *,
121
+ pos0: np.ndarray,
122
+ chrom_slices: Dict[str, Tuple[int, int]],
123
+ ) -> np.ndarray:
124
+ """Map 0-based CpG start positions on a given chrom to global row indices; -1 if not found."""
125
+ if chrom not in chrom_slices:
126
+ return np.full(starts_np.shape[0], -1, dtype=np.int64)
127
+
128
+ s, e = chrom_slices[chrom]
129
+ chrom_pos = pos0[s:e] # sorted increasing within chrom
130
+ j = np.searchsorted(chrom_pos, starts_np, side="left")
131
+ ok = (j < chrom_pos.shape[0]) & (chrom_pos[j] == starts_np)
132
+
133
+ out = np.full(starts_np.shape[0], -1, dtype=np.int64)
134
+ out[ok] = (s + j[ok]).astype(np.int64)
135
+ return out
136
+
137
+
138
+ # -------------------- Platform-specific fill functions --------------------
139
+
140
+ def fill_one_sample_ont(
141
+ bed_path: str,
142
+ col_idx: int,
143
+ M_5mC: np.ndarray,
144
+ M_5hmC: np.ndarray,
145
+ *,
146
+ pos0: np.ndarray,
147
+ chrom_slices: Dict[str, Tuple[int, int]],
148
+ allowed_chroms: List[str],
149
+ min_cov: int,
150
+ keep_percent: bool,
151
+ logger: logging.Logger,
152
+ ) -> Dict[str, int]:
153
+ """ONT modkit pileup BED: has_header=False, 'm' and 'h' rows."""
154
+ t0 = time.time()
155
+
156
+ lf = (
157
+ pl.scan_csv(bed_path, separator="\t", has_header=False)
158
+ .select([ONT_COL_CHROM, ONT_COL_START, ONT_COL_CODE, ONT_COL_PCT, ONT_COL_COV])
159
+ .filter(pl.col(ONT_COL_CHROM).is_in(allowed_chroms))
160
+ )
161
+ if min_cov > 0:
162
+ lf = lf.filter(pl.col(ONT_COL_COV) >= min_cov)
163
+
164
+ df = lf.collect(engine="streaming")
165
+
166
+ stats = {
167
+ "rows_after_filters": int(df.height),
168
+ "rows_m": 0,
169
+ "rows_h": 0,
170
+ "mapped_m": 0,
171
+ "mapped_h": 0,
172
+ }
173
+
174
+ if df.height == 0:
175
+ logger.warning(f"[{col_idx}] {os.path.basename(bed_path)}: no rows after filters")
176
+ return stats
177
+
178
+ # percent -> value
179
+ if keep_percent:
180
+ df = df.with_columns(pl.col(ONT_COL_PCT).cast(pl.Float32).alias("val"))
181
+ else:
182
+ df = df.with_columns((pl.col(ONT_COL_PCT) / 100.0).cast(pl.Float32).alias("val"))
183
+
184
+ df_m = df.filter(pl.col(ONT_COL_CODE) == "m").select([ONT_COL_CHROM, ONT_COL_START, "val"])
185
+ df_h = df.filter(pl.col(ONT_COL_CODE) == "h").select([ONT_COL_CHROM, ONT_COL_START, "val"])
186
+ stats["rows_m"] = int(df_m.height)
187
+ stats["rows_h"] = int(df_h.height)
188
+
189
+ def _apply(df_sub: pl.DataFrame, mat: np.ndarray, tag: str):
190
+ if df_sub.height == 0:
191
+ return 0
192
+ mapped = 0
193
+ for chrom, sub in df_sub.group_by(ONT_COL_CHROM, maintain_order=True):
194
+ chrom = chrom[0]
195
+ starts = sub[ONT_COL_START].to_numpy().astype(np.int32)
196
+ rows = map_positions_to_rows(chrom, starts, pos0=pos0, chrom_slices=chrom_slices)
197
+ ok = rows != -1
198
+ if ok.any():
199
+ vals = sub["val"].to_numpy().astype(np.float32)
200
+ mat[rows[ok], col_idx] = vals[ok]
201
+ mapped += int(ok.sum())
202
+ stats[f"mapped_{tag}"] = mapped
203
+ return mapped
204
+
205
+ mapped_m = _apply(df_m, M_5mC, "m")
206
+ mapped_h = _apply(df_h, M_5hmC, "h")
207
+
208
+ logger.info(
209
+ f"[{col_idx}] {os.path.basename(bed_path)} "
210
+ f"rows={stats['rows_after_filters']:,} "
211
+ f"m={stats['rows_m']:,}(mapped {mapped_m:,}) "
212
+ f"h={stats['rows_h']:,}(mapped {mapped_h:,}) "
213
+ f"min_cov={min_cov}"
214
+ )
215
+ logger.debug(f"[{col_idx}] done in {time.time()-t0:.1f}s")
216
+ return stats
217
+
218
+
219
+ def fill_one_sample_pacbio(
220
+ bed_path: str,
221
+ col_idx: int,
222
+ M_5mC: np.ndarray,
223
+ M_5hmC: np.ndarray,
224
+ *,
225
+ pos0: np.ndarray,
226
+ chrom_slices: Dict[str, Tuple[int, int]],
227
+ allowed_chroms: List[str],
228
+ min_cov: int,
229
+ keep_percent: bool,
230
+ logger: logging.Logger,
231
+ ) -> Dict[str, int]:
232
+ """
233
+ PacBio pb-cpg-tools 'cpg_scores' TSV:
234
+ - comment lines start with ## and a header line starts with '#chrom ...'
235
+ - uses discretized_mod_score (0..100) as methylation percent for CpG.
236
+ - 5hmC not available -> M_5hmC column stays NaN.
237
+
238
+ This reader:
239
+ - skips comment lines starting with "##"
240
+ - treats the '#chrom ...' line as header (we strip the leading '#')
241
+ """
242
+ t0 = time.time()
243
+
244
+ # We need to (1) skip lines starting with ##, and (2) parse the header line "#chrom ..."
245
+ # Polars supports 'comment_prefix' for a single prefix; we use it to skip "##" lines.
246
+ # Then we read with has_header=True; the first non-## line is the "#chrom ..." header.
247
+ lf = pl.scan_csv(
248
+ bed_path,
249
+ separator="\t",
250
+ has_header=True,
251
+ comment_prefix="##",
252
+ )
253
+
254
+ # Header will create a column named "#chrom" (with leading '#').
255
+ # Normalize it to "chrom" so the rest is consistent.
256
+ # Also enforce the minimal needed columns exist.
257
+ lf = lf.rename({"#chrom": PB_COL_CHROM}) if "#chrom" in lf.columns else lf
258
+
259
+ # Keep required columns only
260
+ needed = [PB_COL_CHROM, PB_COL_START, PB_COL_DMS, PB_COL_COV]
261
+ missing = [c for c in needed if c not in lf.columns]
262
+ if missing:
263
+ raise ValueError(
264
+ f"PacBio file missing columns {missing}. "
265
+ f"Found columns: {lf.columns}"
266
+ )
267
+
268
+ lf = (
269
+ lf.select(needed)
270
+ .filter(pl.col(PB_COL_CHROM).is_in(allowed_chroms))
271
+ )
272
+ if min_cov > 0:
273
+ lf = lf.filter(pl.col(PB_COL_COV) >= min_cov)
274
+
275
+ df = lf.collect(engine="streaming")
276
+
277
+ stats = {
278
+ "rows_after_filters": int(df.height),
279
+ "mapped_m": 0, # treat as 5mC
280
+ "mapped_h": 0, # always 0 for pacbio
281
+ }
282
+
283
+ if df.height == 0:
284
+ logger.warning(f"[{col_idx}] {os.path.basename(bed_path)}: no rows after filters")
285
+ return stats
286
+
287
+ # discretized_mod_score is percent 0..100
288
+ if keep_percent:
289
+ df = df.with_columns(pl.col(PB_COL_DMS).cast(pl.Float32).alias("val"))
290
+ else:
291
+ df = df.with_columns((pl.col(PB_COL_DMS) / 100.0).cast(pl.Float32).alias("val"))
292
+
293
+ mapped = 0
294
+ for chrom, sub in df.group_by(PB_COL_CHROM, maintain_order=True):
295
+ chrom = chrom[0]
296
+ starts = sub[PB_COL_START].to_numpy().astype(np.int32)
297
+ rows = map_positions_to_rows(chrom, starts, pos0=pos0, chrom_slices=chrom_slices)
298
+ ok = rows != -1
299
+ if ok.any():
300
+ vals = sub["val"].to_numpy().astype(np.float32)
301
+ M_5mC[rows[ok], col_idx] = vals[ok]
302
+ mapped += int(ok.sum())
303
+
304
+ stats["mapped_m"] = mapped
305
+
306
+ logger.info(
307
+ f"[{col_idx}] {os.path.basename(bed_path)} "
308
+ f"rows={stats['rows_after_filters']:,} "
309
+ f"pacbio(mapped {mapped:,}) "
310
+ f"min_cov={min_cov} (5hmC=NaN)"
311
+ )
312
+ logger.debug(f"[{col_idx}] done in {time.time()-t0:.1f}s")
313
+ return stats
314
+
315
+
316
+ # -------------------- main --------------------
317
+
318
+ def main():
319
+ ap = argparse.ArgumentParser()
320
+ ap.add_argument("--index", required=True, help="CpG index NPZ")
321
+ ap.add_argument("--beds-list", required=True, help="file list (path or sample<TAB>path)")
322
+ ap.add_argument("--out-prefix", required=True, help="Output prefix for .npy files")
323
+ ap.add_argument("--min-cov", type=int, default=0, help="Minimum coverage filter (Nvalid_cov or cov)")
324
+ ap.add_argument("--keep-percent", action="store_true", help="Store 0..100 instead of 0..1")
325
+ ap.add_argument(
326
+ "--platform",
327
+ choices=["ont", "pacbio"],
328
+ default="ont",
329
+ help="Input format: ont(modkit) or pacbio(pb-cpg-tools cpg_scores)"
330
+ )
331
+ ap.add_argument("--log", default=None, help="Log file path")
332
+ ap.add_argument("--verbose", action="store_true", help="Verbose console logs")
333
+ args = ap.parse_args()
334
+
335
+ logger = setup_logging(args.log, verbose=args.verbose)
336
+ logger.info(f"polars {pl.__version__}")
337
+ logger.info(f"platform={args.platform}")
338
+
339
+ chroms, chrom_offsets, pos0, chrom_slices, allowed_chroms = load_index(args.index, logger)
340
+ sample_names, file_paths = read_beds_list(args.beds_list)
341
+ if not file_paths:
342
+ logger.error("No files found in --beds-list")
343
+ sys.exit(1)
344
+
345
+ n_samples = len(file_paths)
346
+ n_cpg = int(pos0.shape[0])
347
+
348
+ logger.info(f"Samples: {n_samples}")
349
+ logger.info(f"Allocating matrices: CpGs={n_cpg:,} x samples={n_samples}")
350
+ logger.info(f"Value mode: {'percent(0..100)' if args.keep_percent else 'fraction(0..1)'}; min_cov={args.min_cov}")
351
+
352
+ # Allocate RAM matrices
353
+ M_5mC = np.full((n_cpg, n_samples), np.nan, dtype=np.float32)
354
+ M_5hmC = np.full((n_cpg, n_samples), np.nan, dtype=np.float32)
355
+
356
+ # Choose fill function
357
+ if args.platform == "ont":
358
+ fill_fn: Callable[..., Dict[str, int]] = fill_one_sample_ont
359
+ else:
360
+ fill_fn = fill_one_sample_pacbio
361
+
362
+ # Fill
363
+ for j, path in enumerate(tqdm(file_paths, desc="Processing files", unit="file")):
364
+ if not os.path.exists(path):
365
+ logger.warning(f"[{j}] Missing file: {path}")
366
+ continue
367
+
368
+ fill_fn(
369
+ path,
370
+ j,
371
+ M_5mC,
372
+ M_5hmC,
373
+ pos0=pos0,
374
+ chrom_slices=chrom_slices,
375
+ allowed_chroms=allowed_chroms,
376
+ min_cov=args.min_cov,
377
+ keep_percent=args.keep_percent,
378
+ logger=logger,
379
+ )
380
+
381
+ # Save outputs (stop here; no M_sum)
382
+ out_prefix = args.out_prefix
383
+ os.makedirs(os.path.dirname(os.path.abspath(out_prefix)), exist_ok=True)
384
+
385
+ # Always save 5mC
386
+ np.save(out_prefix + ".5mC.npy", M_5mC)
387
+ logger.info(f"Saved: {out_prefix}.5mC.npy")
388
+
389
+ # Save 5hmC only for ONT
390
+ if args.platform == "ont":
391
+ np.save(out_prefix + ".5hmC.npy", M_5hmC)
392
+ logger.info(f"Saved: {out_prefix}.5hmC.npy")
393
+
394
+ # QC summary
395
+ filled_5mc = int(np.isfinite(M_5mC).sum())
396
+ logger.info(f"Filled entries: 5mC={filled_5mc:,}")
397
+
398
+ if args.platform == "ont":
399
+ filled_5hmc = int(np.isfinite(M_5hmC).sum())
400
+ logger.info(f"Filled entries: 5hmC={filled_5hmc:,}")
401
+ else:
402
+ logger.info("PacBio mode: only 5mC matrix written (no 5hmC signal).")
403
+
404
+ logger.info("Done.")
405
+
406
+
407
+ if __name__ == "__main__":
408
+ main()
409
+
mdb/create.py ADDED
@@ -0,0 +1,84 @@
1
+ import os
2
+ import sys
3
+ import time
4
+ import logging
5
+ import numpy as np
6
+ import polars as pl
7
+ from typing import Dict, Tuple, List, Optional
8
+ from mdb.ont_bed_parsing import ont_bed_parsing
9
+ from mdb.pacbio_bed_parsing import pacbio_bed_parsing
10
+
11
+
12
+ def load_index(index_npz: str):
13
+ idx = np.load(index_npz, allow_pickle=True)
14
+ chroms = idx["chroms"] # object array
15
+ chrom_offsets = idx["chrom_offsets"] # int64
16
+ pos0 = idx["pos0"] # int32
17
+
18
+ n_cpg = int(pos0.shape[0])
19
+ chrom_slices: Dict[str, Tuple[int, int]] = {}
20
+ for i, c in enumerate(chroms):
21
+ c = str(c)
22
+ s = int(chrom_offsets[i])
23
+ e = int(chrom_offsets[i + 1]) if i + 1 < len(chrom_offsets) else n_cpg
24
+ chrom_slices[c] = (s, e)
25
+
26
+ chrom_to_idx = {str(c): i for i, c in enumerate(chroms)}
27
+ allowed_chroms = list(chrom_to_idx.keys())
28
+ return chroms, chrom_offsets, pos0, chrom_slices, allowed_chroms
29
+
30
+
31
+
32
+ def create_main(args, logger=None):
33
+ # Inputs
34
+ platform = args.platform
35
+ index_npz = args.npz
36
+ bed_path = args.bed
37
+ output_mdb = args.output
38
+ min_cov = args.min_coverage
39
+
40
+ if platform not in ("ont", "pacbio"):
41
+ raise ValueError(f"Unsupported platform: {platform}, supported platforms are 'ont' and 'pacbio'")
42
+
43
+ chroms, chrom_offsets, pos0, chrom_slices, allowed_chroms = load_index(index_npz)
44
+ print(f"Loaded index: {index_npz} with {len(chroms)} chromosomes and {pos0.shape[0]} CpGs")
45
+
46
+
47
+ os.makedirs(output_mdb, exist_ok=True)
48
+ if platform == "ont":
49
+ stats, matrices, bed_map = ont_bed_parsing(
50
+ input_path=bed_path,
51
+ chrom_slices=chrom_slices,
52
+ cpg_pos0=pos0,
53
+ allowed_chroms=allowed_chroms,
54
+ min_cov=min_cov,
55
+ )
56
+ with open(os.path.join(output_mdb, "bed_map.txt"), "w") as f:
57
+ for file_name, bed_file in bed_map.items():
58
+ f.write(f"{file_name}\t{bed_file}\n")
59
+ if len(matrices) != 4:
60
+ np.save(os.path.join(output_mdb, "5mC.npy"), matrices[0])
61
+ np.save(os.path.join(output_mdb, "5hmC.npy"), matrices[1])
62
+ print(f"Saved 5mC and 5hmC matrices to {os.path.join(output_mdb, '5mC.npy')} and {os.path.join(output_mdb, '5hmC.npy')}")
63
+ elif len(matrices) == 4:
64
+ m_5mC_plus, m_5mC_minus, m_5hmC_plus, m_5hmC_minus = matrices
65
+ np.save(os.path.join(output_mdb, "5mC_plus.npy"), m_5mC_plus)
66
+ np.save(os.path.join(output_mdb, "5mC_minus.npy"), m_5mC_minus)
67
+ np.save(os.path.join(output_mdb, "5hmC_plus.npy"), m_5hmC_plus)
68
+ np.save(os.path.join(output_mdb, "5hmC_minus.npy"), m_5hmC_minus)
69
+ print(f"Saved 5mC and 5hmC strand-specific matrices to {os.path.join(output_mdb, '5mC_plus.npy')}, {os.path.join(output_mdb, '5mC_minus.npy')}, {os.path.join(output_mdb, '5hmC_plus.npy')}, and {os.path.join(output_mdb, '5hmC_minus.npy')}")
70
+
71
+ elif platform == "pacbio":
72
+ stats, matrices, bed_map = pacbio_bed_parsing(
73
+ input_path=bed_path,
74
+ chrom_slices=chrom_slices,
75
+ cpg_pos0=pos0,
76
+ allowed_chroms=allowed_chroms,
77
+ min_cov=min_cov,
78
+ )
79
+ np.save(os.path.join(output_mdb, "5mC.npy"), matrices)
80
+ with open(os.path.join(output_mdb, "bed_map.txt"), "w") as f:
81
+ for file_name, bed_file in bed_map.items():
82
+ f.write(f"{file_name}\t{bed_file}\n")
83
+ print(f"Saved 5mC matrix to {os.path.join(output_mdb, '5mC.npy')}")
84
+
mdb/index.py ADDED
@@ -0,0 +1,107 @@
1
+ import numpy as np
2
+ from tqdm.auto import tqdm
3
+ import logging
4
+ import argparse
5
+
6
+
7
+ def fasta_iter(path):
8
+ name = None
9
+ seq_chunks = []
10
+ with open(path, "r") as f:
11
+ for line in f:
12
+ if line.startswith(">"):
13
+ if name is not None:
14
+ yield name, "".join(seq_chunks).upper()
15
+ name = line[1:].split()[0]
16
+ seq_chunks = []
17
+ else:
18
+ seq_chunks.append(line.strip())
19
+ if name is not None:
20
+ yield name, "".join(seq_chunks).upper()
21
+
22
+ def find_cpg_pos0(seq: str) -> np.ndarray:
23
+ # 0-based positions of C in "CG"
24
+ pos = []
25
+ i = seq.find("CG")
26
+ while i != -1:
27
+ pos.append(i)
28
+ i = seq.find("CG", i + 1)
29
+ return np.asarray(pos, dtype=np.int32)
30
+
31
+
32
+
33
+ def map_positions_to_rows(chrom: str, starts_np: np.ndarray) -> np.ndarray:
34
+ """
35
+ Map 0-based CpG start positions on a given chrom to global row indices.
36
+ Returns int64 array of row indices, with -1 for positions not found.
37
+ """
38
+ starts_np = np.asarray(starts_np)
39
+ if starts_np.ndim != 1:
40
+ raise ValueError("starts_np must be a 1D array of 0-based CpG C positions.")
41
+
42
+ if chrom not in chrom_slices:
43
+ return np.full(starts_np.shape[0], -1, dtype=np.int64)
44
+
45
+ s, e = chrom_slices[chrom]
46
+ chrom_pos = cpg_pos0[s:e] # sorted within chrom
47
+
48
+ # Ensure searchsorted dtype compatibility (int32 is fine; int64 also fine)
49
+ j = np.searchsorted(chrom_pos, starts_np.astype(chrom_pos.dtype, copy=False), side="left")
50
+ ok = (j < chrom_pos.shape[0]) & (chrom_pos[j] == starts_np.astype(chrom_pos.dtype, copy=False))
51
+
52
+ out = np.full(starts_np.shape[0], -1, dtype=np.int64)
53
+ out[ok] = (s + j[ok]).astype(np.int64, copy=False)
54
+ return out
55
+
56
+ def index_main(args):
57
+
58
+ reference = args.reference
59
+ output = args.output
60
+ sex = args.sex
61
+
62
+ chroms = []
63
+ chrom_offsets = []
64
+ cpg_pos0_all = []
65
+ total = 0
66
+
67
+ autosomes = [f"chr{str(i)}" for i in range(1, 23)]
68
+ matrix_chroms = autosomes
69
+ if sex == True:
70
+ matrix_chroms += ["chrX", "chrY"]
71
+ for chrom, seq in tqdm(fasta_iter(reference), desc="Scanning chromosomes", unit="chrom", total=len(matrix_chroms)):
72
+ if chrom not in matrix_chroms:
73
+ continue
74
+
75
+ pos0_chr = find_cpg_pos0(seq) # int32, sorted increasing
76
+ chroms.append(chrom)
77
+ chrom_offsets.append(total) # start index into global arrays
78
+ cpg_pos0_all.append(pos0_chr)
79
+ total += pos0_chr.size
80
+
81
+ # finalize offsets as int64 numpy array
82
+ chrom_offsets = np.asarray(chrom_offsets, dtype=np.int64)
83
+
84
+ # Build global pos0 array
85
+ if total == 0:
86
+ raise RuntimeError("No CpGs found for the requested chromosomes. Check FASTA contig names.")
87
+
88
+ cpg_pos0 = np.concatenate(cpg_pos0_all).astype(np.int32, copy=False)
89
+ n_cpg = int(cpg_pos0.shape[0])
90
+
91
+ # Build global chrom_id array (per-row chromosome index)
92
+ chrom_id = np.empty(n_cpg, dtype=np.int32)
93
+ for i, pos0_chr in enumerate(cpg_pos0_all):
94
+ start = int(chrom_offsets[i])
95
+ chrom_id[start:start + pos0_chr.size] = i
96
+
97
+ print("Total CpGs:", n_cpg)
98
+ print("Chromosomes used:", chroms)
99
+
100
+ np.savez_compressed(
101
+ output,
102
+ chroms=np.array(chroms, dtype=object),
103
+ chrom_offsets=chrom_offsets,
104
+ chrom_id=chrom_id,
105
+ pos0=cpg_pos0,
106
+ )
107
+ print("Saved: ", output)
mdb/main.py ADDED
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import argparse, sys, os
4
+
5
+ from mdb.parse_args import parse_args
6
+
7
+ def main(argv=None):
8
+ if argv is None:
9
+ argv = sys.argv[1:]
10
+ args = parse_args(argv)
11
+ if args.command == "index":
12
+ from mdb.index import index_main
13
+ index_main(args)
14
+ elif args.command == "create":
15
+ from mdb.create import create_main
16
+ create_main(args)
17
+ elif args.command == "merge":
18
+ from mdb.merge import merge_main
19
+ merge_main(args)
20
+ elif args.command == "pca":
21
+ from mdb.pca import pca_main
22
+ pca_main(args)
23
+ return 0
24
+
25
+
26
+ if __name__ == "__main__":
27
+ sys.exit(main(sys.argv[1:]))