minimap2 0.2.24.6 → 0.2.25.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -3
- data/ext/minimap2/Makefile +6 -2
- data/ext/minimap2/NEWS.md +38 -0
- data/ext/minimap2/README.md +9 -3
- data/ext/minimap2/align.c +5 -3
- data/ext/minimap2/cookbook.md +2 -2
- data/ext/minimap2/format.c +7 -4
- data/ext/minimap2/kalloc.c +20 -1
- data/ext/minimap2/kalloc.h +13 -2
- data/ext/minimap2/ksw2.h +1 -0
- data/ext/minimap2/ksw2_extd2_sse.c +1 -1
- data/ext/minimap2/ksw2_exts2_sse.c +79 -40
- data/ext/minimap2/ksw2_extz2_sse.c +1 -1
- data/ext/minimap2/lchain.c +15 -16
- data/ext/minimap2/main.c +13 -6
- data/ext/minimap2/map.c +0 -5
- data/ext/minimap2/minimap.h +40 -31
- data/ext/minimap2/minimap2.1 +19 -5
- data/ext/minimap2/misc/paftools.js +545 -24
- data/ext/minimap2/options.c +1 -1
- data/ext/minimap2/pyproject.toml +2 -0
- data/ext/minimap2/python/mappy.pyx +3 -1
- data/ext/minimap2/seed.c +1 -1
- data/ext/minimap2/setup.py +32 -22
- data/ext/minimap2.patch +3 -3
- data/lib/minimap2/aligner.rb +4 -0
- data/lib/minimap2/ffi/constants.rb +90 -88
- data/lib/minimap2/version.rb +2 -2
- metadata +4 -3
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env k8
|
2
2
|
|
3
|
-
var paftools_version = '2.
|
3
|
+
var paftools_version = '2.25-r1173';
|
4
4
|
|
5
5
|
/*****************************
|
6
6
|
***** Library functions *****
|
@@ -1532,22 +1532,24 @@ function paf_view(args)
|
|
1532
1532
|
|
1533
1533
|
function paf_gff2bed(args)
|
1534
1534
|
{
|
1535
|
-
var c, fn_ucsc_fai = null, is_short = false, keep_gff = false, print_junc = false, output_gene = false;
|
1536
|
-
while ((c = getopt(args, "u:
|
1535
|
+
var c, fn_ucsc_fai = null, is_short = false, keep_gff = false, print_junc = false, output_gene = false, ens_canon_only = false;
|
1536
|
+
while ((c = getopt(args, "u:sgjGe")) != null) {
|
1537
1537
|
if (c == 'u') fn_ucsc_fai = getopt.arg;
|
1538
1538
|
else if (c == 's') is_short = true;
|
1539
1539
|
else if (c == 'g') keep_gff = true;
|
1540
1540
|
else if (c == 'j') print_junc = true;
|
1541
1541
|
else if (c == 'G') output_gene = true;
|
1542
|
+
else if (c == 'e') ens_canon_only = true;
|
1542
1543
|
}
|
1543
1544
|
|
1544
1545
|
if (getopt.ind == args.length) {
|
1545
1546
|
print("Usage: paftools.js gff2bed [options] <in.gff>");
|
1546
1547
|
print("Options:");
|
1547
|
-
print(" -j
|
1548
|
-
print(" -s
|
1548
|
+
print(" -j output junction BED");
|
1549
|
+
print(" -s print names in the short form");
|
1549
1550
|
print(" -u FILE hg38.fa.fai for chr name conversion");
|
1550
|
-
print(" -
|
1551
|
+
print(" -e only show transcript tagged with 'Ensembl_canonical'");
|
1552
|
+
print(" -g output GFF (used with -u)");
|
1551
1553
|
exit(1);
|
1552
1554
|
}
|
1553
1555
|
|
@@ -1606,7 +1608,7 @@ function paf_gff2bed(args)
|
|
1606
1608
|
print(a[0][0], st, en, name, 1000, a[0][3], cds_st, cds_en, color, a.length, sizes.join(",") + ",", starts.join(",") + ",");
|
1607
1609
|
}
|
1608
1610
|
|
1609
|
-
var re_gtf = /\b(transcript_id|transcript_type|transcript_biotype|gene_name|gene_id|gbkey|transcript_name) "([^"]+)";/g;
|
1611
|
+
var re_gtf = /\b(transcript_id|transcript_type|transcript_biotype|gene_name|gene_id|gbkey|transcript_name|tag) "([^"]+)";/g;
|
1610
1612
|
var re_gff3 = /\b(transcript_id|transcript_type|transcript_biotype|gene_name|gene_id|gbkey|transcript_name)=([^;]+)/g;
|
1611
1613
|
var re_gtf_gene = /\b(gene_id|gene_type|gene_name) "([^;]+)";/g;
|
1612
1614
|
var re_gff3_gene = /\b(gene_id|gene_type|source_gene|gene_biotype|gene_name)=([^;]+);/g;
|
@@ -1646,13 +1648,14 @@ function paf_gff2bed(args)
|
|
1646
1648
|
if (t[2] != "CDS" && t[2] != "exon") continue;
|
1647
1649
|
t[3] = parseInt(t[3]) - 1;
|
1648
1650
|
t[4] = parseInt(t[4]);
|
1649
|
-
var id = null, type = "", name = "N/A", biotype = "", m, tname = "N/A";
|
1651
|
+
var id = null, type = "", name = "N/A", biotype = "", m, tname = "N/A", ens_canonical = false;
|
1650
1652
|
while ((m = re_gtf.exec(t[8])) != null) {
|
1651
1653
|
if (m[1] == "transcript_id") id = m[2];
|
1652
1654
|
else if (m[1] == "transcript_type") type = m[2];
|
1653
1655
|
else if (m[1] == "transcript_biotype" || m[1] == "gbkey") biotype = m[2];
|
1654
1656
|
else if (m[1] == "gene_name" || m[1] == "gene_id") name = m[2];
|
1655
1657
|
else if (m[1] == "transcript_name") tname = m[2];
|
1658
|
+
else if (m[1] == "tag" && m[2] == "Ensembl_canonical") ens_canonical = true;
|
1656
1659
|
}
|
1657
1660
|
while ((m = re_gff3.exec(t[8])) != null) {
|
1658
1661
|
if (m[1] == "transcript_id") id = m[2];
|
@@ -1661,6 +1664,7 @@ function paf_gff2bed(args)
|
|
1661
1664
|
else if (m[1] == "gene_name" || m[1] == "gene_id") name = m[2];
|
1662
1665
|
else if (m[1] == "transcript_name") tname = m[2];
|
1663
1666
|
}
|
1667
|
+
if (ens_canon_only && !ens_canonical) continue;
|
1664
1668
|
if (type == "" && biotype != "") type = biotype;
|
1665
1669
|
if (id == null) throw Error("No transcript_id");
|
1666
1670
|
if (id != last_id) {
|
@@ -2341,12 +2345,15 @@ function paf_pbsim2fq(args)
|
|
2341
2345
|
|
2342
2346
|
function paf_junceval(args)
|
2343
2347
|
{
|
2344
|
-
var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false;
|
2345
|
-
while ((c = getopt(args, "l:
|
2348
|
+
var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false, aa = false, is_bed = false;
|
2349
|
+
while ((c = getopt(args, "l:epcab1")) != null) {
|
2346
2350
|
if (c == 'l') l_fuzzy = parseInt(getopt.arg);
|
2347
2351
|
else if (c == 'e') print_err_only = print_ovlp = true;
|
2348
2352
|
else if (c == 'p') print_ovlp = true;
|
2349
2353
|
else if (c == 'c') chr_only = true;
|
2354
|
+
else if (c == 'a') aa = true;
|
2355
|
+
else if (c == 'b') is_bed = true;
|
2356
|
+
else if (c == '1') first_only = true;
|
2350
2357
|
}
|
2351
2358
|
|
2352
2359
|
if (args.length - getopt.ind < 1) {
|
@@ -2356,6 +2363,9 @@ function paf_junceval(args)
|
|
2356
2363
|
print(" -p print overlapping introns");
|
2357
2364
|
print(" -e print erroreous overlapping introns");
|
2358
2365
|
print(" -c only consider alignments to /^(chr)?([0-9]+|X|Y)$/");
|
2366
|
+
print(" -a miniprot PAF as input");
|
2367
|
+
print(" -b BED as input");
|
2368
|
+
print(" -1 only process the first alignment of each query");
|
2359
2369
|
exit(1);
|
2360
2370
|
}
|
2361
2371
|
|
@@ -2409,13 +2419,17 @@ function paf_junceval(args)
|
|
2409
2419
|
|
2410
2420
|
file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]);
|
2411
2421
|
var last_qname = null;
|
2412
|
-
var re_cigar = /(\d+)([MIDNSHP=
|
2422
|
+
var re_cigar = /(\d+)([MIDNSHP=XFGUV])/g;
|
2413
2423
|
while (file.readline(buf) >= 0) {
|
2414
2424
|
var m, t = buf.toString().split("\t");
|
2415
|
-
var ctg_name = null, cigar = null, pos = null, qname
|
2425
|
+
var ctg_name = null, cigar = null, pos = null, qname;
|
2416
2426
|
|
2417
2427
|
if (t[0].charAt(0) == '@') continue;
|
2418
|
-
if (t[
|
2428
|
+
if (t[0] == "##PAF") t.shift();
|
2429
|
+
qname = t[0];
|
2430
|
+
if (is_bed) {
|
2431
|
+
ctg_name = t[0], pos = parseInt(t[1]), cigar == null;
|
2432
|
+
} else if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
|
2419
2433
|
ctg_name = t[5], pos = parseInt(t[7]);
|
2420
2434
|
var type = 'P';
|
2421
2435
|
for (i = 12; i < t.length; ++i) {
|
@@ -2445,12 +2459,43 @@ function paf_junceval(args)
|
|
2445
2459
|
}
|
2446
2460
|
|
2447
2461
|
var intron = [];
|
2448
|
-
|
2449
|
-
|
2450
|
-
|
2451
|
-
|
2452
|
-
|
2453
|
-
|
2462
|
+
if (is_bed) {
|
2463
|
+
intron.push([pos, parseInt(t[2])]);
|
2464
|
+
} else if (aa) {
|
2465
|
+
var tmp_junc = [], tmp = 0;
|
2466
|
+
while ((m = re_cigar.exec(cigar)) != null) {
|
2467
|
+
var len = parseInt(m[1]), op = m[2];
|
2468
|
+
if (op == 'N') {
|
2469
|
+
tmp_junc.push([tmp, tmp + len]);
|
2470
|
+
tmp += len;
|
2471
|
+
} else if (op == 'U') {
|
2472
|
+
tmp_junc.push([tmp + 1, tmp + len - 2]);
|
2473
|
+
tmp += len;
|
2474
|
+
} else if (op == 'V') {
|
2475
|
+
tmp_junc.push([tmp + 2, tmp + len - 1]);
|
2476
|
+
tmp += len;
|
2477
|
+
} else if (op == 'M' || op == 'X' || op == '=' || op == 'D') {
|
2478
|
+
tmp += len * 3;
|
2479
|
+
} else if (op == 'F' || op == 'G') {
|
2480
|
+
tmp += len;
|
2481
|
+
}
|
2482
|
+
}
|
2483
|
+
if (t[4] == '+') {
|
2484
|
+
for (var i = 0; i < tmp_junc.length; ++i)
|
2485
|
+
intron.push([pos + tmp_junc[i][0], pos + tmp_junc[i][1]]);
|
2486
|
+
} else if (t[4] == '-') {
|
2487
|
+
var glen = parseInt(t[8]) - parseInt(t[7]);
|
2488
|
+
for (var i = tmp_junc.length - 1; i >= 0; --i)
|
2489
|
+
intron.push([pos + (glen - tmp_junc[i][1]), pos + (glen - tmp_junc[i][0])]);
|
2490
|
+
}
|
2491
|
+
} else {
|
2492
|
+
while ((m = re_cigar.exec(cigar)) != null) {
|
2493
|
+
var len = parseInt(m[1]), op = m[2];
|
2494
|
+
if (op == 'N') {
|
2495
|
+
intron.push([pos, pos + len]);
|
2496
|
+
pos += len;
|
2497
|
+
} else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
|
2498
|
+
}
|
2454
2499
|
}
|
2455
2500
|
if (intron.length == 0) {
|
2456
2501
|
++n_sgl;
|
@@ -2509,6 +2554,276 @@ function paf_junceval(args)
|
|
2509
2554
|
}
|
2510
2555
|
}
|
2511
2556
|
|
2557
|
+
function paf_exoneval(args) // adapted from paf_junceval()
|
2558
|
+
{
|
2559
|
+
var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false, aa = false, is_bed = false, use_cds = false, eval_base = false;
|
2560
|
+
while ((c = getopt(args, "l:epcab1ds")) != null) {
|
2561
|
+
if (c == 'l') l_fuzzy = parseInt(getopt.arg);
|
2562
|
+
else if (c == 'e') print_err_only = print_ovlp = true;
|
2563
|
+
else if (c == 'p') print_ovlp = true;
|
2564
|
+
else if (c == 'c') chr_only = true;
|
2565
|
+
else if (c == 'a') aa = true, use_cds = true;
|
2566
|
+
else if (c == 'b') is_bed = true;
|
2567
|
+
else if (c == '1') first_only = true;
|
2568
|
+
else if (c == 'd') use_cds = true;
|
2569
|
+
else if (c == 's') eval_base = true;
|
2570
|
+
}
|
2571
|
+
|
2572
|
+
if (args.length - getopt.ind < 1) {
|
2573
|
+
print("Usage: paftools.js exoneval [options] <gene.gtf> <aln.sam>");
|
2574
|
+
print("Options:");
|
2575
|
+
print(" -l INT tolerance of junction positions (0 for exact) [0]");
|
2576
|
+
print(" -d evaluate coding regions only (exon regions by default)");
|
2577
|
+
print(" -a miniprot PAF as input (force -d)");
|
2578
|
+
print(" -p print overlapping exons");
|
2579
|
+
print(" -e print erroreous overlapping exons");
|
2580
|
+
print(" -c only consider alignments to /^(chr)?([0-9]+|X|Y)$/");
|
2581
|
+
print(" -1 only process the first alignment of each query");
|
2582
|
+
print(" -b BED as input");
|
2583
|
+
print(" -s compute base Sn and Sp (more memory)");
|
2584
|
+
exit(1);
|
2585
|
+
}
|
2586
|
+
|
2587
|
+
var file, buf = new Bytes();
|
2588
|
+
|
2589
|
+
warn("Reading reference GTF...");
|
2590
|
+
var tr = {};
|
2591
|
+
file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
|
2592
|
+
while (file.readline(buf) >= 0) {
|
2593
|
+
var m, t = buf.toString().split("\t");
|
2594
|
+
if (t[0].charAt(0) == '#') continue;
|
2595
|
+
if (use_cds) {
|
2596
|
+
if (t[2] != "cds" && t[2] != "CDS") continue;
|
2597
|
+
} else {
|
2598
|
+
if (t[2] != 'exon') continue;
|
2599
|
+
}
|
2600
|
+
var st = parseInt(t[3]) - 1;
|
2601
|
+
var en = parseInt(t[4]);
|
2602
|
+
if ((m = /transcript_id "(\S+)"/.exec(t[8])) == null) continue;
|
2603
|
+
var tid = m[1];
|
2604
|
+
if (tr[tid] == null) tr[tid] = [t[0], t[6], 0, 0, []];
|
2605
|
+
tr[tid][4].push([st, en]); // this keeps transcript
|
2606
|
+
}
|
2607
|
+
file.close();
|
2608
|
+
|
2609
|
+
var anno = {};
|
2610
|
+
for (var tid in tr) { // traverse each transcript
|
2611
|
+
var t = tr[tid];
|
2612
|
+
Interval.sort(t[4]);
|
2613
|
+
t[2] = t[4][0][0];
|
2614
|
+
t[3] = t[4][t[4].length - 1][1];
|
2615
|
+
if (anno[t[0]] == null) anno[t[0]] = [];
|
2616
|
+
var s = t[4];
|
2617
|
+
for (var i = 0; i < s.length; ++i) // traverse each exon
|
2618
|
+
anno[t[0]].push([s[i][0], s[i][1]]);
|
2619
|
+
}
|
2620
|
+
tr = null;
|
2621
|
+
|
2622
|
+
for (var chr in anno) { // index exons
|
2623
|
+
var e = anno[chr];
|
2624
|
+
if (e.length == 0) continue;
|
2625
|
+
Interval.sort(e);
|
2626
|
+
var k = 0;
|
2627
|
+
for (var i = 1; i < e.length; ++i) // dedup
|
2628
|
+
if (e[i][0] != e[k][0] || e[i][1] != e[k][1])
|
2629
|
+
e[++k] = e[i].slice(0);
|
2630
|
+
e.length = k + 1;
|
2631
|
+
Interval.index_end(e);
|
2632
|
+
}
|
2633
|
+
|
2634
|
+
var n_pri = 0, n_unmapped = 0, n_mapped = 0;
|
2635
|
+
var n_exon = 0, n_exon_hit = 0, n_exon_novel = 0;
|
2636
|
+
|
2637
|
+
file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]);
|
2638
|
+
var last_qname = null, qexon = {};
|
2639
|
+
var re_cigar = /(\d+)([MIDNSHP=XFGUV])/g;
|
2640
|
+
|
2641
|
+
warn("Evaluating alignments...");
|
2642
|
+
while (file.readline(buf) >= 0) {
|
2643
|
+
var m, t = buf.toString().split("\t");
|
2644
|
+
var ctg_name = null, cigar = null, pos = null, qname;
|
2645
|
+
|
2646
|
+
if (t[0].charAt(0) == '@') continue;
|
2647
|
+
if (t[0] == "##PAF") t.shift();
|
2648
|
+
qname = t[0];
|
2649
|
+
if (is_bed) {
|
2650
|
+
ctg_name = t[0], pos = parseInt(t[1]), cigar == null;
|
2651
|
+
} else if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
|
2652
|
+
ctg_name = t[5], pos = parseInt(t[7]);
|
2653
|
+
var type = 'P';
|
2654
|
+
for (i = 12; i < t.length; ++i) {
|
2655
|
+
if ((m = /^(tp:A|cg:Z):(\S+)/.exec(t[i])) != null) {
|
2656
|
+
if (m[1] == 'tp:A') type = m[2];
|
2657
|
+
else cigar = m[2];
|
2658
|
+
}
|
2659
|
+
}
|
2660
|
+
if (type == 'S') continue; // secondary
|
2661
|
+
} else { // SAM
|
2662
|
+
ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5];
|
2663
|
+
var flag = parseInt(t[1]);
|
2664
|
+
if (flag&0x100) continue; // secondary
|
2665
|
+
}
|
2666
|
+
|
2667
|
+
if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(ctg_name)) continue;
|
2668
|
+
if (first_only && last_qname == qname) continue;
|
2669
|
+
if (ctg_name == '*') { // unmapped
|
2670
|
+
++n_unmapped;
|
2671
|
+
continue;
|
2672
|
+
} else {
|
2673
|
+
++n_pri;
|
2674
|
+
if (last_qname != qname) {
|
2675
|
+
++n_mapped;
|
2676
|
+
last_qname = qname;
|
2677
|
+
}
|
2678
|
+
}
|
2679
|
+
|
2680
|
+
var exon = [];
|
2681
|
+
if (is_bed) { // BED
|
2682
|
+
exon.push([pos, parseInt(t[2])]);
|
2683
|
+
} else if (aa) {
|
2684
|
+
var tmp_exon = [], tmp = 0, tmp_st = 0;
|
2685
|
+
while ((m = re_cigar.exec(cigar)) != null) {
|
2686
|
+
var len = parseInt(m[1]), op = m[2];
|
2687
|
+
if (op == 'N') {
|
2688
|
+
tmp_exon.push([tmp_st, tmp]);
|
2689
|
+
tmp_st = tmp + len, tmp += len;
|
2690
|
+
} else if (op == 'U') {
|
2691
|
+
tmp_exon.push([tmp_st, tmp + 1]);
|
2692
|
+
tmp_st = tmp + len - 2, tmp += len;
|
2693
|
+
} else if (op == 'V') {
|
2694
|
+
tmp_exon.push([tmp_st, tmp + 2]);
|
2695
|
+
tmp_st = tmp + len - 1, tmp += len;
|
2696
|
+
} else if (op == 'M' || op == 'X' || op == '=' || op == 'D') {
|
2697
|
+
tmp += len * 3;
|
2698
|
+
} else if (op == 'F' || op == 'G') {
|
2699
|
+
tmp += len;
|
2700
|
+
}
|
2701
|
+
}
|
2702
|
+
tmp_exon.push([tmp_st, tmp]);
|
2703
|
+
if (t[4] == '+') {
|
2704
|
+
for (var i = 0; i < tmp_exon.length; ++i)
|
2705
|
+
exon.push([pos + tmp_exon[i][0], pos + tmp_exon[i][1]]);
|
2706
|
+
} else if (t[4] == '-') { // For protein-to-genome alignment, the coordinates are on the query strand. Need to flip them.
|
2707
|
+
var glen = parseInt(t[8]) - parseInt(t[7]);
|
2708
|
+
for (var i = tmp_exon.length - 1; i >= 0; --i)
|
2709
|
+
exon.push([pos + (glen - tmp_exon[i][1]), pos + (glen - tmp_exon[i][0])]);
|
2710
|
+
}
|
2711
|
+
} else {
|
2712
|
+
var tmp_st = pos;
|
2713
|
+
while ((m = re_cigar.exec(cigar)) != null) {
|
2714
|
+
var len = parseInt(m[1]), op = m[2];
|
2715
|
+
if (op == 'N') {
|
2716
|
+
exon.push([tmp_st, pos]);
|
2717
|
+
tmp_st = pos + len, pos += len;
|
2718
|
+
} else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
|
2719
|
+
}
|
2720
|
+
exon.push([tmp_st, pos]);
|
2721
|
+
}
|
2722
|
+
n_exon += exon.length;
|
2723
|
+
|
2724
|
+
var chr = anno[ctg_name];
|
2725
|
+
if (chr != null) {
|
2726
|
+
for (var i = 0; i < exon.length; ++i) {
|
2727
|
+
if (eval_base) {
|
2728
|
+
if (qexon[ctg_name] == null) qexon[ctg_name] = [];
|
2729
|
+
qexon[ctg_name].push([exon[i][0], exon[i][1]]);
|
2730
|
+
}
|
2731
|
+
var o = Interval.find_ovlp(chr, exon[i][0], exon[i][1]);
|
2732
|
+
if (o.length > 0) {
|
2733
|
+
var hit = false;
|
2734
|
+
for (var j = 0; j < o.length; ++j) {
|
2735
|
+
var st_diff = exon[i][0] - o[j][0];
|
2736
|
+
var en_diff = exon[i][1] - o[j][1];
|
2737
|
+
if (st_diff < 0) st_diff = -st_diff;
|
2738
|
+
if (en_diff < 0) en_diff = -en_diff;
|
2739
|
+
if (st_diff <= l_fuzzy && en_diff <= l_fuzzy)
|
2740
|
+
++n_exon_hit, hit = true;
|
2741
|
+
if (hit) break;
|
2742
|
+
}
|
2743
|
+
if (print_ovlp) {
|
2744
|
+
var type = hit? 'C' : 'P';
|
2745
|
+
if (hit && print_err_only) continue;
|
2746
|
+
var x = '[';
|
2747
|
+
for (var j = 0; j < o.length; ++j) {
|
2748
|
+
if (j) x += ', ';
|
2749
|
+
x += '(' + o[j][0] + "," + o[j][1] + ')';
|
2750
|
+
}
|
2751
|
+
x += ']';
|
2752
|
+
print(type, qname, i+1, ctg_name, exon[i][0], exon[i][1], x);
|
2753
|
+
}
|
2754
|
+
} else {
|
2755
|
+
++n_exon_novel;
|
2756
|
+
if (print_ovlp)
|
2757
|
+
print('N', qname, i+1, ctg_name, exon[i][0], exon[i][1]);
|
2758
|
+
}
|
2759
|
+
}
|
2760
|
+
} else {
|
2761
|
+
n_exon_novel += exon.length;
|
2762
|
+
}
|
2763
|
+
}
|
2764
|
+
file.close();
|
2765
|
+
|
2766
|
+
buf.destroy();
|
2767
|
+
|
2768
|
+
if (!print_ovlp) {
|
2769
|
+
print("# unmapped reads: " + n_unmapped);
|
2770
|
+
print("# mapped reads: " + n_mapped);
|
2771
|
+
print("# primary alignments: " + n_pri);
|
2772
|
+
print("# predicted exons: " + n_exon);
|
2773
|
+
print("# non-overlapping exons: " + n_exon_novel);
|
2774
|
+
print("# correct exons: " + n_exon_hit + " (" + (n_exon_hit / n_exon * 100).toFixed(2) + "%)");
|
2775
|
+
}
|
2776
|
+
|
2777
|
+
function merge_and_index(ex) {
|
2778
|
+
for (var chr in ex) {
|
2779
|
+
var a = [];
|
2780
|
+
e = ex[chr];
|
2781
|
+
Interval.sort(e);
|
2782
|
+
var st = e[0][0], en = e[0][1];
|
2783
|
+
for (var i = 1; i < e.length; ++i) { // merge
|
2784
|
+
if (e[i][0] > en) {
|
2785
|
+
a.push([st, en]);
|
2786
|
+
st = e[i][0], en = e[i][1];
|
2787
|
+
} else {
|
2788
|
+
en = en > e[i][1]? en : e[i][1];
|
2789
|
+
}
|
2790
|
+
}
|
2791
|
+
a.push([st, en]);
|
2792
|
+
Interval.index_end(a);
|
2793
|
+
ex[chr] = a;
|
2794
|
+
}
|
2795
|
+
}
|
2796
|
+
|
2797
|
+
function cal_sn(a0, a1) {
|
2798
|
+
var tot = 0, cov = 0;
|
2799
|
+
for (var chr in a1) {
|
2800
|
+
var e0 = a0[chr], e1 = a1[chr];
|
2801
|
+
for (var i = 0; i < e1.length; ++i)
|
2802
|
+
tot += e1[i][1] - e1[i][0];
|
2803
|
+
if (e0 == null) continue;
|
2804
|
+
for (var i = 0; i < e1.length; ++i) {
|
2805
|
+
var o = Interval.find_ovlp(e0, e1[i][0], e1[i][1]);
|
2806
|
+
for (var j = 0; j < o.length; ++j) { // this only works when there are no overlaps between intervals
|
2807
|
+
var st = e1[i][0] > o[j][0]? e1[i][0] : o[j][0];
|
2808
|
+
var en = e1[i][1] < o[j][1]? e1[i][1] : o[j][1];
|
2809
|
+
cov += en - st;
|
2810
|
+
}
|
2811
|
+
}
|
2812
|
+
}
|
2813
|
+
return [tot, cov];
|
2814
|
+
}
|
2815
|
+
|
2816
|
+
if (eval_base) {
|
2817
|
+
warn("Computing base Sn and Sp...");
|
2818
|
+
merge_and_index(qexon);
|
2819
|
+
merge_and_index(anno);
|
2820
|
+
var sn = cal_sn(qexon, anno);
|
2821
|
+
var sp = cal_sn(anno, qexon);
|
2822
|
+
print("Base Sn: " + sn[1] + " / " + sn[0] + " = " + (sn[1] / sn[0] * 100).toFixed(2) + "%");
|
2823
|
+
print("Base Sp: " + sp[1] + " / " + sp[0] + " = " + (sp[1] / sp[0] * 100).toFixed(2) + "%");
|
2824
|
+
}
|
2825
|
+
}
|
2826
|
+
|
2512
2827
|
// evaluate overlap sensitivity
|
2513
2828
|
function paf_ov_eval(args)
|
2514
2829
|
{
|
@@ -2704,6 +3019,23 @@ function paf_misjoin(args)
|
|
2704
3019
|
return len < (en - st) * cen_ratio? false : true;
|
2705
3020
|
}
|
2706
3021
|
|
3022
|
+
function test_cen_point(cen, chr, x) {
|
3023
|
+
var b = cen[chr];
|
3024
|
+
if (b == null) return false;
|
3025
|
+
for (var j = 0; j < b.length; ++j)
|
3026
|
+
if (x >= b[j][0] && x < b[j][1])
|
3027
|
+
return true;
|
3028
|
+
return false;
|
3029
|
+
}
|
3030
|
+
|
3031
|
+
if (show_err || show_long) {
|
3032
|
+
print("C\tJ inter-chromosomal misjoin");
|
3033
|
+
print("C\tj inter-chromosomal misjoin with both breakpoints ending in centromeres");
|
3034
|
+
print("C\tG long gap on the reference genome");
|
3035
|
+
print("C\tg long gap on the reference genome with both breakpoints ending in centromeres");
|
3036
|
+
print("C\tM closed inversion");
|
3037
|
+
print("C");
|
3038
|
+
}
|
2707
3039
|
function process(a) {
|
2708
3040
|
var k = 0;
|
2709
3041
|
for (var i = 0; i < a.length; ++i) {
|
@@ -2716,14 +3048,17 @@ function paf_misjoin(args)
|
|
2716
3048
|
a = a.sort(function(x,y){return x[2]-y[2]});
|
2717
3049
|
if (show_long) for (var i = 0; i < a.length; ++i) print(a[i].join("\t"));
|
2718
3050
|
for (var i = 1; i < a.length; ++i) {
|
2719
|
-
var ov = [false, false];
|
3051
|
+
var ov = [false, false], end_cen = [false, false];
|
2720
3052
|
ov[0] = test_cen(cen, a[i-1][5], a[i-1][7], a[i-1][8]);
|
2721
3053
|
ov[1] = test_cen(cen, a[i][5], a[i][7], a[i][8]);
|
3054
|
+
end_cen[0] = test_cen_point(cen, a[i-1][5], a[i-1][4] == '+'? a[i-1][8] : a[i-1][7]);
|
3055
|
+
end_cen[1] = test_cen_point(cen, a[i][5], a[i][4] == '+'? a[i][7] : a[i][8]);
|
2722
3056
|
if (a[i-1][5] != a[i][5]) { // different chr
|
2723
3057
|
if (ov[0] || ov[1]) ++n_diff[1];
|
2724
3058
|
else if (show_err) {
|
2725
|
-
|
2726
|
-
print(
|
3059
|
+
var label = end_cen[0] && end_cen[1]? 'j' : 'J';
|
3060
|
+
print(label, a[i-1].slice(0, 12).join("\t"));
|
3061
|
+
print(label, a[i].slice(0, 12).join("\t"));
|
2727
3062
|
}
|
2728
3063
|
++n_diff[0];
|
2729
3064
|
} else if (a[i-1][4] == a[i][4]) { // a gap
|
@@ -2733,8 +3068,9 @@ function paf_misjoin(args)
|
|
2733
3068
|
if (gap > max_gap) {
|
2734
3069
|
if (ov[0] || ov[1]) ++n_gap[1];
|
2735
3070
|
else if (show_err) {
|
2736
|
-
|
2737
|
-
print(
|
3071
|
+
var label = end_cen[0] && end_cen[1]? 'g' : 'G';
|
3072
|
+
print(label, a[i-1].slice(0, 12).join("\t"));
|
3073
|
+
print(label, a[i].slice(0, 12).join("\t"));
|
2738
3074
|
}
|
2739
3075
|
++n_gap[0];
|
2740
3076
|
}
|
@@ -3084,6 +3420,183 @@ function paf_pafcmp(args)
|
|
3084
3420
|
buf.destroy();
|
3085
3421
|
}
|
3086
3422
|
|
3423
|
+
function paf_longcs2seq(args) {
|
3424
|
+
var c, opt = { query:false };
|
3425
|
+
while ((c = getopt(args, "q")) != null)
|
3426
|
+
if (c == 'q') opt.query = true;
|
3427
|
+
if (args.length == getopt.ind) {
|
3428
|
+
print("Usage: paftools.js longcs2seq [-q] <long-cs.paf>");
|
3429
|
+
return;
|
3430
|
+
}
|
3431
|
+
var re_cs = /([:=*+-])(\d+|[A-Za-z]+)/g
|
3432
|
+
var buf = new Bytes();
|
3433
|
+
var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
|
3434
|
+
while (file.readline(buf) >= 0) {
|
3435
|
+
var m, cs = null, t = buf.toString().split("\t");
|
3436
|
+
for (var i = 12; i < t.length; ++i)
|
3437
|
+
if ((m = /^cs:Z:(\S+)/.exec(t[i])) != null) {
|
3438
|
+
cs = m[1];
|
3439
|
+
break;
|
3440
|
+
}
|
3441
|
+
if (cs == null) continue;
|
3442
|
+
var ts = "", qs = "";
|
3443
|
+
while ((m = re_cs.exec(cs)) != null) {
|
3444
|
+
if (m[1] == "=") ts += m[2], qs += m[2];
|
3445
|
+
else if (m[1] == "+") qs += m[2].toUpperCase();
|
3446
|
+
else if (m[1] == "-") ts += m[2].toUpperCase();
|
3447
|
+
else if (m[1] == "*") ts += m[2][0].toUpperCase(), qs += m[2][1].toUpperCase();
|
3448
|
+
else if (m[1] == ":") throw Error("Long cs is required");
|
3449
|
+
}
|
3450
|
+
if (opt.query) {
|
3451
|
+
print(">" + t[0] + "_" + t[2] + "_" + t[3]);
|
3452
|
+
print(qs);
|
3453
|
+
} else {
|
3454
|
+
print(">" + t[5] + "_" + t[7] + "_" + t[8]);
|
3455
|
+
print(ts);
|
3456
|
+
}
|
3457
|
+
}
|
3458
|
+
file.close();
|
3459
|
+
buf.destroy();
|
3460
|
+
}
|
3461
|
+
|
3462
|
+
function paf_paf2gff(args) {
|
3463
|
+
var c, opt = { aa:false };
|
3464
|
+
var re_cigar = /(\d+)([A-Z=])/g;
|
3465
|
+
while ((c = getopt(args, "a")) != null) {
|
3466
|
+
if (c == 'a') opt.aa = true;
|
3467
|
+
}
|
3468
|
+
if (args.length == getopt.ind) {
|
3469
|
+
print("Usage: paftools.js paf2gff [-a] <in.paf>");
|
3470
|
+
return;
|
3471
|
+
}
|
3472
|
+
var buf = new Bytes();
|
3473
|
+
var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
|
3474
|
+
var hid = 1, last_name = null;
|
3475
|
+
while (file.readline(buf) >= 0) {
|
3476
|
+
var m, t = buf.toString().split("\t");
|
3477
|
+
if (t[5] == '*') continue; // skip unmapped lines
|
3478
|
+
|
3479
|
+
if (t[0] != last_name) last_name = t[0], hid = 1;
|
3480
|
+
else ++hid;
|
3481
|
+
for (var i = 1; i <= 3; ++i) t[i] = parseInt(t[i]);
|
3482
|
+
for (var i = 6; i <= 11; ++i) t[i] = parseInt(t[i]);
|
3483
|
+
var cigar = null, score = null, np = null, dist_stop = null, dist_start = null;
|
3484
|
+
for (var i = 12; i < t.length; ++i) {
|
3485
|
+
if ((m = /^(cg:Z|AS:i|np:i|da:i|do:i):(\S+)/.exec(t[i])) != null) {
|
3486
|
+
if (m[1] == 'cg:Z') cigar = m[2];
|
3487
|
+
else if (m[1] == 'AS:i') score = parseInt(m[2]);
|
3488
|
+
else if (m[1] == 'np:i') np = parseInt(m[2]);
|
3489
|
+
else if (m[1] == 'do:i') dist_stop = parseInt(m[2]);
|
3490
|
+
else if (m[1] == 'da:i') dist_start = parseInt(m[2]);
|
3491
|
+
}
|
3492
|
+
}
|
3493
|
+
if (cigar == null) throw Error("failed to find the cg:Z tag");
|
3494
|
+
if (score == null) throw Error("failed to find the AS:i tag");
|
3495
|
+
|
3496
|
+
var st = 0, en = 0, phase = 0, pseudo = false, fs = 0, a = [];
|
3497
|
+
if (dist_start != null && dist_start == 0)
|
3498
|
+
a.push([t[5], 'paf2gff', 'start_codon', 0, 3, 0, t[4], '.', 0]);
|
3499
|
+
while ((m = re_cigar.exec(cigar)) != null) {
|
3500
|
+
var len = parseInt(m[1]);
|
3501
|
+
if (m[2] == 'M' || m[2] == 'D') {
|
3502
|
+
en += opt.aa? len * 3 : len;
|
3503
|
+
} else if (m[2] == 'F' || m[2] == 'G' || m[2] == 'R') {
|
3504
|
+
en += len, pseudo = true, fs = 1;
|
3505
|
+
} else if (m[2] == 'N') {
|
3506
|
+
a.push([t[5], 'paf2gff', 'exon', st, en, 0, t[4], phase, fs]);
|
3507
|
+
st = en + len, en += len, phase = 0, fs = 0;
|
3508
|
+
} else if (m[2] == 'U') { // ...xGT...AGxx...
|
3509
|
+
a.push([t[5], 'paf2gff', 'exon', st, en + 1, 0, t[4], phase, fs]);
|
3510
|
+
st = en + len - 2, en += len, phase = 2, fs = 0;
|
3511
|
+
} else if (m[2] == 'V') { // ...xxGT...AGx...
|
3512
|
+
a.push([t[5], 'paf2gff', 'exon', st, en + 2, 0, t[4], phase, fs]);
|
3513
|
+
st = en + len - 1, en += len, phase = 1, fs = 0;
|
3514
|
+
}
|
3515
|
+
}
|
3516
|
+
a.push([t[5], 'paf2gff', 'exon', st, en, 0, t[4], phase, fs]);
|
3517
|
+
if (en != t[8] - t[7]) throw Error("inconsistent cigar");
|
3518
|
+
if (dist_stop != null && dist_stop == 0)
|
3519
|
+
a.push([t[5], 'paf2gff', 'stop_codon', en, en + 3, 0, t[4], '.', 0]);
|
3520
|
+
var type = pseudo? 'pseudogene' : 'protein_coding';
|
3521
|
+
var attr = ['transcript_id=' + t[0] + '#' + hid, 'transcript_type=' + type].join(";");
|
3522
|
+
var trans_attr = 'identity=' + (t[9] / t[10]).toFixed(4);
|
3523
|
+
if (np != null) trans_attr += ';positive=' + (np * 3 / t[10]).toFixed(4);
|
3524
|
+
trans_attr += ';aa_start=' + t[2];
|
3525
|
+
trans_attr += ';aa_end=' + (t[1] - t[3]);
|
3526
|
+
if (dist_start != null && dist_start >= 0) trans_attr += ';dist_start_codon=' + dist_start;
|
3527
|
+
if (dist_stop != null && dist_stop >= 0) trans_attr += ';dist_stop_codon=' + dist_stop;
|
3528
|
+
var trans_st = t[7], trans_en = t[8];
|
3529
|
+
if (dist_stop != null && dist_stop == 0) {
|
3530
|
+
if (t[4] == '-') trans_st -= 3;
|
3531
|
+
else trans_en += 3;
|
3532
|
+
}
|
3533
|
+
print([t[5], 'paf2gff', 'transcript', trans_st + 1, trans_en, score, t[4], '.', attr + ';' + trans_attr].join("\t"));
|
3534
|
+
if (opt.aa && t[4] == '-') {
|
3535
|
+
var b = [], len = t[8] - t[7];
|
3536
|
+
for (var i = a.length - 1; i >= 0; --i) {
|
3537
|
+
var x = len - a[i][3];
|
3538
|
+
a[i][3] = len - a[i][4];
|
3539
|
+
a[i][4] = x;
|
3540
|
+
//a[i][7] = a[i][7] == 0? 0 : 3 - a[i][7]; // not sure if this line is needed
|
3541
|
+
b.push(a[i]);
|
3542
|
+
}
|
3543
|
+
a = b;
|
3544
|
+
}
|
3545
|
+
for (var i = 0; i < a.length; ++i) {
|
3546
|
+
if (!pseudo && a[i][2] == "exon") a[i][2] = "CDS";
|
3547
|
+
a[i][3] += t[7] + 1;
|
3548
|
+
a[i][4] += t[7];
|
3549
|
+
a[i][8] = attr + ";frameshift=" + a[i][8];
|
3550
|
+
print(a[i].join("\t"));
|
3551
|
+
}
|
3552
|
+
}
|
3553
|
+
file.close();
|
3554
|
+
buf.destroy();
|
3555
|
+
}
|
3556
|
+
|
3557
|
+
function paf_gff2junc(args) {
|
3558
|
+
var c, feat = "CDS";
|
3559
|
+
while ((c = getopt(args, "f:")) != null) {
|
3560
|
+
if (c == 'f') feat = getopt.arg;
|
3561
|
+
}
|
3562
|
+
if (getopt.ind == args.length) {
|
3563
|
+
print("Usage: paftools.js gff2junc [-f feature] <in.gff3>");
|
3564
|
+
return;
|
3565
|
+
}
|
3566
|
+
var buf = new Bytes();
|
3567
|
+
var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
|
3568
|
+
|
3569
|
+
function process_a(a) {
|
3570
|
+
if (a.length < 2) return;
|
3571
|
+
a = a.sort(function(x, y) { return x[4] - y[4] });
|
3572
|
+
for (var i = 1; i < a.length; ++i)
|
3573
|
+
print([a[i][1], a[i-1][5], a[i][4], a[i][0], 0, a[i][7]].join("\t"));
|
3574
|
+
}
|
3575
|
+
|
3576
|
+
var a = [];
|
3577
|
+
while (file.readline(buf) >= 0) {
|
3578
|
+
var m, t = buf.toString().split("\t");
|
3579
|
+
if (t[0][0] == '#') continue;
|
3580
|
+
if (t[2].toLowerCase() != feat.toLowerCase()) continue;
|
3581
|
+
//print(t.join("\t"));
|
3582
|
+
if ((m = /\bParent=([^;]+)/.exec(t[8])) == null) {
|
3583
|
+
warn("Can't find Parent");
|
3584
|
+
continue;
|
3585
|
+
}
|
3586
|
+
t[3] = parseInt(t[3]) - 1;
|
3587
|
+
t[4] = parseInt(t[4]);
|
3588
|
+
t.unshift(m[1]);
|
3589
|
+
if (a.length > 0 && a[0][0] != m[1]) {
|
3590
|
+
process_a(a);
|
3591
|
+
a.length = 0;
|
3592
|
+
a.push(t);
|
3593
|
+
} else a.push(t);
|
3594
|
+
}
|
3595
|
+
process_a(a);
|
3596
|
+
file.close();
|
3597
|
+
buf.destroy();
|
3598
|
+
}
|
3599
|
+
|
3087
3600
|
/*************************
|
3088
3601
|
***** main function *****
|
3089
3602
|
*************************/
|
@@ -3098,6 +3611,9 @@ function main(args)
|
|
3098
3611
|
print(" sam2paf convert SAM to PAF");
|
3099
3612
|
print(" delta2paf convert MUMmer's delta to PAF");
|
3100
3613
|
print(" gff2bed convert GTF/GFF3 to BED12");
|
3614
|
+
print(" gff2junc convert GFF3 to junction BED");
|
3615
|
+
print(" longcs2seq convert long-cs PAF to sequences");
|
3616
|
+
// print(" paf2gff convert PAF to GFF3 (tested for miniprot only)");
|
3101
3617
|
print("");
|
3102
3618
|
print(" stat collect basic mapping information in PAF/SAM");
|
3103
3619
|
print(" asmstat collect basic assembly information");
|
@@ -3115,6 +3631,7 @@ function main(args)
|
|
3115
3631
|
print(" mason2fq convert mason2-simulated SAM to FASTQ");
|
3116
3632
|
print(" pbsim2fq convert PBSIM-simulated MAF to FASTQ");
|
3117
3633
|
print(" junceval evaluate splice junction consistency with known annotations");
|
3634
|
+
print(" exoneval evaluate exon-level consistency with known annotations");
|
3118
3635
|
print(" ov-eval evaluate read overlap sensitivity using read-to-ref mapping");
|
3119
3636
|
exit(1);
|
3120
3637
|
}
|
@@ -3125,6 +3642,7 @@ function main(args)
|
|
3125
3642
|
else if (cmd == 'delta2paf') paf_delta2paf(args);
|
3126
3643
|
else if (cmd == 'splice2bed') paf_splice2bed(args);
|
3127
3644
|
else if (cmd == 'gff2bed') paf_gff2bed(args);
|
3645
|
+
else if (cmd == 'gff2junc') paf_gff2junc(args);
|
3128
3646
|
else if (cmd == 'stat') paf_stat(args);
|
3129
3647
|
else if (cmd == 'asmstat') paf_asmstat(args);
|
3130
3648
|
else if (cmd == 'asmgene') paf_asmgene(args);
|
@@ -3138,10 +3656,13 @@ function main(args)
|
|
3138
3656
|
else if (cmd == 'mason2fq') paf_mason2fq(args);
|
3139
3657
|
else if (cmd == 'pbsim2fq') paf_pbsim2fq(args);
|
3140
3658
|
else if (cmd == 'junceval') paf_junceval(args);
|
3659
|
+
else if (cmd == 'exoneval') paf_exoneval(args);
|
3141
3660
|
else if (cmd == 'ov-eval') paf_ov_eval(args);
|
3142
3661
|
else if (cmd == 'vcfstat') paf_vcfstat(args);
|
3143
3662
|
else if (cmd == 'sveval') paf_sveval(args);
|
3144
3663
|
else if (cmd == 'vcfsel') paf_vcfsel(args);
|
3664
|
+
else if (cmd == 'longcs2seq') paf_longcs2seq(args);
|
3665
|
+
else if (cmd == 'paf2gff') paf_paf2gff(args);
|
3145
3666
|
else if (cmd == 'version') print(paftools_version);
|
3146
3667
|
else throw Error("unrecognized command: " + cmd);
|
3147
3668
|
}
|