minimap2 0.2.25.0 → 0.2.25.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -3
- data/ext/Rakefile +2 -2
- data/ext/minimap2/Makefile +6 -2
- data/ext/minimap2/NEWS.md +38 -0
- data/ext/minimap2/README.md +9 -3
- data/ext/minimap2/align.c +5 -3
- data/ext/minimap2/cookbook.md +2 -2
- data/ext/minimap2/format.c +7 -4
- data/ext/minimap2/kalloc.c +20 -1
- data/ext/minimap2/kalloc.h +13 -2
- data/ext/minimap2/ksw2.h +1 -0
- data/ext/minimap2/ksw2_extd2_sse.c +1 -1
- data/ext/minimap2/ksw2_exts2_sse.c +79 -40
- data/ext/minimap2/ksw2_extz2_sse.c +1 -1
- data/ext/minimap2/lchain.c +15 -16
- data/ext/minimap2/main.c +13 -6
- data/ext/minimap2/map.c +0 -5
- data/ext/minimap2/minimap.h +40 -31
- data/ext/minimap2/minimap2.1 +19 -5
- data/ext/minimap2/misc/paftools.js +545 -24
- data/ext/minimap2/options.c +1 -1
- data/ext/minimap2/pyproject.toml +2 -0
- data/ext/minimap2/python/mappy.pyx +3 -1
- data/ext/minimap2/seed.c +1 -1
- data/ext/minimap2/setup.py +32 -22
- data/lib/minimap2/version.rb +1 -1
- metadata +4 -3
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env k8
|
2
2
|
|
3
|
-
var paftools_version = '2.
|
3
|
+
var paftools_version = '2.25-r1173';
|
4
4
|
|
5
5
|
/*****************************
|
6
6
|
***** Library functions *****
|
@@ -1532,22 +1532,24 @@ function paf_view(args)
|
|
1532
1532
|
|
1533
1533
|
function paf_gff2bed(args)
|
1534
1534
|
{
|
1535
|
-
var c, fn_ucsc_fai = null, is_short = false, keep_gff = false, print_junc = false, output_gene = false;
|
1536
|
-
while ((c = getopt(args, "u:
|
1535
|
+
var c, fn_ucsc_fai = null, is_short = false, keep_gff = false, print_junc = false, output_gene = false, ens_canon_only = false;
|
1536
|
+
while ((c = getopt(args, "u:sgjGe")) != null) {
|
1537
1537
|
if (c == 'u') fn_ucsc_fai = getopt.arg;
|
1538
1538
|
else if (c == 's') is_short = true;
|
1539
1539
|
else if (c == 'g') keep_gff = true;
|
1540
1540
|
else if (c == 'j') print_junc = true;
|
1541
1541
|
else if (c == 'G') output_gene = true;
|
1542
|
+
else if (c == 'e') ens_canon_only = true;
|
1542
1543
|
}
|
1543
1544
|
|
1544
1545
|
if (getopt.ind == args.length) {
|
1545
1546
|
print("Usage: paftools.js gff2bed [options] <in.gff>");
|
1546
1547
|
print("Options:");
|
1547
|
-
print(" -j
|
1548
|
-
print(" -s
|
1548
|
+
print(" -j output junction BED");
|
1549
|
+
print(" -s print names in the short form");
|
1549
1550
|
print(" -u FILE hg38.fa.fai for chr name conversion");
|
1550
|
-
print(" -
|
1551
|
+
print(" -e only show transcript tagged with 'Ensembl_canonical'");
|
1552
|
+
print(" -g output GFF (used with -u)");
|
1551
1553
|
exit(1);
|
1552
1554
|
}
|
1553
1555
|
|
@@ -1606,7 +1608,7 @@ function paf_gff2bed(args)
|
|
1606
1608
|
print(a[0][0], st, en, name, 1000, a[0][3], cds_st, cds_en, color, a.length, sizes.join(",") + ",", starts.join(",") + ",");
|
1607
1609
|
}
|
1608
1610
|
|
1609
|
-
var re_gtf = /\b(transcript_id|transcript_type|transcript_biotype|gene_name|gene_id|gbkey|transcript_name) "([^"]+)";/g;
|
1611
|
+
var re_gtf = /\b(transcript_id|transcript_type|transcript_biotype|gene_name|gene_id|gbkey|transcript_name|tag) "([^"]+)";/g;
|
1610
1612
|
var re_gff3 = /\b(transcript_id|transcript_type|transcript_biotype|gene_name|gene_id|gbkey|transcript_name)=([^;]+)/g;
|
1611
1613
|
var re_gtf_gene = /\b(gene_id|gene_type|gene_name) "([^;]+)";/g;
|
1612
1614
|
var re_gff3_gene = /\b(gene_id|gene_type|source_gene|gene_biotype|gene_name)=([^;]+);/g;
|
@@ -1646,13 +1648,14 @@ function paf_gff2bed(args)
|
|
1646
1648
|
if (t[2] != "CDS" && t[2] != "exon") continue;
|
1647
1649
|
t[3] = parseInt(t[3]) - 1;
|
1648
1650
|
t[4] = parseInt(t[4]);
|
1649
|
-
var id = null, type = "", name = "N/A", biotype = "", m, tname = "N/A";
|
1651
|
+
var id = null, type = "", name = "N/A", biotype = "", m, tname = "N/A", ens_canonical = false;
|
1650
1652
|
while ((m = re_gtf.exec(t[8])) != null) {
|
1651
1653
|
if (m[1] == "transcript_id") id = m[2];
|
1652
1654
|
else if (m[1] == "transcript_type") type = m[2];
|
1653
1655
|
else if (m[1] == "transcript_biotype" || m[1] == "gbkey") biotype = m[2];
|
1654
1656
|
else if (m[1] == "gene_name" || m[1] == "gene_id") name = m[2];
|
1655
1657
|
else if (m[1] == "transcript_name") tname = m[2];
|
1658
|
+
else if (m[1] == "tag" && m[2] == "Ensembl_canonical") ens_canonical = true;
|
1656
1659
|
}
|
1657
1660
|
while ((m = re_gff3.exec(t[8])) != null) {
|
1658
1661
|
if (m[1] == "transcript_id") id = m[2];
|
@@ -1661,6 +1664,7 @@ function paf_gff2bed(args)
|
|
1661
1664
|
else if (m[1] == "gene_name" || m[1] == "gene_id") name = m[2];
|
1662
1665
|
else if (m[1] == "transcript_name") tname = m[2];
|
1663
1666
|
}
|
1667
|
+
if (ens_canon_only && !ens_canonical) continue;
|
1664
1668
|
if (type == "" && biotype != "") type = biotype;
|
1665
1669
|
if (id == null) throw Error("No transcript_id");
|
1666
1670
|
if (id != last_id) {
|
@@ -2341,12 +2345,15 @@ function paf_pbsim2fq(args)
|
|
2341
2345
|
|
2342
2346
|
function paf_junceval(args)
|
2343
2347
|
{
|
2344
|
-
var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false;
|
2345
|
-
while ((c = getopt(args, "l:
|
2348
|
+
var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false, aa = false, is_bed = false;
|
2349
|
+
while ((c = getopt(args, "l:epcab1")) != null) {
|
2346
2350
|
if (c == 'l') l_fuzzy = parseInt(getopt.arg);
|
2347
2351
|
else if (c == 'e') print_err_only = print_ovlp = true;
|
2348
2352
|
else if (c == 'p') print_ovlp = true;
|
2349
2353
|
else if (c == 'c') chr_only = true;
|
2354
|
+
else if (c == 'a') aa = true;
|
2355
|
+
else if (c == 'b') is_bed = true;
|
2356
|
+
else if (c == '1') first_only = true;
|
2350
2357
|
}
|
2351
2358
|
|
2352
2359
|
if (args.length - getopt.ind < 1) {
|
@@ -2356,6 +2363,9 @@ function paf_junceval(args)
|
|
2356
2363
|
print(" -p print overlapping introns");
|
2357
2364
|
print(" -e print erroreous overlapping introns");
|
2358
2365
|
print(" -c only consider alignments to /^(chr)?([0-9]+|X|Y)$/");
|
2366
|
+
print(" -a miniprot PAF as input");
|
2367
|
+
print(" -b BED as input");
|
2368
|
+
print(" -1 only process the first alignment of each query");
|
2359
2369
|
exit(1);
|
2360
2370
|
}
|
2361
2371
|
|
@@ -2409,13 +2419,17 @@ function paf_junceval(args)
|
|
2409
2419
|
|
2410
2420
|
file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]);
|
2411
2421
|
var last_qname = null;
|
2412
|
-
var re_cigar = /(\d+)([MIDNSHP=
|
2422
|
+
var re_cigar = /(\d+)([MIDNSHP=XFGUV])/g;
|
2413
2423
|
while (file.readline(buf) >= 0) {
|
2414
2424
|
var m, t = buf.toString().split("\t");
|
2415
|
-
var ctg_name = null, cigar = null, pos = null, qname
|
2425
|
+
var ctg_name = null, cigar = null, pos = null, qname;
|
2416
2426
|
|
2417
2427
|
if (t[0].charAt(0) == '@') continue;
|
2418
|
-
if (t[
|
2428
|
+
if (t[0] == "##PAF") t.shift();
|
2429
|
+
qname = t[0];
|
2430
|
+
if (is_bed) {
|
2431
|
+
ctg_name = t[0], pos = parseInt(t[1]), cigar == null;
|
2432
|
+
} else if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
|
2419
2433
|
ctg_name = t[5], pos = parseInt(t[7]);
|
2420
2434
|
var type = 'P';
|
2421
2435
|
for (i = 12; i < t.length; ++i) {
|
@@ -2445,12 +2459,43 @@ function paf_junceval(args)
|
|
2445
2459
|
}
|
2446
2460
|
|
2447
2461
|
var intron = [];
|
2448
|
-
|
2449
|
-
|
2450
|
-
|
2451
|
-
|
2452
|
-
|
2453
|
-
|
2462
|
+
if (is_bed) {
|
2463
|
+
intron.push([pos, parseInt(t[2])]);
|
2464
|
+
} else if (aa) {
|
2465
|
+
var tmp_junc = [], tmp = 0;
|
2466
|
+
while ((m = re_cigar.exec(cigar)) != null) {
|
2467
|
+
var len = parseInt(m[1]), op = m[2];
|
2468
|
+
if (op == 'N') {
|
2469
|
+
tmp_junc.push([tmp, tmp + len]);
|
2470
|
+
tmp += len;
|
2471
|
+
} else if (op == 'U') {
|
2472
|
+
tmp_junc.push([tmp + 1, tmp + len - 2]);
|
2473
|
+
tmp += len;
|
2474
|
+
} else if (op == 'V') {
|
2475
|
+
tmp_junc.push([tmp + 2, tmp + len - 1]);
|
2476
|
+
tmp += len;
|
2477
|
+
} else if (op == 'M' || op == 'X' || op == '=' || op == 'D') {
|
2478
|
+
tmp += len * 3;
|
2479
|
+
} else if (op == 'F' || op == 'G') {
|
2480
|
+
tmp += len;
|
2481
|
+
}
|
2482
|
+
}
|
2483
|
+
if (t[4] == '+') {
|
2484
|
+
for (var i = 0; i < tmp_junc.length; ++i)
|
2485
|
+
intron.push([pos + tmp_junc[i][0], pos + tmp_junc[i][1]]);
|
2486
|
+
} else if (t[4] == '-') {
|
2487
|
+
var glen = parseInt(t[8]) - parseInt(t[7]);
|
2488
|
+
for (var i = tmp_junc.length - 1; i >= 0; --i)
|
2489
|
+
intron.push([pos + (glen - tmp_junc[i][1]), pos + (glen - tmp_junc[i][0])]);
|
2490
|
+
}
|
2491
|
+
} else {
|
2492
|
+
while ((m = re_cigar.exec(cigar)) != null) {
|
2493
|
+
var len = parseInt(m[1]), op = m[2];
|
2494
|
+
if (op == 'N') {
|
2495
|
+
intron.push([pos, pos + len]);
|
2496
|
+
pos += len;
|
2497
|
+
} else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
|
2498
|
+
}
|
2454
2499
|
}
|
2455
2500
|
if (intron.length == 0) {
|
2456
2501
|
++n_sgl;
|
@@ -2509,6 +2554,276 @@ function paf_junceval(args)
|
|
2509
2554
|
}
|
2510
2555
|
}
|
2511
2556
|
|
2557
|
+
function paf_exoneval(args) // adapted from paf_junceval()
|
2558
|
+
{
|
2559
|
+
var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false, aa = false, is_bed = false, use_cds = false, eval_base = false;
|
2560
|
+
while ((c = getopt(args, "l:epcab1ds")) != null) {
|
2561
|
+
if (c == 'l') l_fuzzy = parseInt(getopt.arg);
|
2562
|
+
else if (c == 'e') print_err_only = print_ovlp = true;
|
2563
|
+
else if (c == 'p') print_ovlp = true;
|
2564
|
+
else if (c == 'c') chr_only = true;
|
2565
|
+
else if (c == 'a') aa = true, use_cds = true;
|
2566
|
+
else if (c == 'b') is_bed = true;
|
2567
|
+
else if (c == '1') first_only = true;
|
2568
|
+
else if (c == 'd') use_cds = true;
|
2569
|
+
else if (c == 's') eval_base = true;
|
2570
|
+
}
|
2571
|
+
|
2572
|
+
if (args.length - getopt.ind < 1) {
|
2573
|
+
print("Usage: paftools.js exoneval [options] <gene.gtf> <aln.sam>");
|
2574
|
+
print("Options:");
|
2575
|
+
print(" -l INT tolerance of junction positions (0 for exact) [0]");
|
2576
|
+
print(" -d evaluate coding regions only (exon regions by default)");
|
2577
|
+
print(" -a miniprot PAF as input (force -d)");
|
2578
|
+
print(" -p print overlapping exons");
|
2579
|
+
print(" -e print erroreous overlapping exons");
|
2580
|
+
print(" -c only consider alignments to /^(chr)?([0-9]+|X|Y)$/");
|
2581
|
+
print(" -1 only process the first alignment of each query");
|
2582
|
+
print(" -b BED as input");
|
2583
|
+
print(" -s compute base Sn and Sp (more memory)");
|
2584
|
+
exit(1);
|
2585
|
+
}
|
2586
|
+
|
2587
|
+
var file, buf = new Bytes();
|
2588
|
+
|
2589
|
+
warn("Reading reference GTF...");
|
2590
|
+
var tr = {};
|
2591
|
+
file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
|
2592
|
+
while (file.readline(buf) >= 0) {
|
2593
|
+
var m, t = buf.toString().split("\t");
|
2594
|
+
if (t[0].charAt(0) == '#') continue;
|
2595
|
+
if (use_cds) {
|
2596
|
+
if (t[2] != "cds" && t[2] != "CDS") continue;
|
2597
|
+
} else {
|
2598
|
+
if (t[2] != 'exon') continue;
|
2599
|
+
}
|
2600
|
+
var st = parseInt(t[3]) - 1;
|
2601
|
+
var en = parseInt(t[4]);
|
2602
|
+
if ((m = /transcript_id "(\S+)"/.exec(t[8])) == null) continue;
|
2603
|
+
var tid = m[1];
|
2604
|
+
if (tr[tid] == null) tr[tid] = [t[0], t[6], 0, 0, []];
|
2605
|
+
tr[tid][4].push([st, en]); // this keeps transcript
|
2606
|
+
}
|
2607
|
+
file.close();
|
2608
|
+
|
2609
|
+
var anno = {};
|
2610
|
+
for (var tid in tr) { // traverse each transcript
|
2611
|
+
var t = tr[tid];
|
2612
|
+
Interval.sort(t[4]);
|
2613
|
+
t[2] = t[4][0][0];
|
2614
|
+
t[3] = t[4][t[4].length - 1][1];
|
2615
|
+
if (anno[t[0]] == null) anno[t[0]] = [];
|
2616
|
+
var s = t[4];
|
2617
|
+
for (var i = 0; i < s.length; ++i) // traverse each exon
|
2618
|
+
anno[t[0]].push([s[i][0], s[i][1]]);
|
2619
|
+
}
|
2620
|
+
tr = null;
|
2621
|
+
|
2622
|
+
for (var chr in anno) { // index exons
|
2623
|
+
var e = anno[chr];
|
2624
|
+
if (e.length == 0) continue;
|
2625
|
+
Interval.sort(e);
|
2626
|
+
var k = 0;
|
2627
|
+
for (var i = 1; i < e.length; ++i) // dedup
|
2628
|
+
if (e[i][0] != e[k][0] || e[i][1] != e[k][1])
|
2629
|
+
e[++k] = e[i].slice(0);
|
2630
|
+
e.length = k + 1;
|
2631
|
+
Interval.index_end(e);
|
2632
|
+
}
|
2633
|
+
|
2634
|
+
var n_pri = 0, n_unmapped = 0, n_mapped = 0;
|
2635
|
+
var n_exon = 0, n_exon_hit = 0, n_exon_novel = 0;
|
2636
|
+
|
2637
|
+
file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]);
|
2638
|
+
var last_qname = null, qexon = {};
|
2639
|
+
var re_cigar = /(\d+)([MIDNSHP=XFGUV])/g;
|
2640
|
+
|
2641
|
+
warn("Evaluating alignments...");
|
2642
|
+
while (file.readline(buf) >= 0) {
|
2643
|
+
var m, t = buf.toString().split("\t");
|
2644
|
+
var ctg_name = null, cigar = null, pos = null, qname;
|
2645
|
+
|
2646
|
+
if (t[0].charAt(0) == '@') continue;
|
2647
|
+
if (t[0] == "##PAF") t.shift();
|
2648
|
+
qname = t[0];
|
2649
|
+
if (is_bed) {
|
2650
|
+
ctg_name = t[0], pos = parseInt(t[1]), cigar == null;
|
2651
|
+
} else if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
|
2652
|
+
ctg_name = t[5], pos = parseInt(t[7]);
|
2653
|
+
var type = 'P';
|
2654
|
+
for (i = 12; i < t.length; ++i) {
|
2655
|
+
if ((m = /^(tp:A|cg:Z):(\S+)/.exec(t[i])) != null) {
|
2656
|
+
if (m[1] == 'tp:A') type = m[2];
|
2657
|
+
else cigar = m[2];
|
2658
|
+
}
|
2659
|
+
}
|
2660
|
+
if (type == 'S') continue; // secondary
|
2661
|
+
} else { // SAM
|
2662
|
+
ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5];
|
2663
|
+
var flag = parseInt(t[1]);
|
2664
|
+
if (flag&0x100) continue; // secondary
|
2665
|
+
}
|
2666
|
+
|
2667
|
+
if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(ctg_name)) continue;
|
2668
|
+
if (first_only && last_qname == qname) continue;
|
2669
|
+
if (ctg_name == '*') { // unmapped
|
2670
|
+
++n_unmapped;
|
2671
|
+
continue;
|
2672
|
+
} else {
|
2673
|
+
++n_pri;
|
2674
|
+
if (last_qname != qname) {
|
2675
|
+
++n_mapped;
|
2676
|
+
last_qname = qname;
|
2677
|
+
}
|
2678
|
+
}
|
2679
|
+
|
2680
|
+
var exon = [];
|
2681
|
+
if (is_bed) { // BED
|
2682
|
+
exon.push([pos, parseInt(t[2])]);
|
2683
|
+
} else if (aa) {
|
2684
|
+
var tmp_exon = [], tmp = 0, tmp_st = 0;
|
2685
|
+
while ((m = re_cigar.exec(cigar)) != null) {
|
2686
|
+
var len = parseInt(m[1]), op = m[2];
|
2687
|
+
if (op == 'N') {
|
2688
|
+
tmp_exon.push([tmp_st, tmp]);
|
2689
|
+
tmp_st = tmp + len, tmp += len;
|
2690
|
+
} else if (op == 'U') {
|
2691
|
+
tmp_exon.push([tmp_st, tmp + 1]);
|
2692
|
+
tmp_st = tmp + len - 2, tmp += len;
|
2693
|
+
} else if (op == 'V') {
|
2694
|
+
tmp_exon.push([tmp_st, tmp + 2]);
|
2695
|
+
tmp_st = tmp + len - 1, tmp += len;
|
2696
|
+
} else if (op == 'M' || op == 'X' || op == '=' || op == 'D') {
|
2697
|
+
tmp += len * 3;
|
2698
|
+
} else if (op == 'F' || op == 'G') {
|
2699
|
+
tmp += len;
|
2700
|
+
}
|
2701
|
+
}
|
2702
|
+
tmp_exon.push([tmp_st, tmp]);
|
2703
|
+
if (t[4] == '+') {
|
2704
|
+
for (var i = 0; i < tmp_exon.length; ++i)
|
2705
|
+
exon.push([pos + tmp_exon[i][0], pos + tmp_exon[i][1]]);
|
2706
|
+
} else if (t[4] == '-') { // For protein-to-genome alignment, the coordinates are on the query strand. Need to flip them.
|
2707
|
+
var glen = parseInt(t[8]) - parseInt(t[7]);
|
2708
|
+
for (var i = tmp_exon.length - 1; i >= 0; --i)
|
2709
|
+
exon.push([pos + (glen - tmp_exon[i][1]), pos + (glen - tmp_exon[i][0])]);
|
2710
|
+
}
|
2711
|
+
} else {
|
2712
|
+
var tmp_st = pos;
|
2713
|
+
while ((m = re_cigar.exec(cigar)) != null) {
|
2714
|
+
var len = parseInt(m[1]), op = m[2];
|
2715
|
+
if (op == 'N') {
|
2716
|
+
exon.push([tmp_st, pos]);
|
2717
|
+
tmp_st = pos + len, pos += len;
|
2718
|
+
} else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
|
2719
|
+
}
|
2720
|
+
exon.push([tmp_st, pos]);
|
2721
|
+
}
|
2722
|
+
n_exon += exon.length;
|
2723
|
+
|
2724
|
+
var chr = anno[ctg_name];
|
2725
|
+
if (chr != null) {
|
2726
|
+
for (var i = 0; i < exon.length; ++i) {
|
2727
|
+
if (eval_base) {
|
2728
|
+
if (qexon[ctg_name] == null) qexon[ctg_name] = [];
|
2729
|
+
qexon[ctg_name].push([exon[i][0], exon[i][1]]);
|
2730
|
+
}
|
2731
|
+
var o = Interval.find_ovlp(chr, exon[i][0], exon[i][1]);
|
2732
|
+
if (o.length > 0) {
|
2733
|
+
var hit = false;
|
2734
|
+
for (var j = 0; j < o.length; ++j) {
|
2735
|
+
var st_diff = exon[i][0] - o[j][0];
|
2736
|
+
var en_diff = exon[i][1] - o[j][1];
|
2737
|
+
if (st_diff < 0) st_diff = -st_diff;
|
2738
|
+
if (en_diff < 0) en_diff = -en_diff;
|
2739
|
+
if (st_diff <= l_fuzzy && en_diff <= l_fuzzy)
|
2740
|
+
++n_exon_hit, hit = true;
|
2741
|
+
if (hit) break;
|
2742
|
+
}
|
2743
|
+
if (print_ovlp) {
|
2744
|
+
var type = hit? 'C' : 'P';
|
2745
|
+
if (hit && print_err_only) continue;
|
2746
|
+
var x = '[';
|
2747
|
+
for (var j = 0; j < o.length; ++j) {
|
2748
|
+
if (j) x += ', ';
|
2749
|
+
x += '(' + o[j][0] + "," + o[j][1] + ')';
|
2750
|
+
}
|
2751
|
+
x += ']';
|
2752
|
+
print(type, qname, i+1, ctg_name, exon[i][0], exon[i][1], x);
|
2753
|
+
}
|
2754
|
+
} else {
|
2755
|
+
++n_exon_novel;
|
2756
|
+
if (print_ovlp)
|
2757
|
+
print('N', qname, i+1, ctg_name, exon[i][0], exon[i][1]);
|
2758
|
+
}
|
2759
|
+
}
|
2760
|
+
} else {
|
2761
|
+
n_exon_novel += exon.length;
|
2762
|
+
}
|
2763
|
+
}
|
2764
|
+
file.close();
|
2765
|
+
|
2766
|
+
buf.destroy();
|
2767
|
+
|
2768
|
+
if (!print_ovlp) {
|
2769
|
+
print("# unmapped reads: " + n_unmapped);
|
2770
|
+
print("# mapped reads: " + n_mapped);
|
2771
|
+
print("# primary alignments: " + n_pri);
|
2772
|
+
print("# predicted exons: " + n_exon);
|
2773
|
+
print("# non-overlapping exons: " + n_exon_novel);
|
2774
|
+
print("# correct exons: " + n_exon_hit + " (" + (n_exon_hit / n_exon * 100).toFixed(2) + "%)");
|
2775
|
+
}
|
2776
|
+
|
2777
|
+
function merge_and_index(ex) {
|
2778
|
+
for (var chr in ex) {
|
2779
|
+
var a = [];
|
2780
|
+
e = ex[chr];
|
2781
|
+
Interval.sort(e);
|
2782
|
+
var st = e[0][0], en = e[0][1];
|
2783
|
+
for (var i = 1; i < e.length; ++i) { // merge
|
2784
|
+
if (e[i][0] > en) {
|
2785
|
+
a.push([st, en]);
|
2786
|
+
st = e[i][0], en = e[i][1];
|
2787
|
+
} else {
|
2788
|
+
en = en > e[i][1]? en : e[i][1];
|
2789
|
+
}
|
2790
|
+
}
|
2791
|
+
a.push([st, en]);
|
2792
|
+
Interval.index_end(a);
|
2793
|
+
ex[chr] = a;
|
2794
|
+
}
|
2795
|
+
}
|
2796
|
+
|
2797
|
+
function cal_sn(a0, a1) {
|
2798
|
+
var tot = 0, cov = 0;
|
2799
|
+
for (var chr in a1) {
|
2800
|
+
var e0 = a0[chr], e1 = a1[chr];
|
2801
|
+
for (var i = 0; i < e1.length; ++i)
|
2802
|
+
tot += e1[i][1] - e1[i][0];
|
2803
|
+
if (e0 == null) continue;
|
2804
|
+
for (var i = 0; i < e1.length; ++i) {
|
2805
|
+
var o = Interval.find_ovlp(e0, e1[i][0], e1[i][1]);
|
2806
|
+
for (var j = 0; j < o.length; ++j) { // this only works when there are no overlaps between intervals
|
2807
|
+
var st = e1[i][0] > o[j][0]? e1[i][0] : o[j][0];
|
2808
|
+
var en = e1[i][1] < o[j][1]? e1[i][1] : o[j][1];
|
2809
|
+
cov += en - st;
|
2810
|
+
}
|
2811
|
+
}
|
2812
|
+
}
|
2813
|
+
return [tot, cov];
|
2814
|
+
}
|
2815
|
+
|
2816
|
+
if (eval_base) {
|
2817
|
+
warn("Computing base Sn and Sp...");
|
2818
|
+
merge_and_index(qexon);
|
2819
|
+
merge_and_index(anno);
|
2820
|
+
var sn = cal_sn(qexon, anno);
|
2821
|
+
var sp = cal_sn(anno, qexon);
|
2822
|
+
print("Base Sn: " + sn[1] + " / " + sn[0] + " = " + (sn[1] / sn[0] * 100).toFixed(2) + "%");
|
2823
|
+
print("Base Sp: " + sp[1] + " / " + sp[0] + " = " + (sp[1] / sp[0] * 100).toFixed(2) + "%");
|
2824
|
+
}
|
2825
|
+
}
|
2826
|
+
|
2512
2827
|
// evaluate overlap sensitivity
|
2513
2828
|
function paf_ov_eval(args)
|
2514
2829
|
{
|
@@ -2704,6 +3019,23 @@ function paf_misjoin(args)
|
|
2704
3019
|
return len < (en - st) * cen_ratio? false : true;
|
2705
3020
|
}
|
2706
3021
|
|
3022
|
+
function test_cen_point(cen, chr, x) {
|
3023
|
+
var b = cen[chr];
|
3024
|
+
if (b == null) return false;
|
3025
|
+
for (var j = 0; j < b.length; ++j)
|
3026
|
+
if (x >= b[j][0] && x < b[j][1])
|
3027
|
+
return true;
|
3028
|
+
return false;
|
3029
|
+
}
|
3030
|
+
|
3031
|
+
if (show_err || show_long) {
|
3032
|
+
print("C\tJ inter-chromosomal misjoin");
|
3033
|
+
print("C\tj inter-chromosomal misjoin with both breakpoints ending in centromeres");
|
3034
|
+
print("C\tG long gap on the reference genome");
|
3035
|
+
print("C\tg long gap on the reference genome with both breakpoints ending in centromeres");
|
3036
|
+
print("C\tM closed inversion");
|
3037
|
+
print("C");
|
3038
|
+
}
|
2707
3039
|
function process(a) {
|
2708
3040
|
var k = 0;
|
2709
3041
|
for (var i = 0; i < a.length; ++i) {
|
@@ -2716,14 +3048,17 @@ function paf_misjoin(args)
|
|
2716
3048
|
a = a.sort(function(x,y){return x[2]-y[2]});
|
2717
3049
|
if (show_long) for (var i = 0; i < a.length; ++i) print(a[i].join("\t"));
|
2718
3050
|
for (var i = 1; i < a.length; ++i) {
|
2719
|
-
var ov = [false, false];
|
3051
|
+
var ov = [false, false], end_cen = [false, false];
|
2720
3052
|
ov[0] = test_cen(cen, a[i-1][5], a[i-1][7], a[i-1][8]);
|
2721
3053
|
ov[1] = test_cen(cen, a[i][5], a[i][7], a[i][8]);
|
3054
|
+
end_cen[0] = test_cen_point(cen, a[i-1][5], a[i-1][4] == '+'? a[i-1][8] : a[i-1][7]);
|
3055
|
+
end_cen[1] = test_cen_point(cen, a[i][5], a[i][4] == '+'? a[i][7] : a[i][8]);
|
2722
3056
|
if (a[i-1][5] != a[i][5]) { // different chr
|
2723
3057
|
if (ov[0] || ov[1]) ++n_diff[1];
|
2724
3058
|
else if (show_err) {
|
2725
|
-
|
2726
|
-
print(
|
3059
|
+
var label = end_cen[0] && end_cen[1]? 'j' : 'J';
|
3060
|
+
print(label, a[i-1].slice(0, 12).join("\t"));
|
3061
|
+
print(label, a[i].slice(0, 12).join("\t"));
|
2727
3062
|
}
|
2728
3063
|
++n_diff[0];
|
2729
3064
|
} else if (a[i-1][4] == a[i][4]) { // a gap
|
@@ -2733,8 +3068,9 @@ function paf_misjoin(args)
|
|
2733
3068
|
if (gap > max_gap) {
|
2734
3069
|
if (ov[0] || ov[1]) ++n_gap[1];
|
2735
3070
|
else if (show_err) {
|
2736
|
-
|
2737
|
-
print(
|
3071
|
+
var label = end_cen[0] && end_cen[1]? 'g' : 'G';
|
3072
|
+
print(label, a[i-1].slice(0, 12).join("\t"));
|
3073
|
+
print(label, a[i].slice(0, 12).join("\t"));
|
2738
3074
|
}
|
2739
3075
|
++n_gap[0];
|
2740
3076
|
}
|
@@ -3084,6 +3420,183 @@ function paf_pafcmp(args)
|
|
3084
3420
|
buf.destroy();
|
3085
3421
|
}
|
3086
3422
|
|
3423
|
+
function paf_longcs2seq(args) {
|
3424
|
+
var c, opt = { query:false };
|
3425
|
+
while ((c = getopt(args, "q")) != null)
|
3426
|
+
if (c == 'q') opt.query = true;
|
3427
|
+
if (args.length == getopt.ind) {
|
3428
|
+
print("Usage: paftools.js longcs2seq [-q] <long-cs.paf>");
|
3429
|
+
return;
|
3430
|
+
}
|
3431
|
+
var re_cs = /([:=*+-])(\d+|[A-Za-z]+)/g
|
3432
|
+
var buf = new Bytes();
|
3433
|
+
var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
|
3434
|
+
while (file.readline(buf) >= 0) {
|
3435
|
+
var m, cs = null, t = buf.toString().split("\t");
|
3436
|
+
for (var i = 12; i < t.length; ++i)
|
3437
|
+
if ((m = /^cs:Z:(\S+)/.exec(t[i])) != null) {
|
3438
|
+
cs = m[1];
|
3439
|
+
break;
|
3440
|
+
}
|
3441
|
+
if (cs == null) continue;
|
3442
|
+
var ts = "", qs = "";
|
3443
|
+
while ((m = re_cs.exec(cs)) != null) {
|
3444
|
+
if (m[1] == "=") ts += m[2], qs += m[2];
|
3445
|
+
else if (m[1] == "+") qs += m[2].toUpperCase();
|
3446
|
+
else if (m[1] == "-") ts += m[2].toUpperCase();
|
3447
|
+
else if (m[1] == "*") ts += m[2][0].toUpperCase(), qs += m[2][1].toUpperCase();
|
3448
|
+
else if (m[1] == ":") throw Error("Long cs is required");
|
3449
|
+
}
|
3450
|
+
if (opt.query) {
|
3451
|
+
print(">" + t[0] + "_" + t[2] + "_" + t[3]);
|
3452
|
+
print(qs);
|
3453
|
+
} else {
|
3454
|
+
print(">" + t[5] + "_" + t[7] + "_" + t[8]);
|
3455
|
+
print(ts);
|
3456
|
+
}
|
3457
|
+
}
|
3458
|
+
file.close();
|
3459
|
+
buf.destroy();
|
3460
|
+
}
|
3461
|
+
|
3462
|
+
function paf_paf2gff(args) {
|
3463
|
+
var c, opt = { aa:false };
|
3464
|
+
var re_cigar = /(\d+)([A-Z=])/g;
|
3465
|
+
while ((c = getopt(args, "a")) != null) {
|
3466
|
+
if (c == 'a') opt.aa = true;
|
3467
|
+
}
|
3468
|
+
if (args.length == getopt.ind) {
|
3469
|
+
print("Usage: paftools.js paf2gff [-a] <in.paf>");
|
3470
|
+
return;
|
3471
|
+
}
|
3472
|
+
var buf = new Bytes();
|
3473
|
+
var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
|
3474
|
+
var hid = 1, last_name = null;
|
3475
|
+
while (file.readline(buf) >= 0) {
|
3476
|
+
var m, t = buf.toString().split("\t");
|
3477
|
+
if (t[5] == '*') continue; // skip unmapped lines
|
3478
|
+
|
3479
|
+
if (t[0] != last_name) last_name = t[0], hid = 1;
|
3480
|
+
else ++hid;
|
3481
|
+
for (var i = 1; i <= 3; ++i) t[i] = parseInt(t[i]);
|
3482
|
+
for (var i = 6; i <= 11; ++i) t[i] = parseInt(t[i]);
|
3483
|
+
var cigar = null, score = null, np = null, dist_stop = null, dist_start = null;
|
3484
|
+
for (var i = 12; i < t.length; ++i) {
|
3485
|
+
if ((m = /^(cg:Z|AS:i|np:i|da:i|do:i):(\S+)/.exec(t[i])) != null) {
|
3486
|
+
if (m[1] == 'cg:Z') cigar = m[2];
|
3487
|
+
else if (m[1] == 'AS:i') score = parseInt(m[2]);
|
3488
|
+
else if (m[1] == 'np:i') np = parseInt(m[2]);
|
3489
|
+
else if (m[1] == 'do:i') dist_stop = parseInt(m[2]);
|
3490
|
+
else if (m[1] == 'da:i') dist_start = parseInt(m[2]);
|
3491
|
+
}
|
3492
|
+
}
|
3493
|
+
if (cigar == null) throw Error("failed to find the cg:Z tag");
|
3494
|
+
if (score == null) throw Error("failed to find the AS:i tag");
|
3495
|
+
|
3496
|
+
var st = 0, en = 0, phase = 0, pseudo = false, fs = 0, a = [];
|
3497
|
+
if (dist_start != null && dist_start == 0)
|
3498
|
+
a.push([t[5], 'paf2gff', 'start_codon', 0, 3, 0, t[4], '.', 0]);
|
3499
|
+
while ((m = re_cigar.exec(cigar)) != null) {
|
3500
|
+
var len = parseInt(m[1]);
|
3501
|
+
if (m[2] == 'M' || m[2] == 'D') {
|
3502
|
+
en += opt.aa? len * 3 : len;
|
3503
|
+
} else if (m[2] == 'F' || m[2] == 'G' || m[2] == 'R') {
|
3504
|
+
en += len, pseudo = true, fs = 1;
|
3505
|
+
} else if (m[2] == 'N') {
|
3506
|
+
a.push([t[5], 'paf2gff', 'exon', st, en, 0, t[4], phase, fs]);
|
3507
|
+
st = en + len, en += len, phase = 0, fs = 0;
|
3508
|
+
} else if (m[2] == 'U') { // ...xGT...AGxx...
|
3509
|
+
a.push([t[5], 'paf2gff', 'exon', st, en + 1, 0, t[4], phase, fs]);
|
3510
|
+
st = en + len - 2, en += len, phase = 2, fs = 0;
|
3511
|
+
} else if (m[2] == 'V') { // ...xxGT...AGx...
|
3512
|
+
a.push([t[5], 'paf2gff', 'exon', st, en + 2, 0, t[4], phase, fs]);
|
3513
|
+
st = en + len - 1, en += len, phase = 1, fs = 0;
|
3514
|
+
}
|
3515
|
+
}
|
3516
|
+
a.push([t[5], 'paf2gff', 'exon', st, en, 0, t[4], phase, fs]);
|
3517
|
+
if (en != t[8] - t[7]) throw Error("inconsistent cigar");
|
3518
|
+
if (dist_stop != null && dist_stop == 0)
|
3519
|
+
a.push([t[5], 'paf2gff', 'stop_codon', en, en + 3, 0, t[4], '.', 0]);
|
3520
|
+
var type = pseudo? 'pseudogene' : 'protein_coding';
|
3521
|
+
var attr = ['transcript_id=' + t[0] + '#' + hid, 'transcript_type=' + type].join(";");
|
3522
|
+
var trans_attr = 'identity=' + (t[9] / t[10]).toFixed(4);
|
3523
|
+
if (np != null) trans_attr += ';positive=' + (np * 3 / t[10]).toFixed(4);
|
3524
|
+
trans_attr += ';aa_start=' + t[2];
|
3525
|
+
trans_attr += ';aa_end=' + (t[1] - t[3]);
|
3526
|
+
if (dist_start != null && dist_start >= 0) trans_attr += ';dist_start_codon=' + dist_start;
|
3527
|
+
if (dist_stop != null && dist_stop >= 0) trans_attr += ';dist_stop_codon=' + dist_stop;
|
3528
|
+
var trans_st = t[7], trans_en = t[8];
|
3529
|
+
if (dist_stop != null && dist_stop == 0) {
|
3530
|
+
if (t[4] == '-') trans_st -= 3;
|
3531
|
+
else trans_en += 3;
|
3532
|
+
}
|
3533
|
+
print([t[5], 'paf2gff', 'transcript', trans_st + 1, trans_en, score, t[4], '.', attr + ';' + trans_attr].join("\t"));
|
3534
|
+
if (opt.aa && t[4] == '-') {
|
3535
|
+
var b = [], len = t[8] - t[7];
|
3536
|
+
for (var i = a.length - 1; i >= 0; --i) {
|
3537
|
+
var x = len - a[i][3];
|
3538
|
+
a[i][3] = len - a[i][4];
|
3539
|
+
a[i][4] = x;
|
3540
|
+
//a[i][7] = a[i][7] == 0? 0 : 3 - a[i][7]; // not sure if this line is needed
|
3541
|
+
b.push(a[i]);
|
3542
|
+
}
|
3543
|
+
a = b;
|
3544
|
+
}
|
3545
|
+
for (var i = 0; i < a.length; ++i) {
|
3546
|
+
if (!pseudo && a[i][2] == "exon") a[i][2] = "CDS";
|
3547
|
+
a[i][3] += t[7] + 1;
|
3548
|
+
a[i][4] += t[7];
|
3549
|
+
a[i][8] = attr + ";frameshift=" + a[i][8];
|
3550
|
+
print(a[i].join("\t"));
|
3551
|
+
}
|
3552
|
+
}
|
3553
|
+
file.close();
|
3554
|
+
buf.destroy();
|
3555
|
+
}
|
3556
|
+
|
3557
|
+
function paf_gff2junc(args) {
|
3558
|
+
var c, feat = "CDS";
|
3559
|
+
while ((c = getopt(args, "f:")) != null) {
|
3560
|
+
if (c == 'f') feat = getopt.arg;
|
3561
|
+
}
|
3562
|
+
if (getopt.ind == args.length) {
|
3563
|
+
print("Usage: paftools.js gff2junc [-f feature] <in.gff3>");
|
3564
|
+
return;
|
3565
|
+
}
|
3566
|
+
var buf = new Bytes();
|
3567
|
+
var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
|
3568
|
+
|
3569
|
+
function process_a(a) {
|
3570
|
+
if (a.length < 2) return;
|
3571
|
+
a = a.sort(function(x, y) { return x[4] - y[4] });
|
3572
|
+
for (var i = 1; i < a.length; ++i)
|
3573
|
+
print([a[i][1], a[i-1][5], a[i][4], a[i][0], 0, a[i][7]].join("\t"));
|
3574
|
+
}
|
3575
|
+
|
3576
|
+
var a = [];
|
3577
|
+
while (file.readline(buf) >= 0) {
|
3578
|
+
var m, t = buf.toString().split("\t");
|
3579
|
+
if (t[0][0] == '#') continue;
|
3580
|
+
if (t[2].toLowerCase() != feat.toLowerCase()) continue;
|
3581
|
+
//print(t.join("\t"));
|
3582
|
+
if ((m = /\bParent=([^;]+)/.exec(t[8])) == null) {
|
3583
|
+
warn("Can't find Parent");
|
3584
|
+
continue;
|
3585
|
+
}
|
3586
|
+
t[3] = parseInt(t[3]) - 1;
|
3587
|
+
t[4] = parseInt(t[4]);
|
3588
|
+
t.unshift(m[1]);
|
3589
|
+
if (a.length > 0 && a[0][0] != m[1]) {
|
3590
|
+
process_a(a);
|
3591
|
+
a.length = 0;
|
3592
|
+
a.push(t);
|
3593
|
+
} else a.push(t);
|
3594
|
+
}
|
3595
|
+
process_a(a);
|
3596
|
+
file.close();
|
3597
|
+
buf.destroy();
|
3598
|
+
}
|
3599
|
+
|
3087
3600
|
/*************************
|
3088
3601
|
***** main function *****
|
3089
3602
|
*************************/
|
@@ -3098,6 +3611,9 @@ function main(args)
|
|
3098
3611
|
print(" sam2paf convert SAM to PAF");
|
3099
3612
|
print(" delta2paf convert MUMmer's delta to PAF");
|
3100
3613
|
print(" gff2bed convert GTF/GFF3 to BED12");
|
3614
|
+
print(" gff2junc convert GFF3 to junction BED");
|
3615
|
+
print(" longcs2seq convert long-cs PAF to sequences");
|
3616
|
+
// print(" paf2gff convert PAF to GFF3 (tested for miniprot only)");
|
3101
3617
|
print("");
|
3102
3618
|
print(" stat collect basic mapping information in PAF/SAM");
|
3103
3619
|
print(" asmstat collect basic assembly information");
|
@@ -3115,6 +3631,7 @@ function main(args)
|
|
3115
3631
|
print(" mason2fq convert mason2-simulated SAM to FASTQ");
|
3116
3632
|
print(" pbsim2fq convert PBSIM-simulated MAF to FASTQ");
|
3117
3633
|
print(" junceval evaluate splice junction consistency with known annotations");
|
3634
|
+
print(" exoneval evaluate exon-level consistency with known annotations");
|
3118
3635
|
print(" ov-eval evaluate read overlap sensitivity using read-to-ref mapping");
|
3119
3636
|
exit(1);
|
3120
3637
|
}
|
@@ -3125,6 +3642,7 @@ function main(args)
|
|
3125
3642
|
else if (cmd == 'delta2paf') paf_delta2paf(args);
|
3126
3643
|
else if (cmd == 'splice2bed') paf_splice2bed(args);
|
3127
3644
|
else if (cmd == 'gff2bed') paf_gff2bed(args);
|
3645
|
+
else if (cmd == 'gff2junc') paf_gff2junc(args);
|
3128
3646
|
else if (cmd == 'stat') paf_stat(args);
|
3129
3647
|
else if (cmd == 'asmstat') paf_asmstat(args);
|
3130
3648
|
else if (cmd == 'asmgene') paf_asmgene(args);
|
@@ -3138,10 +3656,13 @@ function main(args)
|
|
3138
3656
|
else if (cmd == 'mason2fq') paf_mason2fq(args);
|
3139
3657
|
else if (cmd == 'pbsim2fq') paf_pbsim2fq(args);
|
3140
3658
|
else if (cmd == 'junceval') paf_junceval(args);
|
3659
|
+
else if (cmd == 'exoneval') paf_exoneval(args);
|
3141
3660
|
else if (cmd == 'ov-eval') paf_ov_eval(args);
|
3142
3661
|
else if (cmd == 'vcfstat') paf_vcfstat(args);
|
3143
3662
|
else if (cmd == 'sveval') paf_sveval(args);
|
3144
3663
|
else if (cmd == 'vcfsel') paf_vcfsel(args);
|
3664
|
+
else if (cmd == 'longcs2seq') paf_longcs2seq(args);
|
3665
|
+
else if (cmd == 'paf2gff') paf_paf2gff(args);
|
3145
3666
|
else if (cmd == 'version') print(paftools_version);
|
3146
3667
|
else throw Error("unrecognized command: " + cmd);
|
3147
3668
|
}
|