minimap2 0.2.24.6 → 0.2.25.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env k8
2
2
 
3
- var paftools_version = '2.24-r1122';
3
+ var paftools_version = '2.25-r1173';
4
4
 
5
5
  /*****************************
6
6
  ***** Library functions *****
@@ -1532,22 +1532,24 @@ function paf_view(args)
1532
1532
 
1533
1533
  function paf_gff2bed(args)
1534
1534
  {
1535
- var c, fn_ucsc_fai = null, is_short = false, keep_gff = false, print_junc = false, output_gene = false;
1536
- while ((c = getopt(args, "u:sgjG")) != null) {
1535
+ var c, fn_ucsc_fai = null, is_short = false, keep_gff = false, print_junc = false, output_gene = false, ens_canon_only = false;
1536
+ while ((c = getopt(args, "u:sgjGe")) != null) {
1537
1537
  if (c == 'u') fn_ucsc_fai = getopt.arg;
1538
1538
  else if (c == 's') is_short = true;
1539
1539
  else if (c == 'g') keep_gff = true;
1540
1540
  else if (c == 'j') print_junc = true;
1541
1541
  else if (c == 'G') output_gene = true;
1542
+ else if (c == 'e') ens_canon_only = true;
1542
1543
  }
1543
1544
 
1544
1545
  if (getopt.ind == args.length) {
1545
1546
  print("Usage: paftools.js gff2bed [options] <in.gff>");
1546
1547
  print("Options:");
1547
- print(" -j Output junction BED");
1548
- print(" -s Print names in the short form");
1548
+ print(" -j output junction BED");
1549
+ print(" -s print names in the short form");
1549
1550
  print(" -u FILE hg38.fa.fai for chr name conversion");
1550
- print(" -g Output GFF (used with -u)");
1551
+ print(" -e only show transcript tagged with 'Ensembl_canonical'");
1552
+ print(" -g output GFF (used with -u)");
1551
1553
  exit(1);
1552
1554
  }
1553
1555
 
@@ -1606,7 +1608,7 @@ function paf_gff2bed(args)
1606
1608
  print(a[0][0], st, en, name, 1000, a[0][3], cds_st, cds_en, color, a.length, sizes.join(",") + ",", starts.join(",") + ",");
1607
1609
  }
1608
1610
 
1609
- var re_gtf = /\b(transcript_id|transcript_type|transcript_biotype|gene_name|gene_id|gbkey|transcript_name) "([^"]+)";/g;
1611
+ var re_gtf = /\b(transcript_id|transcript_type|transcript_biotype|gene_name|gene_id|gbkey|transcript_name|tag) "([^"]+)";/g;
1610
1612
  var re_gff3 = /\b(transcript_id|transcript_type|transcript_biotype|gene_name|gene_id|gbkey|transcript_name)=([^;]+)/g;
1611
1613
  var re_gtf_gene = /\b(gene_id|gene_type|gene_name) "([^;]+)";/g;
1612
1614
  var re_gff3_gene = /\b(gene_id|gene_type|source_gene|gene_biotype|gene_name)=([^;]+);/g;
@@ -1646,13 +1648,14 @@ function paf_gff2bed(args)
1646
1648
  if (t[2] != "CDS" && t[2] != "exon") continue;
1647
1649
  t[3] = parseInt(t[3]) - 1;
1648
1650
  t[4] = parseInt(t[4]);
1649
- var id = null, type = "", name = "N/A", biotype = "", m, tname = "N/A";
1651
+ var id = null, type = "", name = "N/A", biotype = "", m, tname = "N/A", ens_canonical = false;
1650
1652
  while ((m = re_gtf.exec(t[8])) != null) {
1651
1653
  if (m[1] == "transcript_id") id = m[2];
1652
1654
  else if (m[1] == "transcript_type") type = m[2];
1653
1655
  else if (m[1] == "transcript_biotype" || m[1] == "gbkey") biotype = m[2];
1654
1656
  else if (m[1] == "gene_name" || m[1] == "gene_id") name = m[2];
1655
1657
  else if (m[1] == "transcript_name") tname = m[2];
1658
+ else if (m[1] == "tag" && m[2] == "Ensembl_canonical") ens_canonical = true;
1656
1659
  }
1657
1660
  while ((m = re_gff3.exec(t[8])) != null) {
1658
1661
  if (m[1] == "transcript_id") id = m[2];
@@ -1661,6 +1664,7 @@ function paf_gff2bed(args)
1661
1664
  else if (m[1] == "gene_name" || m[1] == "gene_id") name = m[2];
1662
1665
  else if (m[1] == "transcript_name") tname = m[2];
1663
1666
  }
1667
+ if (ens_canon_only && !ens_canonical) continue;
1664
1668
  if (type == "" && biotype != "") type = biotype;
1665
1669
  if (id == null) throw Error("No transcript_id");
1666
1670
  if (id != last_id) {
@@ -2341,12 +2345,15 @@ function paf_pbsim2fq(args)
2341
2345
 
2342
2346
  function paf_junceval(args)
2343
2347
  {
2344
- var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false;
2345
- while ((c = getopt(args, "l:epc")) != null) {
2348
+ var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false, aa = false, is_bed = false;
2349
+ while ((c = getopt(args, "l:epcab1")) != null) {
2346
2350
  if (c == 'l') l_fuzzy = parseInt(getopt.arg);
2347
2351
  else if (c == 'e') print_err_only = print_ovlp = true;
2348
2352
  else if (c == 'p') print_ovlp = true;
2349
2353
  else if (c == 'c') chr_only = true;
2354
+ else if (c == 'a') aa = true;
2355
+ else if (c == 'b') is_bed = true;
2356
+ else if (c == '1') first_only = true;
2350
2357
  }
2351
2358
 
2352
2359
  if (args.length - getopt.ind < 1) {
@@ -2356,6 +2363,9 @@ function paf_junceval(args)
2356
2363
  print(" -p print overlapping introns");
2357
2364
  print(" -e print erroreous overlapping introns");
2358
2365
  print(" -c only consider alignments to /^(chr)?([0-9]+|X|Y)$/");
2366
+ print(" -a miniprot PAF as input");
2367
+ print(" -b BED as input");
2368
+ print(" -1 only process the first alignment of each query");
2359
2369
  exit(1);
2360
2370
  }
2361
2371
 
@@ -2409,13 +2419,17 @@ function paf_junceval(args)
2409
2419
 
2410
2420
  file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]);
2411
2421
  var last_qname = null;
2412
- var re_cigar = /(\d+)([MIDNSHP=X])/g;
2422
+ var re_cigar = /(\d+)([MIDNSHP=XFGUV])/g;
2413
2423
  while (file.readline(buf) >= 0) {
2414
2424
  var m, t = buf.toString().split("\t");
2415
- var ctg_name = null, cigar = null, pos = null, qname = t[0];
2425
+ var ctg_name = null, cigar = null, pos = null, qname;
2416
2426
 
2417
2427
  if (t[0].charAt(0) == '@') continue;
2418
- if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
2428
+ if (t[0] == "##PAF") t.shift();
2429
+ qname = t[0];
2430
+ if (is_bed) {
2431
+ ctg_name = t[0], pos = parseInt(t[1]), cigar == null;
2432
+ } else if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
2419
2433
  ctg_name = t[5], pos = parseInt(t[7]);
2420
2434
  var type = 'P';
2421
2435
  for (i = 12; i < t.length; ++i) {
@@ -2445,12 +2459,43 @@ function paf_junceval(args)
2445
2459
  }
2446
2460
 
2447
2461
  var intron = [];
2448
- while ((m = re_cigar.exec(cigar)) != null) {
2449
- var len = parseInt(m[1]), op = m[2];
2450
- if (op == 'N') {
2451
- intron.push([pos, pos + len]);
2452
- pos += len;
2453
- } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
2462
+ if (is_bed) {
2463
+ intron.push([pos, parseInt(t[2])]);
2464
+ } else if (aa) {
2465
+ var tmp_junc = [], tmp = 0;
2466
+ while ((m = re_cigar.exec(cigar)) != null) {
2467
+ var len = parseInt(m[1]), op = m[2];
2468
+ if (op == 'N') {
2469
+ tmp_junc.push([tmp, tmp + len]);
2470
+ tmp += len;
2471
+ } else if (op == 'U') {
2472
+ tmp_junc.push([tmp + 1, tmp + len - 2]);
2473
+ tmp += len;
2474
+ } else if (op == 'V') {
2475
+ tmp_junc.push([tmp + 2, tmp + len - 1]);
2476
+ tmp += len;
2477
+ } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') {
2478
+ tmp += len * 3;
2479
+ } else if (op == 'F' || op == 'G') {
2480
+ tmp += len;
2481
+ }
2482
+ }
2483
+ if (t[4] == '+') {
2484
+ for (var i = 0; i < tmp_junc.length; ++i)
2485
+ intron.push([pos + tmp_junc[i][0], pos + tmp_junc[i][1]]);
2486
+ } else if (t[4] == '-') {
2487
+ var glen = parseInt(t[8]) - parseInt(t[7]);
2488
+ for (var i = tmp_junc.length - 1; i >= 0; --i)
2489
+ intron.push([pos + (glen - tmp_junc[i][1]), pos + (glen - tmp_junc[i][0])]);
2490
+ }
2491
+ } else {
2492
+ while ((m = re_cigar.exec(cigar)) != null) {
2493
+ var len = parseInt(m[1]), op = m[2];
2494
+ if (op == 'N') {
2495
+ intron.push([pos, pos + len]);
2496
+ pos += len;
2497
+ } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
2498
+ }
2454
2499
  }
2455
2500
  if (intron.length == 0) {
2456
2501
  ++n_sgl;
@@ -2509,6 +2554,276 @@ function paf_junceval(args)
2509
2554
  }
2510
2555
  }
2511
2556
 
2557
+ function paf_exoneval(args) // adapted from paf_junceval()
2558
+ {
2559
+ var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false, aa = false, is_bed = false, use_cds = false, eval_base = false;
2560
+ while ((c = getopt(args, "l:epcab1ds")) != null) {
2561
+ if (c == 'l') l_fuzzy = parseInt(getopt.arg);
2562
+ else if (c == 'e') print_err_only = print_ovlp = true;
2563
+ else if (c == 'p') print_ovlp = true;
2564
+ else if (c == 'c') chr_only = true;
2565
+ else if (c == 'a') aa = true, use_cds = true;
2566
+ else if (c == 'b') is_bed = true;
2567
+ else if (c == '1') first_only = true;
2568
+ else if (c == 'd') use_cds = true;
2569
+ else if (c == 's') eval_base = true;
2570
+ }
2571
+
2572
+ if (args.length - getopt.ind < 1) {
2573
+ print("Usage: paftools.js exoneval [options] <gene.gtf> <aln.sam>");
2574
+ print("Options:");
2575
+ print(" -l INT tolerance of junction positions (0 for exact) [0]");
2576
+ print(" -d evaluate coding regions only (exon regions by default)");
2577
+ print(" -a miniprot PAF as input (force -d)");
2578
+ print(" -p print overlapping exons");
2579
+ print(" -e print erroreous overlapping exons");
2580
+ print(" -c only consider alignments to /^(chr)?([0-9]+|X|Y)$/");
2581
+ print(" -1 only process the first alignment of each query");
2582
+ print(" -b BED as input");
2583
+ print(" -s compute base Sn and Sp (more memory)");
2584
+ exit(1);
2585
+ }
2586
+
2587
+ var file, buf = new Bytes();
2588
+
2589
+ warn("Reading reference GTF...");
2590
+ var tr = {};
2591
+ file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
2592
+ while (file.readline(buf) >= 0) {
2593
+ var m, t = buf.toString().split("\t");
2594
+ if (t[0].charAt(0) == '#') continue;
2595
+ if (use_cds) {
2596
+ if (t[2] != "cds" && t[2] != "CDS") continue;
2597
+ } else {
2598
+ if (t[2] != 'exon') continue;
2599
+ }
2600
+ var st = parseInt(t[3]) - 1;
2601
+ var en = parseInt(t[4]);
2602
+ if ((m = /transcript_id "(\S+)"/.exec(t[8])) == null) continue;
2603
+ var tid = m[1];
2604
+ if (tr[tid] == null) tr[tid] = [t[0], t[6], 0, 0, []];
2605
+ tr[tid][4].push([st, en]); // this keeps transcript
2606
+ }
2607
+ file.close();
2608
+
2609
+ var anno = {};
2610
+ for (var tid in tr) { // traverse each transcript
2611
+ var t = tr[tid];
2612
+ Interval.sort(t[4]);
2613
+ t[2] = t[4][0][0];
2614
+ t[3] = t[4][t[4].length - 1][1];
2615
+ if (anno[t[0]] == null) anno[t[0]] = [];
2616
+ var s = t[4];
2617
+ for (var i = 0; i < s.length; ++i) // traverse each exon
2618
+ anno[t[0]].push([s[i][0], s[i][1]]);
2619
+ }
2620
+ tr = null;
2621
+
2622
+ for (var chr in anno) { // index exons
2623
+ var e = anno[chr];
2624
+ if (e.length == 0) continue;
2625
+ Interval.sort(e);
2626
+ var k = 0;
2627
+ for (var i = 1; i < e.length; ++i) // dedup
2628
+ if (e[i][0] != e[k][0] || e[i][1] != e[k][1])
2629
+ e[++k] = e[i].slice(0);
2630
+ e.length = k + 1;
2631
+ Interval.index_end(e);
2632
+ }
2633
+
2634
+ var n_pri = 0, n_unmapped = 0, n_mapped = 0;
2635
+ var n_exon = 0, n_exon_hit = 0, n_exon_novel = 0;
2636
+
2637
+ file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]);
2638
+ var last_qname = null, qexon = {};
2639
+ var re_cigar = /(\d+)([MIDNSHP=XFGUV])/g;
2640
+
2641
+ warn("Evaluating alignments...");
2642
+ while (file.readline(buf) >= 0) {
2643
+ var m, t = buf.toString().split("\t");
2644
+ var ctg_name = null, cigar = null, pos = null, qname;
2645
+
2646
+ if (t[0].charAt(0) == '@') continue;
2647
+ if (t[0] == "##PAF") t.shift();
2648
+ qname = t[0];
2649
+ if (is_bed) {
2650
+ ctg_name = t[0], pos = parseInt(t[1]), cigar == null;
2651
+ } else if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
2652
+ ctg_name = t[5], pos = parseInt(t[7]);
2653
+ var type = 'P';
2654
+ for (i = 12; i < t.length; ++i) {
2655
+ if ((m = /^(tp:A|cg:Z):(\S+)/.exec(t[i])) != null) {
2656
+ if (m[1] == 'tp:A') type = m[2];
2657
+ else cigar = m[2];
2658
+ }
2659
+ }
2660
+ if (type == 'S') continue; // secondary
2661
+ } else { // SAM
2662
+ ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5];
2663
+ var flag = parseInt(t[1]);
2664
+ if (flag&0x100) continue; // secondary
2665
+ }
2666
+
2667
+ if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(ctg_name)) continue;
2668
+ if (first_only && last_qname == qname) continue;
2669
+ if (ctg_name == '*') { // unmapped
2670
+ ++n_unmapped;
2671
+ continue;
2672
+ } else {
2673
+ ++n_pri;
2674
+ if (last_qname != qname) {
2675
+ ++n_mapped;
2676
+ last_qname = qname;
2677
+ }
2678
+ }
2679
+
2680
+ var exon = [];
2681
+ if (is_bed) { // BED
2682
+ exon.push([pos, parseInt(t[2])]);
2683
+ } else if (aa) {
2684
+ var tmp_exon = [], tmp = 0, tmp_st = 0;
2685
+ while ((m = re_cigar.exec(cigar)) != null) {
2686
+ var len = parseInt(m[1]), op = m[2];
2687
+ if (op == 'N') {
2688
+ tmp_exon.push([tmp_st, tmp]);
2689
+ tmp_st = tmp + len, tmp += len;
2690
+ } else if (op == 'U') {
2691
+ tmp_exon.push([tmp_st, tmp + 1]);
2692
+ tmp_st = tmp + len - 2, tmp += len;
2693
+ } else if (op == 'V') {
2694
+ tmp_exon.push([tmp_st, tmp + 2]);
2695
+ tmp_st = tmp + len - 1, tmp += len;
2696
+ } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') {
2697
+ tmp += len * 3;
2698
+ } else if (op == 'F' || op == 'G') {
2699
+ tmp += len;
2700
+ }
2701
+ }
2702
+ tmp_exon.push([tmp_st, tmp]);
2703
+ if (t[4] == '+') {
2704
+ for (var i = 0; i < tmp_exon.length; ++i)
2705
+ exon.push([pos + tmp_exon[i][0], pos + tmp_exon[i][1]]);
2706
+ } else if (t[4] == '-') { // For protein-to-genome alignment, the coordinates are on the query strand. Need to flip them.
2707
+ var glen = parseInt(t[8]) - parseInt(t[7]);
2708
+ for (var i = tmp_exon.length - 1; i >= 0; --i)
2709
+ exon.push([pos + (glen - tmp_exon[i][1]), pos + (glen - tmp_exon[i][0])]);
2710
+ }
2711
+ } else {
2712
+ var tmp_st = pos;
2713
+ while ((m = re_cigar.exec(cigar)) != null) {
2714
+ var len = parseInt(m[1]), op = m[2];
2715
+ if (op == 'N') {
2716
+ exon.push([tmp_st, pos]);
2717
+ tmp_st = pos + len, pos += len;
2718
+ } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
2719
+ }
2720
+ exon.push([tmp_st, pos]);
2721
+ }
2722
+ n_exon += exon.length;
2723
+
2724
+ var chr = anno[ctg_name];
2725
+ if (chr != null) {
2726
+ for (var i = 0; i < exon.length; ++i) {
2727
+ if (eval_base) {
2728
+ if (qexon[ctg_name] == null) qexon[ctg_name] = [];
2729
+ qexon[ctg_name].push([exon[i][0], exon[i][1]]);
2730
+ }
2731
+ var o = Interval.find_ovlp(chr, exon[i][0], exon[i][1]);
2732
+ if (o.length > 0) {
2733
+ var hit = false;
2734
+ for (var j = 0; j < o.length; ++j) {
2735
+ var st_diff = exon[i][0] - o[j][0];
2736
+ var en_diff = exon[i][1] - o[j][1];
2737
+ if (st_diff < 0) st_diff = -st_diff;
2738
+ if (en_diff < 0) en_diff = -en_diff;
2739
+ if (st_diff <= l_fuzzy && en_diff <= l_fuzzy)
2740
+ ++n_exon_hit, hit = true;
2741
+ if (hit) break;
2742
+ }
2743
+ if (print_ovlp) {
2744
+ var type = hit? 'C' : 'P';
2745
+ if (hit && print_err_only) continue;
2746
+ var x = '[';
2747
+ for (var j = 0; j < o.length; ++j) {
2748
+ if (j) x += ', ';
2749
+ x += '(' + o[j][0] + "," + o[j][1] + ')';
2750
+ }
2751
+ x += ']';
2752
+ print(type, qname, i+1, ctg_name, exon[i][0], exon[i][1], x);
2753
+ }
2754
+ } else {
2755
+ ++n_exon_novel;
2756
+ if (print_ovlp)
2757
+ print('N', qname, i+1, ctg_name, exon[i][0], exon[i][1]);
2758
+ }
2759
+ }
2760
+ } else {
2761
+ n_exon_novel += exon.length;
2762
+ }
2763
+ }
2764
+ file.close();
2765
+
2766
+ buf.destroy();
2767
+
2768
+ if (!print_ovlp) {
2769
+ print("# unmapped reads: " + n_unmapped);
2770
+ print("# mapped reads: " + n_mapped);
2771
+ print("# primary alignments: " + n_pri);
2772
+ print("# predicted exons: " + n_exon);
2773
+ print("# non-overlapping exons: " + n_exon_novel);
2774
+ print("# correct exons: " + n_exon_hit + " (" + (n_exon_hit / n_exon * 100).toFixed(2) + "%)");
2775
+ }
2776
+
2777
+ function merge_and_index(ex) {
2778
+ for (var chr in ex) {
2779
+ var a = [];
2780
+ e = ex[chr];
2781
+ Interval.sort(e);
2782
+ var st = e[0][0], en = e[0][1];
2783
+ for (var i = 1; i < e.length; ++i) { // merge
2784
+ if (e[i][0] > en) {
2785
+ a.push([st, en]);
2786
+ st = e[i][0], en = e[i][1];
2787
+ } else {
2788
+ en = en > e[i][1]? en : e[i][1];
2789
+ }
2790
+ }
2791
+ a.push([st, en]);
2792
+ Interval.index_end(a);
2793
+ ex[chr] = a;
2794
+ }
2795
+ }
2796
+
2797
+ function cal_sn(a0, a1) {
2798
+ var tot = 0, cov = 0;
2799
+ for (var chr in a1) {
2800
+ var e0 = a0[chr], e1 = a1[chr];
2801
+ for (var i = 0; i < e1.length; ++i)
2802
+ tot += e1[i][1] - e1[i][0];
2803
+ if (e0 == null) continue;
2804
+ for (var i = 0; i < e1.length; ++i) {
2805
+ var o = Interval.find_ovlp(e0, e1[i][0], e1[i][1]);
2806
+ for (var j = 0; j < o.length; ++j) { // this only works when there are no overlaps between intervals
2807
+ var st = e1[i][0] > o[j][0]? e1[i][0] : o[j][0];
2808
+ var en = e1[i][1] < o[j][1]? e1[i][1] : o[j][1];
2809
+ cov += en - st;
2810
+ }
2811
+ }
2812
+ }
2813
+ return [tot, cov];
2814
+ }
2815
+
2816
+ if (eval_base) {
2817
+ warn("Computing base Sn and Sp...");
2818
+ merge_and_index(qexon);
2819
+ merge_and_index(anno);
2820
+ var sn = cal_sn(qexon, anno);
2821
+ var sp = cal_sn(anno, qexon);
2822
+ print("Base Sn: " + sn[1] + " / " + sn[0] + " = " + (sn[1] / sn[0] * 100).toFixed(2) + "%");
2823
+ print("Base Sp: " + sp[1] + " / " + sp[0] + " = " + (sp[1] / sp[0] * 100).toFixed(2) + "%");
2824
+ }
2825
+ }
2826
+
2512
2827
  // evaluate overlap sensitivity
2513
2828
  function paf_ov_eval(args)
2514
2829
  {
@@ -2704,6 +3019,23 @@ function paf_misjoin(args)
2704
3019
  return len < (en - st) * cen_ratio? false : true;
2705
3020
  }
2706
3021
 
3022
+ function test_cen_point(cen, chr, x) {
3023
+ var b = cen[chr];
3024
+ if (b == null) return false;
3025
+ for (var j = 0; j < b.length; ++j)
3026
+ if (x >= b[j][0] && x < b[j][1])
3027
+ return true;
3028
+ return false;
3029
+ }
3030
+
3031
+ if (show_err || show_long) {
3032
+ print("C\tJ inter-chromosomal misjoin");
3033
+ print("C\tj inter-chromosomal misjoin with both breakpoints ending in centromeres");
3034
+ print("C\tG long gap on the reference genome");
3035
+ print("C\tg long gap on the reference genome with both breakpoints ending in centromeres");
3036
+ print("C\tM closed inversion");
3037
+ print("C");
3038
+ }
2707
3039
  function process(a) {
2708
3040
  var k = 0;
2709
3041
  for (var i = 0; i < a.length; ++i) {
@@ -2716,14 +3048,17 @@ function paf_misjoin(args)
2716
3048
  a = a.sort(function(x,y){return x[2]-y[2]});
2717
3049
  if (show_long) for (var i = 0; i < a.length; ++i) print(a[i].join("\t"));
2718
3050
  for (var i = 1; i < a.length; ++i) {
2719
- var ov = [false, false];
3051
+ var ov = [false, false], end_cen = [false, false];
2720
3052
  ov[0] = test_cen(cen, a[i-1][5], a[i-1][7], a[i-1][8]);
2721
3053
  ov[1] = test_cen(cen, a[i][5], a[i][7], a[i][8]);
3054
+ end_cen[0] = test_cen_point(cen, a[i-1][5], a[i-1][4] == '+'? a[i-1][8] : a[i-1][7]);
3055
+ end_cen[1] = test_cen_point(cen, a[i][5], a[i][4] == '+'? a[i][7] : a[i][8]);
2722
3056
  if (a[i-1][5] != a[i][5]) { // different chr
2723
3057
  if (ov[0] || ov[1]) ++n_diff[1];
2724
3058
  else if (show_err) {
2725
- print("J", a[i-1].slice(0, 12).join("\t"));
2726
- print("J", a[i].slice(0, 12).join("\t"));
3059
+ var label = end_cen[0] && end_cen[1]? 'j' : 'J';
3060
+ print(label, a[i-1].slice(0, 12).join("\t"));
3061
+ print(label, a[i].slice(0, 12).join("\t"));
2727
3062
  }
2728
3063
  ++n_diff[0];
2729
3064
  } else if (a[i-1][4] == a[i][4]) { // a gap
@@ -2733,8 +3068,9 @@ function paf_misjoin(args)
2733
3068
  if (gap > max_gap) {
2734
3069
  if (ov[0] || ov[1]) ++n_gap[1];
2735
3070
  else if (show_err) {
2736
- print("G", a[i-1].slice(0, 12).join("\t"));
2737
- print("G", a[i].slice(0, 12).join("\t"));
3071
+ var label = end_cen[0] && end_cen[1]? 'g' : 'G';
3072
+ print(label, a[i-1].slice(0, 12).join("\t"));
3073
+ print(label, a[i].slice(0, 12).join("\t"));
2738
3074
  }
2739
3075
  ++n_gap[0];
2740
3076
  }
@@ -3084,6 +3420,183 @@ function paf_pafcmp(args)
3084
3420
  buf.destroy();
3085
3421
  }
3086
3422
 
3423
+ function paf_longcs2seq(args) {
3424
+ var c, opt = { query:false };
3425
+ while ((c = getopt(args, "q")) != null)
3426
+ if (c == 'q') opt.query = true;
3427
+ if (args.length == getopt.ind) {
3428
+ print("Usage: paftools.js longcs2seq [-q] <long-cs.paf>");
3429
+ return;
3430
+ }
3431
+ var re_cs = /([:=*+-])(\d+|[A-Za-z]+)/g
3432
+ var buf = new Bytes();
3433
+ var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
3434
+ while (file.readline(buf) >= 0) {
3435
+ var m, cs = null, t = buf.toString().split("\t");
3436
+ for (var i = 12; i < t.length; ++i)
3437
+ if ((m = /^cs:Z:(\S+)/.exec(t[i])) != null) {
3438
+ cs = m[1];
3439
+ break;
3440
+ }
3441
+ if (cs == null) continue;
3442
+ var ts = "", qs = "";
3443
+ while ((m = re_cs.exec(cs)) != null) {
3444
+ if (m[1] == "=") ts += m[2], qs += m[2];
3445
+ else if (m[1] == "+") qs += m[2].toUpperCase();
3446
+ else if (m[1] == "-") ts += m[2].toUpperCase();
3447
+ else if (m[1] == "*") ts += m[2][0].toUpperCase(), qs += m[2][1].toUpperCase();
3448
+ else if (m[1] == ":") throw Error("Long cs is required");
3449
+ }
3450
+ if (opt.query) {
3451
+ print(">" + t[0] + "_" + t[2] + "_" + t[3]);
3452
+ print(qs);
3453
+ } else {
3454
+ print(">" + t[5] + "_" + t[7] + "_" + t[8]);
3455
+ print(ts);
3456
+ }
3457
+ }
3458
+ file.close();
3459
+ buf.destroy();
3460
+ }
3461
+
3462
+ function paf_paf2gff(args) {
3463
+ var c, opt = { aa:false };
3464
+ var re_cigar = /(\d+)([A-Z=])/g;
3465
+ while ((c = getopt(args, "a")) != null) {
3466
+ if (c == 'a') opt.aa = true;
3467
+ }
3468
+ if (args.length == getopt.ind) {
3469
+ print("Usage: paftools.js paf2gff [-a] <in.paf>");
3470
+ return;
3471
+ }
3472
+ var buf = new Bytes();
3473
+ var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
3474
+ var hid = 1, last_name = null;
3475
+ while (file.readline(buf) >= 0) {
3476
+ var m, t = buf.toString().split("\t");
3477
+ if (t[5] == '*') continue; // skip unmapped lines
3478
+
3479
+ if (t[0] != last_name) last_name = t[0], hid = 1;
3480
+ else ++hid;
3481
+ for (var i = 1; i <= 3; ++i) t[i] = parseInt(t[i]);
3482
+ for (var i = 6; i <= 11; ++i) t[i] = parseInt(t[i]);
3483
+ var cigar = null, score = null, np = null, dist_stop = null, dist_start = null;
3484
+ for (var i = 12; i < t.length; ++i) {
3485
+ if ((m = /^(cg:Z|AS:i|np:i|da:i|do:i):(\S+)/.exec(t[i])) != null) {
3486
+ if (m[1] == 'cg:Z') cigar = m[2];
3487
+ else if (m[1] == 'AS:i') score = parseInt(m[2]);
3488
+ else if (m[1] == 'np:i') np = parseInt(m[2]);
3489
+ else if (m[1] == 'do:i') dist_stop = parseInt(m[2]);
3490
+ else if (m[1] == 'da:i') dist_start = parseInt(m[2]);
3491
+ }
3492
+ }
3493
+ if (cigar == null) throw Error("failed to find the cg:Z tag");
3494
+ if (score == null) throw Error("failed to find the AS:i tag");
3495
+
3496
+ var st = 0, en = 0, phase = 0, pseudo = false, fs = 0, a = [];
3497
+ if (dist_start != null && dist_start == 0)
3498
+ a.push([t[5], 'paf2gff', 'start_codon', 0, 3, 0, t[4], '.', 0]);
3499
+ while ((m = re_cigar.exec(cigar)) != null) {
3500
+ var len = parseInt(m[1]);
3501
+ if (m[2] == 'M' || m[2] == 'D') {
3502
+ en += opt.aa? len * 3 : len;
3503
+ } else if (m[2] == 'F' || m[2] == 'G' || m[2] == 'R') {
3504
+ en += len, pseudo = true, fs = 1;
3505
+ } else if (m[2] == 'N') {
3506
+ a.push([t[5], 'paf2gff', 'exon', st, en, 0, t[4], phase, fs]);
3507
+ st = en + len, en += len, phase = 0, fs = 0;
3508
+ } else if (m[2] == 'U') { // ...xGT...AGxx...
3509
+ a.push([t[5], 'paf2gff', 'exon', st, en + 1, 0, t[4], phase, fs]);
3510
+ st = en + len - 2, en += len, phase = 2, fs = 0;
3511
+ } else if (m[2] == 'V') { // ...xxGT...AGx...
3512
+ a.push([t[5], 'paf2gff', 'exon', st, en + 2, 0, t[4], phase, fs]);
3513
+ st = en + len - 1, en += len, phase = 1, fs = 0;
3514
+ }
3515
+ }
3516
+ a.push([t[5], 'paf2gff', 'exon', st, en, 0, t[4], phase, fs]);
3517
+ if (en != t[8] - t[7]) throw Error("inconsistent cigar");
3518
+ if (dist_stop != null && dist_stop == 0)
3519
+ a.push([t[5], 'paf2gff', 'stop_codon', en, en + 3, 0, t[4], '.', 0]);
3520
+ var type = pseudo? 'pseudogene' : 'protein_coding';
3521
+ var attr = ['transcript_id=' + t[0] + '#' + hid, 'transcript_type=' + type].join(";");
3522
+ var trans_attr = 'identity=' + (t[9] / t[10]).toFixed(4);
3523
+ if (np != null) trans_attr += ';positive=' + (np * 3 / t[10]).toFixed(4);
3524
+ trans_attr += ';aa_start=' + t[2];
3525
+ trans_attr += ';aa_end=' + (t[1] - t[3]);
3526
+ if (dist_start != null && dist_start >= 0) trans_attr += ';dist_start_codon=' + dist_start;
3527
+ if (dist_stop != null && dist_stop >= 0) trans_attr += ';dist_stop_codon=' + dist_stop;
3528
+ var trans_st = t[7], trans_en = t[8];
3529
+ if (dist_stop != null && dist_stop == 0) {
3530
+ if (t[4] == '-') trans_st -= 3;
3531
+ else trans_en += 3;
3532
+ }
3533
+ print([t[5], 'paf2gff', 'transcript', trans_st + 1, trans_en, score, t[4], '.', attr + ';' + trans_attr].join("\t"));
3534
+ if (opt.aa && t[4] == '-') {
3535
+ var b = [], len = t[8] - t[7];
3536
+ for (var i = a.length - 1; i >= 0; --i) {
3537
+ var x = len - a[i][3];
3538
+ a[i][3] = len - a[i][4];
3539
+ a[i][4] = x;
3540
+ //a[i][7] = a[i][7] == 0? 0 : 3 - a[i][7]; // not sure if this line is needed
3541
+ b.push(a[i]);
3542
+ }
3543
+ a = b;
3544
+ }
3545
+ for (var i = 0; i < a.length; ++i) {
3546
+ if (!pseudo && a[i][2] == "exon") a[i][2] = "CDS";
3547
+ a[i][3] += t[7] + 1;
3548
+ a[i][4] += t[7];
3549
+ a[i][8] = attr + ";frameshift=" + a[i][8];
3550
+ print(a[i].join("\t"));
3551
+ }
3552
+ }
3553
+ file.close();
3554
+ buf.destroy();
3555
+ }
3556
+
3557
+ function paf_gff2junc(args) {
3558
+ var c, feat = "CDS";
3559
+ while ((c = getopt(args, "f:")) != null) {
3560
+ if (c == 'f') feat = getopt.arg;
3561
+ }
3562
+ if (getopt.ind == args.length) {
3563
+ print("Usage: paftools.js gff2junc [-f feature] <in.gff3>");
3564
+ return;
3565
+ }
3566
+ var buf = new Bytes();
3567
+ var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
3568
+
3569
+ function process_a(a) {
3570
+ if (a.length < 2) return;
3571
+ a = a.sort(function(x, y) { return x[4] - y[4] });
3572
+ for (var i = 1; i < a.length; ++i)
3573
+ print([a[i][1], a[i-1][5], a[i][4], a[i][0], 0, a[i][7]].join("\t"));
3574
+ }
3575
+
3576
+ var a = [];
3577
+ while (file.readline(buf) >= 0) {
3578
+ var m, t = buf.toString().split("\t");
3579
+ if (t[0][0] == '#') continue;
3580
+ if (t[2].toLowerCase() != feat.toLowerCase()) continue;
3581
+ //print(t.join("\t"));
3582
+ if ((m = /\bParent=([^;]+)/.exec(t[8])) == null) {
3583
+ warn("Can't find Parent");
3584
+ continue;
3585
+ }
3586
+ t[3] = parseInt(t[3]) - 1;
3587
+ t[4] = parseInt(t[4]);
3588
+ t.unshift(m[1]);
3589
+ if (a.length > 0 && a[0][0] != m[1]) {
3590
+ process_a(a);
3591
+ a.length = 0;
3592
+ a.push(t);
3593
+ } else a.push(t);
3594
+ }
3595
+ process_a(a);
3596
+ file.close();
3597
+ buf.destroy();
3598
+ }
3599
+
3087
3600
  /*************************
3088
3601
  ***** main function *****
3089
3602
  *************************/
@@ -3098,6 +3611,9 @@ function main(args)
3098
3611
  print(" sam2paf convert SAM to PAF");
3099
3612
  print(" delta2paf convert MUMmer's delta to PAF");
3100
3613
  print(" gff2bed convert GTF/GFF3 to BED12");
3614
+ print(" gff2junc convert GFF3 to junction BED");
3615
+ print(" longcs2seq convert long-cs PAF to sequences");
3616
+ // print(" paf2gff convert PAF to GFF3 (tested for miniprot only)");
3101
3617
  print("");
3102
3618
  print(" stat collect basic mapping information in PAF/SAM");
3103
3619
  print(" asmstat collect basic assembly information");
@@ -3115,6 +3631,7 @@ function main(args)
3115
3631
  print(" mason2fq convert mason2-simulated SAM to FASTQ");
3116
3632
  print(" pbsim2fq convert PBSIM-simulated MAF to FASTQ");
3117
3633
  print(" junceval evaluate splice junction consistency with known annotations");
3634
+ print(" exoneval evaluate exon-level consistency with known annotations");
3118
3635
  print(" ov-eval evaluate read overlap sensitivity using read-to-ref mapping");
3119
3636
  exit(1);
3120
3637
  }
@@ -3125,6 +3642,7 @@ function main(args)
3125
3642
  else if (cmd == 'delta2paf') paf_delta2paf(args);
3126
3643
  else if (cmd == 'splice2bed') paf_splice2bed(args);
3127
3644
  else if (cmd == 'gff2bed') paf_gff2bed(args);
3645
+ else if (cmd == 'gff2junc') paf_gff2junc(args);
3128
3646
  else if (cmd == 'stat') paf_stat(args);
3129
3647
  else if (cmd == 'asmstat') paf_asmstat(args);
3130
3648
  else if (cmd == 'asmgene') paf_asmgene(args);
@@ -3138,10 +3656,13 @@ function main(args)
3138
3656
  else if (cmd == 'mason2fq') paf_mason2fq(args);
3139
3657
  else if (cmd == 'pbsim2fq') paf_pbsim2fq(args);
3140
3658
  else if (cmd == 'junceval') paf_junceval(args);
3659
+ else if (cmd == 'exoneval') paf_exoneval(args);
3141
3660
  else if (cmd == 'ov-eval') paf_ov_eval(args);
3142
3661
  else if (cmd == 'vcfstat') paf_vcfstat(args);
3143
3662
  else if (cmd == 'sveval') paf_sveval(args);
3144
3663
  else if (cmd == 'vcfsel') paf_vcfsel(args);
3664
+ else if (cmd == 'longcs2seq') paf_longcs2seq(args);
3665
+ else if (cmd == 'paf2gff') paf_paf2gff(args);
3145
3666
  else if (cmd == 'version') print(paftools_version);
3146
3667
  else throw Error("unrecognized command: " + cmd);
3147
3668
  }