minimap2 0.2.24.6 → 0.2.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env k8
2
2
 
3
- var paftools_version = '2.24-r1122';
3
+ var paftools_version = '2.25-r1173';
4
4
 
5
5
  /*****************************
6
6
  ***** Library functions *****
@@ -1532,22 +1532,24 @@ function paf_view(args)
1532
1532
 
1533
1533
  function paf_gff2bed(args)
1534
1534
  {
1535
- var c, fn_ucsc_fai = null, is_short = false, keep_gff = false, print_junc = false, output_gene = false;
1536
- while ((c = getopt(args, "u:sgjG")) != null) {
1535
+ var c, fn_ucsc_fai = null, is_short = false, keep_gff = false, print_junc = false, output_gene = false, ens_canon_only = false;
1536
+ while ((c = getopt(args, "u:sgjGe")) != null) {
1537
1537
  if (c == 'u') fn_ucsc_fai = getopt.arg;
1538
1538
  else if (c == 's') is_short = true;
1539
1539
  else if (c == 'g') keep_gff = true;
1540
1540
  else if (c == 'j') print_junc = true;
1541
1541
  else if (c == 'G') output_gene = true;
1542
+ else if (c == 'e') ens_canon_only = true;
1542
1543
  }
1543
1544
 
1544
1545
  if (getopt.ind == args.length) {
1545
1546
  print("Usage: paftools.js gff2bed [options] <in.gff>");
1546
1547
  print("Options:");
1547
- print(" -j Output junction BED");
1548
- print(" -s Print names in the short form");
1548
+ print(" -j output junction BED");
1549
+ print(" -s print names in the short form");
1549
1550
  print(" -u FILE hg38.fa.fai for chr name conversion");
1550
- print(" -g Output GFF (used with -u)");
1551
+ print(" -e only show transcript tagged with 'Ensembl_canonical'");
1552
+ print(" -g output GFF (used with -u)");
1551
1553
  exit(1);
1552
1554
  }
1553
1555
 
@@ -1606,7 +1608,7 @@ function paf_gff2bed(args)
1606
1608
  print(a[0][0], st, en, name, 1000, a[0][3], cds_st, cds_en, color, a.length, sizes.join(",") + ",", starts.join(",") + ",");
1607
1609
  }
1608
1610
 
1609
- var re_gtf = /\b(transcript_id|transcript_type|transcript_biotype|gene_name|gene_id|gbkey|transcript_name) "([^"]+)";/g;
1611
+ var re_gtf = /\b(transcript_id|transcript_type|transcript_biotype|gene_name|gene_id|gbkey|transcript_name|tag) "([^"]+)";/g;
1610
1612
  var re_gff3 = /\b(transcript_id|transcript_type|transcript_biotype|gene_name|gene_id|gbkey|transcript_name)=([^;]+)/g;
1611
1613
  var re_gtf_gene = /\b(gene_id|gene_type|gene_name) "([^;]+)";/g;
1612
1614
  var re_gff3_gene = /\b(gene_id|gene_type|source_gene|gene_biotype|gene_name)=([^;]+);/g;
@@ -1646,13 +1648,14 @@ function paf_gff2bed(args)
1646
1648
  if (t[2] != "CDS" && t[2] != "exon") continue;
1647
1649
  t[3] = parseInt(t[3]) - 1;
1648
1650
  t[4] = parseInt(t[4]);
1649
- var id = null, type = "", name = "N/A", biotype = "", m, tname = "N/A";
1651
+ var id = null, type = "", name = "N/A", biotype = "", m, tname = "N/A", ens_canonical = false;
1650
1652
  while ((m = re_gtf.exec(t[8])) != null) {
1651
1653
  if (m[1] == "transcript_id") id = m[2];
1652
1654
  else if (m[1] == "transcript_type") type = m[2];
1653
1655
  else if (m[1] == "transcript_biotype" || m[1] == "gbkey") biotype = m[2];
1654
1656
  else if (m[1] == "gene_name" || m[1] == "gene_id") name = m[2];
1655
1657
  else if (m[1] == "transcript_name") tname = m[2];
1658
+ else if (m[1] == "tag" && m[2] == "Ensembl_canonical") ens_canonical = true;
1656
1659
  }
1657
1660
  while ((m = re_gff3.exec(t[8])) != null) {
1658
1661
  if (m[1] == "transcript_id") id = m[2];
@@ -1661,6 +1664,7 @@ function paf_gff2bed(args)
1661
1664
  else if (m[1] == "gene_name" || m[1] == "gene_id") name = m[2];
1662
1665
  else if (m[1] == "transcript_name") tname = m[2];
1663
1666
  }
1667
+ if (ens_canon_only && !ens_canonical) continue;
1664
1668
  if (type == "" && biotype != "") type = biotype;
1665
1669
  if (id == null) throw Error("No transcript_id");
1666
1670
  if (id != last_id) {
@@ -2341,12 +2345,15 @@ function paf_pbsim2fq(args)
2341
2345
 
2342
2346
  function paf_junceval(args)
2343
2347
  {
2344
- var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false;
2345
- while ((c = getopt(args, "l:epc")) != null) {
2348
+ var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false, aa = false, is_bed = false;
2349
+ while ((c = getopt(args, "l:epcab1")) != null) {
2346
2350
  if (c == 'l') l_fuzzy = parseInt(getopt.arg);
2347
2351
  else if (c == 'e') print_err_only = print_ovlp = true;
2348
2352
  else if (c == 'p') print_ovlp = true;
2349
2353
  else if (c == 'c') chr_only = true;
2354
+ else if (c == 'a') aa = true;
2355
+ else if (c == 'b') is_bed = true;
2356
+ else if (c == '1') first_only = true;
2350
2357
  }
2351
2358
 
2352
2359
  if (args.length - getopt.ind < 1) {
@@ -2356,6 +2363,9 @@ function paf_junceval(args)
2356
2363
  print(" -p print overlapping introns");
2357
2364
  print(" -e print erroreous overlapping introns");
2358
2365
  print(" -c only consider alignments to /^(chr)?([0-9]+|X|Y)$/");
2366
+ print(" -a miniprot PAF as input");
2367
+ print(" -b BED as input");
2368
+ print(" -1 only process the first alignment of each query");
2359
2369
  exit(1);
2360
2370
  }
2361
2371
 
@@ -2409,13 +2419,17 @@ function paf_junceval(args)
2409
2419
 
2410
2420
  file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]);
2411
2421
  var last_qname = null;
2412
- var re_cigar = /(\d+)([MIDNSHP=X])/g;
2422
+ var re_cigar = /(\d+)([MIDNSHP=XFGUV])/g;
2413
2423
  while (file.readline(buf) >= 0) {
2414
2424
  var m, t = buf.toString().split("\t");
2415
- var ctg_name = null, cigar = null, pos = null, qname = t[0];
2425
+ var ctg_name = null, cigar = null, pos = null, qname;
2416
2426
 
2417
2427
  if (t[0].charAt(0) == '@') continue;
2418
- if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
2428
+ if (t[0] == "##PAF") t.shift();
2429
+ qname = t[0];
2430
+ if (is_bed) {
2431
+ ctg_name = t[0], pos = parseInt(t[1]), cigar == null;
2432
+ } else if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
2419
2433
  ctg_name = t[5], pos = parseInt(t[7]);
2420
2434
  var type = 'P';
2421
2435
  for (i = 12; i < t.length; ++i) {
@@ -2445,12 +2459,43 @@ function paf_junceval(args)
2445
2459
  }
2446
2460
 
2447
2461
  var intron = [];
2448
- while ((m = re_cigar.exec(cigar)) != null) {
2449
- var len = parseInt(m[1]), op = m[2];
2450
- if (op == 'N') {
2451
- intron.push([pos, pos + len]);
2452
- pos += len;
2453
- } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
2462
+ if (is_bed) {
2463
+ intron.push([pos, parseInt(t[2])]);
2464
+ } else if (aa) {
2465
+ var tmp_junc = [], tmp = 0;
2466
+ while ((m = re_cigar.exec(cigar)) != null) {
2467
+ var len = parseInt(m[1]), op = m[2];
2468
+ if (op == 'N') {
2469
+ tmp_junc.push([tmp, tmp + len]);
2470
+ tmp += len;
2471
+ } else if (op == 'U') {
2472
+ tmp_junc.push([tmp + 1, tmp + len - 2]);
2473
+ tmp += len;
2474
+ } else if (op == 'V') {
2475
+ tmp_junc.push([tmp + 2, tmp + len - 1]);
2476
+ tmp += len;
2477
+ } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') {
2478
+ tmp += len * 3;
2479
+ } else if (op == 'F' || op == 'G') {
2480
+ tmp += len;
2481
+ }
2482
+ }
2483
+ if (t[4] == '+') {
2484
+ for (var i = 0; i < tmp_junc.length; ++i)
2485
+ intron.push([pos + tmp_junc[i][0], pos + tmp_junc[i][1]]);
2486
+ } else if (t[4] == '-') {
2487
+ var glen = parseInt(t[8]) - parseInt(t[7]);
2488
+ for (var i = tmp_junc.length - 1; i >= 0; --i)
2489
+ intron.push([pos + (glen - tmp_junc[i][1]), pos + (glen - tmp_junc[i][0])]);
2490
+ }
2491
+ } else {
2492
+ while ((m = re_cigar.exec(cigar)) != null) {
2493
+ var len = parseInt(m[1]), op = m[2];
2494
+ if (op == 'N') {
2495
+ intron.push([pos, pos + len]);
2496
+ pos += len;
2497
+ } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
2498
+ }
2454
2499
  }
2455
2500
  if (intron.length == 0) {
2456
2501
  ++n_sgl;
@@ -2509,6 +2554,276 @@ function paf_junceval(args)
2509
2554
  }
2510
2555
  }
2511
2556
 
2557
+ function paf_exoneval(args) // adapted from paf_junceval()
2558
+ {
2559
+ var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false, aa = false, is_bed = false, use_cds = false, eval_base = false;
2560
+ while ((c = getopt(args, "l:epcab1ds")) != null) {
2561
+ if (c == 'l') l_fuzzy = parseInt(getopt.arg);
2562
+ else if (c == 'e') print_err_only = print_ovlp = true;
2563
+ else if (c == 'p') print_ovlp = true;
2564
+ else if (c == 'c') chr_only = true;
2565
+ else if (c == 'a') aa = true, use_cds = true;
2566
+ else if (c == 'b') is_bed = true;
2567
+ else if (c == '1') first_only = true;
2568
+ else if (c == 'd') use_cds = true;
2569
+ else if (c == 's') eval_base = true;
2570
+ }
2571
+
2572
+ if (args.length - getopt.ind < 1) {
2573
+ print("Usage: paftools.js exoneval [options] <gene.gtf> <aln.sam>");
2574
+ print("Options:");
2575
+ print(" -l INT tolerance of junction positions (0 for exact) [0]");
2576
+ print(" -d evaluate coding regions only (exon regions by default)");
2577
+ print(" -a miniprot PAF as input (force -d)");
2578
+ print(" -p print overlapping exons");
2579
+ print(" -e print erroreous overlapping exons");
2580
+ print(" -c only consider alignments to /^(chr)?([0-9]+|X|Y)$/");
2581
+ print(" -1 only process the first alignment of each query");
2582
+ print(" -b BED as input");
2583
+ print(" -s compute base Sn and Sp (more memory)");
2584
+ exit(1);
2585
+ }
2586
+
2587
+ var file, buf = new Bytes();
2588
+
2589
+ warn("Reading reference GTF...");
2590
+ var tr = {};
2591
+ file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
2592
+ while (file.readline(buf) >= 0) {
2593
+ var m, t = buf.toString().split("\t");
2594
+ if (t[0].charAt(0) == '#') continue;
2595
+ if (use_cds) {
2596
+ if (t[2] != "cds" && t[2] != "CDS") continue;
2597
+ } else {
2598
+ if (t[2] != 'exon') continue;
2599
+ }
2600
+ var st = parseInt(t[3]) - 1;
2601
+ var en = parseInt(t[4]);
2602
+ if ((m = /transcript_id "(\S+)"/.exec(t[8])) == null) continue;
2603
+ var tid = m[1];
2604
+ if (tr[tid] == null) tr[tid] = [t[0], t[6], 0, 0, []];
2605
+ tr[tid][4].push([st, en]); // this keeps transcript
2606
+ }
2607
+ file.close();
2608
+
2609
+ var anno = {};
2610
+ for (var tid in tr) { // traverse each transcript
2611
+ var t = tr[tid];
2612
+ Interval.sort(t[4]);
2613
+ t[2] = t[4][0][0];
2614
+ t[3] = t[4][t[4].length - 1][1];
2615
+ if (anno[t[0]] == null) anno[t[0]] = [];
2616
+ var s = t[4];
2617
+ for (var i = 0; i < s.length; ++i) // traverse each exon
2618
+ anno[t[0]].push([s[i][0], s[i][1]]);
2619
+ }
2620
+ tr = null;
2621
+
2622
+ for (var chr in anno) { // index exons
2623
+ var e = anno[chr];
2624
+ if (e.length == 0) continue;
2625
+ Interval.sort(e);
2626
+ var k = 0;
2627
+ for (var i = 1; i < e.length; ++i) // dedup
2628
+ if (e[i][0] != e[k][0] || e[i][1] != e[k][1])
2629
+ e[++k] = e[i].slice(0);
2630
+ e.length = k + 1;
2631
+ Interval.index_end(e);
2632
+ }
2633
+
2634
+ var n_pri = 0, n_unmapped = 0, n_mapped = 0;
2635
+ var n_exon = 0, n_exon_hit = 0, n_exon_novel = 0;
2636
+
2637
+ file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]);
2638
+ var last_qname = null, qexon = {};
2639
+ var re_cigar = /(\d+)([MIDNSHP=XFGUV])/g;
2640
+
2641
+ warn("Evaluating alignments...");
2642
+ while (file.readline(buf) >= 0) {
2643
+ var m, t = buf.toString().split("\t");
2644
+ var ctg_name = null, cigar = null, pos = null, qname;
2645
+
2646
+ if (t[0].charAt(0) == '@') continue;
2647
+ if (t[0] == "##PAF") t.shift();
2648
+ qname = t[0];
2649
+ if (is_bed) {
2650
+ ctg_name = t[0], pos = parseInt(t[1]), cigar == null;
2651
+ } else if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
2652
+ ctg_name = t[5], pos = parseInt(t[7]);
2653
+ var type = 'P';
2654
+ for (i = 12; i < t.length; ++i) {
2655
+ if ((m = /^(tp:A|cg:Z):(\S+)/.exec(t[i])) != null) {
2656
+ if (m[1] == 'tp:A') type = m[2];
2657
+ else cigar = m[2];
2658
+ }
2659
+ }
2660
+ if (type == 'S') continue; // secondary
2661
+ } else { // SAM
2662
+ ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5];
2663
+ var flag = parseInt(t[1]);
2664
+ if (flag&0x100) continue; // secondary
2665
+ }
2666
+
2667
+ if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(ctg_name)) continue;
2668
+ if (first_only && last_qname == qname) continue;
2669
+ if (ctg_name == '*') { // unmapped
2670
+ ++n_unmapped;
2671
+ continue;
2672
+ } else {
2673
+ ++n_pri;
2674
+ if (last_qname != qname) {
2675
+ ++n_mapped;
2676
+ last_qname = qname;
2677
+ }
2678
+ }
2679
+
2680
+ var exon = [];
2681
+ if (is_bed) { // BED
2682
+ exon.push([pos, parseInt(t[2])]);
2683
+ } else if (aa) {
2684
+ var tmp_exon = [], tmp = 0, tmp_st = 0;
2685
+ while ((m = re_cigar.exec(cigar)) != null) {
2686
+ var len = parseInt(m[1]), op = m[2];
2687
+ if (op == 'N') {
2688
+ tmp_exon.push([tmp_st, tmp]);
2689
+ tmp_st = tmp + len, tmp += len;
2690
+ } else if (op == 'U') {
2691
+ tmp_exon.push([tmp_st, tmp + 1]);
2692
+ tmp_st = tmp + len - 2, tmp += len;
2693
+ } else if (op == 'V') {
2694
+ tmp_exon.push([tmp_st, tmp + 2]);
2695
+ tmp_st = tmp + len - 1, tmp += len;
2696
+ } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') {
2697
+ tmp += len * 3;
2698
+ } else if (op == 'F' || op == 'G') {
2699
+ tmp += len;
2700
+ }
2701
+ }
2702
+ tmp_exon.push([tmp_st, tmp]);
2703
+ if (t[4] == '+') {
2704
+ for (var i = 0; i < tmp_exon.length; ++i)
2705
+ exon.push([pos + tmp_exon[i][0], pos + tmp_exon[i][1]]);
2706
+ } else if (t[4] == '-') { // For protein-to-genome alignment, the coordinates are on the query strand. Need to flip them.
2707
+ var glen = parseInt(t[8]) - parseInt(t[7]);
2708
+ for (var i = tmp_exon.length - 1; i >= 0; --i)
2709
+ exon.push([pos + (glen - tmp_exon[i][1]), pos + (glen - tmp_exon[i][0])]);
2710
+ }
2711
+ } else {
2712
+ var tmp_st = pos;
2713
+ while ((m = re_cigar.exec(cigar)) != null) {
2714
+ var len = parseInt(m[1]), op = m[2];
2715
+ if (op == 'N') {
2716
+ exon.push([tmp_st, pos]);
2717
+ tmp_st = pos + len, pos += len;
2718
+ } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
2719
+ }
2720
+ exon.push([tmp_st, pos]);
2721
+ }
2722
+ n_exon += exon.length;
2723
+
2724
+ var chr = anno[ctg_name];
2725
+ if (chr != null) {
2726
+ for (var i = 0; i < exon.length; ++i) {
2727
+ if (eval_base) {
2728
+ if (qexon[ctg_name] == null) qexon[ctg_name] = [];
2729
+ qexon[ctg_name].push([exon[i][0], exon[i][1]]);
2730
+ }
2731
+ var o = Interval.find_ovlp(chr, exon[i][0], exon[i][1]);
2732
+ if (o.length > 0) {
2733
+ var hit = false;
2734
+ for (var j = 0; j < o.length; ++j) {
2735
+ var st_diff = exon[i][0] - o[j][0];
2736
+ var en_diff = exon[i][1] - o[j][1];
2737
+ if (st_diff < 0) st_diff = -st_diff;
2738
+ if (en_diff < 0) en_diff = -en_diff;
2739
+ if (st_diff <= l_fuzzy && en_diff <= l_fuzzy)
2740
+ ++n_exon_hit, hit = true;
2741
+ if (hit) break;
2742
+ }
2743
+ if (print_ovlp) {
2744
+ var type = hit? 'C' : 'P';
2745
+ if (hit && print_err_only) continue;
2746
+ var x = '[';
2747
+ for (var j = 0; j < o.length; ++j) {
2748
+ if (j) x += ', ';
2749
+ x += '(' + o[j][0] + "," + o[j][1] + ')';
2750
+ }
2751
+ x += ']';
2752
+ print(type, qname, i+1, ctg_name, exon[i][0], exon[i][1], x);
2753
+ }
2754
+ } else {
2755
+ ++n_exon_novel;
2756
+ if (print_ovlp)
2757
+ print('N', qname, i+1, ctg_name, exon[i][0], exon[i][1]);
2758
+ }
2759
+ }
2760
+ } else {
2761
+ n_exon_novel += exon.length;
2762
+ }
2763
+ }
2764
+ file.close();
2765
+
2766
+ buf.destroy();
2767
+
2768
+ if (!print_ovlp) {
2769
+ print("# unmapped reads: " + n_unmapped);
2770
+ print("# mapped reads: " + n_mapped);
2771
+ print("# primary alignments: " + n_pri);
2772
+ print("# predicted exons: " + n_exon);
2773
+ print("# non-overlapping exons: " + n_exon_novel);
2774
+ print("# correct exons: " + n_exon_hit + " (" + (n_exon_hit / n_exon * 100).toFixed(2) + "%)");
2775
+ }
2776
+
2777
+ function merge_and_index(ex) {
2778
+ for (var chr in ex) {
2779
+ var a = [];
2780
+ e = ex[chr];
2781
+ Interval.sort(e);
2782
+ var st = e[0][0], en = e[0][1];
2783
+ for (var i = 1; i < e.length; ++i) { // merge
2784
+ if (e[i][0] > en) {
2785
+ a.push([st, en]);
2786
+ st = e[i][0], en = e[i][1];
2787
+ } else {
2788
+ en = en > e[i][1]? en : e[i][1];
2789
+ }
2790
+ }
2791
+ a.push([st, en]);
2792
+ Interval.index_end(a);
2793
+ ex[chr] = a;
2794
+ }
2795
+ }
2796
+
2797
+ function cal_sn(a0, a1) {
2798
+ var tot = 0, cov = 0;
2799
+ for (var chr in a1) {
2800
+ var e0 = a0[chr], e1 = a1[chr];
2801
+ for (var i = 0; i < e1.length; ++i)
2802
+ tot += e1[i][1] - e1[i][0];
2803
+ if (e0 == null) continue;
2804
+ for (var i = 0; i < e1.length; ++i) {
2805
+ var o = Interval.find_ovlp(e0, e1[i][0], e1[i][1]);
2806
+ for (var j = 0; j < o.length; ++j) { // this only works when there are no overlaps between intervals
2807
+ var st = e1[i][0] > o[j][0]? e1[i][0] : o[j][0];
2808
+ var en = e1[i][1] < o[j][1]? e1[i][1] : o[j][1];
2809
+ cov += en - st;
2810
+ }
2811
+ }
2812
+ }
2813
+ return [tot, cov];
2814
+ }
2815
+
2816
+ if (eval_base) {
2817
+ warn("Computing base Sn and Sp...");
2818
+ merge_and_index(qexon);
2819
+ merge_and_index(anno);
2820
+ var sn = cal_sn(qexon, anno);
2821
+ var sp = cal_sn(anno, qexon);
2822
+ print("Base Sn: " + sn[1] + " / " + sn[0] + " = " + (sn[1] / sn[0] * 100).toFixed(2) + "%");
2823
+ print("Base Sp: " + sp[1] + " / " + sp[0] + " = " + (sp[1] / sp[0] * 100).toFixed(2) + "%");
2824
+ }
2825
+ }
2826
+
2512
2827
  // evaluate overlap sensitivity
2513
2828
  function paf_ov_eval(args)
2514
2829
  {
@@ -2704,6 +3019,23 @@ function paf_misjoin(args)
2704
3019
  return len < (en - st) * cen_ratio? false : true;
2705
3020
  }
2706
3021
 
3022
+ function test_cen_point(cen, chr, x) {
3023
+ var b = cen[chr];
3024
+ if (b == null) return false;
3025
+ for (var j = 0; j < b.length; ++j)
3026
+ if (x >= b[j][0] && x < b[j][1])
3027
+ return true;
3028
+ return false;
3029
+ }
3030
+
3031
+ if (show_err || show_long) {
3032
+ print("C\tJ inter-chromosomal misjoin");
3033
+ print("C\tj inter-chromosomal misjoin with both breakpoints ending in centromeres");
3034
+ print("C\tG long gap on the reference genome");
3035
+ print("C\tg long gap on the reference genome with both breakpoints ending in centromeres");
3036
+ print("C\tM closed inversion");
3037
+ print("C");
3038
+ }
2707
3039
  function process(a) {
2708
3040
  var k = 0;
2709
3041
  for (var i = 0; i < a.length; ++i) {
@@ -2716,14 +3048,17 @@ function paf_misjoin(args)
2716
3048
  a = a.sort(function(x,y){return x[2]-y[2]});
2717
3049
  if (show_long) for (var i = 0; i < a.length; ++i) print(a[i].join("\t"));
2718
3050
  for (var i = 1; i < a.length; ++i) {
2719
- var ov = [false, false];
3051
+ var ov = [false, false], end_cen = [false, false];
2720
3052
  ov[0] = test_cen(cen, a[i-1][5], a[i-1][7], a[i-1][8]);
2721
3053
  ov[1] = test_cen(cen, a[i][5], a[i][7], a[i][8]);
3054
+ end_cen[0] = test_cen_point(cen, a[i-1][5], a[i-1][4] == '+'? a[i-1][8] : a[i-1][7]);
3055
+ end_cen[1] = test_cen_point(cen, a[i][5], a[i][4] == '+'? a[i][7] : a[i][8]);
2722
3056
  if (a[i-1][5] != a[i][5]) { // different chr
2723
3057
  if (ov[0] || ov[1]) ++n_diff[1];
2724
3058
  else if (show_err) {
2725
- print("J", a[i-1].slice(0, 12).join("\t"));
2726
- print("J", a[i].slice(0, 12).join("\t"));
3059
+ var label = end_cen[0] && end_cen[1]? 'j' : 'J';
3060
+ print(label, a[i-1].slice(0, 12).join("\t"));
3061
+ print(label, a[i].slice(0, 12).join("\t"));
2727
3062
  }
2728
3063
  ++n_diff[0];
2729
3064
  } else if (a[i-1][4] == a[i][4]) { // a gap
@@ -2733,8 +3068,9 @@ function paf_misjoin(args)
2733
3068
  if (gap > max_gap) {
2734
3069
  if (ov[0] || ov[1]) ++n_gap[1];
2735
3070
  else if (show_err) {
2736
- print("G", a[i-1].slice(0, 12).join("\t"));
2737
- print("G", a[i].slice(0, 12).join("\t"));
3071
+ var label = end_cen[0] && end_cen[1]? 'g' : 'G';
3072
+ print(label, a[i-1].slice(0, 12).join("\t"));
3073
+ print(label, a[i].slice(0, 12).join("\t"));
2738
3074
  }
2739
3075
  ++n_gap[0];
2740
3076
  }
@@ -3084,6 +3420,183 @@ function paf_pafcmp(args)
3084
3420
  buf.destroy();
3085
3421
  }
3086
3422
 
3423
+ function paf_longcs2seq(args) {
3424
+ var c, opt = { query:false };
3425
+ while ((c = getopt(args, "q")) != null)
3426
+ if (c == 'q') opt.query = true;
3427
+ if (args.length == getopt.ind) {
3428
+ print("Usage: paftools.js longcs2seq [-q] <long-cs.paf>");
3429
+ return;
3430
+ }
3431
+ var re_cs = /([:=*+-])(\d+|[A-Za-z]+)/g
3432
+ var buf = new Bytes();
3433
+ var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
3434
+ while (file.readline(buf) >= 0) {
3435
+ var m, cs = null, t = buf.toString().split("\t");
3436
+ for (var i = 12; i < t.length; ++i)
3437
+ if ((m = /^cs:Z:(\S+)/.exec(t[i])) != null) {
3438
+ cs = m[1];
3439
+ break;
3440
+ }
3441
+ if (cs == null) continue;
3442
+ var ts = "", qs = "";
3443
+ while ((m = re_cs.exec(cs)) != null) {
3444
+ if (m[1] == "=") ts += m[2], qs += m[2];
3445
+ else if (m[1] == "+") qs += m[2].toUpperCase();
3446
+ else if (m[1] == "-") ts += m[2].toUpperCase();
3447
+ else if (m[1] == "*") ts += m[2][0].toUpperCase(), qs += m[2][1].toUpperCase();
3448
+ else if (m[1] == ":") throw Error("Long cs is required");
3449
+ }
3450
+ if (opt.query) {
3451
+ print(">" + t[0] + "_" + t[2] + "_" + t[3]);
3452
+ print(qs);
3453
+ } else {
3454
+ print(">" + t[5] + "_" + t[7] + "_" + t[8]);
3455
+ print(ts);
3456
+ }
3457
+ }
3458
+ file.close();
3459
+ buf.destroy();
3460
+ }
3461
+
3462
+ function paf_paf2gff(args) {
3463
+ var c, opt = { aa:false };
3464
+ var re_cigar = /(\d+)([A-Z=])/g;
3465
+ while ((c = getopt(args, "a")) != null) {
3466
+ if (c == 'a') opt.aa = true;
3467
+ }
3468
+ if (args.length == getopt.ind) {
3469
+ print("Usage: paftools.js paf2gff [-a] <in.paf>");
3470
+ return;
3471
+ }
3472
+ var buf = new Bytes();
3473
+ var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
3474
+ var hid = 1, last_name = null;
3475
+ while (file.readline(buf) >= 0) {
3476
+ var m, t = buf.toString().split("\t");
3477
+ if (t[5] == '*') continue; // skip unmapped lines
3478
+
3479
+ if (t[0] != last_name) last_name = t[0], hid = 1;
3480
+ else ++hid;
3481
+ for (var i = 1; i <= 3; ++i) t[i] = parseInt(t[i]);
3482
+ for (var i = 6; i <= 11; ++i) t[i] = parseInt(t[i]);
3483
+ var cigar = null, score = null, np = null, dist_stop = null, dist_start = null;
3484
+ for (var i = 12; i < t.length; ++i) {
3485
+ if ((m = /^(cg:Z|AS:i|np:i|da:i|do:i):(\S+)/.exec(t[i])) != null) {
3486
+ if (m[1] == 'cg:Z') cigar = m[2];
3487
+ else if (m[1] == 'AS:i') score = parseInt(m[2]);
3488
+ else if (m[1] == 'np:i') np = parseInt(m[2]);
3489
+ else if (m[1] == 'do:i') dist_stop = parseInt(m[2]);
3490
+ else if (m[1] == 'da:i') dist_start = parseInt(m[2]);
3491
+ }
3492
+ }
3493
+ if (cigar == null) throw Error("failed to find the cg:Z tag");
3494
+ if (score == null) throw Error("failed to find the AS:i tag");
3495
+
3496
+ var st = 0, en = 0, phase = 0, pseudo = false, fs = 0, a = [];
3497
+ if (dist_start != null && dist_start == 0)
3498
+ a.push([t[5], 'paf2gff', 'start_codon', 0, 3, 0, t[4], '.', 0]);
3499
+ while ((m = re_cigar.exec(cigar)) != null) {
3500
+ var len = parseInt(m[1]);
3501
+ if (m[2] == 'M' || m[2] == 'D') {
3502
+ en += opt.aa? len * 3 : len;
3503
+ } else if (m[2] == 'F' || m[2] == 'G' || m[2] == 'R') {
3504
+ en += len, pseudo = true, fs = 1;
3505
+ } else if (m[2] == 'N') {
3506
+ a.push([t[5], 'paf2gff', 'exon', st, en, 0, t[4], phase, fs]);
3507
+ st = en + len, en += len, phase = 0, fs = 0;
3508
+ } else if (m[2] == 'U') { // ...xGT...AGxx...
3509
+ a.push([t[5], 'paf2gff', 'exon', st, en + 1, 0, t[4], phase, fs]);
3510
+ st = en + len - 2, en += len, phase = 2, fs = 0;
3511
+ } else if (m[2] == 'V') { // ...xxGT...AGx...
3512
+ a.push([t[5], 'paf2gff', 'exon', st, en + 2, 0, t[4], phase, fs]);
3513
+ st = en + len - 1, en += len, phase = 1, fs = 0;
3514
+ }
3515
+ }
3516
+ a.push([t[5], 'paf2gff', 'exon', st, en, 0, t[4], phase, fs]);
3517
+ if (en != t[8] - t[7]) throw Error("inconsistent cigar");
3518
+ if (dist_stop != null && dist_stop == 0)
3519
+ a.push([t[5], 'paf2gff', 'stop_codon', en, en + 3, 0, t[4], '.', 0]);
3520
+ var type = pseudo? 'pseudogene' : 'protein_coding';
3521
+ var attr = ['transcript_id=' + t[0] + '#' + hid, 'transcript_type=' + type].join(";");
3522
+ var trans_attr = 'identity=' + (t[9] / t[10]).toFixed(4);
3523
+ if (np != null) trans_attr += ';positive=' + (np * 3 / t[10]).toFixed(4);
3524
+ trans_attr += ';aa_start=' + t[2];
3525
+ trans_attr += ';aa_end=' + (t[1] - t[3]);
3526
+ if (dist_start != null && dist_start >= 0) trans_attr += ';dist_start_codon=' + dist_start;
3527
+ if (dist_stop != null && dist_stop >= 0) trans_attr += ';dist_stop_codon=' + dist_stop;
3528
+ var trans_st = t[7], trans_en = t[8];
3529
+ if (dist_stop != null && dist_stop == 0) {
3530
+ if (t[4] == '-') trans_st -= 3;
3531
+ else trans_en += 3;
3532
+ }
3533
+ print([t[5], 'paf2gff', 'transcript', trans_st + 1, trans_en, score, t[4], '.', attr + ';' + trans_attr].join("\t"));
3534
+ if (opt.aa && t[4] == '-') {
3535
+ var b = [], len = t[8] - t[7];
3536
+ for (var i = a.length - 1; i >= 0; --i) {
3537
+ var x = len - a[i][3];
3538
+ a[i][3] = len - a[i][4];
3539
+ a[i][4] = x;
3540
+ //a[i][7] = a[i][7] == 0? 0 : 3 - a[i][7]; // not sure if this line is needed
3541
+ b.push(a[i]);
3542
+ }
3543
+ a = b;
3544
+ }
3545
+ for (var i = 0; i < a.length; ++i) {
3546
+ if (!pseudo && a[i][2] == "exon") a[i][2] = "CDS";
3547
+ a[i][3] += t[7] + 1;
3548
+ a[i][4] += t[7];
3549
+ a[i][8] = attr + ";frameshift=" + a[i][8];
3550
+ print(a[i].join("\t"));
3551
+ }
3552
+ }
3553
+ file.close();
3554
+ buf.destroy();
3555
+ }
3556
+
3557
+ function paf_gff2junc(args) {
3558
+ var c, feat = "CDS";
3559
+ while ((c = getopt(args, "f:")) != null) {
3560
+ if (c == 'f') feat = getopt.arg;
3561
+ }
3562
+ if (getopt.ind == args.length) {
3563
+ print("Usage: paftools.js gff2junc [-f feature] <in.gff3>");
3564
+ return;
3565
+ }
3566
+ var buf = new Bytes();
3567
+ var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
3568
+
3569
+ function process_a(a) {
3570
+ if (a.length < 2) return;
3571
+ a = a.sort(function(x, y) { return x[4] - y[4] });
3572
+ for (var i = 1; i < a.length; ++i)
3573
+ print([a[i][1], a[i-1][5], a[i][4], a[i][0], 0, a[i][7]].join("\t"));
3574
+ }
3575
+
3576
+ var a = [];
3577
+ while (file.readline(buf) >= 0) {
3578
+ var m, t = buf.toString().split("\t");
3579
+ if (t[0][0] == '#') continue;
3580
+ if (t[2].toLowerCase() != feat.toLowerCase()) continue;
3581
+ //print(t.join("\t"));
3582
+ if ((m = /\bParent=([^;]+)/.exec(t[8])) == null) {
3583
+ warn("Can't find Parent");
3584
+ continue;
3585
+ }
3586
+ t[3] = parseInt(t[3]) - 1;
3587
+ t[4] = parseInt(t[4]);
3588
+ t.unshift(m[1]);
3589
+ if (a.length > 0 && a[0][0] != m[1]) {
3590
+ process_a(a);
3591
+ a.length = 0;
3592
+ a.push(t);
3593
+ } else a.push(t);
3594
+ }
3595
+ process_a(a);
3596
+ file.close();
3597
+ buf.destroy();
3598
+ }
3599
+
3087
3600
  /*************************
3088
3601
  ***** main function *****
3089
3602
  *************************/
@@ -3098,6 +3611,9 @@ function main(args)
3098
3611
  print(" sam2paf convert SAM to PAF");
3099
3612
  print(" delta2paf convert MUMmer's delta to PAF");
3100
3613
  print(" gff2bed convert GTF/GFF3 to BED12");
3614
+ print(" gff2junc convert GFF3 to junction BED");
3615
+ print(" longcs2seq convert long-cs PAF to sequences");
3616
+ // print(" paf2gff convert PAF to GFF3 (tested for miniprot only)");
3101
3617
  print("");
3102
3618
  print(" stat collect basic mapping information in PAF/SAM");
3103
3619
  print(" asmstat collect basic assembly information");
@@ -3115,6 +3631,7 @@ function main(args)
3115
3631
  print(" mason2fq convert mason2-simulated SAM to FASTQ");
3116
3632
  print(" pbsim2fq convert PBSIM-simulated MAF to FASTQ");
3117
3633
  print(" junceval evaluate splice junction consistency with known annotations");
3634
+ print(" exoneval evaluate exon-level consistency with known annotations");
3118
3635
  print(" ov-eval evaluate read overlap sensitivity using read-to-ref mapping");
3119
3636
  exit(1);
3120
3637
  }
@@ -3125,6 +3642,7 @@ function main(args)
3125
3642
  else if (cmd == 'delta2paf') paf_delta2paf(args);
3126
3643
  else if (cmd == 'splice2bed') paf_splice2bed(args);
3127
3644
  else if (cmd == 'gff2bed') paf_gff2bed(args);
3645
+ else if (cmd == 'gff2junc') paf_gff2junc(args);
3128
3646
  else if (cmd == 'stat') paf_stat(args);
3129
3647
  else if (cmd == 'asmstat') paf_asmstat(args);
3130
3648
  else if (cmd == 'asmgene') paf_asmgene(args);
@@ -3138,10 +3656,13 @@ function main(args)
3138
3656
  else if (cmd == 'mason2fq') paf_mason2fq(args);
3139
3657
  else if (cmd == 'pbsim2fq') paf_pbsim2fq(args);
3140
3658
  else if (cmd == 'junceval') paf_junceval(args);
3659
+ else if (cmd == 'exoneval') paf_exoneval(args);
3141
3660
  else if (cmd == 'ov-eval') paf_ov_eval(args);
3142
3661
  else if (cmd == 'vcfstat') paf_vcfstat(args);
3143
3662
  else if (cmd == 'sveval') paf_sveval(args);
3144
3663
  else if (cmd == 'vcfsel') paf_vcfsel(args);
3664
+ else if (cmd == 'longcs2seq') paf_longcs2seq(args);
3665
+ else if (cmd == 'paf2gff') paf_paf2gff(args);
3145
3666
  else if (cmd == 'version') print(paftools_version);
3146
3667
  else throw Error("unrecognized command: " + cmd);
3147
3668
  }