minimap2 0.2.22.0 → 0.2.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,3149 @@
1
+ #!/usr/bin/env k8
2
+
3
+ var paftools_version = '2.24-r1122';
4
+
5
+ /*****************************
6
+ ***** Library functions *****
7
+ *****************************/
8
+
9
+ /*******************************
10
+ * Command line option parsing *
11
+ *******************************/
12
+
13
+ var getopt = function(args, ostr) {
14
+ var oli; // option letter list index
15
+ if (typeof(getopt.place) == 'undefined')
16
+ getopt.ind = 0, getopt.arg = null, getopt.place = -1;
17
+ if (getopt.place == -1) { // update scanning pointer
18
+ if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
19
+ getopt.place = -1;
20
+ return null;
21
+ }
22
+ if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
23
+ ++getopt.ind;
24
+ getopt.place = -1;
25
+ return null;
26
+ }
27
+ }
28
+ var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
29
+ if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
30
+ if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
31
+ if (getopt.place < 0) ++getopt.ind;
32
+ return '?';
33
+ }
34
+ if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
35
+ getopt.arg = null;
36
+ if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
37
+ } else { // need an argument
38
+ if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
39
+ getopt.arg = args[getopt.ind].substr(getopt.place);
40
+ else if (args.length <= ++getopt.ind) { // no arg
41
+ getopt.place = -1;
42
+ if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
43
+ return '?';
44
+ } else getopt.arg = args[getopt.ind]; // white space
45
+ getopt.place = -1;
46
+ ++getopt.ind;
47
+ }
48
+ return optopt;
49
+ }
50
+
51
+ /***********************
52
+ * Interval operations *
53
+ ***********************/
54
+
55
+ Interval = {};
56
+
57
+ Interval.sort = function(a)
58
+ {
59
+ if (typeof a[0] == 'number')
60
+ a.sort(function(x, y) { return x - y });
61
+ else a.sort(function(x, y) { return x[0] != y[0]? x[0] - y[0] : x[1] - y[1] });
62
+ }
63
+
64
+ Interval.merge = function(a, sorted)
65
+ {
66
+ if (typeof sorted == 'undefined') sorted = true;
67
+ if (!sorted) Interval.sort(a);
68
+ var k = 0;
69
+ for (var i = 1; i < a.length; ++i) {
70
+ if (a[k][1] >= a[i][0])
71
+ a[k][1] = a[k][1] > a[i][1]? a[k][1] : a[i][1];
72
+ else a[++k] = a[i].slice(0);
73
+ }
74
+ a.length = k + 1;
75
+ }
76
+
77
+ Interval.index_end = function(a, sorted)
78
+ {
79
+ if (a.length == 0) return;
80
+ if (typeof sorted == 'undefined') sorted = true;
81
+ if (!sorted) Interval.sort(a);
82
+ a[0].push(0);
83
+ var k = 0, k_en = a[0][1];
84
+ for (var i = 1; i < a.length; ++i) {
85
+ if (k_en <= a[i][0]) {
86
+ for (++k; k < i; ++k)
87
+ if (a[k][1] > a[i][0])
88
+ break;
89
+ k_en = a[k][1];
90
+ }
91
+ a[i].push(k);
92
+ }
93
+ }
94
+
95
+ Interval.find_intv = function(a, x)
96
+ {
97
+ var left = -1, right = a.length;
98
+ if (typeof a[0] == 'number') {
99
+ while (right - left > 1) {
100
+ var mid = left + ((right - left) >> 1);
101
+ if (a[mid] > x) right = mid;
102
+ else if (a[mid] < x) left = mid;
103
+ else return mid;
104
+ }
105
+ } else {
106
+ while (right - left > 1) {
107
+ var mid = left + ((right - left) >> 1);
108
+ if (a[mid][0] > x) right = mid;
109
+ else if (a[mid][0] < x) left = mid;
110
+ else return mid;
111
+ }
112
+ }
113
+ return left;
114
+ }
115
+
116
+ Interval.find_ovlp = function(a, st, en)
117
+ {
118
+ if (a.length == 0 || st >= en) return [];
119
+ var l = Interval.find_intv(a, st);
120
+ var k = l < 0? 0 : a[l][a[l].length - 1];
121
+ var b = [];
122
+ for (var i = k; i < a.length; ++i) {
123
+ if (a[i][0] >= en) break;
124
+ else if (st < a[i][1])
125
+ b.push(a[i]);
126
+ }
127
+ return b;
128
+ }
129
+
130
+ /**********************************
131
+ * Reverse and reverse complement *
132
+ **********************************/
133
+
134
+ function fasta_read(fn)
135
+ {
136
+ var h = {}, gt = '>'.charCodeAt(0);
137
+ var file = fn == '-'? new File() : new File(fn);
138
+ var buf = new Bytes(), seq = null, name = null, seqlen = [];
139
+ while (file.readline(buf) >= 0) {
140
+ if (buf[0] == gt) {
141
+ if (seq != null && name != null) {
142
+ seqlen.push([name, seq.length]);
143
+ h[name] = seq;
144
+ name = seq = null;
145
+ }
146
+ var m, line = buf.toString();
147
+ if ((m = /^>(\S+)/.exec(line)) != null) {
148
+ name = m[1];
149
+ seq = new Bytes();
150
+ }
151
+ } else seq.set(buf);
152
+ }
153
+ if (seq != null && name != null) {
154
+ seqlen.push([name, seq.length]);
155
+ h[name] = seq;
156
+ }
157
+ buf.destroy();
158
+ file.close();
159
+ return [h, seqlen];
160
+ }
161
+
162
+ function fasta_free(fa)
163
+ {
164
+ for (var name in fa)
165
+ fa[name].destroy();
166
+ }
167
+
168
+ Bytes.prototype.reverse = function()
169
+ {
170
+ for (var i = 0; i < this.length>>1; ++i) {
171
+ var tmp = this[i];
172
+ this[i] = this[this.length - i - 1];
173
+ this[this.length - i - 1] = tmp;
174
+ }
175
+ }
176
+
177
+ // reverse complement a DNA string
178
+ Bytes.prototype.revcomp = function()
179
+ {
180
+ if (Bytes.rctab == null) {
181
+ var s1 = 'WSATUGCYRKMBDHVNwsatugcyrkmbdhvn';
182
+ var s2 = 'WSTAACGRYMKVHDBNwstaacgrymkvhdbn';
183
+ Bytes.rctab = [];
184
+ for (var i = 0; i < 256; ++i) Bytes.rctab[i] = 0;
185
+ for (var i = 0; i < s1.length; ++i)
186
+ Bytes.rctab[s1.charCodeAt(i)] = s2.charCodeAt(i);
187
+ }
188
+ for (var i = 0; i < this.length>>1; ++i) {
189
+ var tmp = this[this.length - i - 1];
190
+ this[this.length - i - 1] = Bytes.rctab[this[i]];
191
+ this[i] = Bytes.rctab[tmp];
192
+ }
193
+ if (this.length&1)
194
+ this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
195
+ }
196
+
197
+ /********************
198
+ ***** paftools *****
199
+ ********************/
200
+
201
+ /*****************
202
+ * Miscellaneous *
203
+ *****************/
204
+
205
+ // liftover
206
+ function paf_liftover(args)
207
+ {
208
+ function read_bed(fn, to_merge)
209
+ {
210
+ if (fn == null) return null;
211
+ if (typeof to_merge == 'undefined') to_merge = true;
212
+ var file = fn == '-'? new File() : new File(fn);
213
+ var buf = new Bytes();
214
+ var bed = {};
215
+ while (file.readline(buf) >= 0) {
216
+ var t = buf.toString().split("\t");
217
+ if (bed[t[0]] == null) bed[t[0]] = [];
218
+ bed[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
219
+ }
220
+ buf.destroy();
221
+ file.close();
222
+
223
+ for (var chr in bed) {
224
+ Interval.sort(bed[chr]);
225
+ if (to_merge)
226
+ Interval.merge(bed[chr], true);
227
+ Interval.index_end(bed[chr], true);
228
+ }
229
+ return bed;
230
+ }
231
+
232
+ var re_cigar = /(\d+)([MID])/g, re_tag = /^(\S\S):([AZif]):(\S+)$/;
233
+ var c, to_merge = false, min_mapq = 5, min_len = 50000, max_div = 2.0;
234
+ var re = /(\d+)([MID])/g;
235
+ while ((c = getopt(args, "mq:l:d:")) != null) {
236
+ if (c == 'm') to_merge = true;
237
+ else if (c == 'q') min_mapq = parseInt(getopt.arg);
238
+ else if (c == 'l') min_len = parseInt(getopt.arg);
239
+ else if (c == 'd') max_div = parseFloat(getopt.arg);
240
+ }
241
+ if (args.length - getopt.ind < 2) {
242
+ print("Usage: paftools.js liftover [options] <aln.paf> <query.bed>");
243
+ print("Options:");
244
+ print(" -q INT min mapping quality [" + min_mapq + "]");
245
+ print(" -l INT min alignment length [" + min_len + "]");
246
+ print(" -d FLOAT max sequence divergence (>=1 to disable) [1]");
247
+ exit(1);
248
+ }
249
+ var bed = read_bed(args[getopt.ind+1], to_merge);
250
+
251
+ var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
252
+ var buf = new Bytes();
253
+ while (file.readline(buf) >= 0) {
254
+ var t = buf.toString().split("\t");
255
+
256
+ if (bed[t[0]] == null) continue; // sequence not present in BED; skip
257
+
258
+ // parse tp and cg tags
259
+ var m, tp = null, cg = null;
260
+ for (var i = 12; i < t.length; ++i) {
261
+ if ((m = re_tag.exec(t[i])) != null) {
262
+ if (m[1] == 'tp') tp = m[3];
263
+ else if (m[1] == 'cg') cg = m[3];
264
+ }
265
+ }
266
+ if (tp != 'P' && tp != 'I') continue; // only process primary alignments
267
+ if (cg == null) throw Error("unable to find the 'cg' tag");
268
+
269
+ // filter out bad alignments and check overlaps
270
+ for (var i = 1; i <= 3; ++i)
271
+ t[i] = parseInt(t[i]);
272
+ for (var i = 6; i <= 11; ++i)
273
+ t[i] = parseInt(t[i]);
274
+ if (t[11] < min_mapq || t[10] < min_len) continue;
275
+ var regs = Interval.find_ovlp(bed[t[0]], t[2], t[3]);
276
+ if (regs.length == 0) continue; // not overlapping any regions in input BED
277
+ if (max_div >= 0.0 && max_div < 1.0) {
278
+ var n_gaps = 0, n_opens = 0;
279
+ while ((m = re_cigar.exec(cg)) != null)
280
+ if (m[2] == 'I' || m[2] == 'D')
281
+ n_gaps += parseInt(m[1]), ++n_opens;
282
+ var n_mm = t[10] - t[9] - n_gaps;
283
+ var n_diff2 = n_mm + n_opens;
284
+ if (n_diff2 / (n_diff2 + t[9]) > max_div)
285
+ continue;
286
+ }
287
+
288
+ // extract start and end positions
289
+ var a = [], r = [], strand = t[4];
290
+ for (var i = 0; i < regs.length; ++i) {
291
+ var s = regs[i][0], e = regs[i][1];
292
+ if (strand == '+') {
293
+ a.push([s, 0, i, -2]);
294
+ a.push([e - 1, 1, i, -2]);
295
+ } else {
296
+ a.push([t[1] - e, 0, i, -2]);
297
+ a.push([t[1] - s - 1, 1, i, -2]);
298
+ }
299
+ r.push([-2, -2]);
300
+ }
301
+ a.sort(function(x, y) { return x[0] - y[0] });
302
+
303
+ // lift start/end positions
304
+ var k = 0, x = t[7], y = strand == '+'? t[2] : t[1] - t[3];
305
+ while ((m = re_cigar.exec(cg)) != null) { // TODO: be more careful about edge cases
306
+ var len = parseInt(m[1]);
307
+ if (m[2] == 'D') { // do nothing for D
308
+ x += len;
309
+ continue;
310
+ }
311
+ while (k < a.length && a[k][0] < y) ++k; // skip out-of-range positions
312
+ for (var i = k; i < a.length; ++i) {
313
+ if (y <= a[i][0] && a[i][0] < y + len)
314
+ a[i][3] = m[2] == 'M'? x + (a[i][0] - y) : x;
315
+ else break;
316
+ }
317
+ y += len;
318
+ if (m[2] == 'M') x += len;
319
+ }
320
+ if (x != t[8] || (strand == '+' && y != t[3]) || (strand == '-' && y != t[1] - t[2]))
321
+ throw Error("CIGAR is inconsistent with mapping coordinates");
322
+
323
+ // generate result
324
+ for (var i = 0; i < a.length; ++i) {
325
+ if (a[i][1] == 0) r[a[i][2]][0] = a[i][3];
326
+ else r[a[i][2]][1] = a[i][3] + 1; // change to half-close-half-open
327
+ }
328
+ for (var i = 0; i < r.length; ++i) {
329
+ var name = [t[0], regs[i][0], regs[i][1]].join("_");
330
+ if (r[i][0] < 0) name += "_t5", r[i][0] = t[7];
331
+ if (r[i][1] < 0) name += "_t3", r[i][1] = t[8];
332
+ print(t[5], r[i][0], r[i][1], name, 0, strand);
333
+ }
334
+ }
335
+ buf.destroy();
336
+ file.close();
337
+ }
338
+
339
+ // variant calling
340
+ function paf_call(args)
341
+ {
342
+ var re_cs = /([:=*+-])(\d+|[A-Za-z]+)/g, re_tag = /\t(\S\S:[AZif]):(\S+)/g;
343
+ var c, min_cov_len = 10000, min_var_len = 50000, gap_thres = 50, gap_thres_long = 1000, min_mapq = 5;
344
+ var fa_tmp = null, fa, fa_lens, is_vcf = false, sample_name = "sample";
345
+ while ((c = getopt(args, "l:L:g:q:B:f:s:")) != null) {
346
+ if (c == 'l') min_cov_len = parseInt(getopt.arg);
347
+ else if (c == 'L') min_var_len = parseInt(getopt.arg);
348
+ else if (c == 'g') gap_thres = parseInt(getopt.arg);
349
+ else if (c == 'G') gap_thres_long = parseInt(getopt.arg);
350
+ else if (c == 'q') min_mapq = parseInt(getopt.arg);
351
+ else if (c == 'f') fa_tmp = fasta_read(getopt.arg, fa_lens);
352
+ else if (c == 's') sample_name = getopt.arg;
353
+ }
354
+ if (fa_tmp != null) fa = fa_tmp[0], fa_lens = fa_tmp[1], is_vcf = true;
355
+
356
+ if (args.length == getopt.ind) {
357
+ print("Usage: sort -k6,6 -k8,8n <with-cs.paf> | paftools.js call [options] -");
358
+ print("Options:");
359
+ print(" -l INT min alignment length to compute coverage ["+min_cov_len+"]");
360
+ print(" -L INT min alignment length to call variants ["+min_var_len+"]");
361
+ print(" -q INT min mapping quality ["+min_mapq+"]");
362
+ print(" -g INT short/long gap threshold (for statistics only) ["+gap_thres+"]");
363
+ print(" -f FILE reference sequences (enabling VCF output) [null]");
364
+ print(" -s NAME sample name in VCF header ["+sample_name+"]");
365
+ exit(1);
366
+ }
367
+
368
+ var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
369
+ var buf = new Bytes();
370
+ var tot_len = 0, n_sub = [0, 0, 0], n_ins = [0, 0, 0, 0, 0], n_del = [0, 0, 0, 0, 0];
371
+
372
+ function print_vcf(o, fa)
373
+ {
374
+ var v = null;
375
+ if (o[3] != 1) return; // coverage is one; skip
376
+ if (o[5] == '-' && o[6] == '-') return;
377
+ if (o[5] != '-' && o[6] != '-') { // snp
378
+ v = [o[0], o[1] + 1, '.', o[5].toUpperCase(), o[6].toUpperCase()];
379
+ } else if (o[1] > 0) { // shouldn't happen in theory
380
+ if (fa[o[0]] == null) throw Error('sequence "' + o[0] + '" is absent from the reference FASTA');
381
+ if (o[1] >= fa[o[0]].length) throw Error('position ' + o[1] + ' exceeds the length of sequence "' + o[0] + '"');
382
+ var ref = String.fromCharCode(fa[o[0]][o[1]-1]).toUpperCase();
383
+ if (o[5] == '-') // insertion
384
+ v = [o[0], o[1], '.', ref, ref + o[6].toUpperCase()];
385
+ else // deletion
386
+ v = [o[0], o[1], '.', ref + o[5].toUpperCase(), ref];
387
+ }
388
+ v.push(o[4], '.', 'QNAME=' + o[7] + ';QSTART=' + (o[8]+1) + ';QSTRAND=' + (rev? '-' : '+'), 'GT', '1/1');
389
+ if (v == null) throw Error("unexpected variant: [" + o.join(",") + "]");
390
+ print(v.join("\t"));
391
+ }
392
+
393
+ function count_var(o)
394
+ {
395
+ if (o[3] > 1) return;
396
+ if (o[5] == '-' && o[6] == '-') return;
397
+ if (o[5] == '-') { // insertion
398
+ var l = o[6].length;
399
+ if (l == 1) ++n_ins[0];
400
+ else if (l == 2) ++n_ins[1];
401
+ else if (l < gap_thres) ++n_ins[2];
402
+ else if (l < gap_thres_long) ++n_ins[3];
403
+ else ++n_ins[4];
404
+ } else if (o[6] == '-') { // deletion
405
+ var l = o[5].length;
406
+ if (l == 1) ++n_del[0];
407
+ else if (l == 2) ++n_del[1];
408
+ else if (l < gap_thres) ++n_del[2];
409
+ else if (l < gap_thres_long) ++n_del[3];
410
+ else ++n_del[4];
411
+ } else {
412
+ ++n_sub[0];
413
+ var s = (o[5] + o[6]).toLowerCase();
414
+ if (s == 'ag' || s == 'ga' || s == 'ct' || s == 'tc')
415
+ ++n_sub[1];
416
+ else ++n_sub[2];
417
+ }
418
+ }
419
+
420
+ if (is_vcf) {
421
+ print('##fileformat=VCFv4.1');
422
+ for (var i = 0; i < fa_lens.length; ++i)
423
+ print('##contig=<ID=' + fa_lens[i][0] + ',length=' + fa_lens[i][1] + '>');
424
+ print('##INFO=<ID=QNAME,Number=1,Type=String,Description="Query name">');
425
+ print('##INFO=<ID=QSTART,Number=1,Type=Integer,Description="Query start">');
426
+ print('##INFO=<ID=QSTRAND,Number=1,Type=String,Description="Query strand">');
427
+ print('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">');
428
+ print('#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT '+sample_name);
429
+ }
430
+
431
+ var a = [], out = [];
432
+ var c1_ctg = null, c1_start = 0, c1_end = 0, c1_counted = false, c1_len = 0;
433
+ while (file.readline(buf) >= 0) {
434
+ var line = buf.toString();
435
+ var m, t = line.split("\t", 12);
436
+ if (t.length < 12 || t[5] == '*') continue; // unmapped
437
+ for (var i = 6; i <= 11; ++i)
438
+ t[i] = parseInt(t[i]);
439
+ if (t[10] < min_cov_len || t[11] < min_mapq) continue;
440
+ //print(t[0], t[7], t[8], c1_start, c1_end);
441
+ for (var i = 1; i <= 3; ++i)
442
+ t[i] = parseInt(t[i]);
443
+ var ctg = t[5], x = t[7], end = t[8];
444
+ var query = t[0], rev = (t[4] == '-'), y = rev? t[3] : t[2];
445
+ // collect tags
446
+ var cs = null, tp = null, have_s1 = false, have_s2 = false;
447
+ while ((m = re_tag.exec(line)) != null) {
448
+ if (m[1] == 'cs:Z') cs = m[2];
449
+ else if (m[1] == 'tp:A') tp = m[2];
450
+ else if (m[1] == 's1:i') have_s1 = true;
451
+ else if (m[1] == 's2:i') have_s2 = true;
452
+ }
453
+ if (have_s1 && !have_s2) continue;
454
+ if (tp != null && (tp == 'S' || tp == 'i')) continue;
455
+ // compute regions covered by 1 contig
456
+ if (ctg != c1_ctg || x >= c1_end) {
457
+ if (c1_counted && c1_end > c1_start) {
458
+ c1_len += c1_end - c1_start;
459
+ if (!is_vcf) print('R', c1_ctg, c1_start, c1_end);
460
+ }
461
+ c1_ctg = ctg, c1_start = x, c1_end = end;
462
+ c1_counted = (t[10] >= min_var_len);
463
+ } else if (end > c1_end) { // overlap
464
+ if (c1_counted && x > c1_start) {
465
+ c1_len += x - c1_start;
466
+ if (!is_vcf) print('R', c1_ctg, c1_start, x);
467
+ }
468
+ c1_start = c1_end, c1_end = end;
469
+ c1_counted = (t[10] >= min_var_len);
470
+ } else if (end > c1_start) { // contained
471
+ if (c1_counted && x > c1_start) {
472
+ c1_len += x - c1_start;
473
+ if (!is_vcf) print('R', c1_ctg, c1_start, x);
474
+ }
475
+ c1_start = end;
476
+ } // else, the alignment precedes the cov1 region; do nothing
477
+ // output variants ahead of this alignment
478
+ while (out.length) {
479
+ if (out[0][0] != ctg || out[0][2] <= x) {
480
+ count_var(out[0]);
481
+ if (is_vcf) print_vcf(out[0], fa);
482
+ else print('V', out[0].join("\t"));
483
+ out.shift();
484
+ } else break;
485
+ }
486
+ // update coverage
487
+ for (var i = 0; i < out.length; ++i)
488
+ if (out[i][1] >= x && out[i][2] <= end)
489
+ ++out[i][3];
490
+ // drop alignments that don't overlap with the current one
491
+ var k = 0;
492
+ for (var i = 0; i < a.length; ++i)
493
+ if (a[i][0] == ctg && a[i][2] > x)
494
+ a[k++] = a[i];
495
+ a.length = k;
496
+ // core loop
497
+ if (t[10] >= min_var_len) {
498
+ if (cs == null) continue; // no cs tag
499
+ var blen = 0, n_diff = 0;
500
+ tot_len += t[10];
501
+ while ((m = re_cs.exec(cs)) != null) {
502
+ var cov = 1;
503
+ if (m[1] == '*' || m[1] == '+' || m[1] == '-')
504
+ for (var i = 0; i < a.length; ++i)
505
+ if (a[i][2] > x) ++cov;
506
+ var qs, qe;
507
+ if (m[1] == '=' || m[1] == ':') {
508
+ var l = m[1] == '='? m[2].length : parseInt(m[2]);
509
+ if (rev) y -= l;
510
+ else y += l;
511
+ x += l, blen += l;
512
+ } else if (m[1] == '*') {
513
+ if (rev) qs = y - 1, qe = y, --y;
514
+ else qs = y, qe = y + 1, ++y;
515
+ var br = m[2].charAt(0), bq = m[2].charAt(1);
516
+ if (br != 'n' && bq != 'n') { // don't call a SNP if there is an ambiguous base
517
+ out.push([t[5], x, x+1, cov, t[11], br, bq, query, qs, qe, rev? '-' : '+']);
518
+ ++n_diff;
519
+ }
520
+ ++x, ++blen;
521
+ } else if (m[1] == '+') {
522
+ var l = m[2].length;
523
+ if (rev) qs = y - l, qe = y, y -= l;
524
+ else qs = y, qe = y + l, y += l;
525
+ out.push([t[5], x, x, cov, t[11], '-', m[2], query, qs, qe, rev? '-' : '+']);
526
+ ++blen, ++n_diff;
527
+ } else if (m[1] == '-') {
528
+ var l = m[2].length;
529
+ out.push([t[5], x, x + l, cov, t[11], m[2], '-', query, y, y, rev? '-' : '+']);
530
+ x += l, ++blen, ++n_diff;
531
+ }
532
+ }
533
+ }
534
+ a.push([t[5], t[7], t[8]]);
535
+ }
536
+ if (c1_counted && c1_end > c1_start) {
537
+ c1_len += c1_end - c1_start;
538
+ if (!is_vcf) print('R', c1_ctg, c1_start, c1_end);
539
+ }
540
+ while (out.length) {
541
+ count_var(out[0]);
542
+ if (is_vcf) print_vcf(out[0], fa);
543
+ else print('V', out[0].join("\t"));
544
+ out.shift();
545
+ }
546
+
547
+ //warn(tot_len + " alignment columns considered in calling");
548
+ warn(c1_len + " reference bases covered by exactly one contig");
549
+ warn(n_sub[0] + " substitutions; ts/tv = " + (n_sub[1]/n_sub[2]).toFixed(3));
550
+ warn(n_del[0] + " 1bp deletions");
551
+ warn(n_ins[0] + " 1bp insertions");
552
+ warn(n_del[1] + " 2bp deletions");
553
+ warn(n_ins[1] + " 2bp insertions");
554
+ warn(n_del[2] + " [3,"+gap_thres+") deletions");
555
+ warn(n_ins[2] + " [3,"+gap_thres+") insertions");
556
+ warn(n_del[3] + " ["+gap_thres+","+gap_thres_long+") deletions");
557
+ warn(n_ins[3] + " ["+gap_thres+","+gap_thres_long+") insertions");
558
+ warn(n_del[4] + " >=" + gap_thres_long + " deletions");
559
+ warn(n_ins[4] + " >=" + gap_thres_long + " insertions");
560
+
561
+ buf.destroy();
562
+ file.close();
563
+ if (fa != null) fasta_free(fa);
564
+ }
565
+
566
+ function paf_asmstat(args)
567
+ {
568
+ var c, min_query_len = 0, min_seg_len = 10000, max_diff = 0.01, bp_flank_len = 0, bp_gap_len = 0;
569
+ while ((c = getopt(args, "l:d:b:g:q:")) != null) {
570
+ if (c == 'l') min_seg_len = parseInt(getopt.arg);
571
+ else if (c == 'd') max_diff = parseFloat(getopt.arg);
572
+ else if (c == 'b') bp_flank_len = parseInt(getopt.arg);
573
+ else if (c == 'g') bp_gap_len = parseInt(getopt.arg);
574
+ else if (c == 'q') min_query_len = parseInt(getopt.arg);
575
+ }
576
+ if (getopt.ind == args.length) {
577
+ print("Usage: paftools.js asmstat [options] <ref.fa.fai> <asm1.paf> [...]");
578
+ print("Options:");
579
+ print(" -q INT ignore query shorter than INT [0]");
580
+ print(" -l INT min alignment block length [" + min_seg_len + "]");
581
+ print(" -d FLOAT max gap-compressed sequence divergence [" + max_diff + "]");
582
+ exit(1);
583
+ }
584
+
585
+ var file, buf = new Bytes();
586
+
587
+ var ref_len = 0;
588
+ file = new File(args[getopt.ind]);
589
+ while (file.readline(buf) >= 0) {
590
+ var t = buf.toString().split("\t");
591
+ ref_len += parseInt(t[1]);
592
+ }
593
+ file.close();
594
+
595
+ function process_query(qblocks, qblock_len, bp, qi) {
596
+ qblocks.sort(function(a,b) { return a[0]-b[0]; });
597
+ var last_k = null, last_blen = null, st = -1, en = -1, qcov = 0;
598
+ for (var k = 0; k < qblocks.length; ++k) {
599
+ var blen = qblocks[k][1] - qblocks[k][0];
600
+ if (k > 0 && qblocks[k][0] < qblocks[k-1][1]) {
601
+ if (qblocks[k][1] < qblocks[k-1][1]) continue;
602
+ blen = qblocks[k][1] - qblocks[k-1][1];
603
+ }
604
+ qblock_len.push(blen);
605
+ if (qblocks[k][0] > en) {
606
+ qcov += en - st;
607
+ st = qblocks[k][0];
608
+ en = qblocks[k][1];
609
+ } else en = en > qblocks[k][1]? en : qblocks[k][1];
610
+ if (last_k != null) {
611
+ var gap = 1000000000;
612
+ if (qblocks[k][2] == qblocks[last_k][2] && qblocks[k][3] == qblocks[last_k][3]) { // same chr and strand
613
+ var g1 = qblocks[k][0] - qblocks[last_k][1];
614
+ var g2 = qblocks[k][2] == '+'? qblocks[k][4] - qblocks[last_k][5] : qblocks[last_k][4] - qblocks[k][5];
615
+ gap = g1 > g2? g1 - g2 : g2 - g1;
616
+ }
617
+ var min = blen < last_blen? blen : last_blen;
618
+ var flank = k == 0? min : blen;
619
+ bp.push([flank, gap]);
620
+ qi.bp.push([flank, gap]);
621
+ }
622
+ last_k = k, last_blen = blen;
623
+ }
624
+ qcov += en - st;
625
+ return qcov;
626
+ }
627
+
628
+ function N50(lens, tot, quantile) {
629
+ lens.sort(function(a,b) { return b - a; });
630
+ if (tot == null) {
631
+ tot = 0;
632
+ for (var k = 0; k < lens.length; ++k)
633
+ tot += lens[k];
634
+ }
635
+ var sum = 0;
636
+ for (var k = 0; k < lens.length; ++k) {
637
+ if (sum <= quantile * tot && sum + lens[k] > quantile * tot)
638
+ return lens[k];
639
+ sum += lens[k];
640
+ }
641
+ }
642
+
643
+ function AUN(lens, tot) {
644
+ lens.sort(function(a,b) { return b - a; });
645
+ if (tot == null) {
646
+ tot = 0;
647
+ for (var k = 0; k < lens.length; ++k)
648
+ tot += lens[k];
649
+ }
650
+ var x = 0, y = 0;
651
+ for (var k = 0; k < lens.length; ++k) {
652
+ var l = x + lens[k] <= tot? lens[k] : tot - x;
653
+ x += lens[k];
654
+ y += l * (l / tot);
655
+ if (x >= tot) break;
656
+ }
657
+ return y.toFixed(0);
658
+ }
659
+
660
+ function count_bp(bp, min_blen, min_gap) {
661
+ var n_bp = 0;
662
+ for (var k = 0; k < bp.length; ++k)
663
+ if (bp[k][0] >= min_blen && bp[k][1] >= min_gap)
664
+ ++n_bp;
665
+ return n_bp;
666
+ }
667
+
668
+ function compute_diff(cigar, NM) {
669
+ var m, re = /(\d+)([MID])/g;
670
+ var n_M = 0, n_gapo = 0, n_gaps = 0;
671
+ while ((m = re.exec(cigar)) != null) {
672
+ var len = parseInt(m[1]);
673
+ if (m[2] == 'M') n_M += len;
674
+ else ++n_gapo, n_gaps += len;
675
+ }
676
+ if (NM < n_gaps) throw Error('NM is smaller the number of gaps');
677
+ return (NM - n_gaps + n_gapo) / (n_M + n_gapo);
678
+ }
679
+
680
+ var labels = ['Length', 'l_cov', 'Rcov', 'Rdup', 'Qcov', 'NG75', 'NG50', 'NGA50', 'AUNGA', '#breaks', 'bp(' + min_seg_len + ',0)', 'bp(' + min_seg_len + ',10k)'];
681
+ var rst = [];
682
+ for (var i = 0; i < labels.length; ++i)
683
+ rst[i] = [];
684
+
685
+ var n_asm = args.length - (getopt.ind + 1);
686
+ var header = ["Metric"];
687
+ for (var i = 0; i < n_asm; ++i) {
688
+ var n_breaks = 0, qcov = 0;
689
+ var fn = args[getopt.ind + 1 + i];
690
+ var label = fn.replace(/.paf(.gz)?$/, "");
691
+ header.push(label);
692
+ var ref_blocks = [], qblock_len = [], qblocks = [], bp = [];
693
+ var query = {}, qinfo = {};
694
+ var last_qname = null;
695
+ file = new File(fn);
696
+ while (file.readline(buf) >= 0) {
697
+ var m, line = buf.toString();
698
+ var t = line.split("\t");
699
+ t[1] = parseInt(t[1]);
700
+ if (t[1] < min_query_len) continue;
701
+ if (t.length < 2) continue;
702
+ query[t[0]] = t[1];
703
+ if (qinfo[t[0]] == null) qinfo[t[0]] = {};
704
+ qinfo[t[0]].len = t[1];
705
+ qinfo[t[0]].bp = [];
706
+ if (t.length < 9 || t[5] == "*") continue;
707
+ if (!/\ttp:A:[PI]/.test(line)) continue;
708
+ var cigar = (m = /\tcg:Z:(\S+)/.exec(line)) != null? m[1] : null;
709
+ var NM = (m = /\tNM:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : null;
710
+ var diff = cigar != null && NM != null? compute_diff(cigar, NM) : 0;
711
+ t[2] = parseInt(t[2]);
712
+ t[3] = parseInt(t[3]);
713
+ t[7] = parseInt(t[7]);
714
+ t[8] = parseInt(t[8]);
715
+ if (t[0] == last_qname) ++n_breaks;
716
+ if (diff > max_diff) continue;
717
+ if (t[3] - t[2] < min_seg_len) continue;
718
+ if (t[0] != last_qname) {
719
+ if (last_qname != null)
720
+ qcov += process_query(qblocks, qblock_len, bp, qinfo[last_qname]);
721
+ qblocks = [];
722
+ last_qname = t[0];
723
+ }
724
+ ref_blocks.push([t[5], t[7], t[8]]);
725
+ qblocks.push([t[2], t[3], t[4], t[5], t[7], t[8]]);
726
+ }
727
+ if (last_qname != null)
728
+ qcov += process_query(qblocks, qblock_len, bp, qinfo[last_qname]);
729
+ file.close();
730
+
731
+ // compute NG50
732
+ var asm_len = 0, asm_lens = []
733
+ for (var ctg in query) {
734
+ asm_len += query[ctg];
735
+ asm_lens.push(query[ctg]);
736
+ }
737
+ rst[0][i] = asm_len;
738
+ rst[5][i] = N50(asm_lens, ref_len, 0.75);
739
+ rst[6][i] = N50(asm_lens, ref_len, 0.5);
740
+
741
+ // compute coverage
742
+ var l_cov = 0;
743
+ ref_blocks.sort(function(a, b) { return a[0] > b[0]? 1 : a[0] < b[0]? -1 : a[1] - b[1]; });
744
+ var last_ref = null, st = -1, en = -1;
745
+ for (var j = 0; j < ref_blocks.length; ++j) {
746
+ if (ref_blocks[j][0] != last_ref || ref_blocks[j][1] > en) {
747
+ l_cov += en - st;
748
+ last_ref = ref_blocks[j][0];
749
+ st = ref_blocks[j][1];
750
+ en = ref_blocks[j][2];
751
+ } else en = en > ref_blocks[j][2]? en : ref_blocks[j][2];
752
+ }
753
+ l_cov += en - st;
754
+ rst[1][i] = l_cov;
755
+ rst[2][i] = (100.0 * (l_cov / ref_len)).toFixed(2) + '%';
756
+ rst[4][i] = (100.0 * (qcov / asm_len)).toFixed(2) + '%';
757
+
758
+ // compute cov1 and cov2+ lengths; see paf_call() for details
759
+ var c1_ctg = null, c1_start = 0, c1_end = 0, c1_len = 0;
760
+ for (var j = 0; j < ref_blocks.length; ++j) {
761
+ if (ref_blocks[j][0] != c1_ctg || ref_blocks[j][1] >= c1_end) {
762
+ if (c1_end > c1_start)
763
+ c1_len += c1_end - c1_start;
764
+ c1_ctg = ref_blocks[j][0], c1_start = ref_blocks[j][1], c1_end = ref_blocks[j][2];
765
+ } else if (ref_blocks[j][2] > c1_end) { // overlap
766
+ if (ref_blocks[j][1] > c1_start)
767
+ c1_len += ref_blocks[j][1] - c1_start;
768
+ c1_start = c1_end, c1_end = ref_blocks[j][2];
769
+ } else if (ref_blocks[j][2] > c1_start) { // contained
770
+ if (ref_blocks[j][1] > c1_start)
771
+ c1_len += ref_blocks[j][1] - c1_start;
772
+ c1_start = ref_blocks[j][2];
773
+ }
774
+ //print(ref_blocks[j][0], ref_blocks[j][1], ref_blocks[j][2], c1_start, c1_end, c1_len);
775
+ }
776
+ if (c1_end > c1_start)
777
+ c1_len += c1_end - c1_start;
778
+ rst[3][i] = (100 * (l_cov - c1_len) / l_cov).toFixed(2) + '%';
779
+
780
+ // compute NGA50
781
+ rst[7][i] = N50(qblock_len, ref_len, 0.5);
782
+
783
+ // compute AUNGA
784
+ rst[8][i] = AUN(qblock_len, ref_len);
785
+
786
+ // compute break points
787
+ rst[9][i] = n_breaks;
788
+ rst[10][i] = count_bp(bp, 500, 0);
789
+ rst[11][i] = count_bp(bp, 500, 10000);
790
+
791
+ // nb-plot; NOT USED
792
+ /*
793
+ var qa = [];
794
+ for (var qn in qinfo)
795
+ qa.push([qinfo[qn].len, qinfo[qn].bp]);
796
+ qa = qa.sort(function(a, b) { return b[0] - a[0] });
797
+ var sum = 0, n_bp = 0, next_quantile = 0.1;
798
+ for (var j = 0; j < qa.length; ++j) {
799
+ sum += qa[j][0];
800
+ for (var k = 0; k < qa[j][1].length; ++k)
801
+ if (qa[j][1][k][0] >= bp_flank_len && qa[j][1][k][1] >= bp_gap_len)
802
+ ++n_bp;
803
+ if (sum >= ref_len * next_quantile) {
804
+ print(label, Math.floor(next_quantile * 100 + .5), qa[j][0], (sum / n_bp).toFixed(0), n_bp);
805
+ next_quantile += 0.1;
806
+ if (next_quantile >= 1.0) break;
807
+ }
808
+ }
809
+ */
810
+ }
811
+ buf.destroy();
812
+
813
+ if (bp_flank_len <= 0) {
814
+ print(header.join("\t"));
815
+ for (var i = 0; i < labels.length; ++i)
816
+ print(labels[i], rst[i].join("\t"));
817
+ }
818
+ }
819
+
820
+ function paf_asmgene(args)
821
+ {
822
+ var c, opt = { min_cov:0.99, min_iden:0.99 }, print_err = false, auto_only = false;
823
+ while ((c = getopt(args, "i:c:ea")) != null)
824
+ if (c == 'i') opt.min_iden = parseFloat(getopt.arg);
825
+ else if (c == 'c') opt.min_cov = parseFloat(getopt.arg);
826
+ else if (c == 'e') print_err = true;
827
+ else if (c == 'a') auto_only = true;
828
+
829
+ var n_fn = args.length - getopt.ind;
830
+ if (n_fn < 2) {
831
+ print("Usage: paftools.js asmgene [options] <ref-splice.paf> <asm-splice.paf> [...]");
832
+ print("Options:");
833
+ print(" -i FLOAT min identity [" + opt.min_iden + "]");
834
+ print(" -c FLOAT min coverage [" + opt.min_cov + "]");
835
+ print(" -a only evaluate genes mapped to the autosomes");
836
+ print(" -e print fragmented/missing genes");
837
+ exit(1);
838
+ }
839
+
840
+ function process_query(opt, a) {
841
+ var b = [], cnt = [0, 0, 0];
842
+ for (var j = 0; j < a.length; ++j) {
843
+ if (a[j][4] < a[j][5] * opt.min_iden)
844
+ continue;
845
+ b.push(a[j].slice(0));
846
+ }
847
+ if (b.length == 0) return cnt;
848
+ // count full
849
+ var n_full = 0;
850
+ for (var j = 0; j < b.length; ++j)
851
+ if (b[j][3] - b[j][2] >= b[j][1] * opt.min_cov)
852
+ ++n_full;
853
+ cnt[0] = n_full;
854
+ // compute coverage
855
+ b = b.sort(function(x, y) { return x[2] - y[2] });
856
+ var l_cov = 0, st = b[0][2], en = b[0][3];
857
+ for (var j = 1; j < b.length; ++j) {
858
+ if (b[j][2] <= en)
859
+ en = b[j][3] > en? b[j][3] : en;
860
+ else l_cov += en - st;
861
+ }
862
+ l_cov += en - st;
863
+ cnt[1] = l_cov / b[0][1];
864
+ cnt[2] = b.length;
865
+ return cnt;
866
+ }
867
+
868
+ var buf = new Bytes();
869
+ var gene = {}, header = [], refpos = {};
870
+ for (var i = getopt.ind; i < args.length; ++i) {
871
+ var fn = args[i];
872
+ var label = fn.replace(/.paf(.gz)?$/, "");
873
+ header.push(label);
874
+ var file = new File(fn), a = [];
875
+ while (file.readline(buf) >= 0) {
876
+ var t = buf.toString().split("\t");
877
+ var ql = parseInt(t[1]), qs = parseInt(t[2]), qe = parseInt(t[3]), mlen = parseInt(t[9]), blen = parseInt(t[10]), mapq = parseInt(t[11]);
878
+ if (i == getopt.ind) refpos[t[0]] = [t[0], t[1], t[5], t[7], t[8]];
879
+ if (gene[t[0]] == null) gene[t[0]] = [];
880
+ if (a.length && t[0] != a[0][0]) {
881
+ gene[a[0][0]][i - getopt.ind] = process_query(opt, a);
882
+ a = [];
883
+ }
884
+ a.push([t[0], ql, qs, qe, mlen, blen]);
885
+ }
886
+ if (a.length)
887
+ gene[t[0]][i - getopt.ind] = process_query(opt, a);
888
+ file.close();
889
+ }
890
+
891
+ // select the longest genes (not optimal, but should be good enough)
892
+ var gene_list = [], gene_nr = {};
893
+ for (var g in refpos)
894
+ gene_list.push(refpos[g]);
895
+ gene_list = gene_list.sort(function(a, b) { return a[2] < b[2]? -1 : a[2] > b[2]? 1 : a[3] - b[3] });
896
+ var last = 0;
897
+ for (var j = 1; j < gene_list.length; ++j) {
898
+ if (gene_list[j][2] != gene_list[last][2] || gene_list[j][3] >= gene_list[last][4]) {
899
+ gene_nr[gene_list[last][0]] = 1;
900
+ last = j;
901
+ } else if (gene_list[j][1] > gene_list[last][1]) {
902
+ last = j;
903
+ }
904
+ }
905
+ gene_nr[gene_list[last][0]] = 1;
906
+
907
+ // count and print
908
+ var col1 = ["full_sgl", "full_dup", "frag", "part50+", "part10+", "part10-", "dup_cnt", "dup_sum"];
909
+ var rst = [];
910
+ for (var k = 0; k < col1.length; ++k) {
911
+ rst[k] = [];
912
+ for (var i = 0; i < n_fn; ++i)
913
+ rst[k][i] = 0;
914
+ }
915
+ for (var g in gene) { // count single-copy genes
916
+ if (gene[g][0] == null || gene[g][0][0] != 1) continue;
917
+ if (gene_nr[g] == null) continue;
918
+ if (auto_only && /^(chr)?[XY]$/.test(refpos[g][2])) continue;
919
+ for (var i = 0; i < n_fn; ++i) {
920
+ if (gene[g][i] == null) {
921
+ rst[5][i]++;
922
+ if (print_err) print('M', header[i], refpos[g].join("\t"));
923
+ } else if (gene[g][i][0] == 1) {
924
+ rst[0][i]++;
925
+ } else if (gene[g][i][0] > 1) {
926
+ rst[1][i]++;
927
+ if (print_err) print('D', header[i], refpos[g].join("\t"));
928
+ } else if (gene[g][i][1] >= opt.min_cov) {
929
+ rst[2][i]++;
930
+ if (print_err) print('F', header[i], refpos[g].join("\t"));
931
+ } else if (gene[g][i][1] >= 0.5) {
932
+ rst[3][i]++;
933
+ if (print_err) print('5', header[i], refpos[g].join("\t"));
934
+ } else if (gene[g][i][1] >= 0.1) {
935
+ rst[4][i]++;
936
+ if (print_err) print('1', header[i], refpos[g].join("\t"));
937
+ } else {
938
+ rst[5][i]++;
939
+ if (print_err) print('0', header[i], refpos[g].join("\t")); // TODO: reduce code duplicates...
940
+ }
941
+ }
942
+ }
943
+ for (var g in gene) { // count multi-copy genes
944
+ if (gene[g][0] == null || gene[g][0][0] <= 1) continue;
945
+ if (gene_nr[g] == null) continue;
946
+ if (auto_only && /^(chr)?[XY]$/.test(refpos[g][2])) continue;
947
+ for (var i = 0; i < n_fn; ++i) {
948
+ if (gene[g][i] != null) rst[7][i] += gene[g][i][0];
949
+ if (gene[g][i] != null && gene[g][i][0] > 1) {
950
+ rst[6][i]++;
951
+ } else if (print_err) {
952
+ print('d', header[i], gene[g][0][0], refpos[g].join("\t"));
953
+ }
954
+ }
955
+ }
956
+ print('H', 'Metric', header.join("\t"));
957
+ for (var k = 0; k < rst.length; ++k) {
958
+ print('X', col1[k], rst[k].join("\t"));
959
+ }
960
+ buf.destroy();
961
+ }
962
+
963
+ function paf_stat(args)
964
+ {
965
+ var c, gap_out_len = null, count_err = false;
966
+ while ((c = getopt(args, "cl:")) != null)
967
+ if (c == 'l') gap_out_len = parseInt(getopt.arg);
968
+ else if (c == 'c') count_err = true;
969
+
970
+ if (getopt.ind == args.length) {
971
+ print("Usage: paftools.js stat [-c] [-l gapOutLen] <in.sam>|<in.paf>");
972
+ exit(1);
973
+ }
974
+
975
+ var buf = new Bytes();
976
+ var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
977
+ var re = /(\d+)([MIDSHNX=])/g;
978
+
979
+ var lineno = 0, n_pri = 0, n_2nd = 0, n_seq = 0, n_cigar_64k = 0, l_tot = 0, l_cov = 0;
980
+ var n_gap = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], n_sub = 0;
981
+
982
+ function cov_len(regs)
983
+ {
984
+ regs.sort(function(a,b) {return a[0]-b[0]});
985
+ var st = regs[0][0], en = regs[0][1], l = 0;
986
+ for (var i = 1; i < regs.length; ++i) {
987
+ if (regs[i][0] < en)
988
+ en = en > regs[i][1]? en : regs[i][1];
989
+ else l += en - st, st = regs[i][0], en = regs[i][1];
990
+ }
991
+ l += en - st;
992
+ return l;
993
+ }
994
+
995
+ var last = null, last_qlen = null, regs = [];
996
+ while (file.readline(buf) >= 0) {
997
+ var line = buf.toString();
998
+ ++lineno;
999
+ if (line.charAt(0) != '@') {
1000
+ var t = line.split("\t", 12);
1001
+ var m, rs, cigar = null, is_pri = false, is_sam = false, is_rev = false, tname = null;
1002
+ var atlen = null, aqlen, qs, qe, mapq, ori_qlen, NM = null, nn = 0;
1003
+ if (t.length < 2) continue;
1004
+ if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
1005
+ if (t[4] == '*') continue; // unmapped
1006
+ if (!/\ts2:i:\d+/.test(line)) {
1007
+ ++n_2nd;
1008
+ continue;
1009
+ }
1010
+ if ((m = /\tNM:i:(\d+)/.exec(line)) != null)
1011
+ NM = parseInt(m[1]);
1012
+ if ((m = /\tnn:i:(\d+)/.exec(line)) != null)
1013
+ nn = parseInt(m[1]);
1014
+ if ((m = /\tcg:Z:(\S+)/.exec(line)) != null)
1015
+ cigar = m[1];
1016
+ if (cigar == null) {
1017
+ warn("WARNING: no CIGAR at line " + lineno);
1018
+ continue;
1019
+ }
1020
+ tname = t[5];
1021
+ qs = parseInt(t[2]), qe = parseInt(t[3]);
1022
+ aqlen = qe - qs;
1023
+ is_rev = t[4] == '+'? false : true;
1024
+ rs = parseInt(t[7]);
1025
+ atlen = parseInt(t[8]) - rs;
1026
+ mapq = parseInt(t[11]);
1027
+ ori_qlen = parseInt(t[1]);
1028
+ } else { // SAM
1029
+ var flag = parseInt(t[1]);
1030
+ if ((flag & 4) || t[2] == '*' || t[5] == '*') continue;
1031
+ if (flag & 0x100) {
1032
+ ++n_2nd;
1033
+ continue;
1034
+ }
1035
+ if ((m = /\tNM:i:(\d+)/.exec(line)) != null)
1036
+ NM = parseInt(m[1]);
1037
+ if ((m = /\tnn:i:(\d+)/.exec(line)) != null)
1038
+ nn = parseInt(m[1]);
1039
+ cigar = t[5];
1040
+ tname = t[2];
1041
+ rs = parseInt(t[3]) - 1;
1042
+ mapq = parseInt(t[4]);
1043
+ aqlen = t[9].length;
1044
+ is_sam = true;
1045
+ is_rev = !!(flag&0x10);
1046
+ }
1047
+ ++n_pri;
1048
+ if (last != t[0]) {
1049
+ if (last != null) {
1050
+ l_tot += last_qlen;
1051
+ l_cov += cov_len(regs);
1052
+ }
1053
+ regs = [];
1054
+ ++n_seq, last = t[0];
1055
+ }
1056
+ var M = 0, tl = 0, ql = 0, clip = [0, 0], n_cigar = 0, sclip = 0;
1057
+ var n_gapo = 0, n_gap_all = 0, l_match = 0;
1058
+ while ((m = re.exec(cigar)) != null) {
1059
+ var l = parseInt(m[1]);
1060
+ ++n_cigar;
1061
+ if (m[2] == 'M' || m[2] == '=' || m[2] == 'X') {
1062
+ tl += l, ql += l, M += l;
1063
+ l_match += l;
1064
+ } else if (m[2] == 'I' || m[2] == 'D') {
1065
+ var type;
1066
+ if (l < 50) type = 0;
1067
+ else if (l < 100) type = 1;
1068
+ else if (l < 300) type = 2;
1069
+ else if (l < 400) type = 3;
1070
+ else if (l < 1000) type = 4;
1071
+ else type = 5;
1072
+ if (m[2] == 'I') ql += l, ++n_gap[0][type];
1073
+ else tl += l, ++n_gap[1][type];
1074
+ if (gap_out_len != null && l >= gap_out_len)
1075
+ print(t[0], ql, is_rev? '-' : '+', tname, rs + tl, m[2], l);
1076
+ ++n_gapo, n_gap_all += l;
1077
+ } else if (m[2] == 'N') {
1078
+ tl += l;
1079
+ } else if (m[2] == 'S') {
1080
+ clip[M == 0? 0 : 1] = l, sclip += l;
1081
+ } else if (m[2] == 'H') {
1082
+ clip[M == 0? 0 : 1] = l;
1083
+ }
1084
+ }
1085
+ if (NM != null) {
1086
+ var tmp = NM - n_gap_all - nn;
1087
+ if (tmp < 0 && nn == 0) warn("WARNING: NM is smaller than the number of gaps at line " + lineno + ": NM=" + NM + ", nn=" + nn + ", G=" + n_gap_all);
1088
+ if (tmp < 0) tmp = 0;
1089
+ n_sub += tmp;
1090
+ }
1091
+ if (n_cigar > 65535) ++n_cigar_64k;
1092
+ if (ql + sclip != aqlen)
1093
+ warn("WARNING: aligned query length is inconsistent with CIGAR at line " + lineno + " (" + (ql+sclip) + " != " + aqlen + ")");
1094
+ if (atlen != null && atlen != tl)
1095
+ warn("WARNING: aligned reference length is inconsistent with CIGAR at line " + lineno);
1096
+ if (is_sam) {
1097
+ qs = clip[is_rev? 1 : 0], qe = qs + ql;
1098
+ ori_qlen = clip[0] + ql + clip[1];
1099
+ }
1100
+ if (count_err && NM != null) {
1101
+ var n_mm = NM - n_gap_all;
1102
+ if (n_mm < 0) warn("WARNING: NM is smaller than the number of gaps at line " + lineno);
1103
+ if (n_mm < 0) n_mm = 0;
1104
+ print(t[0], ori_qlen, t[11], ori_qlen - (qe - qs), NM, l_match + n_gap_all, n_mm + n_gapo, l_match + n_gapo);
1105
+ }
1106
+ regs.push([qs, qe]);
1107
+ last_qlen = ori_qlen;
1108
+ }
1109
+ }
1110
+ if (regs.length) {
1111
+ l_tot += last_qlen;
1112
+ l_cov += cov_len(regs);
1113
+ }
1114
+
1115
+ file.close();
1116
+ buf.destroy();
1117
+
1118
+ if (gap_out_len == null && !count_err) {
1119
+ print("Number of mapped sequences: " + n_seq);
1120
+ print("Number of primary alignments: " + n_pri);
1121
+ print("Number of secondary alignments: " + n_2nd);
1122
+ print("Number of primary alignments with >65535 CIGAR operations: " + n_cigar_64k);
1123
+ print("Number of bases in mapped sequences: " + l_tot);
1124
+ print("Number of mapped bases: " + l_cov);
1125
+ print("Number of substitutions: " + n_sub);
1126
+ print("Number of insertions in [0,50): " + n_gap[0][0]);
1127
+ print("Number of insertions in [50,100): " + n_gap[0][1]);
1128
+ print("Number of insertions in [100,300): " + n_gap[0][2]);
1129
+ print("Number of insertions in [300,400): " + n_gap[0][3]);
1130
+ print("Number of insertions in [400,1000): " + n_gap[0][4]);
1131
+ print("Number of insertions in [1000,inf): " + n_gap[0][5]);
1132
+ print("Number of deletions in [0,50): " + n_gap[1][0]);
1133
+ print("Number of deletions in [50,100): " + n_gap[1][1]);
1134
+ print("Number of deletions in [100,300): " + n_gap[1][2]);
1135
+ print("Number of deletions in [300,400): " + n_gap[1][3]);
1136
+ print("Number of deletions in [400,1000): " + n_gap[1][4]);
1137
+ print("Number of deletions in [1000,inf): " + n_gap[1][5]);
1138
+ }
1139
+ }
1140
+
1141
+ function paf_bedcov(args)
1142
+ {
1143
+ function read_bed(fn, to_merge, to_dedup)
1144
+ {
1145
+ var file = new File(fn);
1146
+ var buf = new Bytes();
1147
+ var h = {};
1148
+ while (file.readline(buf) >= 0) {
1149
+ var t = buf.toString().split("\t");
1150
+ if (h[t[0]] == null)
1151
+ h[t[0]] = [];
1152
+ var bst = parseInt(t[1]);
1153
+ var ben = parseInt(t[2]);
1154
+ if (t.length >= 12 && /^\d+$/.test(t[9])) {
1155
+ t[9] = parseInt(t[9]);
1156
+ var sz = t[10].split(",");
1157
+ var st = t[11].split(",");
1158
+ for (var i = 0; i < t[9]; ++i) {
1159
+ st[i] = parseInt(st[i]);
1160
+ sz[i] = parseInt(sz[i]);
1161
+ h[t[0]].push([bst + st[i], bst + st[i] + sz[i], 0, 0, 0]);
1162
+ }
1163
+ } else {
1164
+ h[t[0]].push([bst, ben, 0, 0, 0]);
1165
+ }
1166
+ }
1167
+ buf.destroy();
1168
+ file.close();
1169
+ for (var chr in h) {
1170
+ if (to_merge) Interval.merge(h[chr], false);
1171
+ else if (to_dedup) Interval.dedup(h[chr], false);
1172
+ else Interval.sort(h[chr]);
1173
+ Interval.index_end(h[chr]);
1174
+ }
1175
+ return h;
1176
+ }
1177
+
1178
+ var c, print_len = false, to_merge = true, to_dedup = false, fn_excl = null;
1179
+ while ((c = getopt(args, "pde:")) != null) {
1180
+ if (c == 'p') print_len = true;
1181
+ else if (c == 'd') to_dedup = true, to_merge = false;
1182
+ else if (c == 'e') fn_excl = getopt.arg;
1183
+ }
1184
+
1185
+ if (args.length - getopt.ind < 2) {
1186
+ print("Usage: paftools.js bedcov [options] <regions.bed> <target.bed>");
1187
+ print("Options:");
1188
+ print(" -e FILE exclude target regions (2nd file) overlapping BED FILE []");
1189
+ print(" -p print number of covered bases for each target");
1190
+ exit(1);
1191
+ }
1192
+
1193
+ var excl = fn_excl != null? read_bed(fn_excl, true, false) : null;
1194
+ var target = read_bed(args[getopt.ind], to_merge, to_dedup);
1195
+
1196
+ var file, buf = new Bytes();
1197
+ var tot_len = 0, hit_len = 0;
1198
+ file = args[getopt.ind+1] != '-'? new File(args[getopt.ind+1]) : new File();
1199
+ while (file.readline(buf) >= 0) {
1200
+ var t = buf.toString().split("\t");
1201
+ var a = [];
1202
+ var bst = parseInt(t[1]);
1203
+ var ben = parseInt(t[2]);
1204
+ if (t.length >= 12 && /^\d+$/.test(t[9])) { // BED12
1205
+ t[9] = parseInt(t[9]);
1206
+ var sz = t[10].split(",");
1207
+ var st = t[11].split(",");
1208
+ for (var i = 0; i < t[9]; ++i) {
1209
+ st[i] = parseInt(st[i]);
1210
+ sz[i] = parseInt(sz[i]);
1211
+ a.push([bst + st[i], bst + st[i] + sz[i], false]);
1212
+ }
1213
+ } else a.push([bst, ben, false]); // 3-column BED
1214
+ var feat_len = 0;
1215
+ for (var i = 0; i < a.length; ++i) {
1216
+ if (excl != null && excl[t[0]] != null) {
1217
+ var oe = Interval.find_ovlp(excl[t[0]], a[i][0], a[i][1]);
1218
+ if (oe.length > 0)
1219
+ continue;
1220
+ }
1221
+ a[i][2] = true;
1222
+ feat_len += a[i][1] - a[i][0];
1223
+ }
1224
+ tot_len += feat_len;
1225
+ if (target[t[0]] == null) continue;
1226
+ var b = [];
1227
+ for (var i = 0; i < a.length; ++i) {
1228
+ if (!a[i][2]) continue;
1229
+ var o = Interval.find_ovlp(target[t[0]], a[i][0], a[i][1]);
1230
+ for (var j = 0; j < o.length; ++j) {
1231
+ var max_st = o[j][0] > a[i][0]? o[j][0] : a[i][0];
1232
+ var min_en = o[j][1] < a[i][1]? o[j][1] : a[i][1];
1233
+ b.push([max_st, min_en]);
1234
+ o[j][2] += min_en - max_st;
1235
+ ++o[j][3];
1236
+ if (max_st == o[j][0] && min_en == o[j][1])
1237
+ ++o[j][4];
1238
+ }
1239
+ }
1240
+ // find the length covered
1241
+ var feat_hit_len = 0;
1242
+ if (b.length > 0) {
1243
+ b.sort(function(a,b) {return a[0]-b[0]});
1244
+ var st = b[0][0], en = b[0][1];
1245
+ for (var i = 1; i < b.length; ++i) {
1246
+ if (b[i][0] <= en) en = en > b[i][1]? en : b[i][1];
1247
+ else feat_hit_len += en - st, st = b[i][0], en = b[i][1];
1248
+ }
1249
+ feat_hit_len += en - st;
1250
+ }
1251
+ hit_len += feat_hit_len;
1252
+ if (print_len) print('F', t.slice(0, 4).join("\t"), feat_len, feat_hit_len);
1253
+ }
1254
+ file.close();
1255
+
1256
+ buf.destroy();
1257
+
1258
+ warn("# target bases: " + tot_len);
1259
+ warn("# target bases overlapping regions: " + hit_len + ' (' + (100.0 * hit_len / tot_len).toFixed(2) + '%)');
1260
+ }
1261
+
1262
+ function paf_vcfpair(args)
1263
+ {
1264
+ var c, is_male = false, sample = 'syndip', hgver = null;
1265
+ var PAR = { '37':[[0, 2699520], [154931043, 155260560]] };
1266
+ while ((c = getopt(args, "ms:g:")) != null) {
1267
+ if (c == 'm') is_male = true;
1268
+ else if (c == 's') sample = getopt.arg;
1269
+ else if (c == 'g') hgver = getopt.arg;
1270
+ }
1271
+ if (is_male && (hgver == null || PAR[hgver] == null))
1272
+ throw("for a male, -g must be specified to properly handle PARs on chrX");
1273
+
1274
+ if (getopt.ind == args.length) {
1275
+ print("Usage: paftools.js vcfpair [options] <in.pair.vcf>");
1276
+ print("Options:");
1277
+ print(" -m the sample is male");
1278
+ print(" -g STR human genome version '37' []");
1279
+ print(" -s STR sample name [" + sample + "]");
1280
+ exit(1);
1281
+ }
1282
+
1283
+ var re_ctg = is_male? /^(chr)?([0-9]+|X|Y)$/ : /^(chr)?([0-9]+|X)$/;
1284
+ var label = ['1', '2'];
1285
+ var buf = new Bytes();
1286
+ var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
1287
+ while (file.readline(buf) >= 0) {
1288
+ var m, line = buf.toString();
1289
+ if (line.charAt(0) == '#') {
1290
+ if (/^##(source|reference)=/.test(line)) continue;
1291
+ if ((m = /^##contig=.*ID=([^\s,]+)/.exec(line)) != null) {
1292
+ if (!re_ctg.test(m[1])) continue;
1293
+ } else if (/^#CHROM/.test(line)) {
1294
+ var t = line.split("\t");
1295
+ --t.length;
1296
+ t[t.length-1] = sample;
1297
+ line = t.join("\t");
1298
+ print('##FILTER=<ID=HET1,Description="Heterozygous in the first haplotype">');
1299
+ print('##FILTER=<ID=HET2,Description="Heterozygous in the second haplotype">');
1300
+ print('##FILTER=<ID=GAP1,Description="Uncalled in the first haplotype">');
1301
+ print('##FILTER=<ID=GAP2,Description="Uncalled in the second haplotype">');
1302
+ }
1303
+ print(line);
1304
+ continue;
1305
+ }
1306
+ var t = line.split("\t");
1307
+ if (!re_ctg.test(t[0])) continue;
1308
+ var GT = null, AD = null, FILTER = [], HT = [null, null];
1309
+ for (var i = 0; i < 2; ++i) {
1310
+ if ((m = /^(\.|[0-9]+)\/(\.|[0-9]+):(\S+)/.exec(t[9+i])) == null) {
1311
+ warn(line);
1312
+ throw Error("malformatted VCF");
1313
+ }
1314
+ var s = m[3].split(",");
1315
+ if (AD == null) {
1316
+ AD = [];
1317
+ for (var j = 0; j < s.length; ++j)
1318
+ AD[j] = 0;
1319
+ }
1320
+ for (var j = 0; j < s.length; ++j)
1321
+ AD[j] += parseInt(s[j]);
1322
+ if (m[1] == '.') {
1323
+ FILTER.push('GAP' + label[i]);
1324
+ HT[i] = '.';
1325
+ } else if (m[1] != m[2]) {
1326
+ FILTER.push('HET' + label[i]);
1327
+ HT[i] = '.';
1328
+ } else HT[i] = m[1];
1329
+ }
1330
+ --t.length;
1331
+ // test if this is in a haploid region
1332
+ var hap = 0, st = parseInt(t[1]), en = st + t[3].length;
1333
+ if (is_male) {
1334
+ if (/^(chr)?X/.test(t[0])) {
1335
+ if (hgver != null && PAR[hgver] != null) {
1336
+ var r = PAR[hgver], in_par = false;
1337
+ for (var i = 0; i < r.length; ++i)
1338
+ if (r[i][0] <= st && en <= r[i][1])
1339
+ in_par = true;
1340
+ hap = in_par? 0 : 2;
1341
+ }
1342
+ } else if (/^(chr)?Y/.test(t[0])) {
1343
+ hap = 1;
1344
+ }
1345
+ }
1346
+ // special treatment for haploid regions
1347
+ if (hap > 0 && FILTER.length == 1) {
1348
+ if ((hap == 2 && FILTER[0] == "GAP1") || (hap == 1 && FILTER[0] == "GAP2"))
1349
+ FILTER.length = 0;
1350
+ }
1351
+ // update VCF
1352
+ t[5] = 30; // fake QUAL
1353
+ t[6] = FILTER.length? FILTER.join(";") : ".";
1354
+ t[9] = HT.join("|") + ":" + AD.join(",");
1355
+ print(t.join("\t"));
1356
+ }
1357
+ file.close();
1358
+ buf.destroy();
1359
+ }
1360
+
1361
+ /**********************
1362
+ * Conversion related *
1363
+ **********************/
1364
+
1365
+ function paf_view(args)
1366
+ {
1367
+ var c, line_len = 80, fmt = "aln";
1368
+ while ((c = getopt(args, "f:l:")) != null) {
1369
+ if (c == 'f') {
1370
+ fmt = getopt.arg;
1371
+ if (fmt != "aln" && fmt != "lastz-cigar" && fmt != "maf")
1372
+ throw Error("format must be one of aln, lastz-cigar and maf");
1373
+ } else if (c == 'l') line_len = parseInt(getopt.arg);
1374
+ }
1375
+ if (line_len == 0) line_len = 0x7fffffff;
1376
+
1377
+ if (getopt.ind == args.length) {
1378
+ print("Usage: paftools.js view [options] <in.paf>");
1379
+ print("Options:");
1380
+ print(" -f STR output format: aln (BLAST-like), maf or lastz-cigar [aln]");
1381
+ print(" -l INT line length in BLAST-like output [80]");
1382
+ exit(1);
1383
+ }
1384
+
1385
+ function padding_str(x, len, right)
1386
+ {
1387
+ var s = x.toString();
1388
+ if (s.length < len) {
1389
+ if (right) s += Array(len - s.length + 1).join(" ");
1390
+ else s = Array(len - s.length + 1).join(" ") + s;
1391
+ }
1392
+ return s;
1393
+ }
1394
+
1395
+ function update_aln(s_ref, s_qry, s_mid, type, seq, slen)
1396
+ {
1397
+ var l = type == '*'? 1 : seq.length;
1398
+ if (type == '=' || type == ':') {
1399
+ s_ref.set(seq);
1400
+ s_qry.set(seq);
1401
+ s_mid.set(Array(l+1).join("|"));
1402
+ slen[0] += l, slen[1] += l;
1403
+ } else if (type == '*') {
1404
+ s_ref.set(seq.charAt(0));
1405
+ s_qry.set(seq.charAt(1));
1406
+ s_mid.set(' ');
1407
+ slen[0] += 1, slen[1] += 1;
1408
+ } else if (type == '+') {
1409
+ s_ref.set(Array(l+1).join("-"));
1410
+ s_qry.set(seq);
1411
+ s_mid.set(Array(l+1).join(" "));
1412
+ slen[1] += l;
1413
+ } else if (type == '-') {
1414
+ s_ref.set(seq);
1415
+ s_qry.set(Array(l+1).join("-"));
1416
+ s_mid.set(Array(l+1).join(" "));
1417
+ slen[0] += l;
1418
+ }
1419
+ }
1420
+
1421
+ function print_aln(rs, qs, strand, slen, elen, s_ref, s_qry, s_mid)
1422
+ {
1423
+ print(["Ref+:", padding_str(rs + slen[0] + 1, 10, false), s_ref.toString(), padding_str(rs + elen[0], 10, true)].join(" "));
1424
+ print(" " + s_mid.toString());
1425
+ var st, en;
1426
+ if (strand == '+') st = qs + slen[1] + 1, en = qs + elen[1];
1427
+ else st = qs - slen[1], en = qs - elen[1] + 1;
1428
+ print(["Qry" + strand + ":", padding_str(st, 10, false), s_qry.toString(), padding_str(en, 10, true)].join(" "));
1429
+ }
1430
+
1431
+ var s_ref = new Bytes(), s_qry = new Bytes(), s_mid = new Bytes(); // these are used to show padded alignment
1432
+ var re_cs = /([:=\-\+\*])(\d+|[A-Za-z]+)/g;
1433
+ var re_cg = /(\d+)([MIDNSHP=X])/g;
1434
+
1435
+ var buf = new Bytes();
1436
+ var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
1437
+ var lineno = 0;
1438
+ if (fmt == "maf") print("##maf version=1\n");
1439
+ while (file.readline(buf) >= 0) {
1440
+ var m, line = buf.toString();
1441
+ var t = line.split("\t", 12);
1442
+ ++lineno;
1443
+ s_ref.length = s_qry.length = s_mid.length = 0;
1444
+ var slen = [0, 0], elen = [0, 0];
1445
+ if (fmt == "lastz-cigar") { // LASTZ-cigar output
1446
+ var cg = (m = /\tcg:Z:(\S+)/.exec(line)) != null? m[1] : null;
1447
+ if (cg == null) {
1448
+ warn("WARNING: converting to LASTZ-cigar format requires the 'cg' tag, which is absent on line " + lineno);
1449
+ continue;
1450
+ }
1451
+ var score = (m = /\tAS:i:(\d+)/.exec(line)) != null? m[1] : 0;
1452
+ var out = ['cigar:', t[0], t[2], t[3], t[4], t[5], t[7], t[8], '+', score];
1453
+ while ((m = re_cg.exec(cg)) != null)
1454
+ out.push(m[2], m[1]);
1455
+ print(out.join(" "));
1456
+ } else if (fmt == "maf") { // MAF output
1457
+ var cs = (m = /\tcs:Z:(\S+)/.exec(line)) != null? m[1] : null;
1458
+ if (cs == null) {
1459
+ warn("WARNING: converting to MAF requires the 'cs' tag, which is absent on line " + lineno);
1460
+ continue;
1461
+ }
1462
+ while ((m = re_cs.exec(cs)) != null) {
1463
+ if (m[1] == ':')
1464
+ throw Error("converting to MAF only works with 'minimap2 --cs=long'");
1465
+ update_aln(s_ref, s_qry, s_mid, m[1], m[2], elen);
1466
+ }
1467
+ var score = (m = /\tAS:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0;
1468
+ var len = t[0].length > t[5].length? t[0].length : t[5].length;
1469
+ print("a " + score);
1470
+ print(["s", padding_str(t[5], len, true), padding_str(t[7], 10, false), padding_str(parseInt(t[8]) - parseInt(t[7]), 10, false),
1471
+ "+", padding_str(t[6], 10, false), s_ref.toString()].join(" "));
1472
+ var qs, qe, ql = parseInt(t[1]);
1473
+ if (t[4] == '+') {
1474
+ qs = parseInt(t[2]);
1475
+ qe = parseInt(t[3]);
1476
+ } else {
1477
+ qs = ql - parseInt(t[3]);
1478
+ qe = ql - parseInt(t[2]);
1479
+ }
1480
+ print(["s", padding_str(t[0], len, true), padding_str(qs, 10, false), padding_str(qe - qs, 10, false),
1481
+ t[4], padding_str(ql, 10, false), s_qry.toString()].join(" "));
1482
+ print("");
1483
+ } else { // BLAST-like output
1484
+ var cs = (m = /\tcs:Z:(\S+)/.exec(line)) != null? m[1] : null;
1485
+ if (cs == null) {
1486
+ warn("WARNING: converting to BLAST-like alignment requires the 'cs' tag, which is absent on line " + lineno);
1487
+ continue;
1488
+ }
1489
+ var n_mm = 0, n_oi = 0, n_od = 0, n_ei = 0, n_ed = 0;
1490
+ while ((m = re_cs.exec(cs)) != null) {
1491
+ if (m[1] == '*') ++n_mm;
1492
+ else if (m[1] == '+') ++n_oi, n_ei += m[2].length;
1493
+ else if (m[1] == '-') ++n_od, n_ed += m[2].length;
1494
+ }
1495
+ line = line.replace(/\tc[sg]:Z:\S+/g, ""); // get rid of cs or cg tags
1496
+ print('>' + line + "\tmm:i:"+n_mm + "\toi:i:"+n_oi + "\tei:i:"+n_ei + "\tod:i:"+n_od + "\ted:i:"+n_ed);
1497
+ var rs = parseInt(t[7]), qs = t[4] == '+'? parseInt(t[2]) : parseInt(t[3]);
1498
+ var n_blocks = 0;
1499
+ while ((m = re_cs.exec(cs)) != null) {
1500
+ if (m[1] == ':') m[2] = Array(parseInt(m[2]) + 1).join("=");
1501
+ var start = 0, rest = m[1] == '*'? 1 : m[2].length;
1502
+ while (rest > 0) {
1503
+ var l_proc;
1504
+ if (s_ref.length + rest >= line_len) {
1505
+ l_proc = line_len - s_ref.length;
1506
+ update_aln(s_ref, s_qry, s_mid, m[1], m[1] == '*'? m[2] : m[2].substr(start, l_proc), elen);
1507
+ if (n_blocks > 0) print("");
1508
+ print_aln(rs, qs, t[4], slen, elen, s_ref, s_qry, s_mid);
1509
+ ++n_blocks;
1510
+ s_ref.length = s_qry.length = s_mid.length = 0;
1511
+ slen[0] = elen[0], slen[1] = elen[1];
1512
+ } else {
1513
+ l_proc = rest;
1514
+ update_aln(s_ref, s_qry, s_mid, m[1], m[1] == '*'? m[2] : m[2].substr(start, l_proc), elen);
1515
+ }
1516
+ rest -= l_proc, start += l_proc;
1517
+ }
1518
+ }
1519
+ if (s_ref.length > 0) {
1520
+ if (n_blocks > 0) print("");
1521
+ print_aln(rs, qs, t[4], slen, elen, s_ref, s_qry, s_mid);
1522
+ ++n_blocks;
1523
+ }
1524
+ print("//");
1525
+ }
1526
+ }
1527
+ file.close();
1528
+ buf.destroy();
1529
+
1530
+ s_ref.destroy(); s_qry.destroy(); s_mid.destroy();
1531
+ }
1532
+
1533
+ function paf_gff2bed(args)
1534
+ {
1535
+ var c, fn_ucsc_fai = null, is_short = false, keep_gff = false, print_junc = false, output_gene = false;
1536
+ while ((c = getopt(args, "u:sgjG")) != null) {
1537
+ if (c == 'u') fn_ucsc_fai = getopt.arg;
1538
+ else if (c == 's') is_short = true;
1539
+ else if (c == 'g') keep_gff = true;
1540
+ else if (c == 'j') print_junc = true;
1541
+ else if (c == 'G') output_gene = true;
1542
+ }
1543
+
1544
+ if (getopt.ind == args.length) {
1545
+ print("Usage: paftools.js gff2bed [options] <in.gff>");
1546
+ print("Options:");
1547
+ print(" -j Output junction BED");
1548
+ print(" -s Print names in the short form");
1549
+ print(" -u FILE hg38.fa.fai for chr name conversion");
1550
+ print(" -g Output GFF (used with -u)");
1551
+ exit(1);
1552
+ }
1553
+
1554
+ var ens2ucsc = {};
1555
+ if (fn_ucsc_fai != null) {
1556
+ var buf = new Bytes();
1557
+ var file = new File(fn_ucsc_fai);
1558
+ while (file.readline(buf) >= 0) {
1559
+ var t = buf.toString().split("\t");
1560
+ var s = t[0];
1561
+ if (/_(random|alt|decoy)$/.test(s)) {
1562
+ s = s.replace(/_(random|alt|decoy)$/, '');
1563
+ s = s.replace(/^chr\S+_/, '');
1564
+ } else {
1565
+ s = s.replace(/^chrUn_/, '');
1566
+ }
1567
+ s = s.replace(/v(\d+)/, ".$1");
1568
+ if (s != t[0]) ens2ucsc[s] = t[0];
1569
+ }
1570
+ file.close();
1571
+ buf.destroy();
1572
+ }
1573
+
1574
+ var colors = {
1575
+ 'protein_coding':'0,128,255',
1576
+ 'mRNA':'0,128,255',
1577
+ 'lincRNA':'0,192,0',
1578
+ 'snRNA':'0,192,0',
1579
+ 'miRNA':'0,192,0',
1580
+ 'misc_RNA':'0,192,0'
1581
+ };
1582
+
1583
+ function print_bed12(exons, cds_st, cds_en, is_short, print_junc)
1584
+ {
1585
+ if (exons.length == 0) return;
1586
+ var name = is_short? exons[0][7] + "|" + exons[0][5] : exons[0].slice(4, 7).join("|");
1587
+ var a = exons.sort(function(a,b) {return a[1]-b[1]});
1588
+ if (print_junc) {
1589
+ for (var i = 1; i < a.length; ++i)
1590
+ print(a[i][0], a[i-1][2], a[i][1], name, 1000, a[i][3]);
1591
+ return;
1592
+ }
1593
+ var sizes = [], starts = [], st, en;
1594
+ st = a[0][1];
1595
+ en = a[a.length - 1][2];
1596
+ if (cds_st == 1<<30) cds_st = st;
1597
+ if (cds_en == 0) cds_en = en;
1598
+ if (cds_st < st || cds_en > en)
1599
+ throw Error("inconsistent thick start or end for transcript " + a[0][4]);
1600
+ for (var i = 0; i < a.length; ++i) {
1601
+ sizes.push(a[i][2] - a[i][1]);
1602
+ starts.push(a[i][1] - st);
1603
+ }
1604
+ var color = colors[a[0][5]];
1605
+ if (color == null) color = '196,196,196';
1606
+ print(a[0][0], st, en, name, 1000, a[0][3], cds_st, cds_en, color, a.length, sizes.join(",") + ",", starts.join(",") + ",");
1607
+ }
1608
+
1609
+ var re_gtf = /\b(transcript_id|transcript_type|transcript_biotype|gene_name|gene_id|gbkey|transcript_name) "([^"]+)";/g;
1610
+ var re_gff3 = /\b(transcript_id|transcript_type|transcript_biotype|gene_name|gene_id|gbkey|transcript_name)=([^;]+)/g;
1611
+ var re_gtf_gene = /\b(gene_id|gene_type|gene_name) "([^;]+)";/g;
1612
+ var re_gff3_gene = /\b(gene_id|gene_type|source_gene|gene_biotype|gene_name)=([^;]+);/g;
1613
+ var buf = new Bytes();
1614
+ var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
1615
+
1616
+ var exons = [], cds_st = 1<<30, cds_en = 0, last_id = null;
1617
+ while (file.readline(buf) >= 0) {
1618
+ var t = buf.toString().split("\t");
1619
+ if (keep_gff) {
1620
+ if (t[0].charAt(0) != '#' && ens2ucsc[t[0]] != null)
1621
+ t[0] = ens2ucsc[t[0]];
1622
+ print(t.join("\t"));
1623
+ continue;
1624
+ }
1625
+ if (t[0].charAt(0) == '#') continue;
1626
+ if (output_gene) {
1627
+ var id = null, src = null, biotype = null, type = "", name = "N/A";
1628
+ if (t[2] != "gene") continue;
1629
+ while ((m = re_gtf_gene.exec(t[8])) != null) {
1630
+ if (m[1] == "gene_id") id = m[2];
1631
+ else if (m[1] == "gene_type") type = m[2];
1632
+ else if (m[1] == "gene_name") name = m[2];
1633
+ }
1634
+ while ((m = re_gff3_gene.exec(t[8])) != null) {
1635
+ if (m[1] == "gene_id") id = m[2];
1636
+ else if (m[1] == "source_gene") src = m[2];
1637
+ else if (m[1] == "gene_type") type = m[2];
1638
+ else if (m[1] == "gene_biotype") biotype = m[2];
1639
+ else if (m[1] == "gene_name") name = m[2];
1640
+ }
1641
+ if (src != null) id = src;
1642
+ if (type == "" && biotype != null) type = biotype;
1643
+ print(t[0], parseInt(t[3]) - 1, t[4], [id, type, name].join("|"), 1000, t[6]);
1644
+ continue;
1645
+ }
1646
+ if (t[2] != "CDS" && t[2] != "exon") continue;
1647
+ t[3] = parseInt(t[3]) - 1;
1648
+ t[4] = parseInt(t[4]);
1649
+ var id = null, type = "", name = "N/A", biotype = "", m, tname = "N/A";
1650
+ while ((m = re_gtf.exec(t[8])) != null) {
1651
+ if (m[1] == "transcript_id") id = m[2];
1652
+ else if (m[1] == "transcript_type") type = m[2];
1653
+ else if (m[1] == "transcript_biotype" || m[1] == "gbkey") biotype = m[2];
1654
+ else if (m[1] == "gene_name" || m[1] == "gene_id") name = m[2];
1655
+ else if (m[1] == "transcript_name") tname = m[2];
1656
+ }
1657
+ while ((m = re_gff3.exec(t[8])) != null) {
1658
+ if (m[1] == "transcript_id") id = m[2];
1659
+ else if (m[1] == "transcript_type") type = m[2];
1660
+ else if (m[1] == "transcript_biotype" || m[1] == "gbkey") biotype = m[2];
1661
+ else if (m[1] == "gene_name" || m[1] == "gene_id") name = m[2];
1662
+ else if (m[1] == "transcript_name") tname = m[2];
1663
+ }
1664
+ if (type == "" && biotype != "") type = biotype;
1665
+ if (id == null) throw Error("No transcript_id");
1666
+ if (id != last_id) {
1667
+ print_bed12(exons, cds_st, cds_en, is_short, print_junc);
1668
+ exons = [], cds_st = 1<<30, cds_en = 0;
1669
+ last_id = id;
1670
+ }
1671
+ if (t[2] == "CDS") {
1672
+ cds_st = cds_st < t[3]? cds_st : t[3];
1673
+ cds_en = cds_en > t[4]? cds_en : t[4];
1674
+ } else if (t[2] == "exon") {
1675
+ if (fn_ucsc_fai != null) {
1676
+ if (ens2ucsc[t[0]] != null)
1677
+ t[0] = ens2ucsc[t[0]];
1678
+ else if (/^[A-Z]+\d+\.\d+$/.test(t[0]))
1679
+ t[0] = t[0].replace(/([A-Z]+\d+)\.(\d+)/, "chrUn_$1v$2");
1680
+ }
1681
+ exons.push([t[0], t[3], t[4], t[6], id, type, name, tname]);
1682
+ }
1683
+ }
1684
+ if (last_id != null)
1685
+ print_bed12(exons, cds_st, cds_en, is_short, print_junc);
1686
+
1687
+ file.close();
1688
+ buf.destroy();
1689
+ }
1690
+
1691
+ function paf_sam2paf(args)
1692
+ {
1693
+ var c, pri_only = false, long_cs = false;
1694
+ while ((c = getopt(args, "pL")) != null) {
1695
+ if (c == 'p') pri_only = true;
1696
+ else if (c == 'L') long_cs = true;
1697
+ }
1698
+ if (args.length == getopt.ind) {
1699
+ print("Usage: paftools.js sam2paf [options] <in.sam>");
1700
+ print("Options:");
1701
+ print(" -p convert primary or supplementary alignments only");
1702
+ print(" -L output the cs tag in the long form");
1703
+ exit(1);
1704
+ }
1705
+
1706
+ var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
1707
+ var buf = new Bytes();
1708
+ var re = /(\d+)([MIDSHNX=])/g, re_MD = /(\d+)|(\^[A-Za-z]+)|([A-Za-z])/g, re_tag = /\t(\S\S:[AZif]):(\S+)/g;
1709
+
1710
+ var ctg_len = {}, lineno = 0;
1711
+ while (file.readline(buf) >= 0) {
1712
+ var m, n_cigar = 0, line = buf.toString();
1713
+ ++lineno;
1714
+ if (line.charAt(0) == '@') {
1715
+ if (/^@SQ/.test(line)) {
1716
+ var name = (m = /\tSN:(\S+)/.exec(line)) != null? m[1] : null;
1717
+ var l = (m = /\tLN:(\d+)/.exec(line)) != null? parseInt(m[1]) : null;
1718
+ if (name != null && l != null) ctg_len[name] = l;
1719
+ }
1720
+ continue;
1721
+ }
1722
+ var t = line.split("\t", 11);
1723
+ var flag = parseInt(t[1]);
1724
+ if (t[9] != '*' && t[10] != '*' && t[9].length != t[10].length)
1725
+ throw Error("at line " + lineno + ": inconsistent SEQ and QUAL lengths - " + t[9].length + " != " + t[10].length);
1726
+ if (t[2] == '*' || (flag&4) || t[5] == '*') continue;
1727
+ if (pri_only && (flag&0x100)) continue;
1728
+ var tlen = ctg_len[t[2]];
1729
+ if (tlen == null) throw Error("at line " + lineno + ": can't find the length of contig " + t[2]);
1730
+ // find tags
1731
+ var nn = 0, NM = null, MD = null, cs_str = null, md_list = [];
1732
+ while ((m = re_tag.exec(line)) != null) {
1733
+ if (m[1] == "NM:i") NM = parseInt(m[2]);
1734
+ else if (m[1] == "nn:i") nn = parseInt(m[2]);
1735
+ else if (m[1] == "MD:Z") MD = m[2];
1736
+ else if (m[1] == "cs:Z") cs_str = m[2];
1737
+ }
1738
+ if (t[9] == '*') MD = cs_str = null;
1739
+ // infer various lengths from CIGAR
1740
+ var clip = [0, 0], soft_clip = 0, I = [0, 0], D = [0, 0], M = 0, N = 0, mm = 0, have_M = false, have_ext = false, cigar = [];
1741
+ while ((m = re.exec(t[5])) != null) {
1742
+ var l = parseInt(m[1]), op = m[2];
1743
+ if (op == 'M') M += l, have_M = true;
1744
+ else if (op == 'I') ++I[0], I[1] += l;
1745
+ else if (op == 'D') ++D[0], D[1] += l;
1746
+ else if (op == 'N') N += l;
1747
+ else if (op == 'S') clip[n_cigar == 0? 0 : 1] = l, soft_clip += l;
1748
+ else if (op == 'H') clip[n_cigar == 0? 0 : 1] = l;
1749
+ else if (op == '=') M += l, have_ext = true, op = 'M';
1750
+ else if (op == 'X') M += l, mm += l, have_ext = true, op = 'M';
1751
+ ++n_cigar;
1752
+ if (MD != null && op != 'H') {
1753
+ if (cigar.length > 0 && cigar[cigar.length-1][1] == op)
1754
+ cigar[cigar.length-1][0] += l;
1755
+ else cigar.push([l, op]);
1756
+ }
1757
+ }
1758
+ var ql = M + I[1] + soft_clip;
1759
+ var tl = M + D[1] + N;
1760
+ var ts = parseInt(t[3]) - 1, te = ts + tl;
1761
+ // checking coordinate and length consistencies
1762
+ if (n_cigar > 65535)
1763
+ warn("WARNING at line " + lineno + ": " + n_cigar + " CIGAR operations");
1764
+ if (te > tlen) {
1765
+ warn("WARNING at line " + lineno + ": alignment end position larger than ref length; skipped");
1766
+ continue;
1767
+ }
1768
+ if (t[9] != '*' && t[9].length != ql) {
1769
+ warn("WARNING at line " + lineno + ": SEQ length inconsistent with CIGAR (" + t[9].length + " != " + ql + "); skipped");
1770
+ continue;
1771
+ }
1772
+ // parse MD
1773
+ var cs = [];
1774
+ if (MD != null && cs_str == null && t[9] != "*") {
1775
+ var k = 0, cx = 0, cy = 0, mx = 0, my = 0; // cx: cigar ref position; cy: cigar query; mx: MD ref; my: MD query
1776
+ while ((m = re_MD.exec(MD)) != null) {
1777
+ if (m[2] != null) { // deletion from the reference
1778
+ var len = m[2].length - 1;
1779
+ cs.push('-', m[2].substr(1));
1780
+ mx += len, cx += len, ++k;
1781
+ } else { // copy or mismatch
1782
+ var ml = m[1] != null? parseInt(m[1]) : 1;
1783
+ while (k < cigar.length && cigar[k][1] != 'D') {
1784
+ var cl = cigar[k][0], op = cigar[k][1];
1785
+ if (op == 'M') {
1786
+ if (my + ml < cy + cl) {
1787
+ if (ml > 0) {
1788
+ if (m[3] != null) cs.push('*', m[3], t[9][my]);
1789
+ else if (long_cs) cs.push('=', t[9].substr(my, ml));
1790
+ else cs.push(':', ml);
1791
+ }
1792
+ mx += ml, my += ml, ml = 0;
1793
+ break;
1794
+ } else {
1795
+ var dl = cy + cl - my;
1796
+ if (long_cs) cs.push('=', t[9].substr(my, dl));
1797
+ else cs.push(':', dl);
1798
+ cx += cl, cy += cl, ++k;
1799
+ mx += dl, my += dl, ml -= dl;
1800
+ }
1801
+ } else if (op == 'I') {
1802
+ cs.push('+', t[9].substr(cy, cl));
1803
+ cy += cl, my += cl, ++k;
1804
+ } else if (op == 'S') {
1805
+ cy += cl, my += cl, ++k;
1806
+ } else throw Error("at line " + lineno + ": inconsistent MD tag");
1807
+ }
1808
+ if (ml != 0) throw Error("at line " + lineno + ": inconsistent MD tag");
1809
+ }
1810
+ }
1811
+ if (cx != mx || cy != my) throw Error("at line " + lineno + ": inconsistent MD tag");
1812
+ }
1813
+ // compute matching length, block length and calibrate NM
1814
+ if (have_ext && !have_M) { // extended CIGAR
1815
+ if (NM != null && NM != I[1] + D[1] + mm)
1816
+ warn("WARNING at line " + lineno + ": NM is different from sum of gaps and mismatches");
1817
+ NM = I[1] + D[1] + mm;
1818
+ } else if (NM != null) { // standard CIGAR; NM present
1819
+ if (NM < I[1] + D[1]) {
1820
+ warn("WARNING at line " + lineno + ": NM is less than the total number of gaps (" + NM + " < " + (I[1]+D[1]) + ")");
1821
+ NM = I[1] + D[1];
1822
+ }
1823
+ mm = NM - (I[1] + D[1]);
1824
+ } else { // no way to compute mm
1825
+ warn("WARNING at line " + lineno + ": unable to find the number of mismatches; assuming zero");
1826
+ mm = 0;
1827
+ }
1828
+ var mlen = M - mm;
1829
+ var blen = M + I[1] + D[1];
1830
+ // find query name, start and end
1831
+ var qlen = M + I[1] + clip[0] + clip[1];
1832
+ var qname = t[0], qs, qe;
1833
+ if ((flag&1) && (flag&0x40)) qname += '/1';
1834
+ if ((flag&1) && (flag&0x80)) qname += '/2';
1835
+ if (flag&16) qs = clip[1], qe = qlen - clip[0];
1836
+ else qs = clip[0], qe = qlen - clip[1];
1837
+ // optional tags
1838
+ var type = flag&0x100? 'S' : 'P';
1839
+ var tags = ["tp:A:" + type];
1840
+ if (NM != null) tags.push("mm:i:"+mm);
1841
+ tags.push("gn:i:"+(I[1]+D[1]), "go:i:"+(I[0]+D[0]), "cg:Z:" + t[5].replace(/\d+[SH]/g, ''));
1842
+ if (cs_str != null) tags.push("cs:Z:" + cs_str);
1843
+ else if (cs.length > 0) tags.push("cs:Z:" + cs.join(""));
1844
+ // print out
1845
+ var a = [qname, qlen, qs, qe, flag&16? '-' : '+', t[2], tlen, ts, te, mlen, blen, t[4]];
1846
+ print(a.join("\t"), tags.join("\t"));
1847
+ }
1848
+
1849
+ buf.destroy();
1850
+ file.close();
1851
+ }
1852
+
1853
+ function paf_delta2paf(args)
1854
+ {
1855
+ if (args.length == 0) {
1856
+ print("Usage: paftools.js delta2paf <in.delta>");
1857
+ exit(1);
1858
+ }
1859
+
1860
+ var buf = new Bytes();
1861
+ var file = args[0] == '-'? new File() : new File(args[0]);
1862
+
1863
+ var rname, qname, rlen, qlen, qs, qe, rs, re, strand, NM, cigar, x, y, seen_gt = false;
1864
+ while (file.readline(buf) >= 0) {
1865
+ var m, line = buf.toString();
1866
+ if ((m = /^>(\S+)\s+(\S+)\s+(\d+)\s+(\d+)/.exec(line)) != null) {
1867
+ rname = m[1], qname = m[2], rlen = parseInt(m[3]), qlen = parseInt(m[4]);
1868
+ seen_gt = true;
1869
+ continue;
1870
+ }
1871
+ if (!seen_gt) continue;
1872
+ var t = line.split(" ");
1873
+ if (t.length == 7) {
1874
+ for (var i = 0; i < 5; ++i)
1875
+ t[i] = parseInt(t[i]);
1876
+ strand = ((t[0] < t[1] && t[2] < t[3]) || (t[0] > t[1] && t[2] > t[3]))? 1 : -1;
1877
+ rs = (t[0] < t[1]? t[0] : t[1]) - 1;
1878
+ re = t[1] > t[0]? t[1] : t[0];
1879
+ qs = (t[2] < t[3]? t[2] : t[3]) - 1;
1880
+ qe = t[3] > t[2]? t[3] : t[2];
1881
+ x = y = 0;
1882
+ NM = parseInt(t[4]);
1883
+ cigar = [];
1884
+ } else if (t.length == 1) {
1885
+ var d = parseInt(t[0]);
1886
+ if (d == 0) {
1887
+ var blen = 0, cigar_str = [];
1888
+ if (re - rs - x != qe - qs - y) throw Error("inconsisnt alignment");
1889
+ cigar.push((re - rs - x) << 4);
1890
+ for (var i = 0; i < cigar.length; ++i) {
1891
+ blen += cigar[i] >> 4;
1892
+ cigar_str.push((cigar[i]>>4) + "MID".charAt(cigar[i]&0xf));
1893
+ }
1894
+ print([qname, qlen, qs, qe, strand > 0? '+' : '-', rname, rlen, rs, re, blen - NM, blen, 0, "NM:i:" + NM, "cg:Z:" + cigar_str.join("")].join("\t"));
1895
+ } else if (d > 0) {
1896
+ var l = d - 1;
1897
+ x += l + 1, y += l;
1898
+ if (l > 0) cigar.push(l<<4);
1899
+ if (cigar.length > 0 && (cigar[cigar.length-1]&0xf) == 2)
1900
+ cigar[cigar.length-1] += 1<<4;
1901
+ else cigar.push(1<<4|2); // deletion
1902
+ } else {
1903
+ var l = -d - 1;
1904
+ x += l, y += l + 1;
1905
+ if (l > 0) cigar.push(l<<4);
1906
+ if (cigar.length > 0 && (cigar[cigar.length-1]&0xf) == 1)
1907
+ cigar[cigar.length-1] += 1<<4;
1908
+ else cigar.push(1<<4|1); // insertion
1909
+ }
1910
+ }
1911
+ }
1912
+ file.close();
1913
+ buf.destroy();
1914
+ }
1915
+
1916
+ function paf_splice2bed(args)
1917
+ {
1918
+ var colors = ["0,128,255", "255,0,0", "0,192,0"];
1919
+
1920
+ function print_lines(a, fmt, keep_multi)
1921
+ {
1922
+ if (a.length == 0) return;
1923
+ if (fmt == "bed") {
1924
+ var n_pri = 0;
1925
+ for (var i = 0; i < a.length; ++i)
1926
+ if (a[i][8] == 0) ++n_pri;
1927
+ if (n_pri > 1) {
1928
+ for (var i = 0; i < a.length; ++i)
1929
+ if (a[i][8] == 0) a[i][8] = 1;
1930
+ } else if (n_pri == 0) {
1931
+ warn("Warning: " + a[0][3] + " doesn't have a primary alignment");
1932
+ }
1933
+ for (var i = 0; i < a.length; ++i) {
1934
+ if (!keep_multi && a[i][8] == 2) continue;
1935
+ a[i][8] = colors[a[i][8]];
1936
+ print(a[i].join("\t"));
1937
+ }
1938
+ }
1939
+ a.length = 0;
1940
+ }
1941
+
1942
+ var re = /(\d+)([MIDNSHP=X])/g;
1943
+ var c, fmt = "bed", fn_name_conv = null, keep_multi = false;
1944
+ while ((c = getopt(args, "f:n:m")) != null) {
1945
+ if (c == 'f') fmt = getopt.arg;
1946
+ else if (c == 'n') fn_name_conv = getopt.arg;
1947
+ else if (c == 'm') keep_multi = true;
1948
+ }
1949
+ if (getopt.ind == args.length) {
1950
+ print("Usage: paftools.js splice2bed [options] <in.paf>|<in.sam>");
1951
+ print("Options:");
1952
+ print(" -m keep multiple mappings (SAM flag 0x100)");
1953
+ exit(1);
1954
+ }
1955
+
1956
+ var conv = null;
1957
+ if (fn_name_conv != null) {
1958
+ conv = new Map();
1959
+ var file = new File(fn_name_conv);
1960
+ var buf = new Bytes();
1961
+ while (file.readline(buf) >= 0) {
1962
+ var t = buf.toString().split("\t");
1963
+ conv.put(t[0], t[1]);
1964
+ }
1965
+ buf.destroy();
1966
+ file.close();
1967
+ }
1968
+
1969
+ var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
1970
+ var buf = new Bytes();
1971
+ var a = [];
1972
+ while (file.readline(buf) >= 0) {
1973
+ var line = buf.toString();
1974
+ if (line.charAt(0) == '@') continue; // skip SAM header lines
1975
+ var t = line.split("\t");
1976
+ var is_pri = false, cigar = null, a1;
1977
+ var qname = conv != null? conv.get(t[0]) : null;
1978
+ if (qname != null) t[0] = qname;
1979
+ if (t.length >= 10 && t[4] != '+' && t[4] != '-' && /^\d+/.test(t[1])) { // SAM
1980
+ var flag = parseInt(t[1]);
1981
+ if (flag&1) t[0] += '/' + (flag>>6&3);
1982
+ }
1983
+ if (a.length && a[0][3] != t[0]) {
1984
+ print_lines(a, fmt, keep_multi);
1985
+ a = [];
1986
+ }
1987
+ if (t.length >= 12 && (t[4] == '+' || t[4] == '-')) { // PAF
1988
+ for (var i = 12; i < t.length; ++i) {
1989
+ if (t[i].substr(0, 5) == 'cg:Z:') {
1990
+ cigar = t[i].substr(5);
1991
+ } else if (t[i].substr(0, 5) == 's2:i:') {
1992
+ is_pri = true;
1993
+ }
1994
+ }
1995
+ a1 = [t[5], t[7], t[8], t[0], Math.floor(t[9]/t[10]*1000), t[4]];
1996
+ } else if (t.length >= 10) { // SAM
1997
+ var flag = parseInt(t[1]);
1998
+ if ((flag&4) || a[2] == '*') continue;
1999
+ cigar = t[5];
2000
+ is_pri = (flag&0x100)? false : true;
2001
+ a1 = [t[2], parseInt(t[3])-1, null, t[0], 1000, (flag&16)? '-' : '+'];
2002
+ } else {
2003
+ throw Error("unrecognized input format");
2004
+ }
2005
+ if (cigar == null) throw Error("missing CIGAR");
2006
+ var m, x0 = 0, x = 0, bs = [], bl = [];
2007
+ while ((m = re.exec(cigar)) != null) {
2008
+ if (m[2] == 'M' || m[2] == 'D') {
2009
+ x += parseInt(m[1]);
2010
+ } else if (m[2] == 'N') {
2011
+ bs.push(x0);
2012
+ bl.push(x - x0);
2013
+ x += parseInt(m[1]);
2014
+ x0 = x;
2015
+ }
2016
+ }
2017
+ bs.push(x0);
2018
+ bl.push(x - x0);
2019
+ // write the BED12 line
2020
+ if (a1[2] == null) a1[2] = a1[1] + x;
2021
+ a1.push(a1[1], a1[2]); // thick start/end is the same as start/end
2022
+ a1.push(is_pri? 0 : 2, bs.length, bl.join(",")+",", bs.join(",")+",");
2023
+ a.push(a1);
2024
+ }
2025
+ print_lines(a, fmt, keep_multi);
2026
+ buf.destroy();
2027
+ file.close();
2028
+ if (conv != null) conv.destroy();
2029
+ }
2030
+
2031
+ /**********************
2032
+ * Evaluation related *
2033
+ **********************/
2034
+
2035
+ // evaluate mapping accuracy
2036
+ function paf_mapeval(args)
2037
+ {
2038
+ var c, max_mapq = 60, mode = 0, err_out_q = 256, print_err = false, ovlp_ratio = 0.1, cap_short_mapq = false;
2039
+ while ((c = getopt(args, "Q:r:m:c")) != null) {
2040
+ if (c == 'Q') err_out_q = parseInt(getopt.arg), print_err = true;
2041
+ else if (c == 'r') ovlp_ratio = parseFloat(getopt.arg);
2042
+ else if (c == 'm') mode = parseInt(getopt.arg);
2043
+ else if (c == 'c') cap_short_mapq = true;
2044
+ }
2045
+
2046
+ if (args.length == getopt.ind) {
2047
+ warn("Usage: paftools.js mapeval [options] <in.paf>|<in.sam>");
2048
+ warn("Options:");
2049
+ warn(" -r FLOAT mapping correct if overlap_length/union_length>FLOAT [" + ovlp_ratio + "]");
2050
+ warn(" -Q INT print wrong mappings with mapQ>INT [don't print]");
2051
+ warn(" -m INT 0: eval the longest aln only; 1: first aln only; 2: all primary aln [0]");
2052
+ exit(1);
2053
+ }
2054
+
2055
+ var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
2056
+ var buf = new Bytes();
2057
+
2058
+ var tot = [], err = [];
2059
+ for (var q = 0; q <= max_mapq; ++q)
2060
+ tot[q] = err[q] = 0;
2061
+
2062
+ function is_correct(s, b)
2063
+ {
2064
+ if (s[0] != b[0] || s[3] != b[3]) return false;
2065
+ var o, l;
2066
+ if (s[1] < b[1]) {
2067
+ if (s[2] <= b[1]) return false;
2068
+ o = (s[2] < b[2]? s[2] : b[2]) - b[1];
2069
+ l = (s[2] > b[2]? s[2] : b[2]) - s[1];
2070
+ } else {
2071
+ if (b[2] <= s[1]) return false;
2072
+ o = (s[2] < b[2]? s[2] : b[2]) - s[1];
2073
+ l = (s[2] > b[2]? s[2] : b[2]) - b[1];
2074
+ }
2075
+ return o/l > ovlp_ratio? true : false;
2076
+ }
2077
+
2078
+ function count_err(qname, a, tot, err, mode)
2079
+ {
2080
+ if (a.length == 0) return;
2081
+
2082
+ var m, s;
2083
+ if ((m = /^(\S+)!(\S+)!(\d+)!(\d+)!([\+\-])$/.exec(qname)) != null) { // pbsim single-end reads
2084
+ s = [m[1], m[2], parseInt(m[3]), parseInt(m[4]), m[5]];
2085
+ } else if ((m = /^(\S+)!(\S+)!(\d+)_(\d+)!(\d+)_(\d+)!([\+\-])([\+\-])\/([12])$/.exec(qname)) != null) { // mason2 paired-end reads
2086
+ if (m[9] == '1') {
2087
+ s = [m[1], m[2], parseInt(m[3]), parseInt(m[5]), m[7]];
2088
+ } else {
2089
+ s = [m[1], m[2], parseInt(m[4]), parseInt(m[6]), m[8]];
2090
+ }
2091
+ } else throw Error("Failed to parse simulated read names '" + qname + "'");
2092
+ s.shift(); // skip the orginal read name
2093
+
2094
+ if (mode == 0 || mode == 1) { // longest only or first only
2095
+ var max_i = 0;
2096
+ if (mode == 0) { // longest only
2097
+ var max = 0;
2098
+ for (var i = 0; i < a.length; ++i)
2099
+ if (a[i][5] > max)
2100
+ max = a[i][5], max_i = i;
2101
+ }
2102
+ var mapq = a[max_i][4];
2103
+ ++tot[mapq];
2104
+ if (!is_correct(s, a[max_i])) {
2105
+ if (mapq >= err_out_q)
2106
+ print('E', qname, a[max_i].join("\t"));
2107
+ ++err[mapq];
2108
+ }
2109
+ } else if (mode == 2) { // all primary mode
2110
+ var max_err_mapq = -1, max_mapq = 0, max_err_i = -1;
2111
+ if (cap_short_mapq) {
2112
+ var max = 0, max_q = 0;
2113
+ for (var i = 0; i < a.length; ++i)
2114
+ if (a[i][5] > max)
2115
+ max = a[i][5], max_q = a[i][4];
2116
+ for (var i = 0; i < a.length; ++i)
2117
+ a[i][4] = max_q < a[i][4]? max_q : a[i][4];
2118
+ }
2119
+ for (var i = 0; i < a.length; ++i) {
2120
+ max_mapq = max_mapq > a[i][4]? max_mapq : a[i][4];
2121
+ if (!is_correct(s, a[i]))
2122
+ if (a[i][4] > max_err_mapq)
2123
+ max_err_mapq = a[i][4], max_err_i = i;
2124
+ }
2125
+ if (max_err_mapq >= 0) {
2126
+ ++tot[max_err_mapq], ++err[max_err_mapq];
2127
+ if (max_err_mapq >= err_out_q)
2128
+ print('E', qname, a[max_err_i].join("\t"));
2129
+ } else ++tot[max_mapq];
2130
+ }
2131
+ }
2132
+
2133
+ var lineno = 0, last = null, a = [], n_unmapped = null;
2134
+ var re_cigar = /(\d+)([MIDSHN])/g;
2135
+ while (file.readline(buf) >= 0) {
2136
+ var m, line = buf.toString();
2137
+ ++lineno;
2138
+ if (line[0] != '@') {
2139
+ var t = line.split("\t");
2140
+ if (t[4] == '+' || t[4] == '-') { // PAF
2141
+ if (last != t[0]) {
2142
+ if (last != null) count_err(last, a, tot, err, mode);
2143
+ a = [], last = t[0];
2144
+ }
2145
+ if (/\ts1:i:\d+/.test(line) && !/\ts2:i:\d+/.test(line)) // secondary alignment in minimap2 PAF
2146
+ continue;
2147
+ var mapq = parseInt(t[11]);
2148
+ if (mapq > max_mapq) mapq = max_mapq;
2149
+ a.push([t[5], parseInt(t[7]), parseInt(t[8]), t[4], mapq, parseInt(t[9])]);
2150
+ } else { // SAM
2151
+ var flag = parseInt(t[1]);
2152
+ var read_no = flag>>6&0x3;
2153
+ var qname = t[0];
2154
+ if (!/\/[12]$/.test(qname))
2155
+ qname = read_no == 1 || read_no == 2? t[0] + '/' + read_no : t[0];
2156
+ if (last != qname) {
2157
+ if (last != null) count_err(last, a, tot, err, mode);
2158
+ a = [], last = qname;
2159
+ }
2160
+ if (flag&0x100) continue; // secondary alignment
2161
+ if ((flag&0x4) || t[2] == '*') { // unmapped
2162
+ if (n_unmapped == null) n_unmapped = 0;
2163
+ ++n_unmapped;
2164
+ continue;
2165
+ }
2166
+ var mapq = parseInt(t[4]);
2167
+ if (mapq > max_mapq) mapq = max_mapq;
2168
+ var pos = parseInt(t[3]) - 1, pos_end = pos;
2169
+ var n_gap = 0, mlen = 0;
2170
+ while ((m = re_cigar.exec(t[5])) != null) {
2171
+ var len = parseInt(m[1]);
2172
+ if (m[2] == 'M') pos_end += len, mlen += len;
2173
+ else if (m[2] == 'I') n_gap += len;
2174
+ else if (m[2] == 'D') n_gap += len, pos_end += len;
2175
+ }
2176
+ var score = pos_end - pos;
2177
+ if ((m = /\tNM:i:(\d+)/.exec(line)) != null) {
2178
+ var NM = parseInt(m[1]);
2179
+ if (NM >= n_gap) score = mlen - (NM - n_gap);
2180
+ }
2181
+ a.push([t[2], pos, pos_end, (flag&16)? '-' : '+', mapq, score]);
2182
+ }
2183
+ }
2184
+ }
2185
+ if (last != null) count_err(last, a, tot, err, mode);
2186
+
2187
+ buf.destroy();
2188
+ file.close();
2189
+
2190
+ var sum_tot = 0, sum_err = 0, q_out = -1, sum_tot2 = 0, sum_err2 = 0;
2191
+ for (var q = max_mapq; q >= 0; --q) {
2192
+ if (tot[q] == 0) continue;
2193
+ if (q_out < 0 || err[q] > 0) {
2194
+ if (q_out >= 0) print('Q', q_out, sum_tot, sum_err, (sum_err2/sum_tot2).toFixed(9), sum_tot2);
2195
+ sum_tot = sum_err = 0, q_out = q;
2196
+ }
2197
+ sum_tot += tot[q], sum_err += err[q];
2198
+ sum_tot2 += tot[q], sum_err2 += err[q];
2199
+ }
2200
+ print('Q', q_out, sum_tot, sum_err, (sum_err2/sum_tot2).toFixed(9), sum_tot2);
2201
+ if (n_unmapped != null) print('U', n_unmapped);
2202
+ }
2203
+
2204
+ // convert mason2 SAM to FASTQ
2205
+ function paf_mason2fq(args)
2206
+ {
2207
+ if (args.length == 0) {
2208
+ print("Usage: paftools.js mason2fq <mason.sam>");
2209
+ exit(1);
2210
+ }
2211
+
2212
+ function print_se(a)
2213
+ {
2214
+ print('@' + a.slice(0, 5).join("!") + " " + a[8]);
2215
+ print(a[5]);
2216
+ print("+");
2217
+ print(a[6]);
2218
+ }
2219
+
2220
+ var buf = new Bytes(), buf2 = new Bytes();
2221
+ var file = new File(args[0]);
2222
+ var re = /(\d+)([MIDSHN])/g;
2223
+ var last = null;
2224
+ while (file.readline(buf) >= 0) {
2225
+ var t = buf.toString().split("\t");
2226
+ if (t[0].charAt(0) == '@') continue;
2227
+ var m, l_ref = 0;
2228
+ while ((m = re.exec(t[5])) != null)
2229
+ if (m[2] == 'D' || m[2] == 'M' || m[2] == 'N')
2230
+ l_ref += parseInt(m[1]);
2231
+ var flag = parseInt(t[1]);
2232
+ var rev = !!(flag&16);
2233
+ var seq, qual;
2234
+ if (rev) {
2235
+ buf2.length = 0;
2236
+ buf2.set(t[9], 0);
2237
+ buf2.revcomp();
2238
+ seq = buf2.toString();
2239
+ buf2.set(t[10], 0);
2240
+ buf2.reverse();
2241
+ qual = buf2.toString();
2242
+ } else seq = t[9], qual = t[10];
2243
+ var qname = t[0];
2244
+ qname = qname.replace(/^simulated./, "");
2245
+ var chr = t[2];
2246
+ var pos = parseInt(t[3]) - 1;
2247
+ var strand = (flag&16)? '-' : '+';
2248
+ var read_no = flag&0xc0;
2249
+ if (read_no == 0x40) read_no = 1;
2250
+ else if (read_no == 0x80) read_no = 2;
2251
+ else read_no = 0;
2252
+ var err = 0, snp = 0, indel = 0;
2253
+ for (var i = 11; i < t.length; ++i) {
2254
+ if ((m = /^XE:i:(\d+)/.exec(t[i])) != null) err = m[1];
2255
+ else if ((m = /^XS:i:(\d+)/.exec(t[i])) != null) snp = m[1];
2256
+ else if ((m = /^XI:i:(\d+)/.exec(t[i])) != null) indel = m[1];
2257
+ }
2258
+ var comment = [err, snp, indel].join(":");
2259
+ if (last == null) {
2260
+ last = [qname, chr, pos, pos + l_ref, strand, seq, qual, read_no, comment];
2261
+ } else if (last[0] != qname) {
2262
+ print_se(last);
2263
+ last = [qname, chr, pos, pos + l_ref, strand, seq, qual, read_no, comment];
2264
+ } else {
2265
+ if (read_no == 2) { // last[] is the first read
2266
+ if (last[7] != 1) throw Error("ERROR: can't find read1");
2267
+ var name = [qname, chr, last[2] + "_" + pos, last[3] + "_" + (pos + l_ref), last[4] + strand].join("!");
2268
+ print('@' + name + '/1' + ' ' + last[8]); print(last[5]); print("+"); print(last[6]);
2269
+ print('@' + name + '/2' + ' ' + comment); print(seq); print("+"); print(qual);
2270
+ } else {
2271
+ if (last[7] != 2) throw Error("ERROR: can't find read2");
2272
+ var name = [qname, chr, pos + "_" + last[2], (pos + l_ref) + "_" + last[3], strand + last[4]].join("!");
2273
+ print('@' + name + '/1' + ' ' + comment); print(seq); print("+"); print(qual);
2274
+ print('@' + name + '/2' + ' ' + last[8]); print(last[5]); print("+"); print(last[6]);
2275
+ }
2276
+ last = null;
2277
+ }
2278
+ }
2279
+ if (last != null) print_se(last);
2280
+ file.close();
2281
+ buf.destroy();
2282
+ buf2.destroy();
2283
+ }
2284
+
2285
+ // convert pbsim MAF to FASTQ
2286
+ function paf_pbsim2fq(args)
2287
+ {
2288
+ if (args.length < 2) {
2289
+ print("Usage: paftools.js pbsim2fq <ref.fa.fai> <pbsim1.maf> [[pbsim2.maf] ...]");
2290
+ exit(1);
2291
+ }
2292
+
2293
+ var file, buf = new Bytes(), buf2 = new Bytes();
2294
+ file = new File(args[0]);
2295
+ var chr_list = [];
2296
+ while (file.readline(buf) >= 0) {
2297
+ var t = buf.toString().split(/\s+/);
2298
+ chr_list.push(t[0]);
2299
+ }
2300
+ file.close();
2301
+
2302
+ for (var k = 1; k < args.length; ++k) {
2303
+ var fn = args[k];
2304
+ file = new File(fn);
2305
+ var state = 0, reg;
2306
+ while (file.readline(buf) >= 0) {
2307
+ var line = buf.toString();
2308
+ if (state == 0 && line.charAt(0) == 'a') {
2309
+ state = 1;
2310
+ } else if (state == 1 && line.charAt(0) == 's') {
2311
+ var t = line.split(/\s+/);
2312
+ var st = parseInt(t[2]);
2313
+ reg = [st, st + parseInt(t[3])];
2314
+ state = 2;
2315
+ } else if (state == 2 && line.charAt(0) == 's') {
2316
+ var m, t = line.split(/\s+/);
2317
+ if ((m = /S(\d+)_\d+/.exec(t[1])) == null) throw Error("Failed to parse the read name");
2318
+ var chr_id = parseInt(m[1]) - 1;
2319
+ if (chr_id >= chr_list.length) throw Error("Index outside the chr list");
2320
+ var name = [t[1], chr_list[chr_id], reg[0], reg[1], t[4]].join("!");
2321
+ var seq = t[6].replace(/\-/g, "");
2322
+ if (seq.length != parseInt(t[5])) throw Error("Inconsistent read length");
2323
+ if (seq.indexOf("NN") < 0) {
2324
+ if (t[4] == '-') {
2325
+ buf2.set(seq, 0);
2326
+ buf2.length = seq.length;
2327
+ buf2.revcomp();
2328
+ seq = buf2.toString();
2329
+ }
2330
+ print(">" + name);
2331
+ print(seq);
2332
+ }
2333
+ state = 0;
2334
+ }
2335
+ }
2336
+ file.close();
2337
+ }
2338
+ buf.destroy();
2339
+ buf2.destroy();
2340
+ }
2341
+
2342
+ function paf_junceval(args)
2343
+ {
2344
+ var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false;
2345
+ while ((c = getopt(args, "l:epc")) != null) {
2346
+ if (c == 'l') l_fuzzy = parseInt(getopt.arg);
2347
+ else if (c == 'e') print_err_only = print_ovlp = true;
2348
+ else if (c == 'p') print_ovlp = true;
2349
+ else if (c == 'c') chr_only = true;
2350
+ }
2351
+
2352
+ if (args.length - getopt.ind < 1) {
2353
+ print("Usage: paftools.js junceval [options] <gene.gtf> <aln.sam>");
2354
+ print("Options:");
2355
+ print(" -l INT tolerance of junction positions (0 for exact) [0]");
2356
+ print(" -p print overlapping introns");
2357
+ print(" -e print erroreous overlapping introns");
2358
+ print(" -c only consider alignments to /^(chr)?([0-9]+|X|Y)$/");
2359
+ exit(1);
2360
+ }
2361
+
2362
+ var file, buf = new Bytes();
2363
+
2364
+ var tr = {};
2365
+ file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
2366
+ while (file.readline(buf) >= 0) {
2367
+ var m, t = buf.toString().split("\t");
2368
+ if (t[0].charAt(0) == '#') continue;
2369
+ if (t[2] != 'exon') continue;
2370
+ var st = parseInt(t[3]) - 1;
2371
+ var en = parseInt(t[4]);
2372
+ if ((m = /transcript_id "(\S+)"/.exec(t[8])) == null) continue;
2373
+ var tid = m[1];
2374
+ if (tr[tid] == null) tr[tid] = [t[0], t[6], 0, 0, []];
2375
+ tr[tid][4].push([st, en]);
2376
+ }
2377
+ file.close();
2378
+
2379
+ var anno = {};
2380
+ for (var tid in tr) {
2381
+ var t = tr[tid];
2382
+ Interval.sort(t[4]);
2383
+ t[2] = t[4][0][0];
2384
+ t[3] = t[4][t[4].length - 1][1];
2385
+ if (anno[t[0]] == null) anno[t[0]] = [];
2386
+ var s = t[4];
2387
+ for (var i = 0; i < s.length - 1; ++i) {
2388
+ if (s[i][1] >= s[i+1][0])
2389
+ warn("WARNING: incorrect annotation for transcript "+tid+" ("+s[i][1]+" >= "+s[i+1][0]+")")
2390
+ anno[t[0]].push([s[i][1], s[i+1][0]]);
2391
+ }
2392
+ }
2393
+ tr = null;
2394
+
2395
+ for (var chr in anno) {
2396
+ var e = anno[chr];
2397
+ if (e.length == 0) continue;
2398
+ Interval.sort(e);
2399
+ var k = 0;
2400
+ for (var i = 1; i < e.length; ++i) // dedup
2401
+ if (e[i][0] != e[k][0] || e[i][1] != e[k][1])
2402
+ e[++k] = e[i].slice(0);
2403
+ e.length = k + 1;
2404
+ Interval.index_end(e);
2405
+ }
2406
+
2407
+ var n_pri = 0, n_unmapped = 0, n_mapped = 0;
2408
+ var n_sgl = 0, n_splice = 0, n_splice_hit = 0, n_splice_novel = 0;
2409
+
2410
+ file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]);
2411
+ var last_qname = null;
2412
+ var re_cigar = /(\d+)([MIDNSHP=X])/g;
2413
+ while (file.readline(buf) >= 0) {
2414
+ var m, t = buf.toString().split("\t");
2415
+ var ctg_name = null, cigar = null, pos = null, qname = t[0];
2416
+
2417
+ if (t[0].charAt(0) == '@') continue;
2418
+ if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
2419
+ ctg_name = t[5], pos = parseInt(t[7]);
2420
+ var type = 'P';
2421
+ for (i = 12; i < t.length; ++i) {
2422
+ if ((m = /^(tp:A|cg:Z):(\S+)/.exec(t[i])) != null) {
2423
+ if (m[1] == 'tp:A') type = m[2];
2424
+ else cigar = m[2];
2425
+ }
2426
+ }
2427
+ if (type == 'S') continue; // secondary
2428
+ } else { // SAM
2429
+ ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5];
2430
+ var flag = parseInt(t[1]);
2431
+ if (flag&0x100) continue; // secondary
2432
+ }
2433
+
2434
+ if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(ctg_name)) continue;
2435
+ if (first_only && last_qname == qname) continue;
2436
+ if (ctg_name == '*') { // unmapped
2437
+ ++n_unmapped;
2438
+ continue;
2439
+ } else {
2440
+ ++n_pri;
2441
+ if (last_qname != qname) {
2442
+ ++n_mapped;
2443
+ last_qname = qname;
2444
+ }
2445
+ }
2446
+
2447
+ var intron = [];
2448
+ while ((m = re_cigar.exec(cigar)) != null) {
2449
+ var len = parseInt(m[1]), op = m[2];
2450
+ if (op == 'N') {
2451
+ intron.push([pos, pos + len]);
2452
+ pos += len;
2453
+ } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
2454
+ }
2455
+ if (intron.length == 0) {
2456
+ ++n_sgl;
2457
+ continue;
2458
+ }
2459
+ n_splice += intron.length;
2460
+
2461
+ var chr = anno[ctg_name];
2462
+ if (chr != null) {
2463
+ for (var i = 0; i < intron.length; ++i) {
2464
+ var o = Interval.find_ovlp(chr, intron[i][0], intron[i][1]);
2465
+ if (o.length > 0) {
2466
+ var hit = false;
2467
+ for (var j = 0; j < o.length; ++j) {
2468
+ var st_diff = intron[i][0] - o[j][0];
2469
+ var en_diff = intron[i][1] - o[j][1];
2470
+ if (st_diff < 0) st_diff = -st_diff;
2471
+ if (en_diff < 0) en_diff = -en_diff;
2472
+ if (st_diff <= l_fuzzy && en_diff <= l_fuzzy)
2473
+ ++n_splice_hit, hit = true;
2474
+ if (hit) break;
2475
+ }
2476
+ if (print_ovlp) {
2477
+ var type = hit? 'C' : 'P';
2478
+ if (hit && print_err_only) continue;
2479
+ var x = '[';
2480
+ for (var j = 0; j < o.length; ++j) {
2481
+ if (j) x += ', ';
2482
+ x += '(' + o[j][0] + "," + o[j][1] + ')';
2483
+ }
2484
+ x += ']';
2485
+ print(type, qname, i+1, ctg_name, intron[i][0], intron[i][1], x);
2486
+ }
2487
+ } else {
2488
+ ++n_splice_novel;
2489
+ if (print_ovlp)
2490
+ print('N', qname, i+1, ctg_name, intron[i][0], intron[i][1]);
2491
+ }
2492
+ }
2493
+ } else {
2494
+ n_splice_novel += intron.length;
2495
+ }
2496
+ }
2497
+ file.close();
2498
+
2499
+ buf.destroy();
2500
+
2501
+ if (!print_ovlp) {
2502
+ print("# unmapped reads: " + n_unmapped);
2503
+ print("# mapped reads: " + n_mapped);
2504
+ print("# primary alignments: " + n_pri);
2505
+ print("# singletons: " + n_sgl);
2506
+ print("# predicted introns: " + n_splice);
2507
+ print("# non-overlapping introns: " + n_splice_novel);
2508
+ print("# correct introns: " + n_splice_hit + " (" + (n_splice_hit / n_splice * 100).toFixed(2) + "%)");
2509
+ }
2510
+ }
2511
+
2512
+ // evaluate overlap sensitivity
2513
+ function paf_ov_eval(args)
2514
+ {
2515
+ var c, min_ovlp = 2000, min_frac = 0.95, min_mapq = 10;
2516
+ while ((c = getopt(args, "q:l:f:")) != null) {
2517
+ if (c == 'q') min_mapq = parseInt(getopt.arg);
2518
+ else if (c == 'l') min_ovlp = parseInt(getopt.arg);
2519
+ else if (c == 'f') min_frac = parseFloat(getopt.arg);
2520
+ }
2521
+ if (args.length - getopt.ind < 2) {
2522
+ print("Usage: sort -k6,6 -k8,8n to-ref.paf | paftools.js ov-eval [options] - <ovlp.paf>");
2523
+ print("Options:");
2524
+ print(" -l INT min overlap length [2000]");
2525
+ print(" -q INT min mapping quality [10]");
2526
+ print(" -f FLOAT min fraction of mapped length [0.95]");
2527
+ exit(1);
2528
+ }
2529
+
2530
+ var buf = new Bytes();
2531
+ var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
2532
+ var a = [], h = {};
2533
+ while (file.readline(buf) >= 0) {
2534
+ var t = buf.toString().split("\t");
2535
+ var is_pri = false;
2536
+ if (parseInt(t[11]) < min_mapq) continue;
2537
+ for (var i = 12; i < t.length; ++i)
2538
+ if (t[i] == 'tp:A:P')
2539
+ is_pri = true;
2540
+ if (!is_pri) continue;
2541
+ for (var i = 1; i <= 3; ++i)
2542
+ t[i] = parseInt(t[i]);
2543
+ for (var i = 6; i <= 8; ++i)
2544
+ t[i] = parseInt(t[i]);
2545
+ if (t[3] - t[2] < min_ovlp || t[8] - t[7] < min_ovlp || (t[3] - t[2]) / t[1] < min_frac)
2546
+ continue;
2547
+ var ctg = t[5], st = t[7], en = t[8];
2548
+ while (a.length > 0) {
2549
+ if (a[0][0] == ctg && a[0][2] > st)
2550
+ break;
2551
+ else a.shift();
2552
+ }
2553
+ for (var j = 0; j < a.length; ++j) {
2554
+ if (a[j][3] == t[0]) continue;
2555
+ var len = (en > a[j][2]? a[j][2] : en) - st;
2556
+ if (len >= min_ovlp) {
2557
+ var key = a[j][3] < t[0]? a[j][3] + "\t" + t[0] : t[0] + "\t" + a[j][3];
2558
+ h[key] = len;
2559
+ }
2560
+ }
2561
+ a.push([ctg, st, en, t[0]]);
2562
+ }
2563
+ file.close();
2564
+
2565
+ file = new File(args[getopt.ind + 1]);
2566
+ while (file.readline(buf) >= 0) {
2567
+ var t = buf.toString().split("\t");
2568
+ var key = t[0] < t[5]? t[0] + "\t" + t[5] : t[5] + "\t" + t[0];
2569
+ if (h[key] > 0) h[key] = -h[key];
2570
+ }
2571
+ file.close();
2572
+ buf.destroy();
2573
+
2574
+ var n_ovlp = 0, n_missing = 0;
2575
+ for (var key in h) {
2576
+ ++n_ovlp;
2577
+ if (h[key] > 0) ++n_missing;
2578
+ }
2579
+ print(n_ovlp + " overlaps inferred from the reference mapping");
2580
+ print(n_missing + " missed by the read overlapper");
2581
+ print((100 * (1 - n_missing / n_ovlp)).toFixed(2) + "% sensitivity");
2582
+ }
2583
+
2584
+ function paf_vcfstat(args)
2585
+ {
2586
+ var c, ts = { "AG":1, "GA":1, "CT":1, "TC":1 };
2587
+ while ((c = getopt(args, "")) != null) {
2588
+ }
2589
+ var buf = new Bytes();
2590
+ var file = args.length == getopt.ind? new File() : new File(args[getopt.ind]);
2591
+ var x = { sub:0, ts:0, tv:0, ins:0, del:0, ins1:0, del1:0, ins2:0, del2:0, ins50:0, del50:0, ins1k:0, del1k:0, ins7k:0, del7k:0, insinf:0, delinf:0 };
2592
+ while (file.readline(buf) >= 0) {
2593
+ var t = buf.toString().split("\t");
2594
+ if (t[0][0] == '#') continue;
2595
+ var alt = t[4].split(",");
2596
+ var ref = t[3];
2597
+ for (var i = 0; i < alt.length; ++i) {
2598
+ var a = alt[i];
2599
+ if (a[0] == '<' || a[1] == '>') continue;
2600
+ var l = ref.length < a.length? ref.length : a.length;
2601
+ for (var j = 0; j < l; ++j) {
2602
+ if (ref[j] != a[j]) {
2603
+ ++x.sub;
2604
+ if (ts[ref[j] + a[j]]) ++x.ts;
2605
+ else ++x.tv;
2606
+ }
2607
+ }
2608
+ var d = a.length - ref.length;
2609
+ if (d > 0) {
2610
+ ++x.ins;
2611
+ if (d == 1) ++x.ins1;
2612
+ else if (d == 2) ++x.ins2;
2613
+ else if (d < 50) ++x.ins50;
2614
+ else if (d < 1000) ++x.ins1k;
2615
+ else if (d < 7000) ++x.ins7k;
2616
+ else ++x.insinf;
2617
+ } else if (d < 0) {
2618
+ d = -d;
2619
+ ++x.del;
2620
+ if (d == 1) ++x.del1;
2621
+ else if (d == 2) ++x.del2;
2622
+ else if (d < 50) ++x.del50;
2623
+ else if (d < 1000) ++x.del1k;
2624
+ else if (d < 7000) ++x.del7k;
2625
+ else ++x.delinf;
2626
+ }
2627
+ }
2628
+ }
2629
+ file.close();
2630
+ buf.destroy();
2631
+ print("# substitutions: " + x.sub);
2632
+ print("ts/tv: " + (x.ts / x.tv).toFixed(3));
2633
+ print("# insertions: " + x.ins);
2634
+ print("# 1bp insertions: " + x.ins1);
2635
+ print("# 2bp insertions: " + x.ins2);
2636
+ print("# [3,50) insertions: " + x.ins50);
2637
+ print("# [50,1000) insertions: " + x.ins1k);
2638
+ print("# [1000,7000) insertions: " + x.ins7k);
2639
+ print("# >=7000 insertions: " + x.insinf);
2640
+ print("# deletions: " + x.del);
2641
+ print("# 1bp deletions: " + x.del1);
2642
+ print("# 2bp deletions: " + x.del2);
2643
+ print("# [3,50) deletions: " + x.del50);
2644
+ print("# [50,1000) deletions: " + x.del1k);
2645
+ print("# [1000,7000) deletions: " + x.del7k);
2646
+ print("# >=7000 deletions: " + x.delinf);
2647
+ }
2648
+
2649
+ function paf_parseNum(s) {
2650
+ var m, x = null;
2651
+ if ((m = /^(\d*\.?\d*)([mMgGkK]?)/.exec(s)) != null) {
2652
+ x = parseFloat(m[1]);
2653
+ if (m[2] == 'k' || m[2] == 'K') x *= 1000;
2654
+ else if (m[2] == 'm' || m[2] == 'M') x *= 1000000;
2655
+ else if (m[2] == 'g' || m[2] == 'G') x *= 1000000000;
2656
+ }
2657
+ return Math.floor(x + .499);
2658
+ }
2659
+
2660
+ function paf_misjoin(args)
2661
+ {
2662
+ var c, min_seg_len = 1000000, max_gap = 1000000, fn_cen = null, show_long = false, show_err = false, cen_ratio = 0.5;
2663
+ var n_diff = [0, 0], n_gap = [0, 0], n_inv = [0, 0], n_inv_end = [0, 0];
2664
+ while ((c = getopt(args, "l:g:c:per:")) != null) {
2665
+ if (c == 'l') min_seg_len = paf_parseNum(getopt.arg);
2666
+ else if (c == 'g') max_gap = paf_parseNum(getopt.arg);
2667
+ else if (c == 'c') fn_cen = getopt.arg;
2668
+ else if (c == 'r') cen_ratio = parseFloat(getopt.arg);
2669
+ else if (c == 'p') show_long = true;
2670
+ else if (c == 'e') show_err = true;
2671
+ }
2672
+ if (args.length == getopt.ind) {
2673
+ print("Usage: paftools.js misjoin [options] <in.paf>");
2674
+ print("Options:");
2675
+ print(" -c FILE BED for centromeres []");
2676
+ print(" -r FLOAT count a centromeric event if overlap ratio > FLOAT [" + cen_ratio + "]");
2677
+ print(" -l NUM min alignment block length [1m]");
2678
+ print(" -g NUM max gap size [1m]");
2679
+ print(" -e output misjoins not involving centromeres");
2680
+ print(" -p output long alignment blocks for debugging");
2681
+ return;
2682
+ }
2683
+ var cen = {};
2684
+ var file, buf = new Bytes();
2685
+ if (fn_cen != null) {
2686
+ file = new File(fn_cen);
2687
+ while (file.readline(buf) >= 0) {
2688
+ var t = buf.toString().split("\t");
2689
+ if (cen[t[0]] == null) cen[t[0]] = [];
2690
+ cen[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
2691
+ }
2692
+ file.close();
2693
+ }
2694
+
2695
+ function test_cen(cen, chr, st, en) {
2696
+ var b = cen[chr], len = 0;
2697
+ if (b == null) return false;
2698
+ for (var j = 0; j < b.length; ++j)
2699
+ if (b[j][0] < en && b[j][1] > st) {
2700
+ var s = b[j][0] > st? b[j][0] : st;
2701
+ var e = b[j][1] < en? b[j][1] : en;
2702
+ len += e - s;
2703
+ }
2704
+ return len < (en - st) * cen_ratio? false : true;
2705
+ }
2706
+
2707
+ function process(a) {
2708
+ var k = 0;
2709
+ for (var i = 0; i < a.length; ++i) {
2710
+ for (var j = 1; j <= 3; ++j) a[i][j] = parseInt(a[i][j]);
2711
+ for (var j = 6; j <= 11; ++j) a[i][j] = parseInt(a[i][j]);
2712
+ if (a[i][10] >= min_seg_len) a[k++] = a[i];
2713
+ }
2714
+ a.length = k;
2715
+ if (a.length == 1) return;
2716
+ a = a.sort(function(x,y){return x[2]-y[2]});
2717
+ if (show_long) for (var i = 0; i < a.length; ++i) print(a[i].join("\t"));
2718
+ for (var i = 1; i < a.length; ++i) {
2719
+ var ov = [false, false];
2720
+ ov[0] = test_cen(cen, a[i-1][5], a[i-1][7], a[i-1][8]);
2721
+ ov[1] = test_cen(cen, a[i][5], a[i][7], a[i][8]);
2722
+ if (a[i-1][5] != a[i][5]) { // different chr
2723
+ if (ov[0] || ov[1]) ++n_diff[1];
2724
+ else if (show_err) {
2725
+ print("J", a[i-1].slice(0, 12).join("\t"));
2726
+ print("J", a[i].slice(0, 12).join("\t"));
2727
+ }
2728
+ ++n_diff[0];
2729
+ } else if (a[i-1][4] == a[i][4]) { // a gap
2730
+ var dq = a[i][2] - a[i-1][3];
2731
+ var dr = a[i][4] == '+'? a[i][7] - a[i-1][8] : a[i-1][7] - a[i][8];
2732
+ var gap = dr > dq? dr - dq : dq - dr;
2733
+ if (gap > max_gap) {
2734
+ if (ov[0] || ov[1]) ++n_gap[1];
2735
+ else if (show_err) {
2736
+ print("G", a[i-1].slice(0, 12).join("\t"));
2737
+ print("G", a[i].slice(0, 12).join("\t"));
2738
+ }
2739
+ ++n_gap[0];
2740
+ }
2741
+ } else if (i + 1 < a.length && a[i+1][4] == a[i-1][4]) { // bracketed inversion
2742
+ if (ov[0] || ov[1]) ++n_inv[1];
2743
+ else if (show_err) {
2744
+ print("M", a[i-1].slice(0, 12).join("\t"));
2745
+ print("M", a[i].slice(0, 12).join("\t"));
2746
+ print("M", a[i+1].slice(0, 12).join("\t"));
2747
+ }
2748
+ ++n_inv[0];
2749
+ ++i;
2750
+ } else { // hanging inversion
2751
+ if (ov[0] || ov[1]) ++n_inv_end[1];
2752
+ ++n_inv_end[0];
2753
+ }
2754
+ }
2755
+ }
2756
+
2757
+ file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
2758
+ var a = [];
2759
+ while (file.readline(buf) >= 0) {
2760
+ var t = buf.toString().split("\t");
2761
+ if (a.length > 0 && a[0][0] != t[0]) {
2762
+ process(a);
2763
+ a.length = 0;
2764
+ }
2765
+ a.push(t);
2766
+ }
2767
+ if (a.length > 0) process(a);
2768
+ file.close();
2769
+ buf.destroy();
2770
+ print("# inter-chromosomal misjoins: " + n_diff.join(","));
2771
+ print("# intra-chromosomal gaps: " + n_gap.join(","));
2772
+ print("# candidate inversions in the middle: " + n_inv.join(","));
2773
+ print("# candidate inversions at contig ends: " + n_inv_end.join(","));
2774
+ }
2775
+
2776
+ function _paf_get_alen(t)
2777
+ {
2778
+ var svlen = null, alen = null;
2779
+ if ((m = /(^|;)SVLEN=(-?\d+)/.exec(t[7])) != null)
2780
+ svlen = parseInt(m[2]);
2781
+ var s = t[4].split(",");
2782
+ var min_abs_diff = 1<<30, max_abs_diff = 0;
2783
+ if (svlen != null && svlen != 0)
2784
+ alen = svlen, min_abs_diff = max_abs_diff = svlen > 0? svlen : -svlen;
2785
+ var rlen = t[3].length;
2786
+ for (var i = 0; i < s.length; ++i) {
2787
+ if (/^<\S+>$/.test(s[i])) continue;
2788
+ var diff = s[i].length - rlen;
2789
+ var abs_diff = diff > 0? diff : -diff;
2790
+ min_abs_diff = min_abs_diff < abs_diff? min_abs_diff : abs_diff;
2791
+ if (max_abs_diff < abs_diff)
2792
+ max_abs_diff = abs_diff, alen = diff;
2793
+ }
2794
+ return [alen, min_abs_diff, max_abs_diff];
2795
+ }
2796
+
2797
+ function paf_sveval(args)
2798
+ {
2799
+ var c, min_flt = 30, min_size = 50, max_size = 100000, win_size = 500, print_err = false, print_match = false, bed_fn = null;
2800
+ var len_diff_ratio = 0.5;
2801
+ while ((c = getopt(args, "f:i:x:w:er:pd:")) != null) {
2802
+ if (c == 'f') min_flt = paf_parseNum(getopt.arg);
2803
+ else if (c == 'i') min_size = paf_parseNum(getopt.arg);
2804
+ else if (c == 'x') max_size = paf_parseNum(getopt.arg);
2805
+ else if (c == 'w') win_size = paf_parseNum(getopt.arg);
2806
+ else if (c == 'd') len_diff_ratio = parseFloat(getopt.arg);
2807
+ else if (c == 'r') bed_fn = getopt.arg;
2808
+ else if (c == 'e') print_err = true;
2809
+ else if (c == 'p') print_match = true;
2810
+ }
2811
+ if (args.length - getopt.ind < 2) {
2812
+ print("Usage: paftools.js sveval [options] <base.vcf> <call.vcf>");
2813
+ print("Options:");
2814
+ print(" -r FILE confident region in BED []");
2815
+ print(" -f INT min length to discard [" + min_flt + "]");
2816
+ print(" -i INT min SV length [" + min_size + "]");
2817
+ print(" -x INT max SV length [" + max_size + "]");
2818
+ print(" -w INT fuzzy windown size [" + win_size + "]");
2819
+ print(" -d FLOAT max allele diff if there is a single allele in the window [" + len_diff_ratio + "]");
2820
+ print(" -e print errors");
2821
+ return;
2822
+ }
2823
+
2824
+ function read_bed(fn) {
2825
+ var buf = new Bytes();
2826
+ var file = new File(fn);
2827
+ var bed = {};
2828
+ while (file.readline(buf) >= 0) {
2829
+ var t = buf.toString().split("\t");
2830
+ if (bed[t[0]] == null) bed[t[0]] = [];
2831
+ bed[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
2832
+ }
2833
+ file.close();
2834
+ buf.destroy();
2835
+ for (var x in bed) {
2836
+ Interval.sort(bed[x]);
2837
+ Interval.merge(bed[x]);
2838
+ Interval.index_end(bed[x]);
2839
+ }
2840
+ return bed;
2841
+ }
2842
+
2843
+ var bed = bed_fn != null? read_bed(bed_fn) : null;
2844
+
2845
+ function read_vcf(fn, bed) {
2846
+ var buf = new Bytes();
2847
+ var file = new File(fn);
2848
+ var v = {};
2849
+ while (file.readline(buf) >= 0) {
2850
+ var m, t = buf.toString().split("\t");
2851
+ if (t[0][0] == '#') continue;
2852
+ if (bed != null && bed[t[0]] == null) continue;
2853
+ if (t[4] == '<INV>' || t[4] == '<INVDUP>') continue; // no inversion
2854
+ if (/[\[\]]/.test(t[4])) continue; // no break points
2855
+ var st = parseInt(t[1]) - 1, en = st + t[3].length;
2856
+ // parse svlen
2857
+ var b = _paf_get_alen(t), svlen = b[0];
2858
+ var abslen = svlen == null? 0 : svlen > 0? svlen : -svlen;
2859
+ if (abslen < min_flt || abslen > max_size) continue;
2860
+ // update end
2861
+ if ((m = /(^|;)END=(\d+)/.exec(t[7])) != null)
2862
+ en = parseInt(m[2]);
2863
+ else if (svlen != null && svlen < 0)
2864
+ en = st + (-svlen);
2865
+ if (en < st) en = st;
2866
+ if (st == en) --st, ++en;
2867
+ if (bed != null && Interval.find_ovlp(bed[t[0]], st, en).length == 0) continue;
2868
+ // insert
2869
+ if (v[t[0]] == null) v[t[0]] = [];
2870
+ v[t[0]].push([st, en, svlen, abslen]);
2871
+ }
2872
+ file.close();
2873
+ buf.destroy();
2874
+ for (var x in v) {
2875
+ Interval.sort(v[x]);
2876
+ Interval.index_end(v[x]);
2877
+ }
2878
+ return v;
2879
+ }
2880
+
2881
+ function compare_vcf(v0, v1, label) {
2882
+ var m = 0, n = 0;
2883
+ for (var x in v1) {
2884
+ var a1 = v1[x], a0 = v0[x];
2885
+ for (var i = 0; i < a1.length; ++i) {
2886
+ if (a1[i][3] < min_size) continue;
2887
+ ++n;
2888
+ if (a0 == null) continue;
2889
+ var ws = win_size + (a1[i][3]>>1);
2890
+ var st = a1[i][0] > ws? a1[i][0] - ws : 0;
2891
+ b = Interval.find_ovlp(a0, st, a1[i][1] + ws);
2892
+ var n_ins = 0, n_del = 0, sv_del = null, sv_ins = null;
2893
+ for (var j = 0; j < b.length; ++j) {
2894
+ if (b[j][2] < 0) ++n_del, sv_del = -b[j][2];
2895
+ else if (b[j][2] > 0) ++n_ins, sv_ins = b[j][2];
2896
+ if (print_match)
2897
+ print("MA", x, a1[i].slice(0, 3).join("\t"), b[j].slice(0, 3).join("\t"));
2898
+ }
2899
+ var match = false;
2900
+ if (a1[i][2] > 0) { // insertion
2901
+ if (n_ins == 1) {
2902
+ var diff = sv_ins - a1[i][3];
2903
+ if (diff < 0) diff = -diff;
2904
+ if (diff < min_size || diff / a1[i][3] < len_diff_ratio)
2905
+ match = true;
2906
+ } else if (n_ins > 1) match = true; // multiple insertions; ambiguous
2907
+ } else if (a1[i][2] < 0) {
2908
+ if (n_del == 1) { // deletion
2909
+ var diff = sv_del - a1[i][3];
2910
+ if (diff < 0) diff = -diff;
2911
+ if (diff < min_size || diff / a1[i][3] < len_diff_ratio)
2912
+ match = true;
2913
+ } else if (n_del > 1) match = true; // multiple deletions; ambiguous
2914
+ }
2915
+ if (match) ++m;
2916
+ else if (print_err) {
2917
+ if ((a1[i][2] > 0 && n_ins > 0) || (a1[i][2] < 0 && n_del > 0))
2918
+ print("MM", x, a1[i].slice(0, 3).join("\t"));
2919
+ print(label, x, a1[i].slice(0, 3).join("\t"));
2920
+ }
2921
+ }
2922
+ }
2923
+ return [n, m];
2924
+ }
2925
+
2926
+ var v_base = read_vcf(args[getopt.ind+0], bed);
2927
+ var v_call = read_vcf(args[getopt.ind+1], bed);
2928
+ var fn = compare_vcf(v_call, v_base, 'FN');
2929
+ var fp = compare_vcf(v_base, v_call, 'FP');
2930
+ print('SN', fn[0], fn[1], (fn[1] / fn[0]).toFixed(6));
2931
+ print('PC', fp[0], fp[1], (fp[1] / fp[0]).toFixed(6));
2932
+ print('F1', ((fn[1] / fn[0] + fp[1] / fp[0]) / 2).toFixed(6));
2933
+ }
2934
+
2935
+ function paf_vcfsel(args)
2936
+ {
2937
+ var c, min_l = 0, max_l = 1<<30;
2938
+ while ((c = getopt(args, "l:L:")) != null) {
2939
+ if (c == 'l') min_l = parseInt(getopt.arg);
2940
+ else if (c == 'L') max_l = parseInt(getopt.arg);
2941
+ }
2942
+
2943
+ var buf = new Bytes();
2944
+ if (getopt.ind == args.length) {
2945
+ print("Usage: paftools.js vcfsel [options] <in.vcf>");
2946
+ return 1;
2947
+ }
2948
+ var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
2949
+ while (file.readline(buf) >= 0) {
2950
+ var m, line = buf.toString();
2951
+ if (line[0] == '#') {
2952
+ print(line);
2953
+ continue;
2954
+ }
2955
+ var t = line.split("\t");
2956
+ var st = parseInt(t[1]), en = st + t[3].length - 1;
2957
+ if ((m = /(^|;)END=(\d+)/.exec(t[7])) != null)
2958
+ en = parseInt(m[2]);
2959
+ if (en < st) {
2960
+ warn("END is smaller than POS: " + en + " < " + st);
2961
+ en = st;
2962
+ }
2963
+ var b = _paf_get_alen(t);
2964
+ var alen = b[0], min_abs_diff = b[1], max_abs_diff = b[2];
2965
+ if (max_abs_diff < min_l || min_abs_diff > max_l)
2966
+ continue;
2967
+ print(line);
2968
+ }
2969
+ file.close();
2970
+ buf.destroy();
2971
+ }
2972
+
2973
+ function paf_pafcmp(args)
2974
+ {
2975
+ var c, opt = { min_len:5000, min_mapq:10, min_ovlp:0.5 };
2976
+ while ((c = getopt(args, "q:")) != null) {
2977
+ if (c == 'q') opt.min_mapq = parseInt(getopt.arg);
2978
+ }
2979
+
2980
+ var buf = new Bytes();
2981
+ if (args.length - getopt.ind < 2) {
2982
+ print("Usage: paftools.js pafcmp [options] <base.paf> <test.paf>");
2983
+ print("Options:");
2984
+ print(" -q INT min mapping quality [" + opt.min_mapq + "]");
2985
+ return 1;
2986
+ }
2987
+
2988
+ var eval = { n_base:0, n_test:0, n_out_high:0, n_out_low:0, n_hit:0, n_wrong:0, n_miss:0 };
2989
+
2990
+ function process_base(base, a) {
2991
+ if (a.length != 1) return;
2992
+ for (var i = 1; i < 4; ++i)
2993
+ a[0][i] = parseInt(a[0][i]);
2994
+ for (var i = 6; i < 12; ++i)
2995
+ a[0][i] = parseInt(a[0][i]);
2996
+ if (a[0][1] < opt.min_len) return;
2997
+ if (a[0][11] >= opt.min_mapq) ++eval.n_base;
2998
+ base[a[0][0]] = [a[0][5], a[0][7], a[0][8], a[0][11], 0, 0];
2999
+ }
3000
+
3001
+ var file = new File(args[getopt.ind]);
3002
+ warn("Reading " + args[getopt.ind] + "...");
3003
+ var a = [], base = {};
3004
+ while (file.readline(buf) >= 0) {
3005
+ var line = buf.toString();
3006
+ var t = line.split("\t");
3007
+ if (/\ttp:A:S/.test(line)) continue;
3008
+ if (a.length > 0 && a[0][0] != t[0]) {
3009
+ process_base(base, a);
3010
+ a = [];
3011
+ }
3012
+ a.push(t);
3013
+ }
3014
+ process_base(base, a);
3015
+ file.close();
3016
+
3017
+ function process_test(base, a) {
3018
+ for (var i = 1; i < 4; ++i)
3019
+ a[0][i] = parseInt(a[0][i]);
3020
+ for (var i = 6; i < 12; ++i)
3021
+ a[0][i] = parseInt(a[0][i]);
3022
+ if (a[0][1] < opt.min_len) return;
3023
+ if (a[0][11] >= opt.min_mapq) ++eval.n_test;
3024
+ var c = [a[0][5], a[0][7], a[0][8], a[0][11]];
3025
+ if (base[a[0][0]] == null) {
3026
+ if (c[3] >= opt.min_mapq) ++opt.n_out_high;
3027
+ else ++opt.n_out_low;
3028
+ } else {
3029
+ var b = base[a[0][0]];
3030
+ var inter = 0, union = (b[2] - b[1]) + (c[2] - c[1]);
3031
+ if (b[0] == c[0]) { // same chr
3032
+ if (b[1] < c[1]) {
3033
+ if (b[2] > c[1])
3034
+ inter = b[2] - c[1], union = c[2] - b[1];
3035
+ } else { // c[1] < b[1]
3036
+ if (c[2] > b[1])
3037
+ inter = c[2] - b[1], union = b[2] - c[1];
3038
+ }
3039
+ }
3040
+ if (inter >= union * opt.min_ovlp) {
3041
+ if (b[3] >= opt.min_mapq) ++eval.n_hit;
3042
+ ++b[4];
3043
+ } else {
3044
+ if (b[3] >= opt.min_mapq) {
3045
+ print("W", a[0][0], b.slice(0, 4).join("\t"), c.join("\t"));
3046
+ ++eval.n_wrong;
3047
+ }
3048
+ ++b[5];
3049
+ }
3050
+ }
3051
+ }
3052
+
3053
+ file = new File(args[getopt.ind+1]);
3054
+ warn("Reading " + args[getopt.ind+1] + "...");
3055
+ a = [];
3056
+ while (file.readline(buf) >= 0) {
3057
+ var line = buf.toString();
3058
+ var t = line.split("\t");
3059
+ if (/\ttp:A:S/.test(line)) continue;
3060
+ if (a.length > 0 && a[0][0] != t[0]) {
3061
+ process_test(base, a);
3062
+ a = [];
3063
+ }
3064
+ a.push(t);
3065
+ }
3066
+ process_test(base, a);
3067
+ file.close();
3068
+
3069
+ for (var r in base) {
3070
+ var b = base[r];
3071
+ if (b[3] >= opt.min_mapq && b[4] == 0 && b[5] == 0) {
3072
+ ++eval.n_miss;
3073
+ print("M", r, b.slice(0, 4).join("\t"));
3074
+ }
3075
+ }
3076
+
3077
+ print("X", eval.n_base + " base alignments with mapQ>=" + opt.min_mapq);
3078
+ // print("X", eval.n_test + " test alignments with mapQ>=" + opt.min_mapq);
3079
+ print("X", eval.n_hit + " base alignments correctly mapped by test");
3080
+ print("X", eval.n_wrong + " wrong test alignment");
3081
+ print("X", eval.n_miss + " base alignments missing");
3082
+ print("X", eval.n_out_high + " additional test alignments with mapQ>=" + opt.min_mapq);
3083
+
3084
+ buf.destroy();
3085
+ }
3086
+
3087
+ /*************************
3088
+ ***** main function *****
3089
+ *************************/
3090
+
3091
+ function main(args)
3092
+ {
3093
+ if (args.length == 0) {
3094
+ print("Usage: paftools.js <command> [arguments]");
3095
+ print("Commands:");
3096
+ print(" view convert PAF to BLAST-like (for eyeballing) or MAF");
3097
+ print(" splice2bed convert spliced alignment in PAF/SAM to BED12");
3098
+ print(" sam2paf convert SAM to PAF");
3099
+ print(" delta2paf convert MUMmer's delta to PAF");
3100
+ print(" gff2bed convert GTF/GFF3 to BED12");
3101
+ print("");
3102
+ print(" stat collect basic mapping information in PAF/SAM");
3103
+ print(" asmstat collect basic assembly information");
3104
+ print(" asmgene evaluate gene completeness");
3105
+ print(" misjoin evaluate large-scale misjoins");
3106
+ print(" liftover simplistic liftOver");
3107
+ print(" call call variants from asm-to-ref alignment with the cs tag");
3108
+ print(" bedcov compute the number of bases covered");
3109
+ print(" vcfstat VCF statistics");
3110
+ print(" sveval compare two SV callsets in VCF");
3111
+ print(" version print paftools.js version");
3112
+ print("");
3113
+ print(" mapeval evaluate mapping accuracy using mason2/PBSIM-simulated FASTQ");
3114
+ print(" pafcmp compare two PAF files");
3115
+ print(" mason2fq convert mason2-simulated SAM to FASTQ");
3116
+ print(" pbsim2fq convert PBSIM-simulated MAF to FASTQ");
3117
+ print(" junceval evaluate splice junction consistency with known annotations");
3118
+ print(" ov-eval evaluate read overlap sensitivity using read-to-ref mapping");
3119
+ exit(1);
3120
+ }
3121
+
3122
+ var cmd = args.shift();
3123
+ if (cmd == 'view') paf_view(args);
3124
+ else if (cmd == 'sam2paf') paf_sam2paf(args);
3125
+ else if (cmd == 'delta2paf') paf_delta2paf(args);
3126
+ else if (cmd == 'splice2bed') paf_splice2bed(args);
3127
+ else if (cmd == 'gff2bed') paf_gff2bed(args);
3128
+ else if (cmd == 'stat') paf_stat(args);
3129
+ else if (cmd == 'asmstat') paf_asmstat(args);
3130
+ else if (cmd == 'asmgene') paf_asmgene(args);
3131
+ else if (cmd == 'misjoin') paf_misjoin(args);
3132
+ else if (cmd == 'liftover' || cmd == 'liftOver') paf_liftover(args);
3133
+ else if (cmd == 'vcfpair') paf_vcfpair(args);
3134
+ else if (cmd == 'call') paf_call(args);
3135
+ else if (cmd == 'mapeval') paf_mapeval(args);
3136
+ else if (cmd == 'pafcmp') paf_pafcmp(args);
3137
+ else if (cmd == 'bedcov') paf_bedcov(args);
3138
+ else if (cmd == 'mason2fq') paf_mason2fq(args);
3139
+ else if (cmd == 'pbsim2fq') paf_pbsim2fq(args);
3140
+ else if (cmd == 'junceval') paf_junceval(args);
3141
+ else if (cmd == 'ov-eval') paf_ov_eval(args);
3142
+ else if (cmd == 'vcfstat') paf_vcfstat(args);
3143
+ else if (cmd == 'sveval') paf_sveval(args);
3144
+ else if (cmd == 'vcfsel') paf_vcfsel(args);
3145
+ else if (cmd == 'version') print(paftools_version);
3146
+ else throw Error("unrecognized command: " + cmd);
3147
+ }
3148
+
3149
+ main(arguments);