minimap2 0.2.22.0 → 0.2.24.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -76
- data/ext/Rakefile +55 -0
- data/ext/cmappy/cmappy.c +129 -0
- data/ext/cmappy/cmappy.h +44 -0
- data/ext/minimap2/FAQ.md +46 -0
- data/ext/minimap2/LICENSE.txt +24 -0
- data/ext/minimap2/MANIFEST.in +10 -0
- data/ext/minimap2/Makefile +132 -0
- data/ext/minimap2/Makefile.simde +97 -0
- data/ext/minimap2/NEWS.md +821 -0
- data/ext/minimap2/README.md +403 -0
- data/ext/minimap2/align.c +1020 -0
- data/ext/minimap2/bseq.c +169 -0
- data/ext/minimap2/bseq.h +64 -0
- data/ext/minimap2/code_of_conduct.md +30 -0
- data/ext/minimap2/cookbook.md +243 -0
- data/ext/minimap2/esterr.c +64 -0
- data/ext/minimap2/example.c +63 -0
- data/ext/minimap2/format.c +559 -0
- data/ext/minimap2/hit.c +466 -0
- data/ext/minimap2/index.c +775 -0
- data/ext/minimap2/kalloc.c +205 -0
- data/ext/minimap2/kalloc.h +76 -0
- data/ext/minimap2/kdq.h +132 -0
- data/ext/minimap2/ketopt.h +120 -0
- data/ext/minimap2/khash.h +615 -0
- data/ext/minimap2/krmq.h +474 -0
- data/ext/minimap2/kseq.h +256 -0
- data/ext/minimap2/ksort.h +153 -0
- data/ext/minimap2/ksw2.h +184 -0
- data/ext/minimap2/ksw2_dispatch.c +96 -0
- data/ext/minimap2/ksw2_extd2_sse.c +402 -0
- data/ext/minimap2/ksw2_exts2_sse.c +416 -0
- data/ext/minimap2/ksw2_extz2_sse.c +313 -0
- data/ext/minimap2/ksw2_ll_sse.c +152 -0
- data/ext/minimap2/kthread.c +159 -0
- data/ext/minimap2/kthread.h +15 -0
- data/ext/minimap2/kvec.h +105 -0
- data/ext/minimap2/lchain.c +369 -0
- data/ext/minimap2/main.c +459 -0
- data/ext/minimap2/map.c +714 -0
- data/ext/minimap2/minimap.h +410 -0
- data/ext/minimap2/minimap2.1 +725 -0
- data/ext/minimap2/misc/README.md +179 -0
- data/ext/minimap2/misc/mmphase.js +335 -0
- data/ext/minimap2/misc/paftools.js +3149 -0
- data/ext/minimap2/misc.c +162 -0
- data/ext/minimap2/mmpriv.h +132 -0
- data/ext/minimap2/options.c +234 -0
- data/ext/minimap2/pe.c +177 -0
- data/ext/minimap2/python/README.rst +196 -0
- data/ext/minimap2/python/cmappy.h +152 -0
- data/ext/minimap2/python/cmappy.pxd +153 -0
- data/ext/minimap2/python/mappy.pyx +273 -0
- data/ext/minimap2/python/minimap2.py +39 -0
- data/ext/minimap2/sdust.c +213 -0
- data/ext/minimap2/sdust.h +25 -0
- data/ext/minimap2/seed.c +131 -0
- data/ext/minimap2/setup.py +55 -0
- data/ext/minimap2/sketch.c +143 -0
- data/ext/minimap2/splitidx.c +84 -0
- data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
- data/ext/minimap2/test/MT-human.fa +278 -0
- data/ext/minimap2/test/MT-orang.fa +276 -0
- data/ext/minimap2/test/q-inv.fa +4 -0
- data/ext/minimap2/test/q2.fa +2 -0
- data/ext/minimap2/test/t-inv.fa +127 -0
- data/ext/minimap2/test/t2.fa +2 -0
- data/ext/minimap2/tex/Makefile +21 -0
- data/ext/minimap2/tex/bioinfo.cls +930 -0
- data/ext/minimap2/tex/blasr-mc.eval +17 -0
- data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
- data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
- data/ext/minimap2/tex/bwa.eval +55 -0
- data/ext/minimap2/tex/eval2roc.pl +33 -0
- data/ext/minimap2/tex/graphmap.eval +4 -0
- data/ext/minimap2/tex/hs38-simu.sh +10 -0
- data/ext/minimap2/tex/minialign.eval +49 -0
- data/ext/minimap2/tex/minimap2.bib +460 -0
- data/ext/minimap2/tex/minimap2.tex +724 -0
- data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
- data/ext/minimap2/tex/mm2-update.tex +240 -0
- data/ext/minimap2/tex/mm2.approx.eval +12 -0
- data/ext/minimap2/tex/mm2.eval +13 -0
- data/ext/minimap2/tex/natbib.bst +1288 -0
- data/ext/minimap2/tex/natbib.sty +803 -0
- data/ext/minimap2/tex/ngmlr.eval +38 -0
- data/ext/minimap2/tex/roc.gp +60 -0
- data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
- data/ext/minimap2.patch +19 -0
- data/lib/minimap2/aligner.rb +4 -4
- data/lib/minimap2/alignment.rb +11 -11
- data/lib/minimap2/ffi/constants.rb +20 -16
- data/lib/minimap2/ffi/functions.rb +5 -0
- data/lib/minimap2/ffi.rb +4 -5
- data/lib/minimap2/version.rb +2 -2
- data/lib/minimap2.rb +51 -15
- metadata +97 -79
- data/lib/minimap2/ffi_helper.rb +0 -53
- data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,3149 @@
|
|
1
|
+
#!/usr/bin/env k8
|
2
|
+
|
3
|
+
var paftools_version = '2.24-r1122';
|
4
|
+
|
5
|
+
/*****************************
|
6
|
+
***** Library functions *****
|
7
|
+
*****************************/
|
8
|
+
|
9
|
+
/*******************************
|
10
|
+
* Command line option parsing *
|
11
|
+
*******************************/
|
12
|
+
|
13
|
+
var getopt = function(args, ostr) {
|
14
|
+
var oli; // option letter list index
|
15
|
+
if (typeof(getopt.place) == 'undefined')
|
16
|
+
getopt.ind = 0, getopt.arg = null, getopt.place = -1;
|
17
|
+
if (getopt.place == -1) { // update scanning pointer
|
18
|
+
if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
|
19
|
+
getopt.place = -1;
|
20
|
+
return null;
|
21
|
+
}
|
22
|
+
if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
|
23
|
+
++getopt.ind;
|
24
|
+
getopt.place = -1;
|
25
|
+
return null;
|
26
|
+
}
|
27
|
+
}
|
28
|
+
var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
|
29
|
+
if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
|
30
|
+
if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
|
31
|
+
if (getopt.place < 0) ++getopt.ind;
|
32
|
+
return '?';
|
33
|
+
}
|
34
|
+
if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
|
35
|
+
getopt.arg = null;
|
36
|
+
if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
|
37
|
+
} else { // need an argument
|
38
|
+
if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
|
39
|
+
getopt.arg = args[getopt.ind].substr(getopt.place);
|
40
|
+
else if (args.length <= ++getopt.ind) { // no arg
|
41
|
+
getopt.place = -1;
|
42
|
+
if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
|
43
|
+
return '?';
|
44
|
+
} else getopt.arg = args[getopt.ind]; // white space
|
45
|
+
getopt.place = -1;
|
46
|
+
++getopt.ind;
|
47
|
+
}
|
48
|
+
return optopt;
|
49
|
+
}
|
50
|
+
|
51
|
+
/***********************
|
52
|
+
* Interval operations *
|
53
|
+
***********************/
|
54
|
+
|
55
|
+
Interval = {};
|
56
|
+
|
57
|
+
Interval.sort = function(a)
|
58
|
+
{
|
59
|
+
if (typeof a[0] == 'number')
|
60
|
+
a.sort(function(x, y) { return x - y });
|
61
|
+
else a.sort(function(x, y) { return x[0] != y[0]? x[0] - y[0] : x[1] - y[1] });
|
62
|
+
}
|
63
|
+
|
64
|
+
Interval.merge = function(a, sorted)
|
65
|
+
{
|
66
|
+
if (typeof sorted == 'undefined') sorted = true;
|
67
|
+
if (!sorted) Interval.sort(a);
|
68
|
+
var k = 0;
|
69
|
+
for (var i = 1; i < a.length; ++i) {
|
70
|
+
if (a[k][1] >= a[i][0])
|
71
|
+
a[k][1] = a[k][1] > a[i][1]? a[k][1] : a[i][1];
|
72
|
+
else a[++k] = a[i].slice(0);
|
73
|
+
}
|
74
|
+
a.length = k + 1;
|
75
|
+
}
|
76
|
+
|
77
|
+
Interval.index_end = function(a, sorted)
|
78
|
+
{
|
79
|
+
if (a.length == 0) return;
|
80
|
+
if (typeof sorted == 'undefined') sorted = true;
|
81
|
+
if (!sorted) Interval.sort(a);
|
82
|
+
a[0].push(0);
|
83
|
+
var k = 0, k_en = a[0][1];
|
84
|
+
for (var i = 1; i < a.length; ++i) {
|
85
|
+
if (k_en <= a[i][0]) {
|
86
|
+
for (++k; k < i; ++k)
|
87
|
+
if (a[k][1] > a[i][0])
|
88
|
+
break;
|
89
|
+
k_en = a[k][1];
|
90
|
+
}
|
91
|
+
a[i].push(k);
|
92
|
+
}
|
93
|
+
}
|
94
|
+
|
95
|
+
Interval.find_intv = function(a, x)
|
96
|
+
{
|
97
|
+
var left = -1, right = a.length;
|
98
|
+
if (typeof a[0] == 'number') {
|
99
|
+
while (right - left > 1) {
|
100
|
+
var mid = left + ((right - left) >> 1);
|
101
|
+
if (a[mid] > x) right = mid;
|
102
|
+
else if (a[mid] < x) left = mid;
|
103
|
+
else return mid;
|
104
|
+
}
|
105
|
+
} else {
|
106
|
+
while (right - left > 1) {
|
107
|
+
var mid = left + ((right - left) >> 1);
|
108
|
+
if (a[mid][0] > x) right = mid;
|
109
|
+
else if (a[mid][0] < x) left = mid;
|
110
|
+
else return mid;
|
111
|
+
}
|
112
|
+
}
|
113
|
+
return left;
|
114
|
+
}
|
115
|
+
|
116
|
+
Interval.find_ovlp = function(a, st, en)
|
117
|
+
{
|
118
|
+
if (a.length == 0 || st >= en) return [];
|
119
|
+
var l = Interval.find_intv(a, st);
|
120
|
+
var k = l < 0? 0 : a[l][a[l].length - 1];
|
121
|
+
var b = [];
|
122
|
+
for (var i = k; i < a.length; ++i) {
|
123
|
+
if (a[i][0] >= en) break;
|
124
|
+
else if (st < a[i][1])
|
125
|
+
b.push(a[i]);
|
126
|
+
}
|
127
|
+
return b;
|
128
|
+
}
|
129
|
+
|
130
|
+
/**********************************
|
131
|
+
* Reverse and reverse complement *
|
132
|
+
**********************************/
|
133
|
+
|
134
|
+
function fasta_read(fn)
|
135
|
+
{
|
136
|
+
var h = {}, gt = '>'.charCodeAt(0);
|
137
|
+
var file = fn == '-'? new File() : new File(fn);
|
138
|
+
var buf = new Bytes(), seq = null, name = null, seqlen = [];
|
139
|
+
while (file.readline(buf) >= 0) {
|
140
|
+
if (buf[0] == gt) {
|
141
|
+
if (seq != null && name != null) {
|
142
|
+
seqlen.push([name, seq.length]);
|
143
|
+
h[name] = seq;
|
144
|
+
name = seq = null;
|
145
|
+
}
|
146
|
+
var m, line = buf.toString();
|
147
|
+
if ((m = /^>(\S+)/.exec(line)) != null) {
|
148
|
+
name = m[1];
|
149
|
+
seq = new Bytes();
|
150
|
+
}
|
151
|
+
} else seq.set(buf);
|
152
|
+
}
|
153
|
+
if (seq != null && name != null) {
|
154
|
+
seqlen.push([name, seq.length]);
|
155
|
+
h[name] = seq;
|
156
|
+
}
|
157
|
+
buf.destroy();
|
158
|
+
file.close();
|
159
|
+
return [h, seqlen];
|
160
|
+
}
|
161
|
+
|
162
|
+
function fasta_free(fa)
|
163
|
+
{
|
164
|
+
for (var name in fa)
|
165
|
+
fa[name].destroy();
|
166
|
+
}
|
167
|
+
|
168
|
+
Bytes.prototype.reverse = function()
|
169
|
+
{
|
170
|
+
for (var i = 0; i < this.length>>1; ++i) {
|
171
|
+
var tmp = this[i];
|
172
|
+
this[i] = this[this.length - i - 1];
|
173
|
+
this[this.length - i - 1] = tmp;
|
174
|
+
}
|
175
|
+
}
|
176
|
+
|
177
|
+
// reverse complement a DNA string
|
178
|
+
Bytes.prototype.revcomp = function()
|
179
|
+
{
|
180
|
+
if (Bytes.rctab == null) {
|
181
|
+
var s1 = 'WSATUGCYRKMBDHVNwsatugcyrkmbdhvn';
|
182
|
+
var s2 = 'WSTAACGRYMKVHDBNwstaacgrymkvhdbn';
|
183
|
+
Bytes.rctab = [];
|
184
|
+
for (var i = 0; i < 256; ++i) Bytes.rctab[i] = 0;
|
185
|
+
for (var i = 0; i < s1.length; ++i)
|
186
|
+
Bytes.rctab[s1.charCodeAt(i)] = s2.charCodeAt(i);
|
187
|
+
}
|
188
|
+
for (var i = 0; i < this.length>>1; ++i) {
|
189
|
+
var tmp = this[this.length - i - 1];
|
190
|
+
this[this.length - i - 1] = Bytes.rctab[this[i]];
|
191
|
+
this[i] = Bytes.rctab[tmp];
|
192
|
+
}
|
193
|
+
if (this.length&1)
|
194
|
+
this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
|
195
|
+
}
|
196
|
+
|
197
|
+
/********************
|
198
|
+
***** paftools *****
|
199
|
+
********************/
|
200
|
+
|
201
|
+
/*****************
|
202
|
+
* Miscellaneous *
|
203
|
+
*****************/
|
204
|
+
|
205
|
+
// liftover
|
206
|
+
function paf_liftover(args)
|
207
|
+
{
|
208
|
+
function read_bed(fn, to_merge)
|
209
|
+
{
|
210
|
+
if (fn == null) return null;
|
211
|
+
if (typeof to_merge == 'undefined') to_merge = true;
|
212
|
+
var file = fn == '-'? new File() : new File(fn);
|
213
|
+
var buf = new Bytes();
|
214
|
+
var bed = {};
|
215
|
+
while (file.readline(buf) >= 0) {
|
216
|
+
var t = buf.toString().split("\t");
|
217
|
+
if (bed[t[0]] == null) bed[t[0]] = [];
|
218
|
+
bed[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
|
219
|
+
}
|
220
|
+
buf.destroy();
|
221
|
+
file.close();
|
222
|
+
|
223
|
+
for (var chr in bed) {
|
224
|
+
Interval.sort(bed[chr]);
|
225
|
+
if (to_merge)
|
226
|
+
Interval.merge(bed[chr], true);
|
227
|
+
Interval.index_end(bed[chr], true);
|
228
|
+
}
|
229
|
+
return bed;
|
230
|
+
}
|
231
|
+
|
232
|
+
var re_cigar = /(\d+)([MID])/g, re_tag = /^(\S\S):([AZif]):(\S+)$/;
|
233
|
+
var c, to_merge = false, min_mapq = 5, min_len = 50000, max_div = 2.0;
|
234
|
+
var re = /(\d+)([MID])/g;
|
235
|
+
while ((c = getopt(args, "mq:l:d:")) != null) {
|
236
|
+
if (c == 'm') to_merge = true;
|
237
|
+
else if (c == 'q') min_mapq = parseInt(getopt.arg);
|
238
|
+
else if (c == 'l') min_len = parseInt(getopt.arg);
|
239
|
+
else if (c == 'd') max_div = parseFloat(getopt.arg);
|
240
|
+
}
|
241
|
+
if (args.length - getopt.ind < 2) {
|
242
|
+
print("Usage: paftools.js liftover [options] <aln.paf> <query.bed>");
|
243
|
+
print("Options:");
|
244
|
+
print(" -q INT min mapping quality [" + min_mapq + "]");
|
245
|
+
print(" -l INT min alignment length [" + min_len + "]");
|
246
|
+
print(" -d FLOAT max sequence divergence (>=1 to disable) [1]");
|
247
|
+
exit(1);
|
248
|
+
}
|
249
|
+
var bed = read_bed(args[getopt.ind+1], to_merge);
|
250
|
+
|
251
|
+
var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
|
252
|
+
var buf = new Bytes();
|
253
|
+
while (file.readline(buf) >= 0) {
|
254
|
+
var t = buf.toString().split("\t");
|
255
|
+
|
256
|
+
if (bed[t[0]] == null) continue; // sequence not present in BED; skip
|
257
|
+
|
258
|
+
// parse tp and cg tags
|
259
|
+
var m, tp = null, cg = null;
|
260
|
+
for (var i = 12; i < t.length; ++i) {
|
261
|
+
if ((m = re_tag.exec(t[i])) != null) {
|
262
|
+
if (m[1] == 'tp') tp = m[3];
|
263
|
+
else if (m[1] == 'cg') cg = m[3];
|
264
|
+
}
|
265
|
+
}
|
266
|
+
if (tp != 'P' && tp != 'I') continue; // only process primary alignments
|
267
|
+
if (cg == null) throw Error("unable to find the 'cg' tag");
|
268
|
+
|
269
|
+
// filter out bad alignments and check overlaps
|
270
|
+
for (var i = 1; i <= 3; ++i)
|
271
|
+
t[i] = parseInt(t[i]);
|
272
|
+
for (var i = 6; i <= 11; ++i)
|
273
|
+
t[i] = parseInt(t[i]);
|
274
|
+
if (t[11] < min_mapq || t[10] < min_len) continue;
|
275
|
+
var regs = Interval.find_ovlp(bed[t[0]], t[2], t[3]);
|
276
|
+
if (regs.length == 0) continue; // not overlapping any regions in input BED
|
277
|
+
if (max_div >= 0.0 && max_div < 1.0) {
|
278
|
+
var n_gaps = 0, n_opens = 0;
|
279
|
+
while ((m = re_cigar.exec(cg)) != null)
|
280
|
+
if (m[2] == 'I' || m[2] == 'D')
|
281
|
+
n_gaps += parseInt(m[1]), ++n_opens;
|
282
|
+
var n_mm = t[10] - t[9] - n_gaps;
|
283
|
+
var n_diff2 = n_mm + n_opens;
|
284
|
+
if (n_diff2 / (n_diff2 + t[9]) > max_div)
|
285
|
+
continue;
|
286
|
+
}
|
287
|
+
|
288
|
+
// extract start and end positions
|
289
|
+
var a = [], r = [], strand = t[4];
|
290
|
+
for (var i = 0; i < regs.length; ++i) {
|
291
|
+
var s = regs[i][0], e = regs[i][1];
|
292
|
+
if (strand == '+') {
|
293
|
+
a.push([s, 0, i, -2]);
|
294
|
+
a.push([e - 1, 1, i, -2]);
|
295
|
+
} else {
|
296
|
+
a.push([t[1] - e, 0, i, -2]);
|
297
|
+
a.push([t[1] - s - 1, 1, i, -2]);
|
298
|
+
}
|
299
|
+
r.push([-2, -2]);
|
300
|
+
}
|
301
|
+
a.sort(function(x, y) { return x[0] - y[0] });
|
302
|
+
|
303
|
+
// lift start/end positions
|
304
|
+
var k = 0, x = t[7], y = strand == '+'? t[2] : t[1] - t[3];
|
305
|
+
while ((m = re_cigar.exec(cg)) != null) { // TODO: be more careful about edge cases
|
306
|
+
var len = parseInt(m[1]);
|
307
|
+
if (m[2] == 'D') { // do nothing for D
|
308
|
+
x += len;
|
309
|
+
continue;
|
310
|
+
}
|
311
|
+
while (k < a.length && a[k][0] < y) ++k; // skip out-of-range positions
|
312
|
+
for (var i = k; i < a.length; ++i) {
|
313
|
+
if (y <= a[i][0] && a[i][0] < y + len)
|
314
|
+
a[i][3] = m[2] == 'M'? x + (a[i][0] - y) : x;
|
315
|
+
else break;
|
316
|
+
}
|
317
|
+
y += len;
|
318
|
+
if (m[2] == 'M') x += len;
|
319
|
+
}
|
320
|
+
if (x != t[8] || (strand == '+' && y != t[3]) || (strand == '-' && y != t[1] - t[2]))
|
321
|
+
throw Error("CIGAR is inconsistent with mapping coordinates");
|
322
|
+
|
323
|
+
// generate result
|
324
|
+
for (var i = 0; i < a.length; ++i) {
|
325
|
+
if (a[i][1] == 0) r[a[i][2]][0] = a[i][3];
|
326
|
+
else r[a[i][2]][1] = a[i][3] + 1; // change to half-close-half-open
|
327
|
+
}
|
328
|
+
for (var i = 0; i < r.length; ++i) {
|
329
|
+
var name = [t[0], regs[i][0], regs[i][1]].join("_");
|
330
|
+
if (r[i][0] < 0) name += "_t5", r[i][0] = t[7];
|
331
|
+
if (r[i][1] < 0) name += "_t3", r[i][1] = t[8];
|
332
|
+
print(t[5], r[i][0], r[i][1], name, 0, strand);
|
333
|
+
}
|
334
|
+
}
|
335
|
+
buf.destroy();
|
336
|
+
file.close();
|
337
|
+
}
|
338
|
+
|
339
|
+
// variant calling
|
340
|
+
function paf_call(args)
|
341
|
+
{
|
342
|
+
var re_cs = /([:=*+-])(\d+|[A-Za-z]+)/g, re_tag = /\t(\S\S:[AZif]):(\S+)/g;
|
343
|
+
var c, min_cov_len = 10000, min_var_len = 50000, gap_thres = 50, gap_thres_long = 1000, min_mapq = 5;
|
344
|
+
var fa_tmp = null, fa, fa_lens, is_vcf = false, sample_name = "sample";
|
345
|
+
while ((c = getopt(args, "l:L:g:q:B:f:s:")) != null) {
|
346
|
+
if (c == 'l') min_cov_len = parseInt(getopt.arg);
|
347
|
+
else if (c == 'L') min_var_len = parseInt(getopt.arg);
|
348
|
+
else if (c == 'g') gap_thres = parseInt(getopt.arg);
|
349
|
+
else if (c == 'G') gap_thres_long = parseInt(getopt.arg);
|
350
|
+
else if (c == 'q') min_mapq = parseInt(getopt.arg);
|
351
|
+
else if (c == 'f') fa_tmp = fasta_read(getopt.arg, fa_lens);
|
352
|
+
else if (c == 's') sample_name = getopt.arg;
|
353
|
+
}
|
354
|
+
if (fa_tmp != null) fa = fa_tmp[0], fa_lens = fa_tmp[1], is_vcf = true;
|
355
|
+
|
356
|
+
if (args.length == getopt.ind) {
|
357
|
+
print("Usage: sort -k6,6 -k8,8n <with-cs.paf> | paftools.js call [options] -");
|
358
|
+
print("Options:");
|
359
|
+
print(" -l INT min alignment length to compute coverage ["+min_cov_len+"]");
|
360
|
+
print(" -L INT min alignment length to call variants ["+min_var_len+"]");
|
361
|
+
print(" -q INT min mapping quality ["+min_mapq+"]");
|
362
|
+
print(" -g INT short/long gap threshold (for statistics only) ["+gap_thres+"]");
|
363
|
+
print(" -f FILE reference sequences (enabling VCF output) [null]");
|
364
|
+
print(" -s NAME sample name in VCF header ["+sample_name+"]");
|
365
|
+
exit(1);
|
366
|
+
}
|
367
|
+
|
368
|
+
var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
|
369
|
+
var buf = new Bytes();
|
370
|
+
var tot_len = 0, n_sub = [0, 0, 0], n_ins = [0, 0, 0, 0, 0], n_del = [0, 0, 0, 0, 0];
|
371
|
+
|
372
|
+
function print_vcf(o, fa)
|
373
|
+
{
|
374
|
+
var v = null;
|
375
|
+
if (o[3] != 1) return; // coverage is one; skip
|
376
|
+
if (o[5] == '-' && o[6] == '-') return;
|
377
|
+
if (o[5] != '-' && o[6] != '-') { // snp
|
378
|
+
v = [o[0], o[1] + 1, '.', o[5].toUpperCase(), o[6].toUpperCase()];
|
379
|
+
} else if (o[1] > 0) { // shouldn't happen in theory
|
380
|
+
if (fa[o[0]] == null) throw Error('sequence "' + o[0] + '" is absent from the reference FASTA');
|
381
|
+
if (o[1] >= fa[o[0]].length) throw Error('position ' + o[1] + ' exceeds the length of sequence "' + o[0] + '"');
|
382
|
+
var ref = String.fromCharCode(fa[o[0]][o[1]-1]).toUpperCase();
|
383
|
+
if (o[5] == '-') // insertion
|
384
|
+
v = [o[0], o[1], '.', ref, ref + o[6].toUpperCase()];
|
385
|
+
else // deletion
|
386
|
+
v = [o[0], o[1], '.', ref + o[5].toUpperCase(), ref];
|
387
|
+
}
|
388
|
+
v.push(o[4], '.', 'QNAME=' + o[7] + ';QSTART=' + (o[8]+1) + ';QSTRAND=' + (rev? '-' : '+'), 'GT', '1/1');
|
389
|
+
if (v == null) throw Error("unexpected variant: [" + o.join(",") + "]");
|
390
|
+
print(v.join("\t"));
|
391
|
+
}
|
392
|
+
|
393
|
+
function count_var(o)
|
394
|
+
{
|
395
|
+
if (o[3] > 1) return;
|
396
|
+
if (o[5] == '-' && o[6] == '-') return;
|
397
|
+
if (o[5] == '-') { // insertion
|
398
|
+
var l = o[6].length;
|
399
|
+
if (l == 1) ++n_ins[0];
|
400
|
+
else if (l == 2) ++n_ins[1];
|
401
|
+
else if (l < gap_thres) ++n_ins[2];
|
402
|
+
else if (l < gap_thres_long) ++n_ins[3];
|
403
|
+
else ++n_ins[4];
|
404
|
+
} else if (o[6] == '-') { // deletion
|
405
|
+
var l = o[5].length;
|
406
|
+
if (l == 1) ++n_del[0];
|
407
|
+
else if (l == 2) ++n_del[1];
|
408
|
+
else if (l < gap_thres) ++n_del[2];
|
409
|
+
else if (l < gap_thres_long) ++n_del[3];
|
410
|
+
else ++n_del[4];
|
411
|
+
} else {
|
412
|
+
++n_sub[0];
|
413
|
+
var s = (o[5] + o[6]).toLowerCase();
|
414
|
+
if (s == 'ag' || s == 'ga' || s == 'ct' || s == 'tc')
|
415
|
+
++n_sub[1];
|
416
|
+
else ++n_sub[2];
|
417
|
+
}
|
418
|
+
}
|
419
|
+
|
420
|
+
if (is_vcf) {
|
421
|
+
print('##fileformat=VCFv4.1');
|
422
|
+
for (var i = 0; i < fa_lens.length; ++i)
|
423
|
+
print('##contig=<ID=' + fa_lens[i][0] + ',length=' + fa_lens[i][1] + '>');
|
424
|
+
print('##INFO=<ID=QNAME,Number=1,Type=String,Description="Query name">');
|
425
|
+
print('##INFO=<ID=QSTART,Number=1,Type=Integer,Description="Query start">');
|
426
|
+
print('##INFO=<ID=QSTRAND,Number=1,Type=String,Description="Query strand">');
|
427
|
+
print('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">');
|
428
|
+
print('#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT '+sample_name);
|
429
|
+
}
|
430
|
+
|
431
|
+
var a = [], out = [];
|
432
|
+
var c1_ctg = null, c1_start = 0, c1_end = 0, c1_counted = false, c1_len = 0;
|
433
|
+
while (file.readline(buf) >= 0) {
|
434
|
+
var line = buf.toString();
|
435
|
+
var m, t = line.split("\t", 12);
|
436
|
+
if (t.length < 12 || t[5] == '*') continue; // unmapped
|
437
|
+
for (var i = 6; i <= 11; ++i)
|
438
|
+
t[i] = parseInt(t[i]);
|
439
|
+
if (t[10] < min_cov_len || t[11] < min_mapq) continue;
|
440
|
+
//print(t[0], t[7], t[8], c1_start, c1_end);
|
441
|
+
for (var i = 1; i <= 3; ++i)
|
442
|
+
t[i] = parseInt(t[i]);
|
443
|
+
var ctg = t[5], x = t[7], end = t[8];
|
444
|
+
var query = t[0], rev = (t[4] == '-'), y = rev? t[3] : t[2];
|
445
|
+
// collect tags
|
446
|
+
var cs = null, tp = null, have_s1 = false, have_s2 = false;
|
447
|
+
while ((m = re_tag.exec(line)) != null) {
|
448
|
+
if (m[1] == 'cs:Z') cs = m[2];
|
449
|
+
else if (m[1] == 'tp:A') tp = m[2];
|
450
|
+
else if (m[1] == 's1:i') have_s1 = true;
|
451
|
+
else if (m[1] == 's2:i') have_s2 = true;
|
452
|
+
}
|
453
|
+
if (have_s1 && !have_s2) continue;
|
454
|
+
if (tp != null && (tp == 'S' || tp == 'i')) continue;
|
455
|
+
// compute regions covered by 1 contig
|
456
|
+
if (ctg != c1_ctg || x >= c1_end) {
|
457
|
+
if (c1_counted && c1_end > c1_start) {
|
458
|
+
c1_len += c1_end - c1_start;
|
459
|
+
if (!is_vcf) print('R', c1_ctg, c1_start, c1_end);
|
460
|
+
}
|
461
|
+
c1_ctg = ctg, c1_start = x, c1_end = end;
|
462
|
+
c1_counted = (t[10] >= min_var_len);
|
463
|
+
} else if (end > c1_end) { // overlap
|
464
|
+
if (c1_counted && x > c1_start) {
|
465
|
+
c1_len += x - c1_start;
|
466
|
+
if (!is_vcf) print('R', c1_ctg, c1_start, x);
|
467
|
+
}
|
468
|
+
c1_start = c1_end, c1_end = end;
|
469
|
+
c1_counted = (t[10] >= min_var_len);
|
470
|
+
} else if (end > c1_start) { // contained
|
471
|
+
if (c1_counted && x > c1_start) {
|
472
|
+
c1_len += x - c1_start;
|
473
|
+
if (!is_vcf) print('R', c1_ctg, c1_start, x);
|
474
|
+
}
|
475
|
+
c1_start = end;
|
476
|
+
} // else, the alignment precedes the cov1 region; do nothing
|
477
|
+
// output variants ahead of this alignment
|
478
|
+
while (out.length) {
|
479
|
+
if (out[0][0] != ctg || out[0][2] <= x) {
|
480
|
+
count_var(out[0]);
|
481
|
+
if (is_vcf) print_vcf(out[0], fa);
|
482
|
+
else print('V', out[0].join("\t"));
|
483
|
+
out.shift();
|
484
|
+
} else break;
|
485
|
+
}
|
486
|
+
// update coverage
|
487
|
+
for (var i = 0; i < out.length; ++i)
|
488
|
+
if (out[i][1] >= x && out[i][2] <= end)
|
489
|
+
++out[i][3];
|
490
|
+
// drop alignments that don't overlap with the current one
|
491
|
+
var k = 0;
|
492
|
+
for (var i = 0; i < a.length; ++i)
|
493
|
+
if (a[i][0] == ctg && a[i][2] > x)
|
494
|
+
a[k++] = a[i];
|
495
|
+
a.length = k;
|
496
|
+
// core loop
|
497
|
+
if (t[10] >= min_var_len) {
|
498
|
+
if (cs == null) continue; // no cs tag
|
499
|
+
var blen = 0, n_diff = 0;
|
500
|
+
tot_len += t[10];
|
501
|
+
while ((m = re_cs.exec(cs)) != null) {
|
502
|
+
var cov = 1;
|
503
|
+
if (m[1] == '*' || m[1] == '+' || m[1] == '-')
|
504
|
+
for (var i = 0; i < a.length; ++i)
|
505
|
+
if (a[i][2] > x) ++cov;
|
506
|
+
var qs, qe;
|
507
|
+
if (m[1] == '=' || m[1] == ':') {
|
508
|
+
var l = m[1] == '='? m[2].length : parseInt(m[2]);
|
509
|
+
if (rev) y -= l;
|
510
|
+
else y += l;
|
511
|
+
x += l, blen += l;
|
512
|
+
} else if (m[1] == '*') {
|
513
|
+
if (rev) qs = y - 1, qe = y, --y;
|
514
|
+
else qs = y, qe = y + 1, ++y;
|
515
|
+
var br = m[2].charAt(0), bq = m[2].charAt(1);
|
516
|
+
if (br != 'n' && bq != 'n') { // don't call a SNP if there is an ambiguous base
|
517
|
+
out.push([t[5], x, x+1, cov, t[11], br, bq, query, qs, qe, rev? '-' : '+']);
|
518
|
+
++n_diff;
|
519
|
+
}
|
520
|
+
++x, ++blen;
|
521
|
+
} else if (m[1] == '+') {
|
522
|
+
var l = m[2].length;
|
523
|
+
if (rev) qs = y - l, qe = y, y -= l;
|
524
|
+
else qs = y, qe = y + l, y += l;
|
525
|
+
out.push([t[5], x, x, cov, t[11], '-', m[2], query, qs, qe, rev? '-' : '+']);
|
526
|
+
++blen, ++n_diff;
|
527
|
+
} else if (m[1] == '-') {
|
528
|
+
var l = m[2].length;
|
529
|
+
out.push([t[5], x, x + l, cov, t[11], m[2], '-', query, y, y, rev? '-' : '+']);
|
530
|
+
x += l, ++blen, ++n_diff;
|
531
|
+
}
|
532
|
+
}
|
533
|
+
}
|
534
|
+
a.push([t[5], t[7], t[8]]);
|
535
|
+
}
|
536
|
+
if (c1_counted && c1_end > c1_start) {
|
537
|
+
c1_len += c1_end - c1_start;
|
538
|
+
if (!is_vcf) print('R', c1_ctg, c1_start, c1_end);
|
539
|
+
}
|
540
|
+
while (out.length) {
|
541
|
+
count_var(out[0]);
|
542
|
+
if (is_vcf) print_vcf(out[0], fa);
|
543
|
+
else print('V', out[0].join("\t"));
|
544
|
+
out.shift();
|
545
|
+
}
|
546
|
+
|
547
|
+
//warn(tot_len + " alignment columns considered in calling");
|
548
|
+
warn(c1_len + " reference bases covered by exactly one contig");
|
549
|
+
warn(n_sub[0] + " substitutions; ts/tv = " + (n_sub[1]/n_sub[2]).toFixed(3));
|
550
|
+
warn(n_del[0] + " 1bp deletions");
|
551
|
+
warn(n_ins[0] + " 1bp insertions");
|
552
|
+
warn(n_del[1] + " 2bp deletions");
|
553
|
+
warn(n_ins[1] + " 2bp insertions");
|
554
|
+
warn(n_del[2] + " [3,"+gap_thres+") deletions");
|
555
|
+
warn(n_ins[2] + " [3,"+gap_thres+") insertions");
|
556
|
+
warn(n_del[3] + " ["+gap_thres+","+gap_thres_long+") deletions");
|
557
|
+
warn(n_ins[3] + " ["+gap_thres+","+gap_thres_long+") insertions");
|
558
|
+
warn(n_del[4] + " >=" + gap_thres_long + " deletions");
|
559
|
+
warn(n_ins[4] + " >=" + gap_thres_long + " insertions");
|
560
|
+
|
561
|
+
buf.destroy();
|
562
|
+
file.close();
|
563
|
+
if (fa != null) fasta_free(fa);
|
564
|
+
}
|
565
|
+
|
566
|
+
function paf_asmstat(args)
|
567
|
+
{
|
568
|
+
var c, min_query_len = 0, min_seg_len = 10000, max_diff = 0.01, bp_flank_len = 0, bp_gap_len = 0;
|
569
|
+
while ((c = getopt(args, "l:d:b:g:q:")) != null) {
|
570
|
+
if (c == 'l') min_seg_len = parseInt(getopt.arg);
|
571
|
+
else if (c == 'd') max_diff = parseFloat(getopt.arg);
|
572
|
+
else if (c == 'b') bp_flank_len = parseInt(getopt.arg);
|
573
|
+
else if (c == 'g') bp_gap_len = parseInt(getopt.arg);
|
574
|
+
else if (c == 'q') min_query_len = parseInt(getopt.arg);
|
575
|
+
}
|
576
|
+
if (getopt.ind == args.length) {
|
577
|
+
print("Usage: paftools.js asmstat [options] <ref.fa.fai> <asm1.paf> [...]");
|
578
|
+
print("Options:");
|
579
|
+
print(" -q INT ignore query shorter than INT [0]");
|
580
|
+
print(" -l INT min alignment block length [" + min_seg_len + "]");
|
581
|
+
print(" -d FLOAT max gap-compressed sequence divergence [" + max_diff + "]");
|
582
|
+
exit(1);
|
583
|
+
}
|
584
|
+
|
585
|
+
var file, buf = new Bytes();
|
586
|
+
|
587
|
+
var ref_len = 0;
|
588
|
+
file = new File(args[getopt.ind]);
|
589
|
+
while (file.readline(buf) >= 0) {
|
590
|
+
var t = buf.toString().split("\t");
|
591
|
+
ref_len += parseInt(t[1]);
|
592
|
+
}
|
593
|
+
file.close();
|
594
|
+
|
595
|
+
function process_query(qblocks, qblock_len, bp, qi) {
|
596
|
+
qblocks.sort(function(a,b) { return a[0]-b[0]; });
|
597
|
+
var last_k = null, last_blen = null, st = -1, en = -1, qcov = 0;
|
598
|
+
for (var k = 0; k < qblocks.length; ++k) {
|
599
|
+
var blen = qblocks[k][1] - qblocks[k][0];
|
600
|
+
if (k > 0 && qblocks[k][0] < qblocks[k-1][1]) {
|
601
|
+
if (qblocks[k][1] < qblocks[k-1][1]) continue;
|
602
|
+
blen = qblocks[k][1] - qblocks[k-1][1];
|
603
|
+
}
|
604
|
+
qblock_len.push(blen);
|
605
|
+
if (qblocks[k][0] > en) {
|
606
|
+
qcov += en - st;
|
607
|
+
st = qblocks[k][0];
|
608
|
+
en = qblocks[k][1];
|
609
|
+
} else en = en > qblocks[k][1]? en : qblocks[k][1];
|
610
|
+
if (last_k != null) {
|
611
|
+
var gap = 1000000000;
|
612
|
+
if (qblocks[k][2] == qblocks[last_k][2] && qblocks[k][3] == qblocks[last_k][3]) { // same chr and strand
|
613
|
+
var g1 = qblocks[k][0] - qblocks[last_k][1];
|
614
|
+
var g2 = qblocks[k][2] == '+'? qblocks[k][4] - qblocks[last_k][5] : qblocks[last_k][4] - qblocks[k][5];
|
615
|
+
gap = g1 > g2? g1 - g2 : g2 - g1;
|
616
|
+
}
|
617
|
+
var min = blen < last_blen? blen : last_blen;
|
618
|
+
var flank = k == 0? min : blen;
|
619
|
+
bp.push([flank, gap]);
|
620
|
+
qi.bp.push([flank, gap]);
|
621
|
+
}
|
622
|
+
last_k = k, last_blen = blen;
|
623
|
+
}
|
624
|
+
qcov += en - st;
|
625
|
+
return qcov;
|
626
|
+
}
|
627
|
+
|
628
|
+
function N50(lens, tot, quantile) {
|
629
|
+
lens.sort(function(a,b) { return b - a; });
|
630
|
+
if (tot == null) {
|
631
|
+
tot = 0;
|
632
|
+
for (var k = 0; k < lens.length; ++k)
|
633
|
+
tot += lens[k];
|
634
|
+
}
|
635
|
+
var sum = 0;
|
636
|
+
for (var k = 0; k < lens.length; ++k) {
|
637
|
+
if (sum <= quantile * tot && sum + lens[k] > quantile * tot)
|
638
|
+
return lens[k];
|
639
|
+
sum += lens[k];
|
640
|
+
}
|
641
|
+
}
|
642
|
+
|
643
|
+
function AUN(lens, tot) {
|
644
|
+
lens.sort(function(a,b) { return b - a; });
|
645
|
+
if (tot == null) {
|
646
|
+
tot = 0;
|
647
|
+
for (var k = 0; k < lens.length; ++k)
|
648
|
+
tot += lens[k];
|
649
|
+
}
|
650
|
+
var x = 0, y = 0;
|
651
|
+
for (var k = 0; k < lens.length; ++k) {
|
652
|
+
var l = x + lens[k] <= tot? lens[k] : tot - x;
|
653
|
+
x += lens[k];
|
654
|
+
y += l * (l / tot);
|
655
|
+
if (x >= tot) break;
|
656
|
+
}
|
657
|
+
return y.toFixed(0);
|
658
|
+
}
|
659
|
+
|
660
|
+
function count_bp(bp, min_blen, min_gap) {
|
661
|
+
var n_bp = 0;
|
662
|
+
for (var k = 0; k < bp.length; ++k)
|
663
|
+
if (bp[k][0] >= min_blen && bp[k][1] >= min_gap)
|
664
|
+
++n_bp;
|
665
|
+
return n_bp;
|
666
|
+
}
|
667
|
+
|
668
|
+
function compute_diff(cigar, NM) {
|
669
|
+
var m, re = /(\d+)([MID])/g;
|
670
|
+
var n_M = 0, n_gapo = 0, n_gaps = 0;
|
671
|
+
while ((m = re.exec(cigar)) != null) {
|
672
|
+
var len = parseInt(m[1]);
|
673
|
+
if (m[2] == 'M') n_M += len;
|
674
|
+
else ++n_gapo, n_gaps += len;
|
675
|
+
}
|
676
|
+
if (NM < n_gaps) throw Error('NM is smaller the number of gaps');
|
677
|
+
return (NM - n_gaps + n_gapo) / (n_M + n_gapo);
|
678
|
+
}
|
679
|
+
|
680
|
+
var labels = ['Length', 'l_cov', 'Rcov', 'Rdup', 'Qcov', 'NG75', 'NG50', 'NGA50', 'AUNGA', '#breaks', 'bp(' + min_seg_len + ',0)', 'bp(' + min_seg_len + ',10k)'];
|
681
|
+
var rst = [];
|
682
|
+
for (var i = 0; i < labels.length; ++i)
|
683
|
+
rst[i] = [];
|
684
|
+
|
685
|
+
var n_asm = args.length - (getopt.ind + 1);
|
686
|
+
var header = ["Metric"];
|
687
|
+
for (var i = 0; i < n_asm; ++i) {
|
688
|
+
var n_breaks = 0, qcov = 0;
|
689
|
+
var fn = args[getopt.ind + 1 + i];
|
690
|
+
var label = fn.replace(/.paf(.gz)?$/, "");
|
691
|
+
header.push(label);
|
692
|
+
var ref_blocks = [], qblock_len = [], qblocks = [], bp = [];
|
693
|
+
var query = {}, qinfo = {};
|
694
|
+
var last_qname = null;
|
695
|
+
file = new File(fn);
|
696
|
+
while (file.readline(buf) >= 0) {
|
697
|
+
var m, line = buf.toString();
|
698
|
+
var t = line.split("\t");
|
699
|
+
t[1] = parseInt(t[1]);
|
700
|
+
if (t[1] < min_query_len) continue;
|
701
|
+
if (t.length < 2) continue;
|
702
|
+
query[t[0]] = t[1];
|
703
|
+
if (qinfo[t[0]] == null) qinfo[t[0]] = {};
|
704
|
+
qinfo[t[0]].len = t[1];
|
705
|
+
qinfo[t[0]].bp = [];
|
706
|
+
if (t.length < 9 || t[5] == "*") continue;
|
707
|
+
if (!/\ttp:A:[PI]/.test(line)) continue;
|
708
|
+
var cigar = (m = /\tcg:Z:(\S+)/.exec(line)) != null? m[1] : null;
|
709
|
+
var NM = (m = /\tNM:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : null;
|
710
|
+
var diff = cigar != null && NM != null? compute_diff(cigar, NM) : 0;
|
711
|
+
t[2] = parseInt(t[2]);
|
712
|
+
t[3] = parseInt(t[3]);
|
713
|
+
t[7] = parseInt(t[7]);
|
714
|
+
t[8] = parseInt(t[8]);
|
715
|
+
if (t[0] == last_qname) ++n_breaks;
|
716
|
+
if (diff > max_diff) continue;
|
717
|
+
if (t[3] - t[2] < min_seg_len) continue;
|
718
|
+
if (t[0] != last_qname) {
|
719
|
+
if (last_qname != null)
|
720
|
+
qcov += process_query(qblocks, qblock_len, bp, qinfo[last_qname]);
|
721
|
+
qblocks = [];
|
722
|
+
last_qname = t[0];
|
723
|
+
}
|
724
|
+
ref_blocks.push([t[5], t[7], t[8]]);
|
725
|
+
qblocks.push([t[2], t[3], t[4], t[5], t[7], t[8]]);
|
726
|
+
}
|
727
|
+
if (last_qname != null)
|
728
|
+
qcov += process_query(qblocks, qblock_len, bp, qinfo[last_qname]);
|
729
|
+
file.close();
|
730
|
+
|
731
|
+
// compute NG50
|
732
|
+
var asm_len = 0, asm_lens = []
|
733
|
+
for (var ctg in query) {
|
734
|
+
asm_len += query[ctg];
|
735
|
+
asm_lens.push(query[ctg]);
|
736
|
+
}
|
737
|
+
rst[0][i] = asm_len;
|
738
|
+
rst[5][i] = N50(asm_lens, ref_len, 0.75);
|
739
|
+
rst[6][i] = N50(asm_lens, ref_len, 0.5);
|
740
|
+
|
741
|
+
// compute coverage
|
742
|
+
var l_cov = 0;
|
743
|
+
ref_blocks.sort(function(a, b) { return a[0] > b[0]? 1 : a[0] < b[0]? -1 : a[1] - b[1]; });
|
744
|
+
var last_ref = null, st = -1, en = -1;
|
745
|
+
for (var j = 0; j < ref_blocks.length; ++j) {
|
746
|
+
if (ref_blocks[j][0] != last_ref || ref_blocks[j][1] > en) {
|
747
|
+
l_cov += en - st;
|
748
|
+
last_ref = ref_blocks[j][0];
|
749
|
+
st = ref_blocks[j][1];
|
750
|
+
en = ref_blocks[j][2];
|
751
|
+
} else en = en > ref_blocks[j][2]? en : ref_blocks[j][2];
|
752
|
+
}
|
753
|
+
l_cov += en - st;
|
754
|
+
rst[1][i] = l_cov;
|
755
|
+
rst[2][i] = (100.0 * (l_cov / ref_len)).toFixed(2) + '%';
|
756
|
+
rst[4][i] = (100.0 * (qcov / asm_len)).toFixed(2) + '%';
|
757
|
+
|
758
|
+
// compute cov1 and cov2+ lengths; see paf_call() for details
|
759
|
+
var c1_ctg = null, c1_start = 0, c1_end = 0, c1_len = 0;
|
760
|
+
for (var j = 0; j < ref_blocks.length; ++j) {
|
761
|
+
if (ref_blocks[j][0] != c1_ctg || ref_blocks[j][1] >= c1_end) {
|
762
|
+
if (c1_end > c1_start)
|
763
|
+
c1_len += c1_end - c1_start;
|
764
|
+
c1_ctg = ref_blocks[j][0], c1_start = ref_blocks[j][1], c1_end = ref_blocks[j][2];
|
765
|
+
} else if (ref_blocks[j][2] > c1_end) { // overlap
|
766
|
+
if (ref_blocks[j][1] > c1_start)
|
767
|
+
c1_len += ref_blocks[j][1] - c1_start;
|
768
|
+
c1_start = c1_end, c1_end = ref_blocks[j][2];
|
769
|
+
} else if (ref_blocks[j][2] > c1_start) { // contained
|
770
|
+
if (ref_blocks[j][1] > c1_start)
|
771
|
+
c1_len += ref_blocks[j][1] - c1_start;
|
772
|
+
c1_start = ref_blocks[j][2];
|
773
|
+
}
|
774
|
+
//print(ref_blocks[j][0], ref_blocks[j][1], ref_blocks[j][2], c1_start, c1_end, c1_len);
|
775
|
+
}
|
776
|
+
if (c1_end > c1_start)
|
777
|
+
c1_len += c1_end - c1_start;
|
778
|
+
rst[3][i] = (100 * (l_cov - c1_len) / l_cov).toFixed(2) + '%';
|
779
|
+
|
780
|
+
// compute NGA50
|
781
|
+
rst[7][i] = N50(qblock_len, ref_len, 0.5);
|
782
|
+
|
783
|
+
// compute AUNGA
|
784
|
+
rst[8][i] = AUN(qblock_len, ref_len);
|
785
|
+
|
786
|
+
// compute break points
|
787
|
+
rst[9][i] = n_breaks;
|
788
|
+
rst[10][i] = count_bp(bp, 500, 0);
|
789
|
+
rst[11][i] = count_bp(bp, 500, 10000);
|
790
|
+
|
791
|
+
// nb-plot; NOT USED
|
792
|
+
/*
|
793
|
+
var qa = [];
|
794
|
+
for (var qn in qinfo)
|
795
|
+
qa.push([qinfo[qn].len, qinfo[qn].bp]);
|
796
|
+
qa = qa.sort(function(a, b) { return b[0] - a[0] });
|
797
|
+
var sum = 0, n_bp = 0, next_quantile = 0.1;
|
798
|
+
for (var j = 0; j < qa.length; ++j) {
|
799
|
+
sum += qa[j][0];
|
800
|
+
for (var k = 0; k < qa[j][1].length; ++k)
|
801
|
+
if (qa[j][1][k][0] >= bp_flank_len && qa[j][1][k][1] >= bp_gap_len)
|
802
|
+
++n_bp;
|
803
|
+
if (sum >= ref_len * next_quantile) {
|
804
|
+
print(label, Math.floor(next_quantile * 100 + .5), qa[j][0], (sum / n_bp).toFixed(0), n_bp);
|
805
|
+
next_quantile += 0.1;
|
806
|
+
if (next_quantile >= 1.0) break;
|
807
|
+
}
|
808
|
+
}
|
809
|
+
*/
|
810
|
+
}
|
811
|
+
buf.destroy();
|
812
|
+
|
813
|
+
if (bp_flank_len <= 0) {
|
814
|
+
print(header.join("\t"));
|
815
|
+
for (var i = 0; i < labels.length; ++i)
|
816
|
+
print(labels[i], rst[i].join("\t"));
|
817
|
+
}
|
818
|
+
}
|
819
|
+
|
820
|
+
function paf_asmgene(args)
|
821
|
+
{
|
822
|
+
var c, opt = { min_cov:0.99, min_iden:0.99 }, print_err = false, auto_only = false;
|
823
|
+
while ((c = getopt(args, "i:c:ea")) != null)
|
824
|
+
if (c == 'i') opt.min_iden = parseFloat(getopt.arg);
|
825
|
+
else if (c == 'c') opt.min_cov = parseFloat(getopt.arg);
|
826
|
+
else if (c == 'e') print_err = true;
|
827
|
+
else if (c == 'a') auto_only = true;
|
828
|
+
|
829
|
+
var n_fn = args.length - getopt.ind;
|
830
|
+
if (n_fn < 2) {
|
831
|
+
print("Usage: paftools.js asmgene [options] <ref-splice.paf> <asm-splice.paf> [...]");
|
832
|
+
print("Options:");
|
833
|
+
print(" -i FLOAT min identity [" + opt.min_iden + "]");
|
834
|
+
print(" -c FLOAT min coverage [" + opt.min_cov + "]");
|
835
|
+
print(" -a only evaluate genes mapped to the autosomes");
|
836
|
+
print(" -e print fragmented/missing genes");
|
837
|
+
exit(1);
|
838
|
+
}
|
839
|
+
|
840
|
+
function process_query(opt, a) {
|
841
|
+
var b = [], cnt = [0, 0, 0];
|
842
|
+
for (var j = 0; j < a.length; ++j) {
|
843
|
+
if (a[j][4] < a[j][5] * opt.min_iden)
|
844
|
+
continue;
|
845
|
+
b.push(a[j].slice(0));
|
846
|
+
}
|
847
|
+
if (b.length == 0) return cnt;
|
848
|
+
// count full
|
849
|
+
var n_full = 0;
|
850
|
+
for (var j = 0; j < b.length; ++j)
|
851
|
+
if (b[j][3] - b[j][2] >= b[j][1] * opt.min_cov)
|
852
|
+
++n_full;
|
853
|
+
cnt[0] = n_full;
|
854
|
+
// compute coverage
|
855
|
+
b = b.sort(function(x, y) { return x[2] - y[2] });
|
856
|
+
var l_cov = 0, st = b[0][2], en = b[0][3];
|
857
|
+
for (var j = 1; j < b.length; ++j) {
|
858
|
+
if (b[j][2] <= en)
|
859
|
+
en = b[j][3] > en? b[j][3] : en;
|
860
|
+
else l_cov += en - st;
|
861
|
+
}
|
862
|
+
l_cov += en - st;
|
863
|
+
cnt[1] = l_cov / b[0][1];
|
864
|
+
cnt[2] = b.length;
|
865
|
+
return cnt;
|
866
|
+
}
|
867
|
+
|
868
|
+
var buf = new Bytes();
|
869
|
+
var gene = {}, header = [], refpos = {};
|
870
|
+
for (var i = getopt.ind; i < args.length; ++i) {
|
871
|
+
var fn = args[i];
|
872
|
+
var label = fn.replace(/.paf(.gz)?$/, "");
|
873
|
+
header.push(label);
|
874
|
+
var file = new File(fn), a = [];
|
875
|
+
while (file.readline(buf) >= 0) {
|
876
|
+
var t = buf.toString().split("\t");
|
877
|
+
var ql = parseInt(t[1]), qs = parseInt(t[2]), qe = parseInt(t[3]), mlen = parseInt(t[9]), blen = parseInt(t[10]), mapq = parseInt(t[11]);
|
878
|
+
if (i == getopt.ind) refpos[t[0]] = [t[0], t[1], t[5], t[7], t[8]];
|
879
|
+
if (gene[t[0]] == null) gene[t[0]] = [];
|
880
|
+
if (a.length && t[0] != a[0][0]) {
|
881
|
+
gene[a[0][0]][i - getopt.ind] = process_query(opt, a);
|
882
|
+
a = [];
|
883
|
+
}
|
884
|
+
a.push([t[0], ql, qs, qe, mlen, blen]);
|
885
|
+
}
|
886
|
+
if (a.length)
|
887
|
+
gene[t[0]][i - getopt.ind] = process_query(opt, a);
|
888
|
+
file.close();
|
889
|
+
}
|
890
|
+
|
891
|
+
// select the longest genes (not optimal, but should be good enough)
|
892
|
+
var gene_list = [], gene_nr = {};
|
893
|
+
for (var g in refpos)
|
894
|
+
gene_list.push(refpos[g]);
|
895
|
+
gene_list = gene_list.sort(function(a, b) { return a[2] < b[2]? -1 : a[2] > b[2]? 1 : a[3] - b[3] });
|
896
|
+
var last = 0;
|
897
|
+
for (var j = 1; j < gene_list.length; ++j) {
|
898
|
+
if (gene_list[j][2] != gene_list[last][2] || gene_list[j][3] >= gene_list[last][4]) {
|
899
|
+
gene_nr[gene_list[last][0]] = 1;
|
900
|
+
last = j;
|
901
|
+
} else if (gene_list[j][1] > gene_list[last][1]) {
|
902
|
+
last = j;
|
903
|
+
}
|
904
|
+
}
|
905
|
+
gene_nr[gene_list[last][0]] = 1;
|
906
|
+
|
907
|
+
// count and print
|
908
|
+
var col1 = ["full_sgl", "full_dup", "frag", "part50+", "part10+", "part10-", "dup_cnt", "dup_sum"];
|
909
|
+
var rst = [];
|
910
|
+
for (var k = 0; k < col1.length; ++k) {
|
911
|
+
rst[k] = [];
|
912
|
+
for (var i = 0; i < n_fn; ++i)
|
913
|
+
rst[k][i] = 0;
|
914
|
+
}
|
915
|
+
for (var g in gene) { // count single-copy genes
|
916
|
+
if (gene[g][0] == null || gene[g][0][0] != 1) continue;
|
917
|
+
if (gene_nr[g] == null) continue;
|
918
|
+
if (auto_only && /^(chr)?[XY]$/.test(refpos[g][2])) continue;
|
919
|
+
for (var i = 0; i < n_fn; ++i) {
|
920
|
+
if (gene[g][i] == null) {
|
921
|
+
rst[5][i]++;
|
922
|
+
if (print_err) print('M', header[i], refpos[g].join("\t"));
|
923
|
+
} else if (gene[g][i][0] == 1) {
|
924
|
+
rst[0][i]++;
|
925
|
+
} else if (gene[g][i][0] > 1) {
|
926
|
+
rst[1][i]++;
|
927
|
+
if (print_err) print('D', header[i], refpos[g].join("\t"));
|
928
|
+
} else if (gene[g][i][1] >= opt.min_cov) {
|
929
|
+
rst[2][i]++;
|
930
|
+
if (print_err) print('F', header[i], refpos[g].join("\t"));
|
931
|
+
} else if (gene[g][i][1] >= 0.5) {
|
932
|
+
rst[3][i]++;
|
933
|
+
if (print_err) print('5', header[i], refpos[g].join("\t"));
|
934
|
+
} else if (gene[g][i][1] >= 0.1) {
|
935
|
+
rst[4][i]++;
|
936
|
+
if (print_err) print('1', header[i], refpos[g].join("\t"));
|
937
|
+
} else {
|
938
|
+
rst[5][i]++;
|
939
|
+
if (print_err) print('0', header[i], refpos[g].join("\t")); // TODO: reduce code duplicates...
|
940
|
+
}
|
941
|
+
}
|
942
|
+
}
|
943
|
+
for (var g in gene) { // count multi-copy genes
|
944
|
+
if (gene[g][0] == null || gene[g][0][0] <= 1) continue;
|
945
|
+
if (gene_nr[g] == null) continue;
|
946
|
+
if (auto_only && /^(chr)?[XY]$/.test(refpos[g][2])) continue;
|
947
|
+
for (var i = 0; i < n_fn; ++i) {
|
948
|
+
if (gene[g][i] != null) rst[7][i] += gene[g][i][0];
|
949
|
+
if (gene[g][i] != null && gene[g][i][0] > 1) {
|
950
|
+
rst[6][i]++;
|
951
|
+
} else if (print_err) {
|
952
|
+
print('d', header[i], gene[g][0][0], refpos[g].join("\t"));
|
953
|
+
}
|
954
|
+
}
|
955
|
+
}
|
956
|
+
print('H', 'Metric', header.join("\t"));
|
957
|
+
for (var k = 0; k < rst.length; ++k) {
|
958
|
+
print('X', col1[k], rst[k].join("\t"));
|
959
|
+
}
|
960
|
+
buf.destroy();
|
961
|
+
}
|
962
|
+
|
963
|
+
function paf_stat(args)
|
964
|
+
{
|
965
|
+
var c, gap_out_len = null, count_err = false;
|
966
|
+
while ((c = getopt(args, "cl:")) != null)
|
967
|
+
if (c == 'l') gap_out_len = parseInt(getopt.arg);
|
968
|
+
else if (c == 'c') count_err = true;
|
969
|
+
|
970
|
+
if (getopt.ind == args.length) {
|
971
|
+
print("Usage: paftools.js stat [-c] [-l gapOutLen] <in.sam>|<in.paf>");
|
972
|
+
exit(1);
|
973
|
+
}
|
974
|
+
|
975
|
+
var buf = new Bytes();
|
976
|
+
var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
|
977
|
+
var re = /(\d+)([MIDSHNX=])/g;
|
978
|
+
|
979
|
+
var lineno = 0, n_pri = 0, n_2nd = 0, n_seq = 0, n_cigar_64k = 0, l_tot = 0, l_cov = 0;
|
980
|
+
var n_gap = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], n_sub = 0;
|
981
|
+
|
982
|
+
function cov_len(regs)
|
983
|
+
{
|
984
|
+
regs.sort(function(a,b) {return a[0]-b[0]});
|
985
|
+
var st = regs[0][0], en = regs[0][1], l = 0;
|
986
|
+
for (var i = 1; i < regs.length; ++i) {
|
987
|
+
if (regs[i][0] < en)
|
988
|
+
en = en > regs[i][1]? en : regs[i][1];
|
989
|
+
else l += en - st, st = regs[i][0], en = regs[i][1];
|
990
|
+
}
|
991
|
+
l += en - st;
|
992
|
+
return l;
|
993
|
+
}
|
994
|
+
|
995
|
+
var last = null, last_qlen = null, regs = [];
|
996
|
+
while (file.readline(buf) >= 0) {
|
997
|
+
var line = buf.toString();
|
998
|
+
++lineno;
|
999
|
+
if (line.charAt(0) != '@') {
|
1000
|
+
var t = line.split("\t", 12);
|
1001
|
+
var m, rs, cigar = null, is_pri = false, is_sam = false, is_rev = false, tname = null;
|
1002
|
+
var atlen = null, aqlen, qs, qe, mapq, ori_qlen, NM = null, nn = 0;
|
1003
|
+
if (t.length < 2) continue;
|
1004
|
+
if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
|
1005
|
+
if (t[4] == '*') continue; // unmapped
|
1006
|
+
if (!/\ts2:i:\d+/.test(line)) {
|
1007
|
+
++n_2nd;
|
1008
|
+
continue;
|
1009
|
+
}
|
1010
|
+
if ((m = /\tNM:i:(\d+)/.exec(line)) != null)
|
1011
|
+
NM = parseInt(m[1]);
|
1012
|
+
if ((m = /\tnn:i:(\d+)/.exec(line)) != null)
|
1013
|
+
nn = parseInt(m[1]);
|
1014
|
+
if ((m = /\tcg:Z:(\S+)/.exec(line)) != null)
|
1015
|
+
cigar = m[1];
|
1016
|
+
if (cigar == null) {
|
1017
|
+
warn("WARNING: no CIGAR at line " + lineno);
|
1018
|
+
continue;
|
1019
|
+
}
|
1020
|
+
tname = t[5];
|
1021
|
+
qs = parseInt(t[2]), qe = parseInt(t[3]);
|
1022
|
+
aqlen = qe - qs;
|
1023
|
+
is_rev = t[4] == '+'? false : true;
|
1024
|
+
rs = parseInt(t[7]);
|
1025
|
+
atlen = parseInt(t[8]) - rs;
|
1026
|
+
mapq = parseInt(t[11]);
|
1027
|
+
ori_qlen = parseInt(t[1]);
|
1028
|
+
} else { // SAM
|
1029
|
+
var flag = parseInt(t[1]);
|
1030
|
+
if ((flag & 4) || t[2] == '*' || t[5] == '*') continue;
|
1031
|
+
if (flag & 0x100) {
|
1032
|
+
++n_2nd;
|
1033
|
+
continue;
|
1034
|
+
}
|
1035
|
+
if ((m = /\tNM:i:(\d+)/.exec(line)) != null)
|
1036
|
+
NM = parseInt(m[1]);
|
1037
|
+
if ((m = /\tnn:i:(\d+)/.exec(line)) != null)
|
1038
|
+
nn = parseInt(m[1]);
|
1039
|
+
cigar = t[5];
|
1040
|
+
tname = t[2];
|
1041
|
+
rs = parseInt(t[3]) - 1;
|
1042
|
+
mapq = parseInt(t[4]);
|
1043
|
+
aqlen = t[9].length;
|
1044
|
+
is_sam = true;
|
1045
|
+
is_rev = !!(flag&0x10);
|
1046
|
+
}
|
1047
|
+
++n_pri;
|
1048
|
+
if (last != t[0]) {
|
1049
|
+
if (last != null) {
|
1050
|
+
l_tot += last_qlen;
|
1051
|
+
l_cov += cov_len(regs);
|
1052
|
+
}
|
1053
|
+
regs = [];
|
1054
|
+
++n_seq, last = t[0];
|
1055
|
+
}
|
1056
|
+
var M = 0, tl = 0, ql = 0, clip = [0, 0], n_cigar = 0, sclip = 0;
|
1057
|
+
var n_gapo = 0, n_gap_all = 0, l_match = 0;
|
1058
|
+
while ((m = re.exec(cigar)) != null) {
|
1059
|
+
var l = parseInt(m[1]);
|
1060
|
+
++n_cigar;
|
1061
|
+
if (m[2] == 'M' || m[2] == '=' || m[2] == 'X') {
|
1062
|
+
tl += l, ql += l, M += l;
|
1063
|
+
l_match += l;
|
1064
|
+
} else if (m[2] == 'I' || m[2] == 'D') {
|
1065
|
+
var type;
|
1066
|
+
if (l < 50) type = 0;
|
1067
|
+
else if (l < 100) type = 1;
|
1068
|
+
else if (l < 300) type = 2;
|
1069
|
+
else if (l < 400) type = 3;
|
1070
|
+
else if (l < 1000) type = 4;
|
1071
|
+
else type = 5;
|
1072
|
+
if (m[2] == 'I') ql += l, ++n_gap[0][type];
|
1073
|
+
else tl += l, ++n_gap[1][type];
|
1074
|
+
if (gap_out_len != null && l >= gap_out_len)
|
1075
|
+
print(t[0], ql, is_rev? '-' : '+', tname, rs + tl, m[2], l);
|
1076
|
+
++n_gapo, n_gap_all += l;
|
1077
|
+
} else if (m[2] == 'N') {
|
1078
|
+
tl += l;
|
1079
|
+
} else if (m[2] == 'S') {
|
1080
|
+
clip[M == 0? 0 : 1] = l, sclip += l;
|
1081
|
+
} else if (m[2] == 'H') {
|
1082
|
+
clip[M == 0? 0 : 1] = l;
|
1083
|
+
}
|
1084
|
+
}
|
1085
|
+
if (NM != null) {
|
1086
|
+
var tmp = NM - n_gap_all - nn;
|
1087
|
+
if (tmp < 0 && nn == 0) warn("WARNING: NM is smaller than the number of gaps at line " + lineno + ": NM=" + NM + ", nn=" + nn + ", G=" + n_gap_all);
|
1088
|
+
if (tmp < 0) tmp = 0;
|
1089
|
+
n_sub += tmp;
|
1090
|
+
}
|
1091
|
+
if (n_cigar > 65535) ++n_cigar_64k;
|
1092
|
+
if (ql + sclip != aqlen)
|
1093
|
+
warn("WARNING: aligned query length is inconsistent with CIGAR at line " + lineno + " (" + (ql+sclip) + " != " + aqlen + ")");
|
1094
|
+
if (atlen != null && atlen != tl)
|
1095
|
+
warn("WARNING: aligned reference length is inconsistent with CIGAR at line " + lineno);
|
1096
|
+
if (is_sam) {
|
1097
|
+
qs = clip[is_rev? 1 : 0], qe = qs + ql;
|
1098
|
+
ori_qlen = clip[0] + ql + clip[1];
|
1099
|
+
}
|
1100
|
+
if (count_err && NM != null) {
|
1101
|
+
var n_mm = NM - n_gap_all;
|
1102
|
+
if (n_mm < 0) warn("WARNING: NM is smaller than the number of gaps at line " + lineno);
|
1103
|
+
if (n_mm < 0) n_mm = 0;
|
1104
|
+
print(t[0], ori_qlen, t[11], ori_qlen - (qe - qs), NM, l_match + n_gap_all, n_mm + n_gapo, l_match + n_gapo);
|
1105
|
+
}
|
1106
|
+
regs.push([qs, qe]);
|
1107
|
+
last_qlen = ori_qlen;
|
1108
|
+
}
|
1109
|
+
}
|
1110
|
+
if (regs.length) {
|
1111
|
+
l_tot += last_qlen;
|
1112
|
+
l_cov += cov_len(regs);
|
1113
|
+
}
|
1114
|
+
|
1115
|
+
file.close();
|
1116
|
+
buf.destroy();
|
1117
|
+
|
1118
|
+
if (gap_out_len == null && !count_err) {
|
1119
|
+
print("Number of mapped sequences: " + n_seq);
|
1120
|
+
print("Number of primary alignments: " + n_pri);
|
1121
|
+
print("Number of secondary alignments: " + n_2nd);
|
1122
|
+
print("Number of primary alignments with >65535 CIGAR operations: " + n_cigar_64k);
|
1123
|
+
print("Number of bases in mapped sequences: " + l_tot);
|
1124
|
+
print("Number of mapped bases: " + l_cov);
|
1125
|
+
print("Number of substitutions: " + n_sub);
|
1126
|
+
print("Number of insertions in [0,50): " + n_gap[0][0]);
|
1127
|
+
print("Number of insertions in [50,100): " + n_gap[0][1]);
|
1128
|
+
print("Number of insertions in [100,300): " + n_gap[0][2]);
|
1129
|
+
print("Number of insertions in [300,400): " + n_gap[0][3]);
|
1130
|
+
print("Number of insertions in [400,1000): " + n_gap[0][4]);
|
1131
|
+
print("Number of insertions in [1000,inf): " + n_gap[0][5]);
|
1132
|
+
print("Number of deletions in [0,50): " + n_gap[1][0]);
|
1133
|
+
print("Number of deletions in [50,100): " + n_gap[1][1]);
|
1134
|
+
print("Number of deletions in [100,300): " + n_gap[1][2]);
|
1135
|
+
print("Number of deletions in [300,400): " + n_gap[1][3]);
|
1136
|
+
print("Number of deletions in [400,1000): " + n_gap[1][4]);
|
1137
|
+
print("Number of deletions in [1000,inf): " + n_gap[1][5]);
|
1138
|
+
}
|
1139
|
+
}
|
1140
|
+
|
1141
|
+
function paf_bedcov(args)
|
1142
|
+
{
|
1143
|
+
function read_bed(fn, to_merge, to_dedup)
|
1144
|
+
{
|
1145
|
+
var file = new File(fn);
|
1146
|
+
var buf = new Bytes();
|
1147
|
+
var h = {};
|
1148
|
+
while (file.readline(buf) >= 0) {
|
1149
|
+
var t = buf.toString().split("\t");
|
1150
|
+
if (h[t[0]] == null)
|
1151
|
+
h[t[0]] = [];
|
1152
|
+
var bst = parseInt(t[1]);
|
1153
|
+
var ben = parseInt(t[2]);
|
1154
|
+
if (t.length >= 12 && /^\d+$/.test(t[9])) {
|
1155
|
+
t[9] = parseInt(t[9]);
|
1156
|
+
var sz = t[10].split(",");
|
1157
|
+
var st = t[11].split(",");
|
1158
|
+
for (var i = 0; i < t[9]; ++i) {
|
1159
|
+
st[i] = parseInt(st[i]);
|
1160
|
+
sz[i] = parseInt(sz[i]);
|
1161
|
+
h[t[0]].push([bst + st[i], bst + st[i] + sz[i], 0, 0, 0]);
|
1162
|
+
}
|
1163
|
+
} else {
|
1164
|
+
h[t[0]].push([bst, ben, 0, 0, 0]);
|
1165
|
+
}
|
1166
|
+
}
|
1167
|
+
buf.destroy();
|
1168
|
+
file.close();
|
1169
|
+
for (var chr in h) {
|
1170
|
+
if (to_merge) Interval.merge(h[chr], false);
|
1171
|
+
else if (to_dedup) Interval.dedup(h[chr], false);
|
1172
|
+
else Interval.sort(h[chr]);
|
1173
|
+
Interval.index_end(h[chr]);
|
1174
|
+
}
|
1175
|
+
return h;
|
1176
|
+
}
|
1177
|
+
|
1178
|
+
var c, print_len = false, to_merge = true, to_dedup = false, fn_excl = null;
|
1179
|
+
while ((c = getopt(args, "pde:")) != null) {
|
1180
|
+
if (c == 'p') print_len = true;
|
1181
|
+
else if (c == 'd') to_dedup = true, to_merge = false;
|
1182
|
+
else if (c == 'e') fn_excl = getopt.arg;
|
1183
|
+
}
|
1184
|
+
|
1185
|
+
if (args.length - getopt.ind < 2) {
|
1186
|
+
print("Usage: paftools.js bedcov [options] <regions.bed> <target.bed>");
|
1187
|
+
print("Options:");
|
1188
|
+
print(" -e FILE exclude target regions (2nd file) overlapping BED FILE []");
|
1189
|
+
print(" -p print number of covered bases for each target");
|
1190
|
+
exit(1);
|
1191
|
+
}
|
1192
|
+
|
1193
|
+
var excl = fn_excl != null? read_bed(fn_excl, true, false) : null;
|
1194
|
+
var target = read_bed(args[getopt.ind], to_merge, to_dedup);
|
1195
|
+
|
1196
|
+
var file, buf = new Bytes();
|
1197
|
+
var tot_len = 0, hit_len = 0;
|
1198
|
+
file = args[getopt.ind+1] != '-'? new File(args[getopt.ind+1]) : new File();
|
1199
|
+
while (file.readline(buf) >= 0) {
|
1200
|
+
var t = buf.toString().split("\t");
|
1201
|
+
var a = [];
|
1202
|
+
var bst = parseInt(t[1]);
|
1203
|
+
var ben = parseInt(t[2]);
|
1204
|
+
if (t.length >= 12 && /^\d+$/.test(t[9])) { // BED12
|
1205
|
+
t[9] = parseInt(t[9]);
|
1206
|
+
var sz = t[10].split(",");
|
1207
|
+
var st = t[11].split(",");
|
1208
|
+
for (var i = 0; i < t[9]; ++i) {
|
1209
|
+
st[i] = parseInt(st[i]);
|
1210
|
+
sz[i] = parseInt(sz[i]);
|
1211
|
+
a.push([bst + st[i], bst + st[i] + sz[i], false]);
|
1212
|
+
}
|
1213
|
+
} else a.push([bst, ben, false]); // 3-column BED
|
1214
|
+
var feat_len = 0;
|
1215
|
+
for (var i = 0; i < a.length; ++i) {
|
1216
|
+
if (excl != null && excl[t[0]] != null) {
|
1217
|
+
var oe = Interval.find_ovlp(excl[t[0]], a[i][0], a[i][1]);
|
1218
|
+
if (oe.length > 0)
|
1219
|
+
continue;
|
1220
|
+
}
|
1221
|
+
a[i][2] = true;
|
1222
|
+
feat_len += a[i][1] - a[i][0];
|
1223
|
+
}
|
1224
|
+
tot_len += feat_len;
|
1225
|
+
if (target[t[0]] == null) continue;
|
1226
|
+
var b = [];
|
1227
|
+
for (var i = 0; i < a.length; ++i) {
|
1228
|
+
if (!a[i][2]) continue;
|
1229
|
+
var o = Interval.find_ovlp(target[t[0]], a[i][0], a[i][1]);
|
1230
|
+
for (var j = 0; j < o.length; ++j) {
|
1231
|
+
var max_st = o[j][0] > a[i][0]? o[j][0] : a[i][0];
|
1232
|
+
var min_en = o[j][1] < a[i][1]? o[j][1] : a[i][1];
|
1233
|
+
b.push([max_st, min_en]);
|
1234
|
+
o[j][2] += min_en - max_st;
|
1235
|
+
++o[j][3];
|
1236
|
+
if (max_st == o[j][0] && min_en == o[j][1])
|
1237
|
+
++o[j][4];
|
1238
|
+
}
|
1239
|
+
}
|
1240
|
+
// find the length covered
|
1241
|
+
var feat_hit_len = 0;
|
1242
|
+
if (b.length > 0) {
|
1243
|
+
b.sort(function(a,b) {return a[0]-b[0]});
|
1244
|
+
var st = b[0][0], en = b[0][1];
|
1245
|
+
for (var i = 1; i < b.length; ++i) {
|
1246
|
+
if (b[i][0] <= en) en = en > b[i][1]? en : b[i][1];
|
1247
|
+
else feat_hit_len += en - st, st = b[i][0], en = b[i][1];
|
1248
|
+
}
|
1249
|
+
feat_hit_len += en - st;
|
1250
|
+
}
|
1251
|
+
hit_len += feat_hit_len;
|
1252
|
+
if (print_len) print('F', t.slice(0, 4).join("\t"), feat_len, feat_hit_len);
|
1253
|
+
}
|
1254
|
+
file.close();
|
1255
|
+
|
1256
|
+
buf.destroy();
|
1257
|
+
|
1258
|
+
warn("# target bases: " + tot_len);
|
1259
|
+
warn("# target bases overlapping regions: " + hit_len + ' (' + (100.0 * hit_len / tot_len).toFixed(2) + '%)');
|
1260
|
+
}
|
1261
|
+
|
1262
|
+
function paf_vcfpair(args)
|
1263
|
+
{
|
1264
|
+
var c, is_male = false, sample = 'syndip', hgver = null;
|
1265
|
+
var PAR = { '37':[[0, 2699520], [154931043, 155260560]] };
|
1266
|
+
while ((c = getopt(args, "ms:g:")) != null) {
|
1267
|
+
if (c == 'm') is_male = true;
|
1268
|
+
else if (c == 's') sample = getopt.arg;
|
1269
|
+
else if (c == 'g') hgver = getopt.arg;
|
1270
|
+
}
|
1271
|
+
if (is_male && (hgver == null || PAR[hgver] == null))
|
1272
|
+
throw("for a male, -g must be specified to properly handle PARs on chrX");
|
1273
|
+
|
1274
|
+
if (getopt.ind == args.length) {
|
1275
|
+
print("Usage: paftools.js vcfpair [options] <in.pair.vcf>");
|
1276
|
+
print("Options:");
|
1277
|
+
print(" -m the sample is male");
|
1278
|
+
print(" -g STR human genome version '37' []");
|
1279
|
+
print(" -s STR sample name [" + sample + "]");
|
1280
|
+
exit(1);
|
1281
|
+
}
|
1282
|
+
|
1283
|
+
var re_ctg = is_male? /^(chr)?([0-9]+|X|Y)$/ : /^(chr)?([0-9]+|X)$/;
|
1284
|
+
var label = ['1', '2'];
|
1285
|
+
var buf = new Bytes();
|
1286
|
+
var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
|
1287
|
+
while (file.readline(buf) >= 0) {
|
1288
|
+
var m, line = buf.toString();
|
1289
|
+
if (line.charAt(0) == '#') {
|
1290
|
+
if (/^##(source|reference)=/.test(line)) continue;
|
1291
|
+
if ((m = /^##contig=.*ID=([^\s,]+)/.exec(line)) != null) {
|
1292
|
+
if (!re_ctg.test(m[1])) continue;
|
1293
|
+
} else if (/^#CHROM/.test(line)) {
|
1294
|
+
var t = line.split("\t");
|
1295
|
+
--t.length;
|
1296
|
+
t[t.length-1] = sample;
|
1297
|
+
line = t.join("\t");
|
1298
|
+
print('##FILTER=<ID=HET1,Description="Heterozygous in the first haplotype">');
|
1299
|
+
print('##FILTER=<ID=HET2,Description="Heterozygous in the second haplotype">');
|
1300
|
+
print('##FILTER=<ID=GAP1,Description="Uncalled in the first haplotype">');
|
1301
|
+
print('##FILTER=<ID=GAP2,Description="Uncalled in the second haplotype">');
|
1302
|
+
}
|
1303
|
+
print(line);
|
1304
|
+
continue;
|
1305
|
+
}
|
1306
|
+
var t = line.split("\t");
|
1307
|
+
if (!re_ctg.test(t[0])) continue;
|
1308
|
+
var GT = null, AD = null, FILTER = [], HT = [null, null];
|
1309
|
+
for (var i = 0; i < 2; ++i) {
|
1310
|
+
if ((m = /^(\.|[0-9]+)\/(\.|[0-9]+):(\S+)/.exec(t[9+i])) == null) {
|
1311
|
+
warn(line);
|
1312
|
+
throw Error("malformatted VCF");
|
1313
|
+
}
|
1314
|
+
var s = m[3].split(",");
|
1315
|
+
if (AD == null) {
|
1316
|
+
AD = [];
|
1317
|
+
for (var j = 0; j < s.length; ++j)
|
1318
|
+
AD[j] = 0;
|
1319
|
+
}
|
1320
|
+
for (var j = 0; j < s.length; ++j)
|
1321
|
+
AD[j] += parseInt(s[j]);
|
1322
|
+
if (m[1] == '.') {
|
1323
|
+
FILTER.push('GAP' + label[i]);
|
1324
|
+
HT[i] = '.';
|
1325
|
+
} else if (m[1] != m[2]) {
|
1326
|
+
FILTER.push('HET' + label[i]);
|
1327
|
+
HT[i] = '.';
|
1328
|
+
} else HT[i] = m[1];
|
1329
|
+
}
|
1330
|
+
--t.length;
|
1331
|
+
// test if this is in a haploid region
|
1332
|
+
var hap = 0, st = parseInt(t[1]), en = st + t[3].length;
|
1333
|
+
if (is_male) {
|
1334
|
+
if (/^(chr)?X/.test(t[0])) {
|
1335
|
+
if (hgver != null && PAR[hgver] != null) {
|
1336
|
+
var r = PAR[hgver], in_par = false;
|
1337
|
+
for (var i = 0; i < r.length; ++i)
|
1338
|
+
if (r[i][0] <= st && en <= r[i][1])
|
1339
|
+
in_par = true;
|
1340
|
+
hap = in_par? 0 : 2;
|
1341
|
+
}
|
1342
|
+
} else if (/^(chr)?Y/.test(t[0])) {
|
1343
|
+
hap = 1;
|
1344
|
+
}
|
1345
|
+
}
|
1346
|
+
// special treatment for haploid regions
|
1347
|
+
if (hap > 0 && FILTER.length == 1) {
|
1348
|
+
if ((hap == 2 && FILTER[0] == "GAP1") || (hap == 1 && FILTER[0] == "GAP2"))
|
1349
|
+
FILTER.length = 0;
|
1350
|
+
}
|
1351
|
+
// update VCF
|
1352
|
+
t[5] = 30; // fake QUAL
|
1353
|
+
t[6] = FILTER.length? FILTER.join(";") : ".";
|
1354
|
+
t[9] = HT.join("|") + ":" + AD.join(",");
|
1355
|
+
print(t.join("\t"));
|
1356
|
+
}
|
1357
|
+
file.close();
|
1358
|
+
buf.destroy();
|
1359
|
+
}
|
1360
|
+
|
1361
|
+
/**********************
|
1362
|
+
* Conversion related *
|
1363
|
+
**********************/
|
1364
|
+
|
1365
|
+
function paf_view(args)
|
1366
|
+
{
|
1367
|
+
var c, line_len = 80, fmt = "aln";
|
1368
|
+
while ((c = getopt(args, "f:l:")) != null) {
|
1369
|
+
if (c == 'f') {
|
1370
|
+
fmt = getopt.arg;
|
1371
|
+
if (fmt != "aln" && fmt != "lastz-cigar" && fmt != "maf")
|
1372
|
+
throw Error("format must be one of aln, lastz-cigar and maf");
|
1373
|
+
} else if (c == 'l') line_len = parseInt(getopt.arg);
|
1374
|
+
}
|
1375
|
+
if (line_len == 0) line_len = 0x7fffffff;
|
1376
|
+
|
1377
|
+
if (getopt.ind == args.length) {
|
1378
|
+
print("Usage: paftools.js view [options] <in.paf>");
|
1379
|
+
print("Options:");
|
1380
|
+
print(" -f STR output format: aln (BLAST-like), maf or lastz-cigar [aln]");
|
1381
|
+
print(" -l INT line length in BLAST-like output [80]");
|
1382
|
+
exit(1);
|
1383
|
+
}
|
1384
|
+
|
1385
|
+
function padding_str(x, len, right)
|
1386
|
+
{
|
1387
|
+
var s = x.toString();
|
1388
|
+
if (s.length < len) {
|
1389
|
+
if (right) s += Array(len - s.length + 1).join(" ");
|
1390
|
+
else s = Array(len - s.length + 1).join(" ") + s;
|
1391
|
+
}
|
1392
|
+
return s;
|
1393
|
+
}
|
1394
|
+
|
1395
|
+
function update_aln(s_ref, s_qry, s_mid, type, seq, slen)
|
1396
|
+
{
|
1397
|
+
var l = type == '*'? 1 : seq.length;
|
1398
|
+
if (type == '=' || type == ':') {
|
1399
|
+
s_ref.set(seq);
|
1400
|
+
s_qry.set(seq);
|
1401
|
+
s_mid.set(Array(l+1).join("|"));
|
1402
|
+
slen[0] += l, slen[1] += l;
|
1403
|
+
} else if (type == '*') {
|
1404
|
+
s_ref.set(seq.charAt(0));
|
1405
|
+
s_qry.set(seq.charAt(1));
|
1406
|
+
s_mid.set(' ');
|
1407
|
+
slen[0] += 1, slen[1] += 1;
|
1408
|
+
} else if (type == '+') {
|
1409
|
+
s_ref.set(Array(l+1).join("-"));
|
1410
|
+
s_qry.set(seq);
|
1411
|
+
s_mid.set(Array(l+1).join(" "));
|
1412
|
+
slen[1] += l;
|
1413
|
+
} else if (type == '-') {
|
1414
|
+
s_ref.set(seq);
|
1415
|
+
s_qry.set(Array(l+1).join("-"));
|
1416
|
+
s_mid.set(Array(l+1).join(" "));
|
1417
|
+
slen[0] += l;
|
1418
|
+
}
|
1419
|
+
}
|
1420
|
+
|
1421
|
+
function print_aln(rs, qs, strand, slen, elen, s_ref, s_qry, s_mid)
|
1422
|
+
{
|
1423
|
+
print(["Ref+:", padding_str(rs + slen[0] + 1, 10, false), s_ref.toString(), padding_str(rs + elen[0], 10, true)].join(" "));
|
1424
|
+
print(" " + s_mid.toString());
|
1425
|
+
var st, en;
|
1426
|
+
if (strand == '+') st = qs + slen[1] + 1, en = qs + elen[1];
|
1427
|
+
else st = qs - slen[1], en = qs - elen[1] + 1;
|
1428
|
+
print(["Qry" + strand + ":", padding_str(st, 10, false), s_qry.toString(), padding_str(en, 10, true)].join(" "));
|
1429
|
+
}
|
1430
|
+
|
1431
|
+
var s_ref = new Bytes(), s_qry = new Bytes(), s_mid = new Bytes(); // these are used to show padded alignment
|
1432
|
+
var re_cs = /([:=\-\+\*])(\d+|[A-Za-z]+)/g;
|
1433
|
+
var re_cg = /(\d+)([MIDNSHP=X])/g;
|
1434
|
+
|
1435
|
+
var buf = new Bytes();
|
1436
|
+
var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
|
1437
|
+
var lineno = 0;
|
1438
|
+
if (fmt == "maf") print("##maf version=1\n");
|
1439
|
+
while (file.readline(buf) >= 0) {
|
1440
|
+
var m, line = buf.toString();
|
1441
|
+
var t = line.split("\t", 12);
|
1442
|
+
++lineno;
|
1443
|
+
s_ref.length = s_qry.length = s_mid.length = 0;
|
1444
|
+
var slen = [0, 0], elen = [0, 0];
|
1445
|
+
if (fmt == "lastz-cigar") { // LASTZ-cigar output
|
1446
|
+
var cg = (m = /\tcg:Z:(\S+)/.exec(line)) != null? m[1] : null;
|
1447
|
+
if (cg == null) {
|
1448
|
+
warn("WARNING: converting to LASTZ-cigar format requires the 'cg' tag, which is absent on line " + lineno);
|
1449
|
+
continue;
|
1450
|
+
}
|
1451
|
+
var score = (m = /\tAS:i:(\d+)/.exec(line)) != null? m[1] : 0;
|
1452
|
+
var out = ['cigar:', t[0], t[2], t[3], t[4], t[5], t[7], t[8], '+', score];
|
1453
|
+
while ((m = re_cg.exec(cg)) != null)
|
1454
|
+
out.push(m[2], m[1]);
|
1455
|
+
print(out.join(" "));
|
1456
|
+
} else if (fmt == "maf") { // MAF output
|
1457
|
+
var cs = (m = /\tcs:Z:(\S+)/.exec(line)) != null? m[1] : null;
|
1458
|
+
if (cs == null) {
|
1459
|
+
warn("WARNING: converting to MAF requires the 'cs' tag, which is absent on line " + lineno);
|
1460
|
+
continue;
|
1461
|
+
}
|
1462
|
+
while ((m = re_cs.exec(cs)) != null) {
|
1463
|
+
if (m[1] == ':')
|
1464
|
+
throw Error("converting to MAF only works with 'minimap2 --cs=long'");
|
1465
|
+
update_aln(s_ref, s_qry, s_mid, m[1], m[2], elen);
|
1466
|
+
}
|
1467
|
+
var score = (m = /\tAS:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0;
|
1468
|
+
var len = t[0].length > t[5].length? t[0].length : t[5].length;
|
1469
|
+
print("a " + score);
|
1470
|
+
print(["s", padding_str(t[5], len, true), padding_str(t[7], 10, false), padding_str(parseInt(t[8]) - parseInt(t[7]), 10, false),
|
1471
|
+
"+", padding_str(t[6], 10, false), s_ref.toString()].join(" "));
|
1472
|
+
var qs, qe, ql = parseInt(t[1]);
|
1473
|
+
if (t[4] == '+') {
|
1474
|
+
qs = parseInt(t[2]);
|
1475
|
+
qe = parseInt(t[3]);
|
1476
|
+
} else {
|
1477
|
+
qs = ql - parseInt(t[3]);
|
1478
|
+
qe = ql - parseInt(t[2]);
|
1479
|
+
}
|
1480
|
+
print(["s", padding_str(t[0], len, true), padding_str(qs, 10, false), padding_str(qe - qs, 10, false),
|
1481
|
+
t[4], padding_str(ql, 10, false), s_qry.toString()].join(" "));
|
1482
|
+
print("");
|
1483
|
+
} else { // BLAST-like output
|
1484
|
+
var cs = (m = /\tcs:Z:(\S+)/.exec(line)) != null? m[1] : null;
|
1485
|
+
if (cs == null) {
|
1486
|
+
warn("WARNING: converting to BLAST-like alignment requires the 'cs' tag, which is absent on line " + lineno);
|
1487
|
+
continue;
|
1488
|
+
}
|
1489
|
+
var n_mm = 0, n_oi = 0, n_od = 0, n_ei = 0, n_ed = 0;
|
1490
|
+
while ((m = re_cs.exec(cs)) != null) {
|
1491
|
+
if (m[1] == '*') ++n_mm;
|
1492
|
+
else if (m[1] == '+') ++n_oi, n_ei += m[2].length;
|
1493
|
+
else if (m[1] == '-') ++n_od, n_ed += m[2].length;
|
1494
|
+
}
|
1495
|
+
line = line.replace(/\tc[sg]:Z:\S+/g, ""); // get rid of cs or cg tags
|
1496
|
+
print('>' + line + "\tmm:i:"+n_mm + "\toi:i:"+n_oi + "\tei:i:"+n_ei + "\tod:i:"+n_od + "\ted:i:"+n_ed);
|
1497
|
+
var rs = parseInt(t[7]), qs = t[4] == '+'? parseInt(t[2]) : parseInt(t[3]);
|
1498
|
+
var n_blocks = 0;
|
1499
|
+
while ((m = re_cs.exec(cs)) != null) {
|
1500
|
+
if (m[1] == ':') m[2] = Array(parseInt(m[2]) + 1).join("=");
|
1501
|
+
var start = 0, rest = m[1] == '*'? 1 : m[2].length;
|
1502
|
+
while (rest > 0) {
|
1503
|
+
var l_proc;
|
1504
|
+
if (s_ref.length + rest >= line_len) {
|
1505
|
+
l_proc = line_len - s_ref.length;
|
1506
|
+
update_aln(s_ref, s_qry, s_mid, m[1], m[1] == '*'? m[2] : m[2].substr(start, l_proc), elen);
|
1507
|
+
if (n_blocks > 0) print("");
|
1508
|
+
print_aln(rs, qs, t[4], slen, elen, s_ref, s_qry, s_mid);
|
1509
|
+
++n_blocks;
|
1510
|
+
s_ref.length = s_qry.length = s_mid.length = 0;
|
1511
|
+
slen[0] = elen[0], slen[1] = elen[1];
|
1512
|
+
} else {
|
1513
|
+
l_proc = rest;
|
1514
|
+
update_aln(s_ref, s_qry, s_mid, m[1], m[1] == '*'? m[2] : m[2].substr(start, l_proc), elen);
|
1515
|
+
}
|
1516
|
+
rest -= l_proc, start += l_proc;
|
1517
|
+
}
|
1518
|
+
}
|
1519
|
+
if (s_ref.length > 0) {
|
1520
|
+
if (n_blocks > 0) print("");
|
1521
|
+
print_aln(rs, qs, t[4], slen, elen, s_ref, s_qry, s_mid);
|
1522
|
+
++n_blocks;
|
1523
|
+
}
|
1524
|
+
print("//");
|
1525
|
+
}
|
1526
|
+
}
|
1527
|
+
file.close();
|
1528
|
+
buf.destroy();
|
1529
|
+
|
1530
|
+
s_ref.destroy(); s_qry.destroy(); s_mid.destroy();
|
1531
|
+
}
|
1532
|
+
|
1533
|
+
function paf_gff2bed(args)
|
1534
|
+
{
|
1535
|
+
var c, fn_ucsc_fai = null, is_short = false, keep_gff = false, print_junc = false, output_gene = false;
|
1536
|
+
while ((c = getopt(args, "u:sgjG")) != null) {
|
1537
|
+
if (c == 'u') fn_ucsc_fai = getopt.arg;
|
1538
|
+
else if (c == 's') is_short = true;
|
1539
|
+
else if (c == 'g') keep_gff = true;
|
1540
|
+
else if (c == 'j') print_junc = true;
|
1541
|
+
else if (c == 'G') output_gene = true;
|
1542
|
+
}
|
1543
|
+
|
1544
|
+
if (getopt.ind == args.length) {
|
1545
|
+
print("Usage: paftools.js gff2bed [options] <in.gff>");
|
1546
|
+
print("Options:");
|
1547
|
+
print(" -j Output junction BED");
|
1548
|
+
print(" -s Print names in the short form");
|
1549
|
+
print(" -u FILE hg38.fa.fai for chr name conversion");
|
1550
|
+
print(" -g Output GFF (used with -u)");
|
1551
|
+
exit(1);
|
1552
|
+
}
|
1553
|
+
|
1554
|
+
var ens2ucsc = {};
|
1555
|
+
if (fn_ucsc_fai != null) {
|
1556
|
+
var buf = new Bytes();
|
1557
|
+
var file = new File(fn_ucsc_fai);
|
1558
|
+
while (file.readline(buf) >= 0) {
|
1559
|
+
var t = buf.toString().split("\t");
|
1560
|
+
var s = t[0];
|
1561
|
+
if (/_(random|alt|decoy)$/.test(s)) {
|
1562
|
+
s = s.replace(/_(random|alt|decoy)$/, '');
|
1563
|
+
s = s.replace(/^chr\S+_/, '');
|
1564
|
+
} else {
|
1565
|
+
s = s.replace(/^chrUn_/, '');
|
1566
|
+
}
|
1567
|
+
s = s.replace(/v(\d+)/, ".$1");
|
1568
|
+
if (s != t[0]) ens2ucsc[s] = t[0];
|
1569
|
+
}
|
1570
|
+
file.close();
|
1571
|
+
buf.destroy();
|
1572
|
+
}
|
1573
|
+
|
1574
|
+
var colors = {
|
1575
|
+
'protein_coding':'0,128,255',
|
1576
|
+
'mRNA':'0,128,255',
|
1577
|
+
'lincRNA':'0,192,0',
|
1578
|
+
'snRNA':'0,192,0',
|
1579
|
+
'miRNA':'0,192,0',
|
1580
|
+
'misc_RNA':'0,192,0'
|
1581
|
+
};
|
1582
|
+
|
1583
|
+
function print_bed12(exons, cds_st, cds_en, is_short, print_junc)
|
1584
|
+
{
|
1585
|
+
if (exons.length == 0) return;
|
1586
|
+
var name = is_short? exons[0][7] + "|" + exons[0][5] : exons[0].slice(4, 7).join("|");
|
1587
|
+
var a = exons.sort(function(a,b) {return a[1]-b[1]});
|
1588
|
+
if (print_junc) {
|
1589
|
+
for (var i = 1; i < a.length; ++i)
|
1590
|
+
print(a[i][0], a[i-1][2], a[i][1], name, 1000, a[i][3]);
|
1591
|
+
return;
|
1592
|
+
}
|
1593
|
+
var sizes = [], starts = [], st, en;
|
1594
|
+
st = a[0][1];
|
1595
|
+
en = a[a.length - 1][2];
|
1596
|
+
if (cds_st == 1<<30) cds_st = st;
|
1597
|
+
if (cds_en == 0) cds_en = en;
|
1598
|
+
if (cds_st < st || cds_en > en)
|
1599
|
+
throw Error("inconsistent thick start or end for transcript " + a[0][4]);
|
1600
|
+
for (var i = 0; i < a.length; ++i) {
|
1601
|
+
sizes.push(a[i][2] - a[i][1]);
|
1602
|
+
starts.push(a[i][1] - st);
|
1603
|
+
}
|
1604
|
+
var color = colors[a[0][5]];
|
1605
|
+
if (color == null) color = '196,196,196';
|
1606
|
+
print(a[0][0], st, en, name, 1000, a[0][3], cds_st, cds_en, color, a.length, sizes.join(",") + ",", starts.join(",") + ",");
|
1607
|
+
}
|
1608
|
+
|
1609
|
+
var re_gtf = /\b(transcript_id|transcript_type|transcript_biotype|gene_name|gene_id|gbkey|transcript_name) "([^"]+)";/g;
|
1610
|
+
var re_gff3 = /\b(transcript_id|transcript_type|transcript_biotype|gene_name|gene_id|gbkey|transcript_name)=([^;]+)/g;
|
1611
|
+
var re_gtf_gene = /\b(gene_id|gene_type|gene_name) "([^;]+)";/g;
|
1612
|
+
var re_gff3_gene = /\b(gene_id|gene_type|source_gene|gene_biotype|gene_name)=([^;]+);/g;
|
1613
|
+
var buf = new Bytes();
|
1614
|
+
var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
|
1615
|
+
|
1616
|
+
var exons = [], cds_st = 1<<30, cds_en = 0, last_id = null;
|
1617
|
+
while (file.readline(buf) >= 0) {
|
1618
|
+
var t = buf.toString().split("\t");
|
1619
|
+
if (keep_gff) {
|
1620
|
+
if (t[0].charAt(0) != '#' && ens2ucsc[t[0]] != null)
|
1621
|
+
t[0] = ens2ucsc[t[0]];
|
1622
|
+
print(t.join("\t"));
|
1623
|
+
continue;
|
1624
|
+
}
|
1625
|
+
if (t[0].charAt(0) == '#') continue;
|
1626
|
+
if (output_gene) {
|
1627
|
+
var id = null, src = null, biotype = null, type = "", name = "N/A";
|
1628
|
+
if (t[2] != "gene") continue;
|
1629
|
+
while ((m = re_gtf_gene.exec(t[8])) != null) {
|
1630
|
+
if (m[1] == "gene_id") id = m[2];
|
1631
|
+
else if (m[1] == "gene_type") type = m[2];
|
1632
|
+
else if (m[1] == "gene_name") name = m[2];
|
1633
|
+
}
|
1634
|
+
while ((m = re_gff3_gene.exec(t[8])) != null) {
|
1635
|
+
if (m[1] == "gene_id") id = m[2];
|
1636
|
+
else if (m[1] == "source_gene") src = m[2];
|
1637
|
+
else if (m[1] == "gene_type") type = m[2];
|
1638
|
+
else if (m[1] == "gene_biotype") biotype = m[2];
|
1639
|
+
else if (m[1] == "gene_name") name = m[2];
|
1640
|
+
}
|
1641
|
+
if (src != null) id = src;
|
1642
|
+
if (type == "" && biotype != null) type = biotype;
|
1643
|
+
print(t[0], parseInt(t[3]) - 1, t[4], [id, type, name].join("|"), 1000, t[6]);
|
1644
|
+
continue;
|
1645
|
+
}
|
1646
|
+
if (t[2] != "CDS" && t[2] != "exon") continue;
|
1647
|
+
t[3] = parseInt(t[3]) - 1;
|
1648
|
+
t[4] = parseInt(t[4]);
|
1649
|
+
var id = null, type = "", name = "N/A", biotype = "", m, tname = "N/A";
|
1650
|
+
while ((m = re_gtf.exec(t[8])) != null) {
|
1651
|
+
if (m[1] == "transcript_id") id = m[2];
|
1652
|
+
else if (m[1] == "transcript_type") type = m[2];
|
1653
|
+
else if (m[1] == "transcript_biotype" || m[1] == "gbkey") biotype = m[2];
|
1654
|
+
else if (m[1] == "gene_name" || m[1] == "gene_id") name = m[2];
|
1655
|
+
else if (m[1] == "transcript_name") tname = m[2];
|
1656
|
+
}
|
1657
|
+
while ((m = re_gff3.exec(t[8])) != null) {
|
1658
|
+
if (m[1] == "transcript_id") id = m[2];
|
1659
|
+
else if (m[1] == "transcript_type") type = m[2];
|
1660
|
+
else if (m[1] == "transcript_biotype" || m[1] == "gbkey") biotype = m[2];
|
1661
|
+
else if (m[1] == "gene_name" || m[1] == "gene_id") name = m[2];
|
1662
|
+
else if (m[1] == "transcript_name") tname = m[2];
|
1663
|
+
}
|
1664
|
+
if (type == "" && biotype != "") type = biotype;
|
1665
|
+
if (id == null) throw Error("No transcript_id");
|
1666
|
+
if (id != last_id) {
|
1667
|
+
print_bed12(exons, cds_st, cds_en, is_short, print_junc);
|
1668
|
+
exons = [], cds_st = 1<<30, cds_en = 0;
|
1669
|
+
last_id = id;
|
1670
|
+
}
|
1671
|
+
if (t[2] == "CDS") {
|
1672
|
+
cds_st = cds_st < t[3]? cds_st : t[3];
|
1673
|
+
cds_en = cds_en > t[4]? cds_en : t[4];
|
1674
|
+
} else if (t[2] == "exon") {
|
1675
|
+
if (fn_ucsc_fai != null) {
|
1676
|
+
if (ens2ucsc[t[0]] != null)
|
1677
|
+
t[0] = ens2ucsc[t[0]];
|
1678
|
+
else if (/^[A-Z]+\d+\.\d+$/.test(t[0]))
|
1679
|
+
t[0] = t[0].replace(/([A-Z]+\d+)\.(\d+)/, "chrUn_$1v$2");
|
1680
|
+
}
|
1681
|
+
exons.push([t[0], t[3], t[4], t[6], id, type, name, tname]);
|
1682
|
+
}
|
1683
|
+
}
|
1684
|
+
if (last_id != null)
|
1685
|
+
print_bed12(exons, cds_st, cds_en, is_short, print_junc);
|
1686
|
+
|
1687
|
+
file.close();
|
1688
|
+
buf.destroy();
|
1689
|
+
}
|
1690
|
+
|
1691
|
+
function paf_sam2paf(args)
|
1692
|
+
{
|
1693
|
+
var c, pri_only = false, long_cs = false;
|
1694
|
+
while ((c = getopt(args, "pL")) != null) {
|
1695
|
+
if (c == 'p') pri_only = true;
|
1696
|
+
else if (c == 'L') long_cs = true;
|
1697
|
+
}
|
1698
|
+
if (args.length == getopt.ind) {
|
1699
|
+
print("Usage: paftools.js sam2paf [options] <in.sam>");
|
1700
|
+
print("Options:");
|
1701
|
+
print(" -p convert primary or supplementary alignments only");
|
1702
|
+
print(" -L output the cs tag in the long form");
|
1703
|
+
exit(1);
|
1704
|
+
}
|
1705
|
+
|
1706
|
+
var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
|
1707
|
+
var buf = new Bytes();
|
1708
|
+
var re = /(\d+)([MIDSHNX=])/g, re_MD = /(\d+)|(\^[A-Za-z]+)|([A-Za-z])/g, re_tag = /\t(\S\S:[AZif]):(\S+)/g;
|
1709
|
+
|
1710
|
+
var ctg_len = {}, lineno = 0;
|
1711
|
+
while (file.readline(buf) >= 0) {
|
1712
|
+
var m, n_cigar = 0, line = buf.toString();
|
1713
|
+
++lineno;
|
1714
|
+
if (line.charAt(0) == '@') {
|
1715
|
+
if (/^@SQ/.test(line)) {
|
1716
|
+
var name = (m = /\tSN:(\S+)/.exec(line)) != null? m[1] : null;
|
1717
|
+
var l = (m = /\tLN:(\d+)/.exec(line)) != null? parseInt(m[1]) : null;
|
1718
|
+
if (name != null && l != null) ctg_len[name] = l;
|
1719
|
+
}
|
1720
|
+
continue;
|
1721
|
+
}
|
1722
|
+
var t = line.split("\t", 11);
|
1723
|
+
var flag = parseInt(t[1]);
|
1724
|
+
if (t[9] != '*' && t[10] != '*' && t[9].length != t[10].length)
|
1725
|
+
throw Error("at line " + lineno + ": inconsistent SEQ and QUAL lengths - " + t[9].length + " != " + t[10].length);
|
1726
|
+
if (t[2] == '*' || (flag&4) || t[5] == '*') continue;
|
1727
|
+
if (pri_only && (flag&0x100)) continue;
|
1728
|
+
var tlen = ctg_len[t[2]];
|
1729
|
+
if (tlen == null) throw Error("at line " + lineno + ": can't find the length of contig " + t[2]);
|
1730
|
+
// find tags
|
1731
|
+
var nn = 0, NM = null, MD = null, cs_str = null, md_list = [];
|
1732
|
+
while ((m = re_tag.exec(line)) != null) {
|
1733
|
+
if (m[1] == "NM:i") NM = parseInt(m[2]);
|
1734
|
+
else if (m[1] == "nn:i") nn = parseInt(m[2]);
|
1735
|
+
else if (m[1] == "MD:Z") MD = m[2];
|
1736
|
+
else if (m[1] == "cs:Z") cs_str = m[2];
|
1737
|
+
}
|
1738
|
+
if (t[9] == '*') MD = cs_str = null;
|
1739
|
+
// infer various lengths from CIGAR
|
1740
|
+
var clip = [0, 0], soft_clip = 0, I = [0, 0], D = [0, 0], M = 0, N = 0, mm = 0, have_M = false, have_ext = false, cigar = [];
|
1741
|
+
while ((m = re.exec(t[5])) != null) {
|
1742
|
+
var l = parseInt(m[1]), op = m[2];
|
1743
|
+
if (op == 'M') M += l, have_M = true;
|
1744
|
+
else if (op == 'I') ++I[0], I[1] += l;
|
1745
|
+
else if (op == 'D') ++D[0], D[1] += l;
|
1746
|
+
else if (op == 'N') N += l;
|
1747
|
+
else if (op == 'S') clip[n_cigar == 0? 0 : 1] = l, soft_clip += l;
|
1748
|
+
else if (op == 'H') clip[n_cigar == 0? 0 : 1] = l;
|
1749
|
+
else if (op == '=') M += l, have_ext = true, op = 'M';
|
1750
|
+
else if (op == 'X') M += l, mm += l, have_ext = true, op = 'M';
|
1751
|
+
++n_cigar;
|
1752
|
+
if (MD != null && op != 'H') {
|
1753
|
+
if (cigar.length > 0 && cigar[cigar.length-1][1] == op)
|
1754
|
+
cigar[cigar.length-1][0] += l;
|
1755
|
+
else cigar.push([l, op]);
|
1756
|
+
}
|
1757
|
+
}
|
1758
|
+
var ql = M + I[1] + soft_clip;
|
1759
|
+
var tl = M + D[1] + N;
|
1760
|
+
var ts = parseInt(t[3]) - 1, te = ts + tl;
|
1761
|
+
// checking coordinate and length consistencies
|
1762
|
+
if (n_cigar > 65535)
|
1763
|
+
warn("WARNING at line " + lineno + ": " + n_cigar + " CIGAR operations");
|
1764
|
+
if (te > tlen) {
|
1765
|
+
warn("WARNING at line " + lineno + ": alignment end position larger than ref length; skipped");
|
1766
|
+
continue;
|
1767
|
+
}
|
1768
|
+
if (t[9] != '*' && t[9].length != ql) {
|
1769
|
+
warn("WARNING at line " + lineno + ": SEQ length inconsistent with CIGAR (" + t[9].length + " != " + ql + "); skipped");
|
1770
|
+
continue;
|
1771
|
+
}
|
1772
|
+
// parse MD
|
1773
|
+
var cs = [];
|
1774
|
+
if (MD != null && cs_str == null && t[9] != "*") {
|
1775
|
+
var k = 0, cx = 0, cy = 0, mx = 0, my = 0; // cx: cigar ref position; cy: cigar query; mx: MD ref; my: MD query
|
1776
|
+
while ((m = re_MD.exec(MD)) != null) {
|
1777
|
+
if (m[2] != null) { // deletion from the reference
|
1778
|
+
var len = m[2].length - 1;
|
1779
|
+
cs.push('-', m[2].substr(1));
|
1780
|
+
mx += len, cx += len, ++k;
|
1781
|
+
} else { // copy or mismatch
|
1782
|
+
var ml = m[1] != null? parseInt(m[1]) : 1;
|
1783
|
+
while (k < cigar.length && cigar[k][1] != 'D') {
|
1784
|
+
var cl = cigar[k][0], op = cigar[k][1];
|
1785
|
+
if (op == 'M') {
|
1786
|
+
if (my + ml < cy + cl) {
|
1787
|
+
if (ml > 0) {
|
1788
|
+
if (m[3] != null) cs.push('*', m[3], t[9][my]);
|
1789
|
+
else if (long_cs) cs.push('=', t[9].substr(my, ml));
|
1790
|
+
else cs.push(':', ml);
|
1791
|
+
}
|
1792
|
+
mx += ml, my += ml, ml = 0;
|
1793
|
+
break;
|
1794
|
+
} else {
|
1795
|
+
var dl = cy + cl - my;
|
1796
|
+
if (long_cs) cs.push('=', t[9].substr(my, dl));
|
1797
|
+
else cs.push(':', dl);
|
1798
|
+
cx += cl, cy += cl, ++k;
|
1799
|
+
mx += dl, my += dl, ml -= dl;
|
1800
|
+
}
|
1801
|
+
} else if (op == 'I') {
|
1802
|
+
cs.push('+', t[9].substr(cy, cl));
|
1803
|
+
cy += cl, my += cl, ++k;
|
1804
|
+
} else if (op == 'S') {
|
1805
|
+
cy += cl, my += cl, ++k;
|
1806
|
+
} else throw Error("at line " + lineno + ": inconsistent MD tag");
|
1807
|
+
}
|
1808
|
+
if (ml != 0) throw Error("at line " + lineno + ": inconsistent MD tag");
|
1809
|
+
}
|
1810
|
+
}
|
1811
|
+
if (cx != mx || cy != my) throw Error("at line " + lineno + ": inconsistent MD tag");
|
1812
|
+
}
|
1813
|
+
// compute matching length, block length and calibrate NM
|
1814
|
+
if (have_ext && !have_M) { // extended CIGAR
|
1815
|
+
if (NM != null && NM != I[1] + D[1] + mm)
|
1816
|
+
warn("WARNING at line " + lineno + ": NM is different from sum of gaps and mismatches");
|
1817
|
+
NM = I[1] + D[1] + mm;
|
1818
|
+
} else if (NM != null) { // standard CIGAR; NM present
|
1819
|
+
if (NM < I[1] + D[1]) {
|
1820
|
+
warn("WARNING at line " + lineno + ": NM is less than the total number of gaps (" + NM + " < " + (I[1]+D[1]) + ")");
|
1821
|
+
NM = I[1] + D[1];
|
1822
|
+
}
|
1823
|
+
mm = NM - (I[1] + D[1]);
|
1824
|
+
} else { // no way to compute mm
|
1825
|
+
warn("WARNING at line " + lineno + ": unable to find the number of mismatches; assuming zero");
|
1826
|
+
mm = 0;
|
1827
|
+
}
|
1828
|
+
var mlen = M - mm;
|
1829
|
+
var blen = M + I[1] + D[1];
|
1830
|
+
// find query name, start and end
|
1831
|
+
var qlen = M + I[1] + clip[0] + clip[1];
|
1832
|
+
var qname = t[0], qs, qe;
|
1833
|
+
if ((flag&1) && (flag&0x40)) qname += '/1';
|
1834
|
+
if ((flag&1) && (flag&0x80)) qname += '/2';
|
1835
|
+
if (flag&16) qs = clip[1], qe = qlen - clip[0];
|
1836
|
+
else qs = clip[0], qe = qlen - clip[1];
|
1837
|
+
// optional tags
|
1838
|
+
var type = flag&0x100? 'S' : 'P';
|
1839
|
+
var tags = ["tp:A:" + type];
|
1840
|
+
if (NM != null) tags.push("mm:i:"+mm);
|
1841
|
+
tags.push("gn:i:"+(I[1]+D[1]), "go:i:"+(I[0]+D[0]), "cg:Z:" + t[5].replace(/\d+[SH]/g, ''));
|
1842
|
+
if (cs_str != null) tags.push("cs:Z:" + cs_str);
|
1843
|
+
else if (cs.length > 0) tags.push("cs:Z:" + cs.join(""));
|
1844
|
+
// print out
|
1845
|
+
var a = [qname, qlen, qs, qe, flag&16? '-' : '+', t[2], tlen, ts, te, mlen, blen, t[4]];
|
1846
|
+
print(a.join("\t"), tags.join("\t"));
|
1847
|
+
}
|
1848
|
+
|
1849
|
+
buf.destroy();
|
1850
|
+
file.close();
|
1851
|
+
}
|
1852
|
+
|
1853
|
+
function paf_delta2paf(args)
|
1854
|
+
{
|
1855
|
+
if (args.length == 0) {
|
1856
|
+
print("Usage: paftools.js delta2paf <in.delta>");
|
1857
|
+
exit(1);
|
1858
|
+
}
|
1859
|
+
|
1860
|
+
var buf = new Bytes();
|
1861
|
+
var file = args[0] == '-'? new File() : new File(args[0]);
|
1862
|
+
|
1863
|
+
var rname, qname, rlen, qlen, qs, qe, rs, re, strand, NM, cigar, x, y, seen_gt = false;
|
1864
|
+
while (file.readline(buf) >= 0) {
|
1865
|
+
var m, line = buf.toString();
|
1866
|
+
if ((m = /^>(\S+)\s+(\S+)\s+(\d+)\s+(\d+)/.exec(line)) != null) {
|
1867
|
+
rname = m[1], qname = m[2], rlen = parseInt(m[3]), qlen = parseInt(m[4]);
|
1868
|
+
seen_gt = true;
|
1869
|
+
continue;
|
1870
|
+
}
|
1871
|
+
if (!seen_gt) continue;
|
1872
|
+
var t = line.split(" ");
|
1873
|
+
if (t.length == 7) {
|
1874
|
+
for (var i = 0; i < 5; ++i)
|
1875
|
+
t[i] = parseInt(t[i]);
|
1876
|
+
strand = ((t[0] < t[1] && t[2] < t[3]) || (t[0] > t[1] && t[2] > t[3]))? 1 : -1;
|
1877
|
+
rs = (t[0] < t[1]? t[0] : t[1]) - 1;
|
1878
|
+
re = t[1] > t[0]? t[1] : t[0];
|
1879
|
+
qs = (t[2] < t[3]? t[2] : t[3]) - 1;
|
1880
|
+
qe = t[3] > t[2]? t[3] : t[2];
|
1881
|
+
x = y = 0;
|
1882
|
+
NM = parseInt(t[4]);
|
1883
|
+
cigar = [];
|
1884
|
+
} else if (t.length == 1) {
|
1885
|
+
var d = parseInt(t[0]);
|
1886
|
+
if (d == 0) {
|
1887
|
+
var blen = 0, cigar_str = [];
|
1888
|
+
if (re - rs - x != qe - qs - y) throw Error("inconsisnt alignment");
|
1889
|
+
cigar.push((re - rs - x) << 4);
|
1890
|
+
for (var i = 0; i < cigar.length; ++i) {
|
1891
|
+
blen += cigar[i] >> 4;
|
1892
|
+
cigar_str.push((cigar[i]>>4) + "MID".charAt(cigar[i]&0xf));
|
1893
|
+
}
|
1894
|
+
print([qname, qlen, qs, qe, strand > 0? '+' : '-', rname, rlen, rs, re, blen - NM, blen, 0, "NM:i:" + NM, "cg:Z:" + cigar_str.join("")].join("\t"));
|
1895
|
+
} else if (d > 0) {
|
1896
|
+
var l = d - 1;
|
1897
|
+
x += l + 1, y += l;
|
1898
|
+
if (l > 0) cigar.push(l<<4);
|
1899
|
+
if (cigar.length > 0 && (cigar[cigar.length-1]&0xf) == 2)
|
1900
|
+
cigar[cigar.length-1] += 1<<4;
|
1901
|
+
else cigar.push(1<<4|2); // deletion
|
1902
|
+
} else {
|
1903
|
+
var l = -d - 1;
|
1904
|
+
x += l, y += l + 1;
|
1905
|
+
if (l > 0) cigar.push(l<<4);
|
1906
|
+
if (cigar.length > 0 && (cigar[cigar.length-1]&0xf) == 1)
|
1907
|
+
cigar[cigar.length-1] += 1<<4;
|
1908
|
+
else cigar.push(1<<4|1); // insertion
|
1909
|
+
}
|
1910
|
+
}
|
1911
|
+
}
|
1912
|
+
file.close();
|
1913
|
+
buf.destroy();
|
1914
|
+
}
|
1915
|
+
|
1916
|
+
function paf_splice2bed(args)
|
1917
|
+
{
|
1918
|
+
var colors = ["0,128,255", "255,0,0", "0,192,0"];
|
1919
|
+
|
1920
|
+
function print_lines(a, fmt, keep_multi)
|
1921
|
+
{
|
1922
|
+
if (a.length == 0) return;
|
1923
|
+
if (fmt == "bed") {
|
1924
|
+
var n_pri = 0;
|
1925
|
+
for (var i = 0; i < a.length; ++i)
|
1926
|
+
if (a[i][8] == 0) ++n_pri;
|
1927
|
+
if (n_pri > 1) {
|
1928
|
+
for (var i = 0; i < a.length; ++i)
|
1929
|
+
if (a[i][8] == 0) a[i][8] = 1;
|
1930
|
+
} else if (n_pri == 0) {
|
1931
|
+
warn("Warning: " + a[0][3] + " doesn't have a primary alignment");
|
1932
|
+
}
|
1933
|
+
for (var i = 0; i < a.length; ++i) {
|
1934
|
+
if (!keep_multi && a[i][8] == 2) continue;
|
1935
|
+
a[i][8] = colors[a[i][8]];
|
1936
|
+
print(a[i].join("\t"));
|
1937
|
+
}
|
1938
|
+
}
|
1939
|
+
a.length = 0;
|
1940
|
+
}
|
1941
|
+
|
1942
|
+
var re = /(\d+)([MIDNSHP=X])/g;
|
1943
|
+
var c, fmt = "bed", fn_name_conv = null, keep_multi = false;
|
1944
|
+
while ((c = getopt(args, "f:n:m")) != null) {
|
1945
|
+
if (c == 'f') fmt = getopt.arg;
|
1946
|
+
else if (c == 'n') fn_name_conv = getopt.arg;
|
1947
|
+
else if (c == 'm') keep_multi = true;
|
1948
|
+
}
|
1949
|
+
if (getopt.ind == args.length) {
|
1950
|
+
print("Usage: paftools.js splice2bed [options] <in.paf>|<in.sam>");
|
1951
|
+
print("Options:");
|
1952
|
+
print(" -m keep multiple mappings (SAM flag 0x100)");
|
1953
|
+
exit(1);
|
1954
|
+
}
|
1955
|
+
|
1956
|
+
var conv = null;
|
1957
|
+
if (fn_name_conv != null) {
|
1958
|
+
conv = new Map();
|
1959
|
+
var file = new File(fn_name_conv);
|
1960
|
+
var buf = new Bytes();
|
1961
|
+
while (file.readline(buf) >= 0) {
|
1962
|
+
var t = buf.toString().split("\t");
|
1963
|
+
conv.put(t[0], t[1]);
|
1964
|
+
}
|
1965
|
+
buf.destroy();
|
1966
|
+
file.close();
|
1967
|
+
}
|
1968
|
+
|
1969
|
+
var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
|
1970
|
+
var buf = new Bytes();
|
1971
|
+
var a = [];
|
1972
|
+
while (file.readline(buf) >= 0) {
|
1973
|
+
var line = buf.toString();
|
1974
|
+
if (line.charAt(0) == '@') continue; // skip SAM header lines
|
1975
|
+
var t = line.split("\t");
|
1976
|
+
var is_pri = false, cigar = null, a1;
|
1977
|
+
var qname = conv != null? conv.get(t[0]) : null;
|
1978
|
+
if (qname != null) t[0] = qname;
|
1979
|
+
if (t.length >= 10 && t[4] != '+' && t[4] != '-' && /^\d+/.test(t[1])) { // SAM
|
1980
|
+
var flag = parseInt(t[1]);
|
1981
|
+
if (flag&1) t[0] += '/' + (flag>>6&3);
|
1982
|
+
}
|
1983
|
+
if (a.length && a[0][3] != t[0]) {
|
1984
|
+
print_lines(a, fmt, keep_multi);
|
1985
|
+
a = [];
|
1986
|
+
}
|
1987
|
+
if (t.length >= 12 && (t[4] == '+' || t[4] == '-')) { // PAF
|
1988
|
+
for (var i = 12; i < t.length; ++i) {
|
1989
|
+
if (t[i].substr(0, 5) == 'cg:Z:') {
|
1990
|
+
cigar = t[i].substr(5);
|
1991
|
+
} else if (t[i].substr(0, 5) == 's2:i:') {
|
1992
|
+
is_pri = true;
|
1993
|
+
}
|
1994
|
+
}
|
1995
|
+
a1 = [t[5], t[7], t[8], t[0], Math.floor(t[9]/t[10]*1000), t[4]];
|
1996
|
+
} else if (t.length >= 10) { // SAM
|
1997
|
+
var flag = parseInt(t[1]);
|
1998
|
+
if ((flag&4) || a[2] == '*') continue;
|
1999
|
+
cigar = t[5];
|
2000
|
+
is_pri = (flag&0x100)? false : true;
|
2001
|
+
a1 = [t[2], parseInt(t[3])-1, null, t[0], 1000, (flag&16)? '-' : '+'];
|
2002
|
+
} else {
|
2003
|
+
throw Error("unrecognized input format");
|
2004
|
+
}
|
2005
|
+
if (cigar == null) throw Error("missing CIGAR");
|
2006
|
+
var m, x0 = 0, x = 0, bs = [], bl = [];
|
2007
|
+
while ((m = re.exec(cigar)) != null) {
|
2008
|
+
if (m[2] == 'M' || m[2] == 'D') {
|
2009
|
+
x += parseInt(m[1]);
|
2010
|
+
} else if (m[2] == 'N') {
|
2011
|
+
bs.push(x0);
|
2012
|
+
bl.push(x - x0);
|
2013
|
+
x += parseInt(m[1]);
|
2014
|
+
x0 = x;
|
2015
|
+
}
|
2016
|
+
}
|
2017
|
+
bs.push(x0);
|
2018
|
+
bl.push(x - x0);
|
2019
|
+
// write the BED12 line
|
2020
|
+
if (a1[2] == null) a1[2] = a1[1] + x;
|
2021
|
+
a1.push(a1[1], a1[2]); // thick start/end is the same as start/end
|
2022
|
+
a1.push(is_pri? 0 : 2, bs.length, bl.join(",")+",", bs.join(",")+",");
|
2023
|
+
a.push(a1);
|
2024
|
+
}
|
2025
|
+
print_lines(a, fmt, keep_multi);
|
2026
|
+
buf.destroy();
|
2027
|
+
file.close();
|
2028
|
+
if (conv != null) conv.destroy();
|
2029
|
+
}
|
2030
|
+
|
2031
|
+
/**********************
|
2032
|
+
* Evaluation related *
|
2033
|
+
**********************/
|
2034
|
+
|
2035
|
+
// evaluate mapping accuracy
|
2036
|
+
function paf_mapeval(args)
|
2037
|
+
{
|
2038
|
+
var c, max_mapq = 60, mode = 0, err_out_q = 256, print_err = false, ovlp_ratio = 0.1, cap_short_mapq = false;
|
2039
|
+
while ((c = getopt(args, "Q:r:m:c")) != null) {
|
2040
|
+
if (c == 'Q') err_out_q = parseInt(getopt.arg), print_err = true;
|
2041
|
+
else if (c == 'r') ovlp_ratio = parseFloat(getopt.arg);
|
2042
|
+
else if (c == 'm') mode = parseInt(getopt.arg);
|
2043
|
+
else if (c == 'c') cap_short_mapq = true;
|
2044
|
+
}
|
2045
|
+
|
2046
|
+
if (args.length == getopt.ind) {
|
2047
|
+
warn("Usage: paftools.js mapeval [options] <in.paf>|<in.sam>");
|
2048
|
+
warn("Options:");
|
2049
|
+
warn(" -r FLOAT mapping correct if overlap_length/union_length>FLOAT [" + ovlp_ratio + "]");
|
2050
|
+
warn(" -Q INT print wrong mappings with mapQ>INT [don't print]");
|
2051
|
+
warn(" -m INT 0: eval the longest aln only; 1: first aln only; 2: all primary aln [0]");
|
2052
|
+
exit(1);
|
2053
|
+
}
|
2054
|
+
|
2055
|
+
var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
|
2056
|
+
var buf = new Bytes();
|
2057
|
+
|
2058
|
+
var tot = [], err = [];
|
2059
|
+
for (var q = 0; q <= max_mapq; ++q)
|
2060
|
+
tot[q] = err[q] = 0;
|
2061
|
+
|
2062
|
+
function is_correct(s, b)
|
2063
|
+
{
|
2064
|
+
if (s[0] != b[0] || s[3] != b[3]) return false;
|
2065
|
+
var o, l;
|
2066
|
+
if (s[1] < b[1]) {
|
2067
|
+
if (s[2] <= b[1]) return false;
|
2068
|
+
o = (s[2] < b[2]? s[2] : b[2]) - b[1];
|
2069
|
+
l = (s[2] > b[2]? s[2] : b[2]) - s[1];
|
2070
|
+
} else {
|
2071
|
+
if (b[2] <= s[1]) return false;
|
2072
|
+
o = (s[2] < b[2]? s[2] : b[2]) - s[1];
|
2073
|
+
l = (s[2] > b[2]? s[2] : b[2]) - b[1];
|
2074
|
+
}
|
2075
|
+
return o/l > ovlp_ratio? true : false;
|
2076
|
+
}
|
2077
|
+
|
2078
|
+
function count_err(qname, a, tot, err, mode)
|
2079
|
+
{
|
2080
|
+
if (a.length == 0) return;
|
2081
|
+
|
2082
|
+
var m, s;
|
2083
|
+
if ((m = /^(\S+)!(\S+)!(\d+)!(\d+)!([\+\-])$/.exec(qname)) != null) { // pbsim single-end reads
|
2084
|
+
s = [m[1], m[2], parseInt(m[3]), parseInt(m[4]), m[5]];
|
2085
|
+
} else if ((m = /^(\S+)!(\S+)!(\d+)_(\d+)!(\d+)_(\d+)!([\+\-])([\+\-])\/([12])$/.exec(qname)) != null) { // mason2 paired-end reads
|
2086
|
+
if (m[9] == '1') {
|
2087
|
+
s = [m[1], m[2], parseInt(m[3]), parseInt(m[5]), m[7]];
|
2088
|
+
} else {
|
2089
|
+
s = [m[1], m[2], parseInt(m[4]), parseInt(m[6]), m[8]];
|
2090
|
+
}
|
2091
|
+
} else throw Error("Failed to parse simulated read names '" + qname + "'");
|
2092
|
+
s.shift(); // skip the orginal read name
|
2093
|
+
|
2094
|
+
if (mode == 0 || mode == 1) { // longest only or first only
|
2095
|
+
var max_i = 0;
|
2096
|
+
if (mode == 0) { // longest only
|
2097
|
+
var max = 0;
|
2098
|
+
for (var i = 0; i < a.length; ++i)
|
2099
|
+
if (a[i][5] > max)
|
2100
|
+
max = a[i][5], max_i = i;
|
2101
|
+
}
|
2102
|
+
var mapq = a[max_i][4];
|
2103
|
+
++tot[mapq];
|
2104
|
+
if (!is_correct(s, a[max_i])) {
|
2105
|
+
if (mapq >= err_out_q)
|
2106
|
+
print('E', qname, a[max_i].join("\t"));
|
2107
|
+
++err[mapq];
|
2108
|
+
}
|
2109
|
+
} else if (mode == 2) { // all primary mode
|
2110
|
+
var max_err_mapq = -1, max_mapq = 0, max_err_i = -1;
|
2111
|
+
if (cap_short_mapq) {
|
2112
|
+
var max = 0, max_q = 0;
|
2113
|
+
for (var i = 0; i < a.length; ++i)
|
2114
|
+
if (a[i][5] > max)
|
2115
|
+
max = a[i][5], max_q = a[i][4];
|
2116
|
+
for (var i = 0; i < a.length; ++i)
|
2117
|
+
a[i][4] = max_q < a[i][4]? max_q : a[i][4];
|
2118
|
+
}
|
2119
|
+
for (var i = 0; i < a.length; ++i) {
|
2120
|
+
max_mapq = max_mapq > a[i][4]? max_mapq : a[i][4];
|
2121
|
+
if (!is_correct(s, a[i]))
|
2122
|
+
if (a[i][4] > max_err_mapq)
|
2123
|
+
max_err_mapq = a[i][4], max_err_i = i;
|
2124
|
+
}
|
2125
|
+
if (max_err_mapq >= 0) {
|
2126
|
+
++tot[max_err_mapq], ++err[max_err_mapq];
|
2127
|
+
if (max_err_mapq >= err_out_q)
|
2128
|
+
print('E', qname, a[max_err_i].join("\t"));
|
2129
|
+
} else ++tot[max_mapq];
|
2130
|
+
}
|
2131
|
+
}
|
2132
|
+
|
2133
|
+
var lineno = 0, last = null, a = [], n_unmapped = null;
|
2134
|
+
var re_cigar = /(\d+)([MIDSHN])/g;
|
2135
|
+
while (file.readline(buf) >= 0) {
|
2136
|
+
var m, line = buf.toString();
|
2137
|
+
++lineno;
|
2138
|
+
if (line[0] != '@') {
|
2139
|
+
var t = line.split("\t");
|
2140
|
+
if (t[4] == '+' || t[4] == '-') { // PAF
|
2141
|
+
if (last != t[0]) {
|
2142
|
+
if (last != null) count_err(last, a, tot, err, mode);
|
2143
|
+
a = [], last = t[0];
|
2144
|
+
}
|
2145
|
+
if (/\ts1:i:\d+/.test(line) && !/\ts2:i:\d+/.test(line)) // secondary alignment in minimap2 PAF
|
2146
|
+
continue;
|
2147
|
+
var mapq = parseInt(t[11]);
|
2148
|
+
if (mapq > max_mapq) mapq = max_mapq;
|
2149
|
+
a.push([t[5], parseInt(t[7]), parseInt(t[8]), t[4], mapq, parseInt(t[9])]);
|
2150
|
+
} else { // SAM
|
2151
|
+
var flag = parseInt(t[1]);
|
2152
|
+
var read_no = flag>>6&0x3;
|
2153
|
+
var qname = t[0];
|
2154
|
+
if (!/\/[12]$/.test(qname))
|
2155
|
+
qname = read_no == 1 || read_no == 2? t[0] + '/' + read_no : t[0];
|
2156
|
+
if (last != qname) {
|
2157
|
+
if (last != null) count_err(last, a, tot, err, mode);
|
2158
|
+
a = [], last = qname;
|
2159
|
+
}
|
2160
|
+
if (flag&0x100) continue; // secondary alignment
|
2161
|
+
if ((flag&0x4) || t[2] == '*') { // unmapped
|
2162
|
+
if (n_unmapped == null) n_unmapped = 0;
|
2163
|
+
++n_unmapped;
|
2164
|
+
continue;
|
2165
|
+
}
|
2166
|
+
var mapq = parseInt(t[4]);
|
2167
|
+
if (mapq > max_mapq) mapq = max_mapq;
|
2168
|
+
var pos = parseInt(t[3]) - 1, pos_end = pos;
|
2169
|
+
var n_gap = 0, mlen = 0;
|
2170
|
+
while ((m = re_cigar.exec(t[5])) != null) {
|
2171
|
+
var len = parseInt(m[1]);
|
2172
|
+
if (m[2] == 'M') pos_end += len, mlen += len;
|
2173
|
+
else if (m[2] == 'I') n_gap += len;
|
2174
|
+
else if (m[2] == 'D') n_gap += len, pos_end += len;
|
2175
|
+
}
|
2176
|
+
var score = pos_end - pos;
|
2177
|
+
if ((m = /\tNM:i:(\d+)/.exec(line)) != null) {
|
2178
|
+
var NM = parseInt(m[1]);
|
2179
|
+
if (NM >= n_gap) score = mlen - (NM - n_gap);
|
2180
|
+
}
|
2181
|
+
a.push([t[2], pos, pos_end, (flag&16)? '-' : '+', mapq, score]);
|
2182
|
+
}
|
2183
|
+
}
|
2184
|
+
}
|
2185
|
+
if (last != null) count_err(last, a, tot, err, mode);
|
2186
|
+
|
2187
|
+
buf.destroy();
|
2188
|
+
file.close();
|
2189
|
+
|
2190
|
+
var sum_tot = 0, sum_err = 0, q_out = -1, sum_tot2 = 0, sum_err2 = 0;
|
2191
|
+
for (var q = max_mapq; q >= 0; --q) {
|
2192
|
+
if (tot[q] == 0) continue;
|
2193
|
+
if (q_out < 0 || err[q] > 0) {
|
2194
|
+
if (q_out >= 0) print('Q', q_out, sum_tot, sum_err, (sum_err2/sum_tot2).toFixed(9), sum_tot2);
|
2195
|
+
sum_tot = sum_err = 0, q_out = q;
|
2196
|
+
}
|
2197
|
+
sum_tot += tot[q], sum_err += err[q];
|
2198
|
+
sum_tot2 += tot[q], sum_err2 += err[q];
|
2199
|
+
}
|
2200
|
+
print('Q', q_out, sum_tot, sum_err, (sum_err2/sum_tot2).toFixed(9), sum_tot2);
|
2201
|
+
if (n_unmapped != null) print('U', n_unmapped);
|
2202
|
+
}
|
2203
|
+
|
2204
|
+
// convert mason2 SAM to FASTQ
|
2205
|
+
function paf_mason2fq(args)
|
2206
|
+
{
|
2207
|
+
if (args.length == 0) {
|
2208
|
+
print("Usage: paftools.js mason2fq <mason.sam>");
|
2209
|
+
exit(1);
|
2210
|
+
}
|
2211
|
+
|
2212
|
+
function print_se(a)
|
2213
|
+
{
|
2214
|
+
print('@' + a.slice(0, 5).join("!") + " " + a[8]);
|
2215
|
+
print(a[5]);
|
2216
|
+
print("+");
|
2217
|
+
print(a[6]);
|
2218
|
+
}
|
2219
|
+
|
2220
|
+
var buf = new Bytes(), buf2 = new Bytes();
|
2221
|
+
var file = new File(args[0]);
|
2222
|
+
var re = /(\d+)([MIDSHN])/g;
|
2223
|
+
var last = null;
|
2224
|
+
while (file.readline(buf) >= 0) {
|
2225
|
+
var t = buf.toString().split("\t");
|
2226
|
+
if (t[0].charAt(0) == '@') continue;
|
2227
|
+
var m, l_ref = 0;
|
2228
|
+
while ((m = re.exec(t[5])) != null)
|
2229
|
+
if (m[2] == 'D' || m[2] == 'M' || m[2] == 'N')
|
2230
|
+
l_ref += parseInt(m[1]);
|
2231
|
+
var flag = parseInt(t[1]);
|
2232
|
+
var rev = !!(flag&16);
|
2233
|
+
var seq, qual;
|
2234
|
+
if (rev) {
|
2235
|
+
buf2.length = 0;
|
2236
|
+
buf2.set(t[9], 0);
|
2237
|
+
buf2.revcomp();
|
2238
|
+
seq = buf2.toString();
|
2239
|
+
buf2.set(t[10], 0);
|
2240
|
+
buf2.reverse();
|
2241
|
+
qual = buf2.toString();
|
2242
|
+
} else seq = t[9], qual = t[10];
|
2243
|
+
var qname = t[0];
|
2244
|
+
qname = qname.replace(/^simulated./, "");
|
2245
|
+
var chr = t[2];
|
2246
|
+
var pos = parseInt(t[3]) - 1;
|
2247
|
+
var strand = (flag&16)? '-' : '+';
|
2248
|
+
var read_no = flag&0xc0;
|
2249
|
+
if (read_no == 0x40) read_no = 1;
|
2250
|
+
else if (read_no == 0x80) read_no = 2;
|
2251
|
+
else read_no = 0;
|
2252
|
+
var err = 0, snp = 0, indel = 0;
|
2253
|
+
for (var i = 11; i < t.length; ++i) {
|
2254
|
+
if ((m = /^XE:i:(\d+)/.exec(t[i])) != null) err = m[1];
|
2255
|
+
else if ((m = /^XS:i:(\d+)/.exec(t[i])) != null) snp = m[1];
|
2256
|
+
else if ((m = /^XI:i:(\d+)/.exec(t[i])) != null) indel = m[1];
|
2257
|
+
}
|
2258
|
+
var comment = [err, snp, indel].join(":");
|
2259
|
+
if (last == null) {
|
2260
|
+
last = [qname, chr, pos, pos + l_ref, strand, seq, qual, read_no, comment];
|
2261
|
+
} else if (last[0] != qname) {
|
2262
|
+
print_se(last);
|
2263
|
+
last = [qname, chr, pos, pos + l_ref, strand, seq, qual, read_no, comment];
|
2264
|
+
} else {
|
2265
|
+
if (read_no == 2) { // last[] is the first read
|
2266
|
+
if (last[7] != 1) throw Error("ERROR: can't find read1");
|
2267
|
+
var name = [qname, chr, last[2] + "_" + pos, last[3] + "_" + (pos + l_ref), last[4] + strand].join("!");
|
2268
|
+
print('@' + name + '/1' + ' ' + last[8]); print(last[5]); print("+"); print(last[6]);
|
2269
|
+
print('@' + name + '/2' + ' ' + comment); print(seq); print("+"); print(qual);
|
2270
|
+
} else {
|
2271
|
+
if (last[7] != 2) throw Error("ERROR: can't find read2");
|
2272
|
+
var name = [qname, chr, pos + "_" + last[2], (pos + l_ref) + "_" + last[3], strand + last[4]].join("!");
|
2273
|
+
print('@' + name + '/1' + ' ' + comment); print(seq); print("+"); print(qual);
|
2274
|
+
print('@' + name + '/2' + ' ' + last[8]); print(last[5]); print("+"); print(last[6]);
|
2275
|
+
}
|
2276
|
+
last = null;
|
2277
|
+
}
|
2278
|
+
}
|
2279
|
+
if (last != null) print_se(last);
|
2280
|
+
file.close();
|
2281
|
+
buf.destroy();
|
2282
|
+
buf2.destroy();
|
2283
|
+
}
|
2284
|
+
|
2285
|
+
// convert pbsim MAF to FASTQ
|
2286
|
+
function paf_pbsim2fq(args)
|
2287
|
+
{
|
2288
|
+
if (args.length < 2) {
|
2289
|
+
print("Usage: paftools.js pbsim2fq <ref.fa.fai> <pbsim1.maf> [[pbsim2.maf] ...]");
|
2290
|
+
exit(1);
|
2291
|
+
}
|
2292
|
+
|
2293
|
+
var file, buf = new Bytes(), buf2 = new Bytes();
|
2294
|
+
file = new File(args[0]);
|
2295
|
+
var chr_list = [];
|
2296
|
+
while (file.readline(buf) >= 0) {
|
2297
|
+
var t = buf.toString().split(/\s+/);
|
2298
|
+
chr_list.push(t[0]);
|
2299
|
+
}
|
2300
|
+
file.close();
|
2301
|
+
|
2302
|
+
for (var k = 1; k < args.length; ++k) {
|
2303
|
+
var fn = args[k];
|
2304
|
+
file = new File(fn);
|
2305
|
+
var state = 0, reg;
|
2306
|
+
while (file.readline(buf) >= 0) {
|
2307
|
+
var line = buf.toString();
|
2308
|
+
if (state == 0 && line.charAt(0) == 'a') {
|
2309
|
+
state = 1;
|
2310
|
+
} else if (state == 1 && line.charAt(0) == 's') {
|
2311
|
+
var t = line.split(/\s+/);
|
2312
|
+
var st = parseInt(t[2]);
|
2313
|
+
reg = [st, st + parseInt(t[3])];
|
2314
|
+
state = 2;
|
2315
|
+
} else if (state == 2 && line.charAt(0) == 's') {
|
2316
|
+
var m, t = line.split(/\s+/);
|
2317
|
+
if ((m = /S(\d+)_\d+/.exec(t[1])) == null) throw Error("Failed to parse the read name");
|
2318
|
+
var chr_id = parseInt(m[1]) - 1;
|
2319
|
+
if (chr_id >= chr_list.length) throw Error("Index outside the chr list");
|
2320
|
+
var name = [t[1], chr_list[chr_id], reg[0], reg[1], t[4]].join("!");
|
2321
|
+
var seq = t[6].replace(/\-/g, "");
|
2322
|
+
if (seq.length != parseInt(t[5])) throw Error("Inconsistent read length");
|
2323
|
+
if (seq.indexOf("NN") < 0) {
|
2324
|
+
if (t[4] == '-') {
|
2325
|
+
buf2.set(seq, 0);
|
2326
|
+
buf2.length = seq.length;
|
2327
|
+
buf2.revcomp();
|
2328
|
+
seq = buf2.toString();
|
2329
|
+
}
|
2330
|
+
print(">" + name);
|
2331
|
+
print(seq);
|
2332
|
+
}
|
2333
|
+
state = 0;
|
2334
|
+
}
|
2335
|
+
}
|
2336
|
+
file.close();
|
2337
|
+
}
|
2338
|
+
buf.destroy();
|
2339
|
+
buf2.destroy();
|
2340
|
+
}
|
2341
|
+
|
2342
|
+
function paf_junceval(args)
|
2343
|
+
{
|
2344
|
+
var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false;
|
2345
|
+
while ((c = getopt(args, "l:epc")) != null) {
|
2346
|
+
if (c == 'l') l_fuzzy = parseInt(getopt.arg);
|
2347
|
+
else if (c == 'e') print_err_only = print_ovlp = true;
|
2348
|
+
else if (c == 'p') print_ovlp = true;
|
2349
|
+
else if (c == 'c') chr_only = true;
|
2350
|
+
}
|
2351
|
+
|
2352
|
+
if (args.length - getopt.ind < 1) {
|
2353
|
+
print("Usage: paftools.js junceval [options] <gene.gtf> <aln.sam>");
|
2354
|
+
print("Options:");
|
2355
|
+
print(" -l INT tolerance of junction positions (0 for exact) [0]");
|
2356
|
+
print(" -p print overlapping introns");
|
2357
|
+
print(" -e print erroreous overlapping introns");
|
2358
|
+
print(" -c only consider alignments to /^(chr)?([0-9]+|X|Y)$/");
|
2359
|
+
exit(1);
|
2360
|
+
}
|
2361
|
+
|
2362
|
+
var file, buf = new Bytes();
|
2363
|
+
|
2364
|
+
var tr = {};
|
2365
|
+
file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
|
2366
|
+
while (file.readline(buf) >= 0) {
|
2367
|
+
var m, t = buf.toString().split("\t");
|
2368
|
+
if (t[0].charAt(0) == '#') continue;
|
2369
|
+
if (t[2] != 'exon') continue;
|
2370
|
+
var st = parseInt(t[3]) - 1;
|
2371
|
+
var en = parseInt(t[4]);
|
2372
|
+
if ((m = /transcript_id "(\S+)"/.exec(t[8])) == null) continue;
|
2373
|
+
var tid = m[1];
|
2374
|
+
if (tr[tid] == null) tr[tid] = [t[0], t[6], 0, 0, []];
|
2375
|
+
tr[tid][4].push([st, en]);
|
2376
|
+
}
|
2377
|
+
file.close();
|
2378
|
+
|
2379
|
+
var anno = {};
|
2380
|
+
for (var tid in tr) {
|
2381
|
+
var t = tr[tid];
|
2382
|
+
Interval.sort(t[4]);
|
2383
|
+
t[2] = t[4][0][0];
|
2384
|
+
t[3] = t[4][t[4].length - 1][1];
|
2385
|
+
if (anno[t[0]] == null) anno[t[0]] = [];
|
2386
|
+
var s = t[4];
|
2387
|
+
for (var i = 0; i < s.length - 1; ++i) {
|
2388
|
+
if (s[i][1] >= s[i+1][0])
|
2389
|
+
warn("WARNING: incorrect annotation for transcript "+tid+" ("+s[i][1]+" >= "+s[i+1][0]+")")
|
2390
|
+
anno[t[0]].push([s[i][1], s[i+1][0]]);
|
2391
|
+
}
|
2392
|
+
}
|
2393
|
+
tr = null;
|
2394
|
+
|
2395
|
+
for (var chr in anno) {
|
2396
|
+
var e = anno[chr];
|
2397
|
+
if (e.length == 0) continue;
|
2398
|
+
Interval.sort(e);
|
2399
|
+
var k = 0;
|
2400
|
+
for (var i = 1; i < e.length; ++i) // dedup
|
2401
|
+
if (e[i][0] != e[k][0] || e[i][1] != e[k][1])
|
2402
|
+
e[++k] = e[i].slice(0);
|
2403
|
+
e.length = k + 1;
|
2404
|
+
Interval.index_end(e);
|
2405
|
+
}
|
2406
|
+
|
2407
|
+
var n_pri = 0, n_unmapped = 0, n_mapped = 0;
|
2408
|
+
var n_sgl = 0, n_splice = 0, n_splice_hit = 0, n_splice_novel = 0;
|
2409
|
+
|
2410
|
+
file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]);
|
2411
|
+
var last_qname = null;
|
2412
|
+
var re_cigar = /(\d+)([MIDNSHP=X])/g;
|
2413
|
+
while (file.readline(buf) >= 0) {
|
2414
|
+
var m, t = buf.toString().split("\t");
|
2415
|
+
var ctg_name = null, cigar = null, pos = null, qname = t[0];
|
2416
|
+
|
2417
|
+
if (t[0].charAt(0) == '@') continue;
|
2418
|
+
if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
|
2419
|
+
ctg_name = t[5], pos = parseInt(t[7]);
|
2420
|
+
var type = 'P';
|
2421
|
+
for (i = 12; i < t.length; ++i) {
|
2422
|
+
if ((m = /^(tp:A|cg:Z):(\S+)/.exec(t[i])) != null) {
|
2423
|
+
if (m[1] == 'tp:A') type = m[2];
|
2424
|
+
else cigar = m[2];
|
2425
|
+
}
|
2426
|
+
}
|
2427
|
+
if (type == 'S') continue; // secondary
|
2428
|
+
} else { // SAM
|
2429
|
+
ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5];
|
2430
|
+
var flag = parseInt(t[1]);
|
2431
|
+
if (flag&0x100) continue; // secondary
|
2432
|
+
}
|
2433
|
+
|
2434
|
+
if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(ctg_name)) continue;
|
2435
|
+
if (first_only && last_qname == qname) continue;
|
2436
|
+
if (ctg_name == '*') { // unmapped
|
2437
|
+
++n_unmapped;
|
2438
|
+
continue;
|
2439
|
+
} else {
|
2440
|
+
++n_pri;
|
2441
|
+
if (last_qname != qname) {
|
2442
|
+
++n_mapped;
|
2443
|
+
last_qname = qname;
|
2444
|
+
}
|
2445
|
+
}
|
2446
|
+
|
2447
|
+
var intron = [];
|
2448
|
+
while ((m = re_cigar.exec(cigar)) != null) {
|
2449
|
+
var len = parseInt(m[1]), op = m[2];
|
2450
|
+
if (op == 'N') {
|
2451
|
+
intron.push([pos, pos + len]);
|
2452
|
+
pos += len;
|
2453
|
+
} else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
|
2454
|
+
}
|
2455
|
+
if (intron.length == 0) {
|
2456
|
+
++n_sgl;
|
2457
|
+
continue;
|
2458
|
+
}
|
2459
|
+
n_splice += intron.length;
|
2460
|
+
|
2461
|
+
var chr = anno[ctg_name];
|
2462
|
+
if (chr != null) {
|
2463
|
+
for (var i = 0; i < intron.length; ++i) {
|
2464
|
+
var o = Interval.find_ovlp(chr, intron[i][0], intron[i][1]);
|
2465
|
+
if (o.length > 0) {
|
2466
|
+
var hit = false;
|
2467
|
+
for (var j = 0; j < o.length; ++j) {
|
2468
|
+
var st_diff = intron[i][0] - o[j][0];
|
2469
|
+
var en_diff = intron[i][1] - o[j][1];
|
2470
|
+
if (st_diff < 0) st_diff = -st_diff;
|
2471
|
+
if (en_diff < 0) en_diff = -en_diff;
|
2472
|
+
if (st_diff <= l_fuzzy && en_diff <= l_fuzzy)
|
2473
|
+
++n_splice_hit, hit = true;
|
2474
|
+
if (hit) break;
|
2475
|
+
}
|
2476
|
+
if (print_ovlp) {
|
2477
|
+
var type = hit? 'C' : 'P';
|
2478
|
+
if (hit && print_err_only) continue;
|
2479
|
+
var x = '[';
|
2480
|
+
for (var j = 0; j < o.length; ++j) {
|
2481
|
+
if (j) x += ', ';
|
2482
|
+
x += '(' + o[j][0] + "," + o[j][1] + ')';
|
2483
|
+
}
|
2484
|
+
x += ']';
|
2485
|
+
print(type, qname, i+1, ctg_name, intron[i][0], intron[i][1], x);
|
2486
|
+
}
|
2487
|
+
} else {
|
2488
|
+
++n_splice_novel;
|
2489
|
+
if (print_ovlp)
|
2490
|
+
print('N', qname, i+1, ctg_name, intron[i][0], intron[i][1]);
|
2491
|
+
}
|
2492
|
+
}
|
2493
|
+
} else {
|
2494
|
+
n_splice_novel += intron.length;
|
2495
|
+
}
|
2496
|
+
}
|
2497
|
+
file.close();
|
2498
|
+
|
2499
|
+
buf.destroy();
|
2500
|
+
|
2501
|
+
if (!print_ovlp) {
|
2502
|
+
print("# unmapped reads: " + n_unmapped);
|
2503
|
+
print("# mapped reads: " + n_mapped);
|
2504
|
+
print("# primary alignments: " + n_pri);
|
2505
|
+
print("# singletons: " + n_sgl);
|
2506
|
+
print("# predicted introns: " + n_splice);
|
2507
|
+
print("# non-overlapping introns: " + n_splice_novel);
|
2508
|
+
print("# correct introns: " + n_splice_hit + " (" + (n_splice_hit / n_splice * 100).toFixed(2) + "%)");
|
2509
|
+
}
|
2510
|
+
}
|
2511
|
+
|
2512
|
+
// evaluate overlap sensitivity
|
2513
|
+
function paf_ov_eval(args)
|
2514
|
+
{
|
2515
|
+
var c, min_ovlp = 2000, min_frac = 0.95, min_mapq = 10;
|
2516
|
+
while ((c = getopt(args, "q:l:f:")) != null) {
|
2517
|
+
if (c == 'q') min_mapq = parseInt(getopt.arg);
|
2518
|
+
else if (c == 'l') min_ovlp = parseInt(getopt.arg);
|
2519
|
+
else if (c == 'f') min_frac = parseFloat(getopt.arg);
|
2520
|
+
}
|
2521
|
+
if (args.length - getopt.ind < 2) {
|
2522
|
+
print("Usage: sort -k6,6 -k8,8n to-ref.paf | paftools.js ov-eval [options] - <ovlp.paf>");
|
2523
|
+
print("Options:");
|
2524
|
+
print(" -l INT min overlap length [2000]");
|
2525
|
+
print(" -q INT min mapping quality [10]");
|
2526
|
+
print(" -f FLOAT min fraction of mapped length [0.95]");
|
2527
|
+
exit(1);
|
2528
|
+
}
|
2529
|
+
|
2530
|
+
var buf = new Bytes();
|
2531
|
+
var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
|
2532
|
+
var a = [], h = {};
|
2533
|
+
while (file.readline(buf) >= 0) {
|
2534
|
+
var t = buf.toString().split("\t");
|
2535
|
+
var is_pri = false;
|
2536
|
+
if (parseInt(t[11]) < min_mapq) continue;
|
2537
|
+
for (var i = 12; i < t.length; ++i)
|
2538
|
+
if (t[i] == 'tp:A:P')
|
2539
|
+
is_pri = true;
|
2540
|
+
if (!is_pri) continue;
|
2541
|
+
for (var i = 1; i <= 3; ++i)
|
2542
|
+
t[i] = parseInt(t[i]);
|
2543
|
+
for (var i = 6; i <= 8; ++i)
|
2544
|
+
t[i] = parseInt(t[i]);
|
2545
|
+
if (t[3] - t[2] < min_ovlp || t[8] - t[7] < min_ovlp || (t[3] - t[2]) / t[1] < min_frac)
|
2546
|
+
continue;
|
2547
|
+
var ctg = t[5], st = t[7], en = t[8];
|
2548
|
+
while (a.length > 0) {
|
2549
|
+
if (a[0][0] == ctg && a[0][2] > st)
|
2550
|
+
break;
|
2551
|
+
else a.shift();
|
2552
|
+
}
|
2553
|
+
for (var j = 0; j < a.length; ++j) {
|
2554
|
+
if (a[j][3] == t[0]) continue;
|
2555
|
+
var len = (en > a[j][2]? a[j][2] : en) - st;
|
2556
|
+
if (len >= min_ovlp) {
|
2557
|
+
var key = a[j][3] < t[0]? a[j][3] + "\t" + t[0] : t[0] + "\t" + a[j][3];
|
2558
|
+
h[key] = len;
|
2559
|
+
}
|
2560
|
+
}
|
2561
|
+
a.push([ctg, st, en, t[0]]);
|
2562
|
+
}
|
2563
|
+
file.close();
|
2564
|
+
|
2565
|
+
file = new File(args[getopt.ind + 1]);
|
2566
|
+
while (file.readline(buf) >= 0) {
|
2567
|
+
var t = buf.toString().split("\t");
|
2568
|
+
var key = t[0] < t[5]? t[0] + "\t" + t[5] : t[5] + "\t" + t[0];
|
2569
|
+
if (h[key] > 0) h[key] = -h[key];
|
2570
|
+
}
|
2571
|
+
file.close();
|
2572
|
+
buf.destroy();
|
2573
|
+
|
2574
|
+
var n_ovlp = 0, n_missing = 0;
|
2575
|
+
for (var key in h) {
|
2576
|
+
++n_ovlp;
|
2577
|
+
if (h[key] > 0) ++n_missing;
|
2578
|
+
}
|
2579
|
+
print(n_ovlp + " overlaps inferred from the reference mapping");
|
2580
|
+
print(n_missing + " missed by the read overlapper");
|
2581
|
+
print((100 * (1 - n_missing / n_ovlp)).toFixed(2) + "% sensitivity");
|
2582
|
+
}
|
2583
|
+
|
2584
|
+
function paf_vcfstat(args)
|
2585
|
+
{
|
2586
|
+
var c, ts = { "AG":1, "GA":1, "CT":1, "TC":1 };
|
2587
|
+
while ((c = getopt(args, "")) != null) {
|
2588
|
+
}
|
2589
|
+
var buf = new Bytes();
|
2590
|
+
var file = args.length == getopt.ind? new File() : new File(args[getopt.ind]);
|
2591
|
+
var x = { sub:0, ts:0, tv:0, ins:0, del:0, ins1:0, del1:0, ins2:0, del2:0, ins50:0, del50:0, ins1k:0, del1k:0, ins7k:0, del7k:0, insinf:0, delinf:0 };
|
2592
|
+
while (file.readline(buf) >= 0) {
|
2593
|
+
var t = buf.toString().split("\t");
|
2594
|
+
if (t[0][0] == '#') continue;
|
2595
|
+
var alt = t[4].split(",");
|
2596
|
+
var ref = t[3];
|
2597
|
+
for (var i = 0; i < alt.length; ++i) {
|
2598
|
+
var a = alt[i];
|
2599
|
+
if (a[0] == '<' || a[1] == '>') continue;
|
2600
|
+
var l = ref.length < a.length? ref.length : a.length;
|
2601
|
+
for (var j = 0; j < l; ++j) {
|
2602
|
+
if (ref[j] != a[j]) {
|
2603
|
+
++x.sub;
|
2604
|
+
if (ts[ref[j] + a[j]]) ++x.ts;
|
2605
|
+
else ++x.tv;
|
2606
|
+
}
|
2607
|
+
}
|
2608
|
+
var d = a.length - ref.length;
|
2609
|
+
if (d > 0) {
|
2610
|
+
++x.ins;
|
2611
|
+
if (d == 1) ++x.ins1;
|
2612
|
+
else if (d == 2) ++x.ins2;
|
2613
|
+
else if (d < 50) ++x.ins50;
|
2614
|
+
else if (d < 1000) ++x.ins1k;
|
2615
|
+
else if (d < 7000) ++x.ins7k;
|
2616
|
+
else ++x.insinf;
|
2617
|
+
} else if (d < 0) {
|
2618
|
+
d = -d;
|
2619
|
+
++x.del;
|
2620
|
+
if (d == 1) ++x.del1;
|
2621
|
+
else if (d == 2) ++x.del2;
|
2622
|
+
else if (d < 50) ++x.del50;
|
2623
|
+
else if (d < 1000) ++x.del1k;
|
2624
|
+
else if (d < 7000) ++x.del7k;
|
2625
|
+
else ++x.delinf;
|
2626
|
+
}
|
2627
|
+
}
|
2628
|
+
}
|
2629
|
+
file.close();
|
2630
|
+
buf.destroy();
|
2631
|
+
print("# substitutions: " + x.sub);
|
2632
|
+
print("ts/tv: " + (x.ts / x.tv).toFixed(3));
|
2633
|
+
print("# insertions: " + x.ins);
|
2634
|
+
print("# 1bp insertions: " + x.ins1);
|
2635
|
+
print("# 2bp insertions: " + x.ins2);
|
2636
|
+
print("# [3,50) insertions: " + x.ins50);
|
2637
|
+
print("# [50,1000) insertions: " + x.ins1k);
|
2638
|
+
print("# [1000,7000) insertions: " + x.ins7k);
|
2639
|
+
print("# >=7000 insertions: " + x.insinf);
|
2640
|
+
print("# deletions: " + x.del);
|
2641
|
+
print("# 1bp deletions: " + x.del1);
|
2642
|
+
print("# 2bp deletions: " + x.del2);
|
2643
|
+
print("# [3,50) deletions: " + x.del50);
|
2644
|
+
print("# [50,1000) deletions: " + x.del1k);
|
2645
|
+
print("# [1000,7000) deletions: " + x.del7k);
|
2646
|
+
print("# >=7000 deletions: " + x.delinf);
|
2647
|
+
}
|
2648
|
+
|
2649
|
+
function paf_parseNum(s) {
|
2650
|
+
var m, x = null;
|
2651
|
+
if ((m = /^(\d*\.?\d*)([mMgGkK]?)/.exec(s)) != null) {
|
2652
|
+
x = parseFloat(m[1]);
|
2653
|
+
if (m[2] == 'k' || m[2] == 'K') x *= 1000;
|
2654
|
+
else if (m[2] == 'm' || m[2] == 'M') x *= 1000000;
|
2655
|
+
else if (m[2] == 'g' || m[2] == 'G') x *= 1000000000;
|
2656
|
+
}
|
2657
|
+
return Math.floor(x + .499);
|
2658
|
+
}
|
2659
|
+
|
2660
|
+
function paf_misjoin(args)
|
2661
|
+
{
|
2662
|
+
var c, min_seg_len = 1000000, max_gap = 1000000, fn_cen = null, show_long = false, show_err = false, cen_ratio = 0.5;
|
2663
|
+
var n_diff = [0, 0], n_gap = [0, 0], n_inv = [0, 0], n_inv_end = [0, 0];
|
2664
|
+
while ((c = getopt(args, "l:g:c:per:")) != null) {
|
2665
|
+
if (c == 'l') min_seg_len = paf_parseNum(getopt.arg);
|
2666
|
+
else if (c == 'g') max_gap = paf_parseNum(getopt.arg);
|
2667
|
+
else if (c == 'c') fn_cen = getopt.arg;
|
2668
|
+
else if (c == 'r') cen_ratio = parseFloat(getopt.arg);
|
2669
|
+
else if (c == 'p') show_long = true;
|
2670
|
+
else if (c == 'e') show_err = true;
|
2671
|
+
}
|
2672
|
+
if (args.length == getopt.ind) {
|
2673
|
+
print("Usage: paftools.js misjoin [options] <in.paf>");
|
2674
|
+
print("Options:");
|
2675
|
+
print(" -c FILE BED for centromeres []");
|
2676
|
+
print(" -r FLOAT count a centromeric event if overlap ratio > FLOAT [" + cen_ratio + "]");
|
2677
|
+
print(" -l NUM min alignment block length [1m]");
|
2678
|
+
print(" -g NUM max gap size [1m]");
|
2679
|
+
print(" -e output misjoins not involving centromeres");
|
2680
|
+
print(" -p output long alignment blocks for debugging");
|
2681
|
+
return;
|
2682
|
+
}
|
2683
|
+
var cen = {};
|
2684
|
+
var file, buf = new Bytes();
|
2685
|
+
if (fn_cen != null) {
|
2686
|
+
file = new File(fn_cen);
|
2687
|
+
while (file.readline(buf) >= 0) {
|
2688
|
+
var t = buf.toString().split("\t");
|
2689
|
+
if (cen[t[0]] == null) cen[t[0]] = [];
|
2690
|
+
cen[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
|
2691
|
+
}
|
2692
|
+
file.close();
|
2693
|
+
}
|
2694
|
+
|
2695
|
+
function test_cen(cen, chr, st, en) {
|
2696
|
+
var b = cen[chr], len = 0;
|
2697
|
+
if (b == null) return false;
|
2698
|
+
for (var j = 0; j < b.length; ++j)
|
2699
|
+
if (b[j][0] < en && b[j][1] > st) {
|
2700
|
+
var s = b[j][0] > st? b[j][0] : st;
|
2701
|
+
var e = b[j][1] < en? b[j][1] : en;
|
2702
|
+
len += e - s;
|
2703
|
+
}
|
2704
|
+
return len < (en - st) * cen_ratio? false : true;
|
2705
|
+
}
|
2706
|
+
|
2707
|
+
function process(a) {
|
2708
|
+
var k = 0;
|
2709
|
+
for (var i = 0; i < a.length; ++i) {
|
2710
|
+
for (var j = 1; j <= 3; ++j) a[i][j] = parseInt(a[i][j]);
|
2711
|
+
for (var j = 6; j <= 11; ++j) a[i][j] = parseInt(a[i][j]);
|
2712
|
+
if (a[i][10] >= min_seg_len) a[k++] = a[i];
|
2713
|
+
}
|
2714
|
+
a.length = k;
|
2715
|
+
if (a.length == 1) return;
|
2716
|
+
a = a.sort(function(x,y){return x[2]-y[2]});
|
2717
|
+
if (show_long) for (var i = 0; i < a.length; ++i) print(a[i].join("\t"));
|
2718
|
+
for (var i = 1; i < a.length; ++i) {
|
2719
|
+
var ov = [false, false];
|
2720
|
+
ov[0] = test_cen(cen, a[i-1][5], a[i-1][7], a[i-1][8]);
|
2721
|
+
ov[1] = test_cen(cen, a[i][5], a[i][7], a[i][8]);
|
2722
|
+
if (a[i-1][5] != a[i][5]) { // different chr
|
2723
|
+
if (ov[0] || ov[1]) ++n_diff[1];
|
2724
|
+
else if (show_err) {
|
2725
|
+
print("J", a[i-1].slice(0, 12).join("\t"));
|
2726
|
+
print("J", a[i].slice(0, 12).join("\t"));
|
2727
|
+
}
|
2728
|
+
++n_diff[0];
|
2729
|
+
} else if (a[i-1][4] == a[i][4]) { // a gap
|
2730
|
+
var dq = a[i][2] - a[i-1][3];
|
2731
|
+
var dr = a[i][4] == '+'? a[i][7] - a[i-1][8] : a[i-1][7] - a[i][8];
|
2732
|
+
var gap = dr > dq? dr - dq : dq - dr;
|
2733
|
+
if (gap > max_gap) {
|
2734
|
+
if (ov[0] || ov[1]) ++n_gap[1];
|
2735
|
+
else if (show_err) {
|
2736
|
+
print("G", a[i-1].slice(0, 12).join("\t"));
|
2737
|
+
print("G", a[i].slice(0, 12).join("\t"));
|
2738
|
+
}
|
2739
|
+
++n_gap[0];
|
2740
|
+
}
|
2741
|
+
} else if (i + 1 < a.length && a[i+1][4] == a[i-1][4]) { // bracketed inversion
|
2742
|
+
if (ov[0] || ov[1]) ++n_inv[1];
|
2743
|
+
else if (show_err) {
|
2744
|
+
print("M", a[i-1].slice(0, 12).join("\t"));
|
2745
|
+
print("M", a[i].slice(0, 12).join("\t"));
|
2746
|
+
print("M", a[i+1].slice(0, 12).join("\t"));
|
2747
|
+
}
|
2748
|
+
++n_inv[0];
|
2749
|
+
++i;
|
2750
|
+
} else { // hanging inversion
|
2751
|
+
if (ov[0] || ov[1]) ++n_inv_end[1];
|
2752
|
+
++n_inv_end[0];
|
2753
|
+
}
|
2754
|
+
}
|
2755
|
+
}
|
2756
|
+
|
2757
|
+
file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
|
2758
|
+
var a = [];
|
2759
|
+
while (file.readline(buf) >= 0) {
|
2760
|
+
var t = buf.toString().split("\t");
|
2761
|
+
if (a.length > 0 && a[0][0] != t[0]) {
|
2762
|
+
process(a);
|
2763
|
+
a.length = 0;
|
2764
|
+
}
|
2765
|
+
a.push(t);
|
2766
|
+
}
|
2767
|
+
if (a.length > 0) process(a);
|
2768
|
+
file.close();
|
2769
|
+
buf.destroy();
|
2770
|
+
print("# inter-chromosomal misjoins: " + n_diff.join(","));
|
2771
|
+
print("# intra-chromosomal gaps: " + n_gap.join(","));
|
2772
|
+
print("# candidate inversions in the middle: " + n_inv.join(","));
|
2773
|
+
print("# candidate inversions at contig ends: " + n_inv_end.join(","));
|
2774
|
+
}
|
2775
|
+
|
2776
|
+
function _paf_get_alen(t)
|
2777
|
+
{
|
2778
|
+
var svlen = null, alen = null;
|
2779
|
+
if ((m = /(^|;)SVLEN=(-?\d+)/.exec(t[7])) != null)
|
2780
|
+
svlen = parseInt(m[2]);
|
2781
|
+
var s = t[4].split(",");
|
2782
|
+
var min_abs_diff = 1<<30, max_abs_diff = 0;
|
2783
|
+
if (svlen != null && svlen != 0)
|
2784
|
+
alen = svlen, min_abs_diff = max_abs_diff = svlen > 0? svlen : -svlen;
|
2785
|
+
var rlen = t[3].length;
|
2786
|
+
for (var i = 0; i < s.length; ++i) {
|
2787
|
+
if (/^<\S+>$/.test(s[i])) continue;
|
2788
|
+
var diff = s[i].length - rlen;
|
2789
|
+
var abs_diff = diff > 0? diff : -diff;
|
2790
|
+
min_abs_diff = min_abs_diff < abs_diff? min_abs_diff : abs_diff;
|
2791
|
+
if (max_abs_diff < abs_diff)
|
2792
|
+
max_abs_diff = abs_diff, alen = diff;
|
2793
|
+
}
|
2794
|
+
return [alen, min_abs_diff, max_abs_diff];
|
2795
|
+
}
|
2796
|
+
|
2797
|
+
function paf_sveval(args)
|
2798
|
+
{
|
2799
|
+
var c, min_flt = 30, min_size = 50, max_size = 100000, win_size = 500, print_err = false, print_match = false, bed_fn = null;
|
2800
|
+
var len_diff_ratio = 0.5;
|
2801
|
+
while ((c = getopt(args, "f:i:x:w:er:pd:")) != null) {
|
2802
|
+
if (c == 'f') min_flt = paf_parseNum(getopt.arg);
|
2803
|
+
else if (c == 'i') min_size = paf_parseNum(getopt.arg);
|
2804
|
+
else if (c == 'x') max_size = paf_parseNum(getopt.arg);
|
2805
|
+
else if (c == 'w') win_size = paf_parseNum(getopt.arg);
|
2806
|
+
else if (c == 'd') len_diff_ratio = parseFloat(getopt.arg);
|
2807
|
+
else if (c == 'r') bed_fn = getopt.arg;
|
2808
|
+
else if (c == 'e') print_err = true;
|
2809
|
+
else if (c == 'p') print_match = true;
|
2810
|
+
}
|
2811
|
+
if (args.length - getopt.ind < 2) {
|
2812
|
+
print("Usage: paftools.js sveval [options] <base.vcf> <call.vcf>");
|
2813
|
+
print("Options:");
|
2814
|
+
print(" -r FILE confident region in BED []");
|
2815
|
+
print(" -f INT min length to discard [" + min_flt + "]");
|
2816
|
+
print(" -i INT min SV length [" + min_size + "]");
|
2817
|
+
print(" -x INT max SV length [" + max_size + "]");
|
2818
|
+
print(" -w INT fuzzy windown size [" + win_size + "]");
|
2819
|
+
print(" -d FLOAT max allele diff if there is a single allele in the window [" + len_diff_ratio + "]");
|
2820
|
+
print(" -e print errors");
|
2821
|
+
return;
|
2822
|
+
}
|
2823
|
+
|
2824
|
+
function read_bed(fn) {
|
2825
|
+
var buf = new Bytes();
|
2826
|
+
var file = new File(fn);
|
2827
|
+
var bed = {};
|
2828
|
+
while (file.readline(buf) >= 0) {
|
2829
|
+
var t = buf.toString().split("\t");
|
2830
|
+
if (bed[t[0]] == null) bed[t[0]] = [];
|
2831
|
+
bed[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
|
2832
|
+
}
|
2833
|
+
file.close();
|
2834
|
+
buf.destroy();
|
2835
|
+
for (var x in bed) {
|
2836
|
+
Interval.sort(bed[x]);
|
2837
|
+
Interval.merge(bed[x]);
|
2838
|
+
Interval.index_end(bed[x]);
|
2839
|
+
}
|
2840
|
+
return bed;
|
2841
|
+
}
|
2842
|
+
|
2843
|
+
var bed = bed_fn != null? read_bed(bed_fn) : null;
|
2844
|
+
|
2845
|
+
function read_vcf(fn, bed) {
|
2846
|
+
var buf = new Bytes();
|
2847
|
+
var file = new File(fn);
|
2848
|
+
var v = {};
|
2849
|
+
while (file.readline(buf) >= 0) {
|
2850
|
+
var m, t = buf.toString().split("\t");
|
2851
|
+
if (t[0][0] == '#') continue;
|
2852
|
+
if (bed != null && bed[t[0]] == null) continue;
|
2853
|
+
if (t[4] == '<INV>' || t[4] == '<INVDUP>') continue; // no inversion
|
2854
|
+
if (/[\[\]]/.test(t[4])) continue; // no break points
|
2855
|
+
var st = parseInt(t[1]) - 1, en = st + t[3].length;
|
2856
|
+
// parse svlen
|
2857
|
+
var b = _paf_get_alen(t), svlen = b[0];
|
2858
|
+
var abslen = svlen == null? 0 : svlen > 0? svlen : -svlen;
|
2859
|
+
if (abslen < min_flt || abslen > max_size) continue;
|
2860
|
+
// update end
|
2861
|
+
if ((m = /(^|;)END=(\d+)/.exec(t[7])) != null)
|
2862
|
+
en = parseInt(m[2]);
|
2863
|
+
else if (svlen != null && svlen < 0)
|
2864
|
+
en = st + (-svlen);
|
2865
|
+
if (en < st) en = st;
|
2866
|
+
if (st == en) --st, ++en;
|
2867
|
+
if (bed != null && Interval.find_ovlp(bed[t[0]], st, en).length == 0) continue;
|
2868
|
+
// insert
|
2869
|
+
if (v[t[0]] == null) v[t[0]] = [];
|
2870
|
+
v[t[0]].push([st, en, svlen, abslen]);
|
2871
|
+
}
|
2872
|
+
file.close();
|
2873
|
+
buf.destroy();
|
2874
|
+
for (var x in v) {
|
2875
|
+
Interval.sort(v[x]);
|
2876
|
+
Interval.index_end(v[x]);
|
2877
|
+
}
|
2878
|
+
return v;
|
2879
|
+
}
|
2880
|
+
|
2881
|
+
function compare_vcf(v0, v1, label) {
|
2882
|
+
var m = 0, n = 0;
|
2883
|
+
for (var x in v1) {
|
2884
|
+
var a1 = v1[x], a0 = v0[x];
|
2885
|
+
for (var i = 0; i < a1.length; ++i) {
|
2886
|
+
if (a1[i][3] < min_size) continue;
|
2887
|
+
++n;
|
2888
|
+
if (a0 == null) continue;
|
2889
|
+
var ws = win_size + (a1[i][3]>>1);
|
2890
|
+
var st = a1[i][0] > ws? a1[i][0] - ws : 0;
|
2891
|
+
b = Interval.find_ovlp(a0, st, a1[i][1] + ws);
|
2892
|
+
var n_ins = 0, n_del = 0, sv_del = null, sv_ins = null;
|
2893
|
+
for (var j = 0; j < b.length; ++j) {
|
2894
|
+
if (b[j][2] < 0) ++n_del, sv_del = -b[j][2];
|
2895
|
+
else if (b[j][2] > 0) ++n_ins, sv_ins = b[j][2];
|
2896
|
+
if (print_match)
|
2897
|
+
print("MA", x, a1[i].slice(0, 3).join("\t"), b[j].slice(0, 3).join("\t"));
|
2898
|
+
}
|
2899
|
+
var match = false;
|
2900
|
+
if (a1[i][2] > 0) { // insertion
|
2901
|
+
if (n_ins == 1) {
|
2902
|
+
var diff = sv_ins - a1[i][3];
|
2903
|
+
if (diff < 0) diff = -diff;
|
2904
|
+
if (diff < min_size || diff / a1[i][3] < len_diff_ratio)
|
2905
|
+
match = true;
|
2906
|
+
} else if (n_ins > 1) match = true; // multiple insertions; ambiguous
|
2907
|
+
} else if (a1[i][2] < 0) {
|
2908
|
+
if (n_del == 1) { // deletion
|
2909
|
+
var diff = sv_del - a1[i][3];
|
2910
|
+
if (diff < 0) diff = -diff;
|
2911
|
+
if (diff < min_size || diff / a1[i][3] < len_diff_ratio)
|
2912
|
+
match = true;
|
2913
|
+
} else if (n_del > 1) match = true; // multiple deletions; ambiguous
|
2914
|
+
}
|
2915
|
+
if (match) ++m;
|
2916
|
+
else if (print_err) {
|
2917
|
+
if ((a1[i][2] > 0 && n_ins > 0) || (a1[i][2] < 0 && n_del > 0))
|
2918
|
+
print("MM", x, a1[i].slice(0, 3).join("\t"));
|
2919
|
+
print(label, x, a1[i].slice(0, 3).join("\t"));
|
2920
|
+
}
|
2921
|
+
}
|
2922
|
+
}
|
2923
|
+
return [n, m];
|
2924
|
+
}
|
2925
|
+
|
2926
|
+
var v_base = read_vcf(args[getopt.ind+0], bed);
|
2927
|
+
var v_call = read_vcf(args[getopt.ind+1], bed);
|
2928
|
+
var fn = compare_vcf(v_call, v_base, 'FN');
|
2929
|
+
var fp = compare_vcf(v_base, v_call, 'FP');
|
2930
|
+
print('SN', fn[0], fn[1], (fn[1] / fn[0]).toFixed(6));
|
2931
|
+
print('PC', fp[0], fp[1], (fp[1] / fp[0]).toFixed(6));
|
2932
|
+
print('F1', ((fn[1] / fn[0] + fp[1] / fp[0]) / 2).toFixed(6));
|
2933
|
+
}
|
2934
|
+
|
2935
|
+
function paf_vcfsel(args)
|
2936
|
+
{
|
2937
|
+
var c, min_l = 0, max_l = 1<<30;
|
2938
|
+
while ((c = getopt(args, "l:L:")) != null) {
|
2939
|
+
if (c == 'l') min_l = parseInt(getopt.arg);
|
2940
|
+
else if (c == 'L') max_l = parseInt(getopt.arg);
|
2941
|
+
}
|
2942
|
+
|
2943
|
+
var buf = new Bytes();
|
2944
|
+
if (getopt.ind == args.length) {
|
2945
|
+
print("Usage: paftools.js vcfsel [options] <in.vcf>");
|
2946
|
+
return 1;
|
2947
|
+
}
|
2948
|
+
var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
|
2949
|
+
while (file.readline(buf) >= 0) {
|
2950
|
+
var m, line = buf.toString();
|
2951
|
+
if (line[0] == '#') {
|
2952
|
+
print(line);
|
2953
|
+
continue;
|
2954
|
+
}
|
2955
|
+
var t = line.split("\t");
|
2956
|
+
var st = parseInt(t[1]), en = st + t[3].length - 1;
|
2957
|
+
if ((m = /(^|;)END=(\d+)/.exec(t[7])) != null)
|
2958
|
+
en = parseInt(m[2]);
|
2959
|
+
if (en < st) {
|
2960
|
+
warn("END is smaller than POS: " + en + " < " + st);
|
2961
|
+
en = st;
|
2962
|
+
}
|
2963
|
+
var b = _paf_get_alen(t);
|
2964
|
+
var alen = b[0], min_abs_diff = b[1], max_abs_diff = b[2];
|
2965
|
+
if (max_abs_diff < min_l || min_abs_diff > max_l)
|
2966
|
+
continue;
|
2967
|
+
print(line);
|
2968
|
+
}
|
2969
|
+
file.close();
|
2970
|
+
buf.destroy();
|
2971
|
+
}
|
2972
|
+
|
2973
|
+
function paf_pafcmp(args)
|
2974
|
+
{
|
2975
|
+
var c, opt = { min_len:5000, min_mapq:10, min_ovlp:0.5 };
|
2976
|
+
while ((c = getopt(args, "q:")) != null) {
|
2977
|
+
if (c == 'q') opt.min_mapq = parseInt(getopt.arg);
|
2978
|
+
}
|
2979
|
+
|
2980
|
+
var buf = new Bytes();
|
2981
|
+
if (args.length - getopt.ind < 2) {
|
2982
|
+
print("Usage: paftools.js pafcmp [options] <base.paf> <test.paf>");
|
2983
|
+
print("Options:");
|
2984
|
+
print(" -q INT min mapping quality [" + opt.min_mapq + "]");
|
2985
|
+
return 1;
|
2986
|
+
}
|
2987
|
+
|
2988
|
+
var eval = { n_base:0, n_test:0, n_out_high:0, n_out_low:0, n_hit:0, n_wrong:0, n_miss:0 };
|
2989
|
+
|
2990
|
+
function process_base(base, a) {
|
2991
|
+
if (a.length != 1) return;
|
2992
|
+
for (var i = 1; i < 4; ++i)
|
2993
|
+
a[0][i] = parseInt(a[0][i]);
|
2994
|
+
for (var i = 6; i < 12; ++i)
|
2995
|
+
a[0][i] = parseInt(a[0][i]);
|
2996
|
+
if (a[0][1] < opt.min_len) return;
|
2997
|
+
if (a[0][11] >= opt.min_mapq) ++eval.n_base;
|
2998
|
+
base[a[0][0]] = [a[0][5], a[0][7], a[0][8], a[0][11], 0, 0];
|
2999
|
+
}
|
3000
|
+
|
3001
|
+
var file = new File(args[getopt.ind]);
|
3002
|
+
warn("Reading " + args[getopt.ind] + "...");
|
3003
|
+
var a = [], base = {};
|
3004
|
+
while (file.readline(buf) >= 0) {
|
3005
|
+
var line = buf.toString();
|
3006
|
+
var t = line.split("\t");
|
3007
|
+
if (/\ttp:A:S/.test(line)) continue;
|
3008
|
+
if (a.length > 0 && a[0][0] != t[0]) {
|
3009
|
+
process_base(base, a);
|
3010
|
+
a = [];
|
3011
|
+
}
|
3012
|
+
a.push(t);
|
3013
|
+
}
|
3014
|
+
process_base(base, a);
|
3015
|
+
file.close();
|
3016
|
+
|
3017
|
+
function process_test(base, a) {
|
3018
|
+
for (var i = 1; i < 4; ++i)
|
3019
|
+
a[0][i] = parseInt(a[0][i]);
|
3020
|
+
for (var i = 6; i < 12; ++i)
|
3021
|
+
a[0][i] = parseInt(a[0][i]);
|
3022
|
+
if (a[0][1] < opt.min_len) return;
|
3023
|
+
if (a[0][11] >= opt.min_mapq) ++eval.n_test;
|
3024
|
+
var c = [a[0][5], a[0][7], a[0][8], a[0][11]];
|
3025
|
+
if (base[a[0][0]] == null) {
|
3026
|
+
if (c[3] >= opt.min_mapq) ++opt.n_out_high;
|
3027
|
+
else ++opt.n_out_low;
|
3028
|
+
} else {
|
3029
|
+
var b = base[a[0][0]];
|
3030
|
+
var inter = 0, union = (b[2] - b[1]) + (c[2] - c[1]);
|
3031
|
+
if (b[0] == c[0]) { // same chr
|
3032
|
+
if (b[1] < c[1]) {
|
3033
|
+
if (b[2] > c[1])
|
3034
|
+
inter = b[2] - c[1], union = c[2] - b[1];
|
3035
|
+
} else { // c[1] < b[1]
|
3036
|
+
if (c[2] > b[1])
|
3037
|
+
inter = c[2] - b[1], union = b[2] - c[1];
|
3038
|
+
}
|
3039
|
+
}
|
3040
|
+
if (inter >= union * opt.min_ovlp) {
|
3041
|
+
if (b[3] >= opt.min_mapq) ++eval.n_hit;
|
3042
|
+
++b[4];
|
3043
|
+
} else {
|
3044
|
+
if (b[3] >= opt.min_mapq) {
|
3045
|
+
print("W", a[0][0], b.slice(0, 4).join("\t"), c.join("\t"));
|
3046
|
+
++eval.n_wrong;
|
3047
|
+
}
|
3048
|
+
++b[5];
|
3049
|
+
}
|
3050
|
+
}
|
3051
|
+
}
|
3052
|
+
|
3053
|
+
file = new File(args[getopt.ind+1]);
|
3054
|
+
warn("Reading " + args[getopt.ind+1] + "...");
|
3055
|
+
a = [];
|
3056
|
+
while (file.readline(buf) >= 0) {
|
3057
|
+
var line = buf.toString();
|
3058
|
+
var t = line.split("\t");
|
3059
|
+
if (/\ttp:A:S/.test(line)) continue;
|
3060
|
+
if (a.length > 0 && a[0][0] != t[0]) {
|
3061
|
+
process_test(base, a);
|
3062
|
+
a = [];
|
3063
|
+
}
|
3064
|
+
a.push(t);
|
3065
|
+
}
|
3066
|
+
process_test(base, a);
|
3067
|
+
file.close();
|
3068
|
+
|
3069
|
+
for (var r in base) {
|
3070
|
+
var b = base[r];
|
3071
|
+
if (b[3] >= opt.min_mapq && b[4] == 0 && b[5] == 0) {
|
3072
|
+
++eval.n_miss;
|
3073
|
+
print("M", r, b.slice(0, 4).join("\t"));
|
3074
|
+
}
|
3075
|
+
}
|
3076
|
+
|
3077
|
+
print("X", eval.n_base + " base alignments with mapQ>=" + opt.min_mapq);
|
3078
|
+
// print("X", eval.n_test + " test alignments with mapQ>=" + opt.min_mapq);
|
3079
|
+
print("X", eval.n_hit + " base alignments correctly mapped by test");
|
3080
|
+
print("X", eval.n_wrong + " wrong test alignment");
|
3081
|
+
print("X", eval.n_miss + " base alignments missing");
|
3082
|
+
print("X", eval.n_out_high + " additional test alignments with mapQ>=" + opt.min_mapq);
|
3083
|
+
|
3084
|
+
buf.destroy();
|
3085
|
+
}
|
3086
|
+
|
3087
|
+
/*************************
|
3088
|
+
***** main function *****
|
3089
|
+
*************************/
|
3090
|
+
|
3091
|
+
function main(args)
|
3092
|
+
{
|
3093
|
+
if (args.length == 0) {
|
3094
|
+
print("Usage: paftools.js <command> [arguments]");
|
3095
|
+
print("Commands:");
|
3096
|
+
print(" view convert PAF to BLAST-like (for eyeballing) or MAF");
|
3097
|
+
print(" splice2bed convert spliced alignment in PAF/SAM to BED12");
|
3098
|
+
print(" sam2paf convert SAM to PAF");
|
3099
|
+
print(" delta2paf convert MUMmer's delta to PAF");
|
3100
|
+
print(" gff2bed convert GTF/GFF3 to BED12");
|
3101
|
+
print("");
|
3102
|
+
print(" stat collect basic mapping information in PAF/SAM");
|
3103
|
+
print(" asmstat collect basic assembly information");
|
3104
|
+
print(" asmgene evaluate gene completeness");
|
3105
|
+
print(" misjoin evaluate large-scale misjoins");
|
3106
|
+
print(" liftover simplistic liftOver");
|
3107
|
+
print(" call call variants from asm-to-ref alignment with the cs tag");
|
3108
|
+
print(" bedcov compute the number of bases covered");
|
3109
|
+
print(" vcfstat VCF statistics");
|
3110
|
+
print(" sveval compare two SV callsets in VCF");
|
3111
|
+
print(" version print paftools.js version");
|
3112
|
+
print("");
|
3113
|
+
print(" mapeval evaluate mapping accuracy using mason2/PBSIM-simulated FASTQ");
|
3114
|
+
print(" pafcmp compare two PAF files");
|
3115
|
+
print(" mason2fq convert mason2-simulated SAM to FASTQ");
|
3116
|
+
print(" pbsim2fq convert PBSIM-simulated MAF to FASTQ");
|
3117
|
+
print(" junceval evaluate splice junction consistency with known annotations");
|
3118
|
+
print(" ov-eval evaluate read overlap sensitivity using read-to-ref mapping");
|
3119
|
+
exit(1);
|
3120
|
+
}
|
3121
|
+
|
3122
|
+
var cmd = args.shift();
|
3123
|
+
if (cmd == 'view') paf_view(args);
|
3124
|
+
else if (cmd == 'sam2paf') paf_sam2paf(args);
|
3125
|
+
else if (cmd == 'delta2paf') paf_delta2paf(args);
|
3126
|
+
else if (cmd == 'splice2bed') paf_splice2bed(args);
|
3127
|
+
else if (cmd == 'gff2bed') paf_gff2bed(args);
|
3128
|
+
else if (cmd == 'stat') paf_stat(args);
|
3129
|
+
else if (cmd == 'asmstat') paf_asmstat(args);
|
3130
|
+
else if (cmd == 'asmgene') paf_asmgene(args);
|
3131
|
+
else if (cmd == 'misjoin') paf_misjoin(args);
|
3132
|
+
else if (cmd == 'liftover' || cmd == 'liftOver') paf_liftover(args);
|
3133
|
+
else if (cmd == 'vcfpair') paf_vcfpair(args);
|
3134
|
+
else if (cmd == 'call') paf_call(args);
|
3135
|
+
else if (cmd == 'mapeval') paf_mapeval(args);
|
3136
|
+
else if (cmd == 'pafcmp') paf_pafcmp(args);
|
3137
|
+
else if (cmd == 'bedcov') paf_bedcov(args);
|
3138
|
+
else if (cmd == 'mason2fq') paf_mason2fq(args);
|
3139
|
+
else if (cmd == 'pbsim2fq') paf_pbsim2fq(args);
|
3140
|
+
else if (cmd == 'junceval') paf_junceval(args);
|
3141
|
+
else if (cmd == 'ov-eval') paf_ov_eval(args);
|
3142
|
+
else if (cmd == 'vcfstat') paf_vcfstat(args);
|
3143
|
+
else if (cmd == 'sveval') paf_sveval(args);
|
3144
|
+
else if (cmd == 'vcfsel') paf_vcfsel(args);
|
3145
|
+
else if (cmd == 'version') print(paftools_version);
|
3146
|
+
else throw Error("unrecognized command: " + cmd);
|
3147
|
+
}
|
3148
|
+
|
3149
|
+
main(arguments);
|