ruby-minigraph 0.0.20.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +62 -0
- data/ext/Rakefile +56 -0
- data/ext/cmappy/cmappy.c +7 -0
- data/ext/cmappy/cmappy.h +8 -0
- data/ext/minigraph/LICENSE.txt +23 -0
- data/ext/minigraph/Makefile +66 -0
- data/ext/minigraph/NEWS.md +317 -0
- data/ext/minigraph/README.md +207 -0
- data/ext/minigraph/algo.c +194 -0
- data/ext/minigraph/algo.h +33 -0
- data/ext/minigraph/asm-call.c +147 -0
- data/ext/minigraph/bseq.c +133 -0
- data/ext/minigraph/bseq.h +76 -0
- data/ext/minigraph/cal_cov.c +139 -0
- data/ext/minigraph/doc/example1.png +0 -0
- data/ext/minigraph/doc/example2.png +0 -0
- data/ext/minigraph/doc/examples.graffle +0 -0
- data/ext/minigraph/format.c +241 -0
- data/ext/minigraph/galign.c +140 -0
- data/ext/minigraph/gchain1.c +532 -0
- data/ext/minigraph/gcmisc.c +223 -0
- data/ext/minigraph/gfa-aug.c +260 -0
- data/ext/minigraph/gfa-base.c +526 -0
- data/ext/minigraph/gfa-bbl.c +372 -0
- data/ext/minigraph/gfa-ed.c +617 -0
- data/ext/minigraph/gfa-io.c +395 -0
- data/ext/minigraph/gfa-priv.h +154 -0
- data/ext/minigraph/gfa.h +166 -0
- data/ext/minigraph/ggen.c +182 -0
- data/ext/minigraph/ggen.h +21 -0
- data/ext/minigraph/ggsimple.c +570 -0
- data/ext/minigraph/gmap.c +211 -0
- data/ext/minigraph/index.c +230 -0
- data/ext/minigraph/kalloc.c +224 -0
- data/ext/minigraph/kalloc.h +82 -0
- data/ext/minigraph/kavl.h +414 -0
- data/ext/minigraph/kdq.h +134 -0
- data/ext/minigraph/ketopt.h +116 -0
- data/ext/minigraph/khashl.h +348 -0
- data/ext/minigraph/krmq.h +474 -0
- data/ext/minigraph/kseq.h +256 -0
- data/ext/minigraph/ksort.h +164 -0
- data/ext/minigraph/kstring.h +165 -0
- data/ext/minigraph/kthread.c +159 -0
- data/ext/minigraph/kthread.h +15 -0
- data/ext/minigraph/kvec-km.h +105 -0
- data/ext/minigraph/kvec.h +110 -0
- data/ext/minigraph/lchain.c +441 -0
- data/ext/minigraph/main.c +301 -0
- data/ext/minigraph/map-algo.c +500 -0
- data/ext/minigraph/mgpriv.h +128 -0
- data/ext/minigraph/minigraph.1 +359 -0
- data/ext/minigraph/minigraph.h +176 -0
- data/ext/minigraph/miniwfa.c +834 -0
- data/ext/minigraph/miniwfa.h +95 -0
- data/ext/minigraph/misc/mgutils.js +1451 -0
- data/ext/minigraph/misc.c +12 -0
- data/ext/minigraph/options.c +134 -0
- data/ext/minigraph/shortk.c +251 -0
- data/ext/minigraph/sketch.c +109 -0
- data/ext/minigraph/sys.c +147 -0
- data/ext/minigraph/sys.h +20 -0
- data/ext/minigraph/test/MT-chimp.fa +277 -0
- data/ext/minigraph/test/MT-human.fa +239 -0
- data/ext/minigraph/test/MT-orangA.fa +276 -0
- data/ext/minigraph/test/MT.gfa +19 -0
- data/ext/minigraph/tex/Makefile +13 -0
- data/ext/minigraph/tex/minigraph.bib +676 -0
- data/ext/minigraph/tex/minigraph.tex +986 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
- data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
- data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
- data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
- data/ext/minigraph/tex/plots/bedutils.js +367 -0
- data/ext/minigraph/tex/plots/chr-plot.js +130 -0
- data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
- data/ext/minigraph.patch +21 -0
- data/lib/minigraph/ffi/constants.rb +230 -0
- data/lib/minigraph/ffi/functions.rb +70 -0
- data/lib/minigraph/ffi/mappy.rb +8 -0
- data/lib/minigraph/ffi.rb +27 -0
- data/lib/minigraph/version.rb +5 -0
- data/lib/minigraph.rb +72 -0
- metadata +159 -0
@@ -0,0 +1,1451 @@
|
|
1
|
+
#!/usr/bin/env k8
|
2
|
+
|
3
|
+
/*******************************
|
4
|
+
* Command line option parsing *
|
5
|
+
*******************************/
|
6
|
+
|
7
|
+
var getopt = function(args, ostr) {
|
8
|
+
var oli; // option letter list index
|
9
|
+
if (typeof(getopt.place) == 'undefined')
|
10
|
+
getopt.ind = 0, getopt.arg = null, getopt.place = -1;
|
11
|
+
if (getopt.place == -1) { // update scanning pointer
|
12
|
+
if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
|
13
|
+
getopt.place = -1;
|
14
|
+
return null;
|
15
|
+
}
|
16
|
+
if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
|
17
|
+
++getopt.ind;
|
18
|
+
getopt.place = -1;
|
19
|
+
return null;
|
20
|
+
}
|
21
|
+
}
|
22
|
+
var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
|
23
|
+
if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
|
24
|
+
if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
|
25
|
+
if (getopt.place < 0) ++getopt.ind;
|
26
|
+
return '?';
|
27
|
+
}
|
28
|
+
if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
|
29
|
+
getopt.arg = null;
|
30
|
+
if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
|
31
|
+
} else { // need an argument
|
32
|
+
if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
|
33
|
+
getopt.arg = args[getopt.ind].substr(getopt.place);
|
34
|
+
else if (args.length <= ++getopt.ind) { // no arg
|
35
|
+
getopt.place = -1;
|
36
|
+
if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
|
37
|
+
return '?';
|
38
|
+
} else getopt.arg = args[getopt.ind]; // white space
|
39
|
+
getopt.place = -1;
|
40
|
+
++getopt.ind;
|
41
|
+
}
|
42
|
+
return optopt;
|
43
|
+
}
|
44
|
+
|
45
|
+
function it_index(a) {
|
46
|
+
if (a.length == 0) return -1;
|
47
|
+
a.sort(function(x, y) { return x[0] - y[0] });
|
48
|
+
var last, last_i;
|
49
|
+
for (var i = 0; i < a.length; i += 2) last = a[i][2] = a[i][1], last_i = i;
|
50
|
+
for (var k = 1; 1<<k <= a.length; ++k) {
|
51
|
+
var i0 = (1<<k) - 1, step = 1<<(k+1);
|
52
|
+
for (var i = i0; i < a.length; i += step) {
|
53
|
+
var x = 1<<(k-1);
|
54
|
+
a[i][2] = a[i][1];
|
55
|
+
if (a[i][2] < a[i-x][2]) a[i][2] = a[i-x][2];
|
56
|
+
var e = i + x < a.length? a[i+x][2] : last;
|
57
|
+
if (a[i][2] < e) a[i][2] = e;
|
58
|
+
}
|
59
|
+
last_i = last_i>>k&1? last_i - (1<<(k-1)) : last_i + (1<<(k-1));
|
60
|
+
if (last_i < a.length) last = last > a[last_i][2]? last : a[last_i][2];
|
61
|
+
}
|
62
|
+
return k - 1;
|
63
|
+
}
|
64
|
+
|
65
|
+
function it_overlap(a, st, en) {
|
66
|
+
if (a == null) return [];
|
67
|
+
var h, stack = [], b = [];
|
68
|
+
for (h = 0; 1<<h <= a.length; ++h);
|
69
|
+
--h;
|
70
|
+
stack.push([(1<<h) - 1, h, 0]);
|
71
|
+
while (stack.length) {
|
72
|
+
var t = stack.pop();
|
73
|
+
var x = t[0], h = t[1], w = t[2];
|
74
|
+
if (h <= 2) {
|
75
|
+
var i0 = x >> h << h, i1 = i0 + (1<<(h+1)) - 1;
|
76
|
+
if (i1 >= a.length) i1 = a.length;
|
77
|
+
for (var i = i0; i < i1; ++i)
|
78
|
+
if (a[i][0] < en && st < a[i][1])
|
79
|
+
b.push(a[i]);
|
80
|
+
} else if (w == 0) { // if left child not processed
|
81
|
+
stack.push([x, h, 1]);
|
82
|
+
var y = x - (1<<(h-1));
|
83
|
+
if (y >= a.length || a[y][2] > st)
|
84
|
+
stack.push([y, h - 1, 0]);
|
85
|
+
} else if (x < a.length && a[x][0] < en) {
|
86
|
+
if (st < a[x][1]) b.push(a[x]);
|
87
|
+
stack.push([x + (1<<(h-1)), h - 1, 0]);
|
88
|
+
}
|
89
|
+
}
|
90
|
+
return b;
|
91
|
+
}
|
92
|
+
|
93
|
+
function it_contained(a, st, en) {
|
94
|
+
if (a == null) return false;
|
95
|
+
var b = it_overlap(a, st, en);
|
96
|
+
var c = false;
|
97
|
+
for (var i = 0; i < b.length; ++i) {
|
98
|
+
if (b[i][0] <= st && en <= b[i][1])
|
99
|
+
c = true;
|
100
|
+
}
|
101
|
+
return c;
|
102
|
+
}
|
103
|
+
|
104
|
+
/****************************
|
105
|
+
***** mgutils commands *****
|
106
|
+
****************************/
|
107
|
+
|
108
|
+
function mg_cmd_renamefa(args)
|
109
|
+
{
|
110
|
+
var c, sep = '#';
|
111
|
+
while ((c = getopt(args, "d:")) != null)
|
112
|
+
if (c == 'd') sep = getopt.arg;
|
113
|
+
if (args.length - getopt.ind < 2) {
|
114
|
+
print("Usage: mgutils.js renamefa [-d delimitor] <prefix> <in.fa>");
|
115
|
+
return;
|
116
|
+
}
|
117
|
+
var prefix = args[getopt.ind];
|
118
|
+
var file = new File(args[getopt.ind+1]);
|
119
|
+
var buf = new Bytes();
|
120
|
+
while (file.readline(buf) >= 0) {
|
121
|
+
if (buf[0] != 62) {
|
122
|
+
print(buf);
|
123
|
+
} else {
|
124
|
+
var m, s = buf.toString();
|
125
|
+
if ((m = /^>(.*)/.exec(s)) != null) {
|
126
|
+
var name = m[1].replace(/^\S+#/, "");
|
127
|
+
print(">" + prefix + sep + name);
|
128
|
+
} else throw Error("Wrong FASTA format!");
|
129
|
+
}
|
130
|
+
}
|
131
|
+
file.close();
|
132
|
+
buf.destroy();
|
133
|
+
}
|
134
|
+
|
135
|
+
function mg_cmd_joinfa(args)
|
136
|
+
{
|
137
|
+
var c, len_n = 20, min_len = 150, name = "decoy-cat";
|
138
|
+
while ((c = getopt(args, "n:l:s:")) != null) {
|
139
|
+
if (c == 'l') min_len = parseInt(getopt.arg);
|
140
|
+
else if (c == 'n') len_n = parseInt(getopt.arg);
|
141
|
+
else if (c == 's') name = getopt.arg;
|
142
|
+
}
|
143
|
+
if (args.length - getopt.ind < 1) {
|
144
|
+
print("Usage: mgutils.js joinfa [options] <in.fa>");
|
145
|
+
return;
|
146
|
+
}
|
147
|
+
var seq = new Bytes(), seq1 = new Bytes(), lineno = 0, nn = new Bytes();
|
148
|
+
for (var i = 0; i < len_n; ++i) nn.set(78);
|
149
|
+
var buf = new Bytes();
|
150
|
+
var file = new File(args[getopt.ind]);
|
151
|
+
while (file.readline(buf) >= 0) {
|
152
|
+
++lineno;
|
153
|
+
if (buf[0] == 62) {
|
154
|
+
if (seq1.length >= min_len) {
|
155
|
+
if (seq.length > 0) seq.set(nn);
|
156
|
+
seq.set(seq1);
|
157
|
+
}
|
158
|
+
seq1.length = 0;
|
159
|
+
} else seq1.set(buf);
|
160
|
+
}
|
161
|
+
if (seq1.length >= min_len) {
|
162
|
+
if (seq.length > 0) seq.set(nn);
|
163
|
+
seq.set(seq1);
|
164
|
+
}
|
165
|
+
print(">" + name);
|
166
|
+
print(seq);
|
167
|
+
file.close();
|
168
|
+
buf.destroy();
|
169
|
+
seq.destroy();
|
170
|
+
seq1.destroy();
|
171
|
+
}
|
172
|
+
|
173
|
+
function mg_cmd_anno(args)
|
174
|
+
{
|
175
|
+
var c, min_rm_div = 0.2, min_rm_sc = 300, micro_cap = 6, min_feat_len = 30, min_centro_len = 200, mobile = false, max_mobile_div = 2.0, min_segdup_frac = 0.2;
|
176
|
+
var fn_rmout = null, fn_etrf = null, fn_dust = null, fn_gap = null, fn_paf = null, fn_centro = null, fn_bb = null, fn_sd = null;
|
177
|
+
while ((c = getopt(args, "e:p:g:d:r:c:l:S:b:s:m")) != null) {
|
178
|
+
if (c == 'l') min_feat_len = parseInt(getopt.arg);
|
179
|
+
else if (c == 'S') min_segdup_frac = parseFloat(getopt.arg);
|
180
|
+
else if (c == 'm') mobile = true;
|
181
|
+
else if (c == 'e') fn_etrf = getopt.arg;
|
182
|
+
else if (c == 'p') fn_paf = getopt.arg;
|
183
|
+
else if (c == 'g') fn_gap = getopt.arg;
|
184
|
+
else if (c == 'd') fn_dust = getopt.arg;
|
185
|
+
else if (c == 'r') fn_rmout = getopt.arg;
|
186
|
+
else if (c == 'c') fn_centro = getopt.arg;
|
187
|
+
else if (c == 'b') fn_bb = getopt.arg;
|
188
|
+
else if (c == 's') fn_sd = getopt.arg;
|
189
|
+
}
|
190
|
+
|
191
|
+
if (args.length - getopt.ind < 1) {
|
192
|
+
print("Usage: anno.js [options] <in.bed>");
|
193
|
+
print("Options:");
|
194
|
+
print(" -l INT min feature length [" + min_feat_len + "]");
|
195
|
+
print(" -S FLOAT min segdup length [" + min_segdup_frac + "]");
|
196
|
+
print(" -r FILE RepeatMasker .out [null]");
|
197
|
+
print(" -g FILE seqtk gap output for stretches of Ns [null]");
|
198
|
+
print(" -d FILE minimap2/sdust output for LCRs [null]");
|
199
|
+
print(" -e FILE etrf output [null]");
|
200
|
+
print(" -p FILE PAF alignment against reference [null]");
|
201
|
+
print(" -c FILE dna-brnn centromere results [null]");
|
202
|
+
print(" -b FILE bubble file [null]");
|
203
|
+
print(" -s FILE segdup file (paste gfa2bed bedcov) [null]");
|
204
|
+
print(" -m annotate AluY and L1HS separately");
|
205
|
+
exit(1);
|
206
|
+
}
|
207
|
+
|
208
|
+
var file, buf = new Bytes();
|
209
|
+
|
210
|
+
var bb = {}, bba = [], seg = {};
|
211
|
+
|
212
|
+
file = new File(args[getopt.ind]);
|
213
|
+
while (file.readline(buf) >= 0) {
|
214
|
+
var t = buf.toString().split("\t");
|
215
|
+
if (t.length < 4) continue;
|
216
|
+
var key = t[0] + "_" + t[1] + "_" + t[2];
|
217
|
+
var len = parseInt(t[3]);
|
218
|
+
if (len < parseInt(t[2]) - parseInt(t[1]))
|
219
|
+
throw Error("ERROR: event length smaller than interval length");
|
220
|
+
bb[key] = [len, {}];
|
221
|
+
bba.push(key);
|
222
|
+
}
|
223
|
+
file.close();
|
224
|
+
|
225
|
+
if (fn_bb) {
|
226
|
+
if (fn_sd) { // generated by "paste <(gfatools gfa2bed) <(bedtk cov segdup.bed gfa2bed.bed) | cut -f1-5,9,10"
|
227
|
+
file = new File(fn_sd);
|
228
|
+
while (file.readline(buf) >= 0) {
|
229
|
+
var t = buf.toString().split("\t");
|
230
|
+
seg[t[3]] = [parseInt(t[4]), parseInt(t[2]) - parseInt(t[1]), parseInt(t[6])];
|
231
|
+
}
|
232
|
+
file.close();
|
233
|
+
}
|
234
|
+
file = new File(fn_bb); // parse "gfatools bubble" output
|
235
|
+
while (file.readline(buf) >= 0) {
|
236
|
+
var t = buf.toString().split("\t");
|
237
|
+
var key = t[0] + "_" + t[1] + "_" + t[2];
|
238
|
+
if (key in bb) {
|
239
|
+
bb[key].push(t[3], t[4], t[5], t[6], t[7], t[8], t[9], t[10]);
|
240
|
+
var s = t[11].split(","), tot_len = 0, tot_sd = 0, ref_len = 0;
|
241
|
+
var dup = {};
|
242
|
+
for (var i = 1; i < s.length - 1; ++i) {
|
243
|
+
if (seg[s[i]] == null) continue;
|
244
|
+
if (dup[s[i]]) continue;
|
245
|
+
dup[s[i]] = 1;
|
246
|
+
tot_len += seg[s[i]][1], tot_sd += seg[s[i]][2];
|
247
|
+
if (seg[s[i]][0] == 0)
|
248
|
+
ref_len += seg[s[i]][1];
|
249
|
+
}
|
250
|
+
bb[key][7] = tot_len;
|
251
|
+
bb[key][8] = tot_sd;
|
252
|
+
bb[key][9] = ref_len;
|
253
|
+
}
|
254
|
+
}
|
255
|
+
file.close();
|
256
|
+
}
|
257
|
+
|
258
|
+
if (fn_rmout) { // parse RepeastMasker output
|
259
|
+
var motif0 = "GGAAT", motif_hash = {}, motif_mut_hash = {};
|
260
|
+
{ // dealing with possible (GGAAT)n rotations and mutations
|
261
|
+
var comp_tbl = { 'A':'T', 'T':'A', 'C':'G', 'G':'C' };
|
262
|
+
var motif = [motif0], motif_alt = [];
|
263
|
+
|
264
|
+
// reverse complement
|
265
|
+
for (var i = 0; i < motif.length; ++i) {
|
266
|
+
var x = motif[i], y = "";
|
267
|
+
for (var j = x.length - 1; j >= 0; --j) {
|
268
|
+
y += comp_tbl[x[j]];
|
269
|
+
}
|
270
|
+
motif_alt.push(y);
|
271
|
+
}
|
272
|
+
for (var i = 0; i < motif_alt.length; ++i)
|
273
|
+
motif.push(motif_alt[i]);
|
274
|
+
|
275
|
+
// rotate
|
276
|
+
motif_alt = [];
|
277
|
+
for (var i = 0; i < motif.length; ++i) {
|
278
|
+
var x = motif[i];
|
279
|
+
for (var j = 1; j < x.length; ++j)
|
280
|
+
motif_alt.push(x.substr(j) + x.substr(0, j));
|
281
|
+
}
|
282
|
+
for (var i = 0; i < motif_alt.length; ++i)
|
283
|
+
motif.push(motif_alt[i]);
|
284
|
+
|
285
|
+
for (var i = 0; i < motif.length; ++i) motif_hash[motif[i]] = i;
|
286
|
+
|
287
|
+
// mutate
|
288
|
+
var bases = [ 'A', 'C', 'G', 'T' ];
|
289
|
+
for (var x in motif_hash) {
|
290
|
+
var y = x;
|
291
|
+
for (var i = 0; i < x.length; ++i) {
|
292
|
+
for (var j = 0; j < bases.length; ++j) {
|
293
|
+
var a = x.split("");
|
294
|
+
if (a[i] == bases[j]) continue;
|
295
|
+
a[i] = bases[j];
|
296
|
+
motif_mut_hash[a.join("")] = 1;
|
297
|
+
}
|
298
|
+
}
|
299
|
+
}
|
300
|
+
}
|
301
|
+
|
302
|
+
function process_rm_line(bb, lines) {
|
303
|
+
var h = {};
|
304
|
+
if (lines.length == 0) return;
|
305
|
+
var key = lines[0][4];
|
306
|
+
if (bb[key] == null) throw Error("ERROR: missing key: " + key);
|
307
|
+
var h = bb[key][1];
|
308
|
+
for (var i = 0; i < lines.length; ++i) {
|
309
|
+
var t = lines[i];
|
310
|
+
var st = parseInt(t[5]) - 1, en = parseInt(t[6]);
|
311
|
+
if (h[t[10]] == null) h[t[10]] = [];
|
312
|
+
h[t[10]].push([st, en]);
|
313
|
+
}
|
314
|
+
}
|
315
|
+
|
316
|
+
file = new File(fn_rmout);
|
317
|
+
var lines = [];
|
318
|
+
while (file.readline(buf) >= 0) {
|
319
|
+
var line = buf.toString();
|
320
|
+
var l2 = line.replace(/^\s+/, "");
|
321
|
+
var m4, t = l2.split(/\s+/);
|
322
|
+
if (t.length < 15) continue;
|
323
|
+
if (t[9] == "ALR/Alpha") t[10] = "alpha";
|
324
|
+
else if (t[9] == "HSATII") t[10] = "hsat2/3";
|
325
|
+
else if (/^LTR\/ERV/.test(t[10])) t[10] = 'LTR/ERV';
|
326
|
+
else if (/^LTR/.test(t[10])) t[10] = 'LTR/misc';
|
327
|
+
else if (/^DNA/.test(t[10])) t[10] = 'DNA/misc';
|
328
|
+
else if (/rRNA|scRNA|snRNA|srpRNA/.test(t[10])) t[10] = 'RNAmisc';
|
329
|
+
else if (/^LINE/.test(t[10]) && t[10] != "LINE/L1") t[10] = 'LINE/misc';
|
330
|
+
else if ((t[10] == "Simple_repeat" || t[10] == "Satellite") && ((m4 = /^\(([ACGT]+)\)n/.exec(t[9])) != null)) {
|
331
|
+
if (motif_hash[m4[1]] != null) {
|
332
|
+
t[10] = "hsat2/3";
|
333
|
+
} else if (m4[1].length % motif0.length == 0) {
|
334
|
+
var c = 0, c_mut = 0;
|
335
|
+
for (var j = 0; j < m4[1].length; j += motif0.length) {
|
336
|
+
var s = m4[1].substr(j, j + motif0.length);
|
337
|
+
if (motif_hash[s] != null)
|
338
|
+
++c;
|
339
|
+
else if (motif_mut_hash[s] != null)
|
340
|
+
++c_mut;
|
341
|
+
}
|
342
|
+
if (c > 0 && (c + c_mut) * motif0.length == m4[1].length)
|
343
|
+
t[10] = "hsat2/3";
|
344
|
+
}
|
345
|
+
}
|
346
|
+
|
347
|
+
if (mobile) {
|
348
|
+
if (t[10] == "LINE/L1" && t[9] == "L1HS" && parseFloat(t[1]) < max_mobile_div) t[10] = "LINE/L1HS";
|
349
|
+
if (t[10] == "SINE/Alu" && /^AluY/.test(t[9]) && parseFloat(t[1]) < max_mobile_div) t[10] = "SINE/AluY";
|
350
|
+
}
|
351
|
+
if (t[10] == 'Simple_repeat' || t[10] == 'Low_complexity') t[10] = 'LCR';
|
352
|
+
if (t[10] != 'LCR') {
|
353
|
+
// if (parseInt(t[0]) < min_rm_sc) continue;
|
354
|
+
// if (parseInt(t[1])/100 > min_rm_div) continue;
|
355
|
+
}
|
356
|
+
if (lines.length > 0 && lines[0][4] != t[4]) {
|
357
|
+
process_rm_line(bb, lines);
|
358
|
+
lines = [];
|
359
|
+
}
|
360
|
+
lines.push(t);
|
361
|
+
}
|
362
|
+
if (lines.length > 0) process_rm_line(bb, lines);
|
363
|
+
file.close();
|
364
|
+
|
365
|
+
for (var i = 0; i < bba.length; ++i) {
|
366
|
+
var h = bb[bba[i]][1], a = [], b = [], c_alu = [], c_l1 = [];
|
367
|
+
for (var key in h) {
|
368
|
+
if (/^(DNA|SINE|LINE|Retroposon|LTR)/.test(key))
|
369
|
+
for (var j = 0; j < h[key].length; ++j)
|
370
|
+
a.push(h[key][j]);
|
371
|
+
if (/^(Satellite|hsat2\/3|alpha)/.test(key))
|
372
|
+
for (var j = 0; j < h[key].length; ++j)
|
373
|
+
b.push(h[key][j]);
|
374
|
+
if (/^(SINE\/Alu)/.test(key))
|
375
|
+
for (var j = 0; j < h[key].length; ++j)
|
376
|
+
c_alu.push(h[key][j]);
|
377
|
+
if (/^(LINE\/L1)/.test(key))
|
378
|
+
for (var j = 0; j < h[key].length; ++j)
|
379
|
+
c_l1.push(h[key][j]);
|
380
|
+
}
|
381
|
+
if (a.length) h['_inter'] = a;
|
382
|
+
if (b.length) h['_sat'] = b;
|
383
|
+
if (c_alu.length) h['_alu'] = c_alu;
|
384
|
+
if (c_l1.length) h['_l1'] = c_l1;
|
385
|
+
}
|
386
|
+
}
|
387
|
+
|
388
|
+
if (fn_etrf) { // parse etrf output
|
389
|
+
file = new File(fn_etrf);
|
390
|
+
while (file.readline(buf) >= 0) {
|
391
|
+
var t = buf.toString().split("\t");
|
392
|
+
var l = parseInt(t[4]);
|
393
|
+
if (l == 1) continue;
|
394
|
+
var anno = l <= micro_cap? 'micro' : 'mini';
|
395
|
+
if (bb[t[0]][1][anno] == null)
|
396
|
+
bb[t[0]][1][anno] = [];
|
397
|
+
var st = parseInt(t[1]), en = parseInt(t[2]);
|
398
|
+
bb[t[0]][1][anno].push([st, en]);
|
399
|
+
if (bb[t[0]][1]['LCR'] == null)
|
400
|
+
bb[t[0]][1]['LCR'] = [];
|
401
|
+
bb[t[0]][1]['LCR'].push([st, en]);
|
402
|
+
}
|
403
|
+
file.close();
|
404
|
+
}
|
405
|
+
|
406
|
+
if (fn_dust) { // parse minimap2/sdust output
|
407
|
+
file = new File(fn_dust);
|
408
|
+
while (file.readline(buf) >= 0) {
|
409
|
+
var t = buf.toString().split("\t");
|
410
|
+
var anno = 'LCR';
|
411
|
+
if (bb[t[0]][1][anno] == null)
|
412
|
+
bb[t[0]][1][anno] = [];
|
413
|
+
bb[t[0]][1][anno].push([parseInt(t[1]), parseInt(t[2])]);
|
414
|
+
}
|
415
|
+
file.close();
|
416
|
+
}
|
417
|
+
|
418
|
+
if (fn_paf) { // parse bubble-to-reference PAF for self alignment
|
419
|
+
file = new File(fn_paf);
|
420
|
+
while (file.readline(buf) >= 0) {
|
421
|
+
var t = buf.toString().split("\t");
|
422
|
+
var anno = 'self';
|
423
|
+
if (bb[t[0]][1][anno] == null)
|
424
|
+
bb[t[0]][1][anno] = [];
|
425
|
+
bb[t[0]][1][anno].push([parseInt(t[2]), parseInt(t[3])]);
|
426
|
+
}
|
427
|
+
file.close();
|
428
|
+
}
|
429
|
+
|
430
|
+
if (fn_gap) { // parse assembly gaps, generated by "seqtk gap"
|
431
|
+
file = new File(fn_gap);
|
432
|
+
while (file.readline(buf) >= 0) {
|
433
|
+
var t = buf.toString().split("\t");
|
434
|
+
var anno = 'gap';
|
435
|
+
if (bb[t[0]][1][anno] == null)
|
436
|
+
bb[t[0]][1][anno] = [];
|
437
|
+
bb[t[0]][1][anno].push([parseInt(t[1]), parseInt(t[2])]);
|
438
|
+
}
|
439
|
+
file.close();
|
440
|
+
}
|
441
|
+
|
442
|
+
if (fn_centro) {
|
443
|
+
file = new File(fn_centro);
|
444
|
+
while (file.readline(buf) >= 0) {
|
445
|
+
var t = buf.toString().split("\t");
|
446
|
+
var anno = t[3] == '1'? 'hsat2/3' : 'alpha';
|
447
|
+
if (bb[t[0]][1][anno] == null)
|
448
|
+
bb[t[0]][1][anno] = [];
|
449
|
+
var st = parseInt(t[1]), en = parseInt(t[2]);
|
450
|
+
if (en - st >= min_centro_len)
|
451
|
+
bb[t[0]][1][anno].push([st, en]);
|
452
|
+
}
|
453
|
+
file.close();
|
454
|
+
}
|
455
|
+
|
456
|
+
for (var i = 0; i < bba.length; ++i) {
|
457
|
+
var m, key = bba[i], h = bb[key][1], len = bb[key][0];
|
458
|
+
if ((m = /^(\S+)_(\d+)_(\d+)/.exec(key)) == null)
|
459
|
+
throw("Bug!");
|
460
|
+
var x = {}, t = [m[1], m[2], m[3]];
|
461
|
+
if (fn_bb) t.push(bb[key][2], bb[key][3], bb[key][4], bb[key][5], bb[key][6], bb[key][7], bb[key][8], bb[key][9]);
|
462
|
+
else t.push(len);
|
463
|
+
for (var c in h) { // calculated the merged length of each feature
|
464
|
+
var s, st = 0, en = 0, cov = 0;
|
465
|
+
s = h[c].sort(function(a, b) { return a[0] - b[0]; });
|
466
|
+
for (var j = 0; j < s.length; ++j) {
|
467
|
+
if (s[j][0] > en) {
|
468
|
+
cov += en - st;
|
469
|
+
st = s[j][0], en = s[j][1];
|
470
|
+
} else en = en > s[j][1]? en : s[j][1];
|
471
|
+
}
|
472
|
+
cov += en - st;
|
473
|
+
if (cov >= min_feat_len)
|
474
|
+
x[c] = cov;
|
475
|
+
}
|
476
|
+
var type = "none";
|
477
|
+
var max = 0, max2 = 0, max_c2 = null, max_c = null, sum = 0, sum_misc = 0;
|
478
|
+
var lcr = x['LCR'] == null? 0 : x['LCR'];
|
479
|
+
var self_len = x['self'] == null? 0 : x['self'];
|
480
|
+
for (var c in x) {
|
481
|
+
if (c == 'LCR' || c == 'self') continue;
|
482
|
+
if (c[0] == '_') continue;
|
483
|
+
sum += x[c];
|
484
|
+
if (c != 'mini' && c != 'micro') sum_misc += x[c];
|
485
|
+
if (max < x[c]) max2 = max, max_c2 = max_c, max = x[c], max_c = c;
|
486
|
+
else if (max2 < x[c]) max2 = x[c], max_c2 = c;
|
487
|
+
}
|
488
|
+
if (max >= len * 0.7) {
|
489
|
+
type = max_c;
|
490
|
+
} else if (lcr >= len * 0.7) {
|
491
|
+
type = 'lcr';
|
492
|
+
if (max_c == 'mini' || max_c == 'micro') {
|
493
|
+
var y = x['mini'] == null? 0 : x['mini'];
|
494
|
+
y += x['micro'] == null? 0 : x['micro'];
|
495
|
+
if (max >= y * 0.7) type = max_c;
|
496
|
+
}
|
497
|
+
} else if ((max_c == 'mini' || max_c == 'micro') && max2 < max * 0.1) {
|
498
|
+
type = max_c;
|
499
|
+
} else if (x['_alu'] != null && x['_alu'] >= len * 0.7) {
|
500
|
+
type = 'SINE/Alu';
|
501
|
+
} else if (x['_l1'] != null && x['_l1'] >= len * 0.7) {
|
502
|
+
type = 'LINE/L1';
|
503
|
+
} else if (x['_inter'] != null && x['_inter'] >= len * 0.7) {
|
504
|
+
type = 'inter';
|
505
|
+
} else if (x['_sat'] != null && x['_sat'] >= len * 0.5) {
|
506
|
+
type = 'Satellite';
|
507
|
+
} else if (sum_misc + lcr >= len * 0.7) {
|
508
|
+
type = 'mixed';
|
509
|
+
} else if (sum + lcr > len * 0.05) {
|
510
|
+
type = 'partial';
|
511
|
+
} else if (self_len >= len * 0.5) {
|
512
|
+
type = 'self';
|
513
|
+
}
|
514
|
+
if ((type == 'partial' || type == 'self' || type == 'none' || type == 'mixed') && fn_bb && t[8] >= 1000 && t[9] >= t[8] * min_segdup_frac)
|
515
|
+
type = 'segdup';
|
516
|
+
t.push(type);
|
517
|
+
for (var c in x)
|
518
|
+
t.push(c + ':' + x[c]);
|
519
|
+
print(t.join("\t"));
|
520
|
+
}
|
521
|
+
|
522
|
+
buf.destroy();
|
523
|
+
}
|
524
|
+
|
525
|
+
function mg_classify_repeat(anno) {
|
526
|
+
var type;
|
527
|
+
if (anno == "mini") type = "11_VNTR";
|
528
|
+
else if (anno == "micro") type = "12_STR";
|
529
|
+
else if (anno == "lcr") type = "13_Other-LCR";
|
530
|
+
else if (anno == "LINE/L1" || anno == "LINE/L1HS") type = "02_L1";
|
531
|
+
else if (anno == "SINE/Alu" || anno == "SINE/AluY") type = "01_Alu";
|
532
|
+
else if (anno == "Retroposon/SVA") type = "03_SVA";
|
533
|
+
else if (anno == "LTR/ERV") type = "04_ERV";
|
534
|
+
else if (anno == "inter" || /^(DNA|LINE|SINE|LTR)/.test(anno)) type = "05_Other-TE";
|
535
|
+
else if (/^Satellite/.test(anno) || anno == "alpha" || anno == "hsat2/3" || anno == "_sat") type = "10_Satellite";
|
536
|
+
else if (anno == "self" || anno == "none") type = "30_Low-repeat";
|
537
|
+
else if (anno == "mixed") type = "20_Other-repeat";
|
538
|
+
else if (anno == "segdup") type = "21_SegDup";
|
539
|
+
else if (anno == "partial") type = "30_Low-repeat";
|
540
|
+
else type = "20_Other-repeat";
|
541
|
+
return type;
|
542
|
+
}
|
543
|
+
|
544
|
+
function mg_cmd_anno2tbl(args)
|
545
|
+
{
|
546
|
+
var segdup_ratio = 0.7;
|
547
|
+
var buf = new Bytes();
|
548
|
+
var file = args.length == 0? new File() : new File(args[0]);
|
549
|
+
var h = {};
|
550
|
+
while (file.readline(buf) >= 0) {
|
551
|
+
var t = buf.toString().split("\t");
|
552
|
+
for (var i = 1; i <= 7; ++i) t[i] = parseInt(t[i]);
|
553
|
+
//if (t[5]) continue;
|
554
|
+
if (t[11] == "gap") continue;
|
555
|
+
if (/chrUn|_random/.test(t[0])) continue;
|
556
|
+
var na = t[4] < 4? t[4] : 4;
|
557
|
+
var key = mg_classify_repeat(t[11]);
|
558
|
+
if (h[key] == null) h[key] = [0, null, 0, 0, 0, 0, 0, 0, 0, 0, 0];
|
559
|
+
++h[key][na];
|
560
|
+
h[key][na+3] += t[7];
|
561
|
+
if (t[8] >= 0 && t[10] >= 0) h[key][na+6] += t[8] - t[10];
|
562
|
+
}
|
563
|
+
|
564
|
+
file.close();
|
565
|
+
buf.destroy();
|
566
|
+
|
567
|
+
for (var key in h) {
|
568
|
+
var label = key.replace(/^[0-9]+_/, "");
|
569
|
+
print(key, label, h[key].slice(2).join("\t"));
|
570
|
+
}
|
571
|
+
}
|
572
|
+
|
573
|
+
function mg_cmd_paf2bl(args)
|
574
|
+
{
|
575
|
+
var c, min_de = 0.01, max_de = 0.1, sub_de = 0.002, min_mapq = 5, min_len = 500, is_sub = false;
|
576
|
+
while ((c = getopt(args, "d:s")) != null) {
|
577
|
+
if (c == 'd') min_de = parseFloat(getopt.arg);
|
578
|
+
else if (c == 's') is_sub = true;
|
579
|
+
}
|
580
|
+
if (args.length - getopt.ind < 1) {
|
581
|
+
print("Usage: mgutils.js paf2bl <ins.paf>");
|
582
|
+
print("Note: bedtk sub <(mgutils.js paf2bl ins.paf; cat bl100.bed) <(../mgutils.js paf2bl -s ins.paf) | bedtk merge");
|
583
|
+
return;
|
584
|
+
}
|
585
|
+
var file = new File(args[getopt.ind]);
|
586
|
+
var buf = new Bytes();
|
587
|
+
while (file.readline(buf) >= 0) {
|
588
|
+
var line = buf.toString();
|
589
|
+
var m, t = line.split("\t");
|
590
|
+
if (/\ttp:A:[SI]/.test(line)) continue;
|
591
|
+
if (parseInt(t[11]) < min_mapq) continue;
|
592
|
+
if (parseInt(t[10]) < min_len) continue;
|
593
|
+
if ((m = /\tde:f:(\S+)/.exec(line)) == null) continue;
|
594
|
+
var de = parseFloat(m[1]);
|
595
|
+
if (is_sub) {
|
596
|
+
if (de > sub_de) continue;
|
597
|
+
} else {
|
598
|
+
if (de < min_de || de > max_de) continue;
|
599
|
+
}
|
600
|
+
print(t[5], t[7], t[8]);
|
601
|
+
//print(line);
|
602
|
+
}
|
603
|
+
buf.destroy();
|
604
|
+
file.close();
|
605
|
+
}
|
606
|
+
|
607
|
+
function mg_cmd_stableGaf(args)
|
608
|
+
{
|
609
|
+
var c;
|
610
|
+
while ((c = getopt(args, "")) != null) {
|
611
|
+
}
|
612
|
+
if (args.length - getopt.ind < 1) {
|
613
|
+
print("Usage: mgutils.js stableGaf <graph.gfa> <aln.gaf>");
|
614
|
+
return;
|
615
|
+
}
|
616
|
+
|
617
|
+
var re = /\t(LN|SN|SO|SR):[Zi]:(\S+)/g;
|
618
|
+
var file, buf = new Bytes();
|
619
|
+
|
620
|
+
var pri_len = {}, segh = {};
|
621
|
+
file = new File(args[getopt.ind]);
|
622
|
+
while (file.readline(buf) >= 0) {
|
623
|
+
var m, line = buf.toString();
|
624
|
+
if ((m = /^S\t(\S+)\t(\S+)(\t.*)/.exec(line)) == null) continue;
|
625
|
+
var seg = m[1], len = m[2] == '*'? 0 : m[2].length, tags = m[3];
|
626
|
+
var sn = null, so = -1, sr = -1;
|
627
|
+
while ((m = re.exec(tags)) != null) {
|
628
|
+
if (m[1] == "LN") len = parseInt(m[2]);
|
629
|
+
else if (m[1] == "SN") sn = m[2];
|
630
|
+
else if (m[1] == "SO") so = parseInt(m[2]);
|
631
|
+
else if (m[1] == "SR") sr = parseInt(m[2]);
|
632
|
+
}
|
633
|
+
if (sn == null || so < 0 || sr < 0 || len <= 0)
|
634
|
+
throw Error("failed to parse tags '" + tags + "'");
|
635
|
+
segh[seg] = [sn, so, so + len, sr];
|
636
|
+
if (sr == 0) {
|
637
|
+
if (pri_len[sn] == null) pri_len[sn] = 0;
|
638
|
+
pri_len[sn] = pri_len[sn] > so + len? pri_len[sn] : so + len;
|
639
|
+
}
|
640
|
+
}
|
641
|
+
file.close();
|
642
|
+
|
643
|
+
re = /([><])([^\s><]+)/g;
|
644
|
+
file = args.length - getopt.ind < 2? new File() : new File(args[getopt.ind+1]);
|
645
|
+
while (file.readline(buf) >= 0) {
|
646
|
+
var m, line = buf.toString();
|
647
|
+
if ((m = /^(\S+)\t(\d+\t\d+\t\d+)\t([+-])\t(\S+)\t(\d+)\t(\d+)\t(\d+)\t(.*)/.exec(line)) == null)
|
648
|
+
continue;
|
649
|
+
var s, a = [];
|
650
|
+
while ((s = re.exec(m[4])) != null) {
|
651
|
+
if (segh[s[2]] == null)
|
652
|
+
throw Error("failed to find segment '" + s[2] + "'");
|
653
|
+
var h = segh[s[2]], add_new = true;
|
654
|
+
if (a.length) {
|
655
|
+
var b = a[a.length - 1];
|
656
|
+
if (b[0] == s[1] && h[3] == b[4] && h[0] == b[1]) {
|
657
|
+
if (b[0] == '>') {
|
658
|
+
if (h[1] == b[3]) b[3] = h[2], add_new = false;
|
659
|
+
} else {
|
660
|
+
if (h[2] == b[2]) b[2] = h[1], add_new = false;
|
661
|
+
}
|
662
|
+
}
|
663
|
+
}
|
664
|
+
if (add_new) a.push([s[1], h[0], h[1], h[2], h[3]]);
|
665
|
+
}
|
666
|
+
var path_len = 0, path = "";
|
667
|
+
for (var i = 0; i < a.length; ++i)
|
668
|
+
path_len += a[i][3] - a[i][2];
|
669
|
+
if (path_len != parseInt(m[5]))
|
670
|
+
throw Error("inconsistent path length for '" + m[1] + "': " + path_len + "!=" + m[5]);
|
671
|
+
if (a.length == 1 && pri_len[a[0][1]] != null) {
|
672
|
+
m[6] = parseInt(m[6]);
|
673
|
+
m[7] = parseInt(m[7]);
|
674
|
+
if (a[0][0] == '>') {
|
675
|
+
m[6] += a[0][2], m[7] += a[0][2];
|
676
|
+
} else {
|
677
|
+
m[3] = m[3] == '+'? '-' : '+';
|
678
|
+
var st = a[0][2] + (path_len - 1 - m[7]);
|
679
|
+
var en = a[0][2] + (path_len - 1 - m[6]);
|
680
|
+
m[6] = st, m[7] = en;
|
681
|
+
}
|
682
|
+
path_len = pri_len[a[0][1]];
|
683
|
+
path = a[0][1];
|
684
|
+
} else {
|
685
|
+
var b = [];
|
686
|
+
for (var i = 0; i < a.length; ++i)
|
687
|
+
b.push(a[i][0] + a[i][1] + ':' + a[i][2] + '-' + a[i][3]);
|
688
|
+
path = b.join("");
|
689
|
+
}
|
690
|
+
print(m[1], m[2], m[3], path, path_len, m[6], m[7], m[8]);
|
691
|
+
}
|
692
|
+
file.close();
|
693
|
+
buf.destroy();
|
694
|
+
}
|
695
|
+
|
696
|
+
function mg_cmd_subgaf(args) // FIXME: this is BUGGY!!!
|
697
|
+
{
|
698
|
+
if (args.length < 2) {
|
699
|
+
print("Usage: mgutils.js subgaf <in.gaf> <reg>");
|
700
|
+
exit(1);
|
701
|
+
}
|
702
|
+
|
703
|
+
var m, ctg, st, en;
|
704
|
+
if ((m = /^(\S+):(\S+)-(\S+)/.exec(args[1])) != null)
|
705
|
+
ctg = m[1], st = parseInt(m[2]), en = parseInt(m[3]);
|
706
|
+
|
707
|
+
var buf = new Bytes();
|
708
|
+
var file = new File(args[0]);
|
709
|
+
var re = /([><])([^\s><]+):(\d+)-(\d+)/g;
|
710
|
+
|
711
|
+
while (file.readline(buf) >= 0) {
|
712
|
+
var t = buf.toString().split("\t");
|
713
|
+
var l = parseInt(t[6]), s = parseInt(t[7]), e = parseInt(t[8]);
|
714
|
+
var regs = [];
|
715
|
+
if (t[5][0] == '>' || t[5][0] == '<') {
|
716
|
+
var m, x = 0;
|
717
|
+
//print(buf);
|
718
|
+
while ((m = re.exec(t[5])) != null) {
|
719
|
+
var a = parseInt(m[3]), b = parseInt(m[4]), c = b - a;
|
720
|
+
if (x == 0) {
|
721
|
+
if (b - a <= s) throw Error("Inconsistent!");
|
722
|
+
a += s;
|
723
|
+
}
|
724
|
+
if (x + c == l) b -= l - e;
|
725
|
+
//print(m[2], a, b);
|
726
|
+
regs.push([m[2], a, b]);
|
727
|
+
x += c;
|
728
|
+
}
|
729
|
+
} else {
|
730
|
+
regs.push([t[5], s, e]);
|
731
|
+
}
|
732
|
+
var hit = false;
|
733
|
+
for (var i = 0; i < regs.length; ++i) {
|
734
|
+
if (regs[i][0] == ctg && regs[i][2] > st && en > regs[i][1])
|
735
|
+
hit = true;
|
736
|
+
}
|
737
|
+
if (hit) print(buf);
|
738
|
+
}
|
739
|
+
|
740
|
+
file.close();
|
741
|
+
buf.destroy();
|
742
|
+
}
|
743
|
+
|
744
|
+
function mg_cmd_sveval(args)
|
745
|
+
{
|
746
|
+
var c, flank = 100, min_var_len = 100, min_test_len = 50, min_sc = 20.0, non_chr = false, out_err = false, flt_vcf = false;
|
747
|
+
while ((c = getopt(args, "f:v:t:s:aeF")) != null) {
|
748
|
+
if (c == 'f') flank = parseInt(getopt.arg);
|
749
|
+
else if (c == 'v') min_var_len = parseInt(getopt.arg);
|
750
|
+
else if (c == 't') min_test_len = parseInt(getopt.arg);
|
751
|
+
else if (c == 's') min_sc = parseFloat(getopt.arg);
|
752
|
+
else if (c == 'a') non_chr = true;
|
753
|
+
else if (c == 'e') out_err = true;
|
754
|
+
else if (c == 'F') flt_vcf = true;
|
755
|
+
}
|
756
|
+
if (args.length - getopt.ind < 3) {
|
757
|
+
print("Usage: mgutils.js sveval <true.vcf> <true.bed> <call.txt>");
|
758
|
+
print("Options:");
|
759
|
+
print(" -f INT length of flanking regions [" + flank + "]");
|
760
|
+
print(" -v INT min INDEL length [" + min_var_len + "]");
|
761
|
+
print(" -t INT min true INDEL length [" + min_test_len + "]");
|
762
|
+
print(" -s INT min called score [" + min_sc + "]");
|
763
|
+
print(" -e print errors");
|
764
|
+
exit(1);
|
765
|
+
}
|
766
|
+
|
767
|
+
var file, buf = new Bytes();
|
768
|
+
|
769
|
+
// parse true.bed
|
770
|
+
warn("Reading confident regions...");
|
771
|
+
var bed = {}
|
772
|
+
file = new File(args[getopt.ind + 1]);
|
773
|
+
while (file.readline(buf) >= 0) {
|
774
|
+
var t = buf.toString().split("\t");
|
775
|
+
if (t.length < 3) continue;
|
776
|
+
if (!non_chr && /^(chr)?[XY]$/.test(t[0])) continue;
|
777
|
+
if (bed[t[0]] == null) bed[t[0]] = [];
|
778
|
+
bed[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
|
779
|
+
}
|
780
|
+
file.close();
|
781
|
+
for (var ctg in bed) it_index(bed[ctg]);
|
782
|
+
|
783
|
+
// parse true.vcf
|
784
|
+
warn("Reading baseline variants...");
|
785
|
+
var vcf = {}, n_vcf = 0;
|
786
|
+
file = new File(args[getopt.ind]);
|
787
|
+
while (file.readline(buf) >= 0) {
|
788
|
+
var t = buf.toString().split("\t");
|
789
|
+
if (t[0][0] == '#') continue;
|
790
|
+
if (t.length < 10) continue;
|
791
|
+
var flt = (t[6] != '.' && t[6] != 'PASS');
|
792
|
+
if (flt_vcf && flt) continue;
|
793
|
+
if (bed[t[0]] == null) continue;
|
794
|
+
var ref = t[3];
|
795
|
+
var st = parseInt(t[1]) - 1;
|
796
|
+
var en = st + ref.length;
|
797
|
+
var max_diff = 0;
|
798
|
+
var al = t[4].split(",");
|
799
|
+
al.unshift(ref);
|
800
|
+
for (var i = 1; i < al.length; ++i) {
|
801
|
+
var l = al[i].length - ref.length;
|
802
|
+
if (l < 0) l = -l;
|
803
|
+
if (max_diff < l) max_diff = l;
|
804
|
+
}
|
805
|
+
if (max_diff < min_test_len) continue;
|
806
|
+
var s = t[9].split(':');
|
807
|
+
if (s.length == 0) continue;
|
808
|
+
var gt = s[0].split(/[|\/]/);
|
809
|
+
if (gt == 0) continue;
|
810
|
+
var max_ev = 0;
|
811
|
+
max_diff = 0;
|
812
|
+
for (var i = 0; i < gt.length; ++i) {
|
813
|
+
if (gt[i] == '.') continue;
|
814
|
+
var x = parseInt(gt[i]);
|
815
|
+
var l = al[x].length - ref.length;
|
816
|
+
var x = l > 0? l : -l;
|
817
|
+
if (max_diff < x) max_diff = x, max_ev = l;
|
818
|
+
}
|
819
|
+
if (max_diff < min_test_len) continue;
|
820
|
+
if (vcf[t[0]] == null) vcf[t[0]] = [];
|
821
|
+
vcf[t[0]].push([st, en, -1, max_diff, max_ev, flt, s[0]]);
|
822
|
+
}
|
823
|
+
file.close();
|
824
|
+
for (var ctg in vcf) it_index(vcf[ctg]);
|
825
|
+
|
826
|
+
// parse rst.txt
|
827
|
+
warn("Reading gt results...");
|
828
|
+
var rst = {};
|
829
|
+
file = new File(args[getopt.ind + 2]);
|
830
|
+
while (file.readline(buf) >= 0) {
|
831
|
+
var t = buf.toString().split("\t");
|
832
|
+
if (parseFloat(t[3]) < min_sc) continue;
|
833
|
+
if (bed[t[0]] == null) continue;
|
834
|
+
if (rst[t[0]] == null) rst[t[0]] = [];
|
835
|
+
var ref_len = t[7] == '*'? 0 : t[7].length;
|
836
|
+
var max_diff = 0, max_ev = 0;
|
837
|
+
for (var i = 8; i < t.length; ++i) {
|
838
|
+
var alt_len = t[i] == '*'? 0 : t[8].length;
|
839
|
+
var l = alt_len - ref_len;
|
840
|
+
var x = l > 0? l : -l;
|
841
|
+
if (max_diff < x) max_diff = x, max_ev = l;
|
842
|
+
}
|
843
|
+
var st = parseInt(t[1]), en = parseInt(t[2]);
|
844
|
+
rst[t[0]].push([st, en, -1, max_diff, max_ev]);
|
845
|
+
}
|
846
|
+
file.close();
|
847
|
+
for (var ctg in rst) it_index(rst[ctg]);
|
848
|
+
|
849
|
+
// sensitivity
|
850
|
+
var n_vcf = [0, 0, 0], fn = [0, 0, 0];
|
851
|
+
for (var ctg in vcf) {
|
852
|
+
for (var i = 0; i < vcf[ctg].length; ++i) {
|
853
|
+
var v = vcf[ctg][i];
|
854
|
+
if (v[3] < min_var_len) continue;
|
855
|
+
if (v[5]) continue;
|
856
|
+
var st = v[0] - flank, en = v[1] + flank;
|
857
|
+
if (st < 0) st = 0;
|
858
|
+
if (!it_contained(bed[ctg], st, en)) continue;
|
859
|
+
var sub = v[4] < 0? 1 : 2;
|
860
|
+
++n_vcf[0], ++n_vcf[sub];
|
861
|
+
var b = it_overlap(rst[ctg], st, en);
|
862
|
+
if (b.length == 0) {
|
863
|
+
if (out_err) print("FN", ctg, v[0], v[1], v[4], v[6]);
|
864
|
+
++fn[0], ++fn[sub];
|
865
|
+
}
|
866
|
+
}
|
867
|
+
}
|
868
|
+
|
869
|
+
// specificity
|
870
|
+
var n_rst = [0, 0, 0], fp = [0, 0, 0];
|
871
|
+
for (var ctg in rst) {
|
872
|
+
for (var i = 0; i < rst[ctg].length; ++i) {
|
873
|
+
var v = rst[ctg][i];
|
874
|
+
if (v[3] < min_var_len) continue;
|
875
|
+
var st = v[0] - flank, en = v[1] + flank;
|
876
|
+
if (st < 0) st = 0;
|
877
|
+
if (!it_contained(bed[ctg], st, en)) continue;
|
878
|
+
var sub = v[4] < 0? 1 : 2;
|
879
|
+
++n_rst[0], ++n_rst[sub];
|
880
|
+
var b = it_overlap(vcf[ctg], st, en);
|
881
|
+
if (b.length == 0) {
|
882
|
+
if (out_err) print("FP", ctg, v[0], v[1], v[4]);
|
883
|
+
++fp[0], ++fp[sub];
|
884
|
+
}
|
885
|
+
}
|
886
|
+
}
|
887
|
+
|
888
|
+
print("NA", fn[0], n_vcf[0], (fn[0]/n_vcf[0]).toFixed(4));
|
889
|
+
print("ND", fn[1], n_vcf[1], (fn[1]/n_vcf[1]).toFixed(4));
|
890
|
+
print("NI", fn[2], n_vcf[2], (fn[2]/n_vcf[2]).toFixed(4));
|
891
|
+
print("PA", fp[0], n_rst[0], (fp[0]/n_rst[0]).toFixed(4));
|
892
|
+
print("PD", fp[1], n_rst[1], (fp[1]/n_rst[1]).toFixed(4));
|
893
|
+
print("PI", fp[2], n_rst[2], (fp[2]/n_rst[2]).toFixed(4));
|
894
|
+
}
|
895
|
+
|
896
|
+
function mg_cmd_extractseg(args)
|
897
|
+
{
|
898
|
+
function process(ctg, first, last, is_end) {
|
899
|
+
if (ctg == null || first[0] == null || first[1] == null) return;
|
900
|
+
if (first[0][7] == first[1][7]) return;
|
901
|
+
if (first[0][7] < first[1][7]) {
|
902
|
+
if (last[0][7] >= first[1][7]) return;
|
903
|
+
if (is_end) print(ctg, last[0][8], first[1][7], '*', 0, '+');
|
904
|
+
else print(ctg, last[0][7], first[1][8], '*', 0, '+');
|
905
|
+
} else {
|
906
|
+
if (last[1][7] >= first[0][7]) return;
|
907
|
+
if (is_end) print(ctg, last[1][8], first[0][7], '*', 0, '-');
|
908
|
+
else print(ctg, last[1][7], first[0][8], '*', 0, '-');
|
909
|
+
}
|
910
|
+
}
|
911
|
+
|
912
|
+
var c, min_len = 100000, is_end = false;
|
913
|
+
while ((c = getopt(args, "el:")) != null) {
|
914
|
+
if (c == 'l') min_len = parseInt(getopt.arg);
|
915
|
+
else if (c == 'e') is_end = true;
|
916
|
+
}
|
917
|
+
if (args.length - getopt.ind < 3) {
|
918
|
+
print("Usage: mgutils.js extractseg <seg1> <seg2> <in.gaf> [...]");
|
919
|
+
return;
|
920
|
+
}
|
921
|
+
|
922
|
+
var seg = [args[getopt.ind], args[getopt.ind+1]];
|
923
|
+
var buf = new Bytes();
|
924
|
+
for (var i = getopt.ind + 2; i < args.length; ++i) {
|
925
|
+
var file = new File(args[i]);
|
926
|
+
var flt = false;
|
927
|
+
var first = [null, null], last = [null, null], ctg = null;
|
928
|
+
while (file.readline(buf) >= 0) {
|
929
|
+
var t = buf.toString().split("\t");
|
930
|
+
if (t[0] != "*") {
|
931
|
+
process(ctg, first, last, is_end);
|
932
|
+
flt = (parseInt(t[3]) - parseInt(t[2]) < min_len || parseInt(t[8]) - parseInt(t[7]) < min_len);
|
933
|
+
first = [null, null];
|
934
|
+
last = [null, null];
|
935
|
+
ctg = t[0];
|
936
|
+
} else if (!flt) {
|
937
|
+
var s = t[1].substr(1);
|
938
|
+
t[7] = parseInt(t[7]), t[8] = parseInt(t[8]);
|
939
|
+
if (s == seg[0] && t[3] != '0') {
|
940
|
+
if (first[0] == null) first[0] = t.slice(0);
|
941
|
+
last[0] = t.slice(0);
|
942
|
+
} else if (s == seg[1] && t[3] != '0') {
|
943
|
+
if (first[1] == null) first[1] = t.slice(0);
|
944
|
+
last[1] = t.slice(0);
|
945
|
+
}
|
946
|
+
}
|
947
|
+
}
|
948
|
+
process(ctg, first, last, is_end);
|
949
|
+
file.close();
|
950
|
+
}
|
951
|
+
buf.destroy();
|
952
|
+
}
|
953
|
+
|
954
|
+
function mg_cmd_bed2sql(args)
|
955
|
+
{
|
956
|
+
var c;
|
957
|
+
while ((c = getopt(args, "")) != null) {
|
958
|
+
}
|
959
|
+
if (args.length - getopt.ind == 0) {
|
960
|
+
print("Usage: paste *.bed | mgutils.js bed2sql <sample.list> | sqlite3 rGFA.db");
|
961
|
+
return;
|
962
|
+
}
|
963
|
+
|
964
|
+
var file, buf = new Bytes();
|
965
|
+
|
966
|
+
var sample = [];
|
967
|
+
file = new File(args[getopt.ind]);
|
968
|
+
while (file.readline(buf) >= 0) {
|
969
|
+
var t = buf.toString().split("\t");
|
970
|
+
sample.push(t[0]);
|
971
|
+
}
|
972
|
+
file.close();
|
973
|
+
|
974
|
+
file = args.length - getopt.ind >= 2 && args[getopt.ind+1] != "-"? new File(args[getopt.ind+1]) : new File();
|
975
|
+
print("DROP INDEX IF EXISTS idx_bwalk;");
|
976
|
+
print("DROP INDEX IF EXISTS idx_cst;");
|
977
|
+
print("DROP INDEX IF EXISTS idx_cen;");
|
978
|
+
print("BEGIN TRANSACTION;");
|
979
|
+
var wid = 0, bid = 0, ins_walk = [];
|
980
|
+
while (file.readline(buf) >= 0) {
|
981
|
+
var t = buf.toString().split("\t");
|
982
|
+
if (t.length != sample.length * 6)
|
983
|
+
throw Error("Different number of samples");
|
984
|
+
var h = {}, w = [], j = 0;
|
985
|
+
for (var i = 5; i < t.length; i += 6, ++j) {
|
986
|
+
if (t[i] == ".") continue;
|
987
|
+
var s = t[i].split(":");
|
988
|
+
if (!(s[0] in h)) {
|
989
|
+
h[s[0]] = w.length;
|
990
|
+
ins_walk.push([wid, bid, s[1], s[0]]);
|
991
|
+
w.push([s[0], s[1], wid++]);
|
992
|
+
}
|
993
|
+
var v = [], x = w[h[s[0]]];
|
994
|
+
v.push("'" + bid + "'", "'" + sample[j] + "'", "'" + x[2] + "'", "'" + s[3] + "'");
|
995
|
+
v.push("'" + s[4] + "'", "'" + s[5] + "'", "'" + (s[2] == '+'? 1 : -1) + "'");
|
996
|
+
print("INSERT INTO call (bid,sample,wid,ctg,start,end,strand) VALUES (" + v.join(",") + ");");
|
997
|
+
}
|
998
|
+
++bid;
|
999
|
+
}
|
1000
|
+
for (var i = 0; i < ins_walk.length; ++i) {
|
1001
|
+
var w = ins_walk[i], v = [];
|
1002
|
+
for (var j = 0; j < w.length; ++j)
|
1003
|
+
v.push("'" + w[j] + "'");
|
1004
|
+
print("INSERT INTO bwalk (wid,bid,len,walk) VALUES (" + v.join(",") + ");");
|
1005
|
+
}
|
1006
|
+
print("END TRANSACTION;");
|
1007
|
+
print("CREATE INDEX IF NOT EXISTS idx_bwalk ON bwalk (bid);");
|
1008
|
+
print("CREATE INDEX IF NOT EXISTS idx_cst ON call (ctg, start);");
|
1009
|
+
print("CREATE INDEX IF NOT EXISTS idx_cen ON call (ctg, end);");
|
1010
|
+
file.close();
|
1011
|
+
|
1012
|
+
buf.destroy();
|
1013
|
+
}
|
1014
|
+
|
1015
|
+
function mg_cmd_merge(args)
|
1016
|
+
{
|
1017
|
+
var c, fn_anno = null, fn_sample = null;
|
1018
|
+
while ((c = getopt(args, "a:s:")) != null) {
|
1019
|
+
if (c == 'a') fn_anno = getopt.arg;
|
1020
|
+
else if (c == 's') fn_sample = getopt.arg;
|
1021
|
+
}
|
1022
|
+
if (args.length - getopt.ind == 0) {
|
1023
|
+
print("Usage: paste *.bed | mgutils.js merge -");
|
1024
|
+
print("Options:");
|
1025
|
+
print(" -a FILE annotation [null]");
|
1026
|
+
print(" -s FILE list of samples [null]");
|
1027
|
+
return;
|
1028
|
+
}
|
1029
|
+
|
1030
|
+
var file, buf = new Bytes();
|
1031
|
+
var anno = {};
|
1032
|
+
if (fn_anno) {
|
1033
|
+
file = new File(fn_anno);
|
1034
|
+
while (file.readline(buf) >= 0) {
|
1035
|
+
var t = buf.toString().split("\t");
|
1036
|
+
var key = [t[0], t[1], t[2]].join("_");
|
1037
|
+
anno[key] = t[11];
|
1038
|
+
}
|
1039
|
+
file.close();
|
1040
|
+
}
|
1041
|
+
var hdr = ["#CHROM", "START", "END", "INFO", "FORMAT"];
|
1042
|
+
if (fn_sample) {
|
1043
|
+
file = new File(fn_sample);
|
1044
|
+
while (file.readline(buf) >= 0) {
|
1045
|
+
var t = buf.toString().split(/\s+/);
|
1046
|
+
hdr.push(t[0]);
|
1047
|
+
}
|
1048
|
+
file.close();
|
1049
|
+
}
|
1050
|
+
file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
|
1051
|
+
print('##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">');
|
1052
|
+
print('##INFO=<ID=NA,Number=1,Type=Integer,Description="Number of alleles">');
|
1053
|
+
print('##INFO=<ID=AC,Number=.,Type=Integer,Description="Allele count">');
|
1054
|
+
print('##INFO=<ID=ALEN,Number=.,Type=Integer,Description="Length of each allele">');
|
1055
|
+
print('##INFO=<ID=ANNO,Number=1,Type=String,Description="Annotation">');
|
1056
|
+
print('##INFO=<ID=VS,Number=1,Type=String,Description="Start vertex">');
|
1057
|
+
print('##INFO=<ID=VE,Number=1,Type=String,Description="End vertex">');
|
1058
|
+
print('##INFO=<ID=AWALK,Number=.,Type=String,Description="Walk of each allele">');
|
1059
|
+
print('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">');
|
1060
|
+
print('##FORMAT=<ID=CSTRAND,Number=1,Type=String,Description="Contig strand">');
|
1061
|
+
print('##FORMAT=<ID=CTG,Number=1,Type=String,Description="Contig name">');
|
1062
|
+
print('##FORMAT=<ID=CS,Number=1,Type=String,Description="Contig start, BED-like">');
|
1063
|
+
print('##FORMAT=<ID=CE,Number=1,Type=String,Description="Contig end, BED-like">');
|
1064
|
+
print(hdr.join("\t"));
|
1065
|
+
while (file.readline(buf) >= 0) {
|
1066
|
+
var t = buf.toString().split("\t");
|
1067
|
+
var a = [t[0], t[1], t[2], "", "GT:CSTRAND:CTG:CS:CE"];
|
1068
|
+
var ah = {}, aa = [], b = [], ns = 0;
|
1069
|
+
for (var j = 5; j < t.length; j += 6) {
|
1070
|
+
if (t[j] == ".") {
|
1071
|
+
b.push(["."]);
|
1072
|
+
continue;
|
1073
|
+
}
|
1074
|
+
++ns;
|
1075
|
+
var s = t[j].split(":");
|
1076
|
+
if (ah[s[0]] == null) {
|
1077
|
+
ah[s[0]] = aa.length;
|
1078
|
+
aa.push({walk:s[0], len:s[1], cnt:0});
|
1079
|
+
}
|
1080
|
+
var k = ah[s[0]];
|
1081
|
+
++aa[k].cnt;
|
1082
|
+
s[0] = k;
|
1083
|
+
b.push(s);
|
1084
|
+
}
|
1085
|
+
for (var i = 0; i < aa.length; ++i)
|
1086
|
+
aa[i].i = i;
|
1087
|
+
aa.sort(function(a,b) { return b.cnt - a.cnt });
|
1088
|
+
var i2a = [], alen = [], awalk = [], ac = [];
|
1089
|
+
for (var i = 0; i < aa.length; ++i) {
|
1090
|
+
i2a[aa[i].i] = i;
|
1091
|
+
alen[i] = aa[i].len;
|
1092
|
+
awalk[i] = aa[i].walk;
|
1093
|
+
ac[i] = aa[i].cnt;
|
1094
|
+
}
|
1095
|
+
for (var j = 0; j < b.length; ++j) {
|
1096
|
+
if (b[j][0] != ".") {
|
1097
|
+
var i = b[j].shift();
|
1098
|
+
b[j][0] = i2a[i];
|
1099
|
+
a.push(b[j].join(":"));
|
1100
|
+
} else a.push(".");
|
1101
|
+
}
|
1102
|
+
var info = ["NS="+ns, "NA="+aa.length, "ALEN="+alen.join(","), "AC="+ac.join(",")];
|
1103
|
+
var key = [t[0], t[1], t[2]].join("_");
|
1104
|
+
if (anno[key] != null) info.push("ANNO="+anno[key]);
|
1105
|
+
info.push("VS="+t[3], "VE="+t[4], "AWALK="+awalk.join(","));
|
1106
|
+
a[3] = info.join(";");
|
1107
|
+
print(a.join("\t"));
|
1108
|
+
}
|
1109
|
+
buf.destroy();
|
1110
|
+
file.close();
|
1111
|
+
}
|
1112
|
+
|
1113
|
+
function mg_cmd_merge2vcf(args) {
|
1114
|
+
var buf = new Bytes();
|
1115
|
+
var file = args.length == 0? new File() : new File(args[0]);
|
1116
|
+
print("##fileformat=VCFv4.2");
|
1117
|
+
print('##ALT=<ID=CNV,Description="description">');
|
1118
|
+
print('##FORMAT=<ID=GT0,Number=1,Type=String,Description="Original genotype">');
|
1119
|
+
while (file.readline(buf) >= 0) {
|
1120
|
+
var line = buf.toString();
|
1121
|
+
if (/^##/.test(line)) {
|
1122
|
+
print(line);
|
1123
|
+
continue;
|
1124
|
+
}
|
1125
|
+
var a, t = line.split("\t");
|
1126
|
+
if (line[0] == "#") {
|
1127
|
+
a = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"];
|
1128
|
+
for (var i = 5; i < t.length; ++i)
|
1129
|
+
a.push(t[i]);
|
1130
|
+
} else {
|
1131
|
+
a = [t[0], t[1], ".", "N", "<CNV>", 30, "PASS", t[3] + ";END=" + t[2], "GT:GT0"];
|
1132
|
+
for (var i = 5; i < t.length; ++i) {
|
1133
|
+
var s = t[i].split(":");
|
1134
|
+
if (s[0] == ".") a.push(s[0]);
|
1135
|
+
else if (s[0] == "0") a.push("0:0");
|
1136
|
+
else a.push("1:" + s[0]);
|
1137
|
+
}
|
1138
|
+
}
|
1139
|
+
print(a.join("\t"));
|
1140
|
+
}
|
1141
|
+
file.close();
|
1142
|
+
buf.destroy();
|
1143
|
+
}
|
1144
|
+
|
1145
|
+
function mg_cmd_segfreq(args) {
|
1146
|
+
var c, min_af = 0.05;
|
1147
|
+
while ((c = getopt(args, "f:")) != null) {
|
1148
|
+
if (c == 'f') min_af = parseFloat(getopt.arg);
|
1149
|
+
}
|
1150
|
+
if (args.length - getopt.ind < 2) {
|
1151
|
+
print("Usage: mgutils.js segfreq [-f minFreq=0.05] <gfa2bed.bed> <merged.txt> [bubble.bed]");
|
1152
|
+
return 1;
|
1153
|
+
}
|
1154
|
+
var file, buf = new Bytes();
|
1155
|
+
|
1156
|
+
file = new File(args[getopt.ind]);
|
1157
|
+
var h = {}, a = [];
|
1158
|
+
while (file.readline(buf) >= 0) {
|
1159
|
+
var t = buf.toString().split("\t");
|
1160
|
+
h[t[3]] = a.length;
|
1161
|
+
a.push([t[0], t[1], t[2], t[3], parseInt(t[4]), 0, 0, "N/A", "N/A", 0]);
|
1162
|
+
}
|
1163
|
+
file.close();
|
1164
|
+
|
1165
|
+
var re_info = /([^\s=;]+)=([^\s=;]+)/g;
|
1166
|
+
var re_walk = /([><])([^\s><]+)/g;
|
1167
|
+
var bb = {};
|
1168
|
+
file = new File(args[getopt.ind+1]);
|
1169
|
+
while (file.readline(buf) >= 0) {
|
1170
|
+
var m, t = buf.toString().split("\t", 4);
|
1171
|
+
if (t[0][0] == "#") continue;
|
1172
|
+
var anno = null, ac = null, walk = null;
|
1173
|
+
while ((m = re_info.exec(t[3])) != null) {
|
1174
|
+
if (m[1] == "ANNO") anno = m[2];
|
1175
|
+
else if (m[1] == "AWALK") walk = m[2].split(",");
|
1176
|
+
else if (m[1] == "AC") {
|
1177
|
+
ac = m[2].split(",");
|
1178
|
+
for (var i = 0; i < ac.length; ++i)
|
1179
|
+
ac[i] = parseInt(ac[i]);
|
1180
|
+
}
|
1181
|
+
}
|
1182
|
+
if (ac == null || walk == null) throw Error("Missing AC or AWALK");
|
1183
|
+
if (ac.length != walk.length) throw Error("Inconsistent AC or AWALK");
|
1184
|
+
if (anno == null) anno = "N/A";
|
1185
|
+
bb[t[0]+"_"+t[1]+"_"+t[2]] = anno;
|
1186
|
+
var ns = 0;
|
1187
|
+
for (var i = 0; i < walk.length; ++i)
|
1188
|
+
ns += ac[i];
|
1189
|
+
var dup = {};
|
1190
|
+
for (var i = 0; i < walk.length; ++i) {
|
1191
|
+
if (walk[i] == "*") continue;
|
1192
|
+
while ((m = re_walk.exec(walk[i])) != null) {
|
1193
|
+
var s = m[2];
|
1194
|
+
if (h[s] == null) throw Error("Missing segment " + s);
|
1195
|
+
if (dup[s]) continue;
|
1196
|
+
dup[s] = 1;
|
1197
|
+
var b = a[h[s]];
|
1198
|
+
b[5] = ns;
|
1199
|
+
b[6] += ac[i];
|
1200
|
+
b[7] = anno;
|
1201
|
+
b[8] = mg_classify_repeat(anno);
|
1202
|
+
b[9] = walk.length;
|
1203
|
+
}
|
1204
|
+
}
|
1205
|
+
}
|
1206
|
+
file.close();
|
1207
|
+
|
1208
|
+
if (args.length - getopt.ind >= 3) {
|
1209
|
+
file = new File(args[getopt.ind+2]);
|
1210
|
+
while (file.readline(buf) >= 0) {
|
1211
|
+
var t = buf.toString().split("\t");
|
1212
|
+
var s = t[11].split(",");
|
1213
|
+
var anno = bb[t[0]+"_"+t[1]+"_"+t[2]];
|
1214
|
+
if (anno == null) throw Error("Missing bubble");
|
1215
|
+
for (var i = 1; i < s.length - 1; ++i) {
|
1216
|
+
if (h[s[i]] == null) throw Error("Inconsistent bubble file");
|
1217
|
+
var b = a[h[s[i]]];
|
1218
|
+
b[10] = t[0], b[11] = t[1], b[12] = t[2];
|
1219
|
+
b[7] = anno;
|
1220
|
+
b[8] = mg_classify_repeat(anno);
|
1221
|
+
}
|
1222
|
+
}
|
1223
|
+
file.close();
|
1224
|
+
}
|
1225
|
+
|
1226
|
+
buf.destroy();
|
1227
|
+
|
1228
|
+
var replen = {};
|
1229
|
+
for (var i = 0; i < a.length; ++i) {
|
1230
|
+
print(a[i].join("\t"));
|
1231
|
+
var anno = a[i][8], len = parseInt(a[i][2]) - parseInt(a[i][1]);
|
1232
|
+
if (a[i][4] > 0 && a[i][5] > 0 && a[i][6] >= a[i][5] * min_af) {
|
1233
|
+
if (replen[anno] == null) replen[anno] = [0, 0, 0];
|
1234
|
+
if (a[i][9] == 2) replen[anno][0] += len;
|
1235
|
+
else if (a[i][9] == 3) replen[anno][1] += len;
|
1236
|
+
else if (a[i][9] > 3) replen[anno][2] += len;
|
1237
|
+
}
|
1238
|
+
}
|
1239
|
+
for (var x in replen) {
|
1240
|
+
var y = x.replace(/^\d+_/, "");
|
1241
|
+
warn(x, y, replen[x].join("\t"));
|
1242
|
+
}
|
1243
|
+
}
|
1244
|
+
|
1245
|
+
function mg_cmd_genecopy(args)
|
1246
|
+
{
|
1247
|
+
var c, opt = { min_cov:0.8, min_rel_cov:0.85, max_prev_ovlp:0.5, mm:4, gapo:5 };
|
1248
|
+
while ((c = getopt(args, "c:r:")) != null) {
|
1249
|
+
if (c == 'c') opt.min_cov = parseFloat(getopt.arg);
|
1250
|
+
else if (c == 'r') opt.min_rel_cov = parseFloat(getopt.arg);
|
1251
|
+
}
|
1252
|
+
if (args.length - getopt.ind < 2) {
|
1253
|
+
print("Usage: mgutils.js genecopy [options] <in.gaf> <src.bed>");
|
1254
|
+
print("Options:");
|
1255
|
+
print(" -c FLOAT min coverage [" + opt.min_cov + "]");
|
1256
|
+
print(" -r FLOAT min relative coverage [" + opt.min_rel_cov + "]");
|
1257
|
+
return;
|
1258
|
+
}
|
1259
|
+
var re_cg = /(\d+)([MIDNSHP=X])/g;
|
1260
|
+
var re_walk = /([><])([^\s><]+):(\d+)-(\d+)/g;
|
1261
|
+
var file, buf = new Bytes();
|
1262
|
+
|
1263
|
+
var src = {};
|
1264
|
+
file = new File(args[getopt.ind+1]);
|
1265
|
+
while (file.readline(buf) >= 0) {
|
1266
|
+
var t = buf.toString().split("\t");
|
1267
|
+
src[t[3]] = [t[0], parseInt(t[1]), parseInt(t[2]), t[5] == '+'? 1 : -1];
|
1268
|
+
}
|
1269
|
+
file.close();
|
1270
|
+
|
1271
|
+
file = new File(args[getopt.ind]);
|
1272
|
+
var gene = {}, reg = {};
|
1273
|
+
while (file.readline(buf) >= 0) {
|
1274
|
+
var t = buf.toString().split("\t");
|
1275
|
+
|
1276
|
+
// check coverage
|
1277
|
+
if (/\|([A-Z]+\d*\.\d+|ENSG\d+)$/.test(t[0])) continue;
|
1278
|
+
for (var i = 1; i <= 3; ++i) t[i] = parseInt(t[i]);
|
1279
|
+
for (var i = 6; i <= 11; ++i) t[i] = parseInt(t[i]);
|
1280
|
+
if (t[3] - t[2] < t[1] * opt.min_cov) continue;
|
1281
|
+
if (gene[t[0]] != null) {
|
1282
|
+
var g0 = gene[t[0]][0];
|
1283
|
+
if (t[3] - t[2] < (g0[2] - g0[1]) * opt.min_rel_cov)
|
1284
|
+
continue;
|
1285
|
+
}
|
1286
|
+
|
1287
|
+
// compute de
|
1288
|
+
var m, cg = null;
|
1289
|
+
for (var i = 12; i < t.length; ++i) {
|
1290
|
+
if (t[i].substr(0, 4) == "cg:Z")
|
1291
|
+
cg = t[i].substr(5);
|
1292
|
+
}
|
1293
|
+
if (cg == null) throw Error("no cg");
|
1294
|
+
var blen = 0, mlen = 0, sc = 0;
|
1295
|
+
while ((m = re_cg.exec(cg)) != null) {
|
1296
|
+
var len = parseInt(m[1]);
|
1297
|
+
if (m[2] == '=') mlen += len, blen += len, sc += len;
|
1298
|
+
else {
|
1299
|
+
++blen;
|
1300
|
+
if (m[2] == '*') sc -= opt.mm;
|
1301
|
+
else sc -= opt.gapo + len;
|
1302
|
+
}
|
1303
|
+
}
|
1304
|
+
var de = (blen - mlen) / blen;
|
1305
|
+
|
1306
|
+
// find intervals
|
1307
|
+
var intv = [];
|
1308
|
+
if (t[5][0] == '>' || t[5][0] == '<') {
|
1309
|
+
var len = 0;
|
1310
|
+
while ((m = re_walk.exec(t[5])) != null) {
|
1311
|
+
var st = parseInt(m[3]), en = parseInt(m[4]);
|
1312
|
+
var ss = st, ee = en;
|
1313
|
+
if (t[7] >= len && t[7] < len + en - st) {
|
1314
|
+
if (m[1] == '>') ss = st + t[7];
|
1315
|
+
else ee = en - t[7];
|
1316
|
+
} else if (t[8] >= len && t[8] < len + en - st) {
|
1317
|
+
if (m[1] == '>') ee = st + t[8] - len;
|
1318
|
+
else ss = st + t[6] - t[8];
|
1319
|
+
}
|
1320
|
+
intv.push([m[2], ss, ee, m[1] == '>'? 1 : -1]);
|
1321
|
+
len += en - st;
|
1322
|
+
}
|
1323
|
+
} else intv.push([t[5], t[7], t[8], t[4] == '+'? 1 : -1]);
|
1324
|
+
|
1325
|
+
// save
|
1326
|
+
if (gene[t[0]] == null) gene[t[0]] = [];
|
1327
|
+
for (var j = 0; j < intv.length; ++j) {
|
1328
|
+
var x = intv[j], pass = true;
|
1329
|
+
if (reg[x[0]] == null) reg[x[0]] = [];
|
1330
|
+
if (src[t[0]] != null) {
|
1331
|
+
var y = src[t[0]];
|
1332
|
+
if (y[0] == x[0] && y[1] < x[2] && x[1] < y[2]) {
|
1333
|
+
var l = (x[2] < y[2]? x[2] : y[2]) - (x[1] > y[1]? x[1] : y[1]);
|
1334
|
+
if (l > (x[2] - x[1]) * 0.99) pass = false;
|
1335
|
+
}
|
1336
|
+
}
|
1337
|
+
reg[x[0]].push([x[1], x[2], 0, t[0], gene[t[0]].length, pass, x[3]]);
|
1338
|
+
}
|
1339
|
+
gene[t[0]].push([t[1], t[2], t[3], sc, de, intv]);
|
1340
|
+
}
|
1341
|
+
file.close();
|
1342
|
+
buf.destroy();
|
1343
|
+
|
1344
|
+
// preparation
|
1345
|
+
var a = [];
|
1346
|
+
for (var g in gene) {
|
1347
|
+
var x = gene[g];
|
1348
|
+
for (var i = 0; i < x.length; ++i)
|
1349
|
+
a.push([x[i][3], g, i]);
|
1350
|
+
}
|
1351
|
+
a.sort(function(x,y) { return y[0]-x[0] });
|
1352
|
+
for (var x in reg) it_index(reg[x]);
|
1353
|
+
|
1354
|
+
// select
|
1355
|
+
var good_hit = [];
|
1356
|
+
for (var i = 0; i < a.length; ++i) {
|
1357
|
+
var x = a[i];
|
1358
|
+
var h = gene[x[1]][x[2]];
|
1359
|
+
var intv = h[5], cov_tot = 0, len_tot = 0, ovlp_gene = {};
|
1360
|
+
for (var j = 0; j < intv.length; ++j) {
|
1361
|
+
var y = intv[j];
|
1362
|
+
len_tot += y[2] - y[1];
|
1363
|
+
if (reg[y[0]] == null) continue;
|
1364
|
+
var st0 = y[1], en0 = y[2];
|
1365
|
+
var b = it_overlap(reg[y[0]], st0, en0);
|
1366
|
+
var cov_st = 0, cov_en = 0, cov = 0;
|
1367
|
+
for (var k = 0; k < b.length; ++k) {
|
1368
|
+
if (b[k][5] || b[k][6] != y[3]) continue;
|
1369
|
+
ovlp_gene[b[k][3]] = 1;
|
1370
|
+
var st1 = b[k][0] > st0? b[k][0] : st0;
|
1371
|
+
var en1 = b[k][1] < en0? b[k][1] : en0;
|
1372
|
+
if (st1 > cov_en) {
|
1373
|
+
cov += cov_en - cov_st;
|
1374
|
+
cov_st = st1, cov_en = en1;
|
1375
|
+
} else cov_en = cov_en > en1? cov_en : en1;
|
1376
|
+
}
|
1377
|
+
cov += cov_en - cov_st;
|
1378
|
+
cov_tot += cov;
|
1379
|
+
}
|
1380
|
+
var ovlp_gene_arr = [];
|
1381
|
+
for (var y in ovlp_gene) ovlp_gene_arr.push(y);
|
1382
|
+
if (ovlp_gene_arr.length > 0)
|
1383
|
+
print("OG", x[1], x[2], cov_tot, len_tot, ovlp_gene_arr);
|
1384
|
+
if (cov_tot < len_tot * opt.max_prev_ovlp) {
|
1385
|
+
good_hit.push([x[1], x[2]]);
|
1386
|
+
for (var j = 0; j < intv.length; ++j) {
|
1387
|
+
var y = intv[j];
|
1388
|
+
if (reg[y[0]] == null) continue;
|
1389
|
+
var b = it_overlap(reg[y[0]], y[1], y[2]);
|
1390
|
+
for (var k = 0; k < b.length; ++k)
|
1391
|
+
if (b[k][3] == x[1] && b[k][4] == x[2])
|
1392
|
+
b[k][5] = false;
|
1393
|
+
}
|
1394
|
+
}
|
1395
|
+
}
|
1396
|
+
|
1397
|
+
// count good_hit
|
1398
|
+
var out = {};
|
1399
|
+
for (var g in gene) out[g] = [gene[g].length, 0];
|
1400
|
+
for (var i = 0; i < good_hit.length; ++i) {
|
1401
|
+
print("GH", good_hit[i][0], gene[good_hit[i][0]][good_hit[i][1]].join("\t"));
|
1402
|
+
++out[good_hit[i][0]][1];
|
1403
|
+
}
|
1404
|
+
for (var g in out)
|
1405
|
+
print("GC", g, out[g].join("\t"));
|
1406
|
+
}
|
1407
|
+
|
1408
|
+
/*************************
|
1409
|
+
***** main function *****
|
1410
|
+
*************************/
|
1411
|
+
|
1412
|
+
function main(args)
|
1413
|
+
{
|
1414
|
+
if (args.length == 0) {
|
1415
|
+
print("Usage: mgutils.js <command> [arguments]");
|
1416
|
+
print("Commands:");
|
1417
|
+
print(" stableGaf convert unstable GAF to stable GAF");
|
1418
|
+
print(" renamefa add a prefix to sequence names in FASTA");
|
1419
|
+
print(" paf2bl blacklist regions from insert-to-ref alignment");
|
1420
|
+
print(" anno annotate short sequences");
|
1421
|
+
print(" anno2tbl summarize anno output");
|
1422
|
+
print(" extractseg extract a segment from GAF");
|
1423
|
+
print(" merge merge per-sample --call BED");
|
1424
|
+
print(" merge2vcf convert merge BED output to VCF");
|
1425
|
+
print(" segfreq compute node frequency from merged calls");
|
1426
|
+
print(" genecopy gene copy analysis");
|
1427
|
+
print(" bed2sql generate SQL from --call BED");
|
1428
|
+
//print(" subgaf extract GAF overlapping with a region (BUGGY)");
|
1429
|
+
//print(" sveval evaluate SV accuracy");
|
1430
|
+
exit(1);
|
1431
|
+
}
|
1432
|
+
|
1433
|
+
var cmd = args.shift();
|
1434
|
+
if (cmd == 'renamefa') mg_cmd_renamefa(args);
|
1435
|
+
else if (cmd == 'paf2bl') mg_cmd_paf2bl(args);
|
1436
|
+
else if (cmd == 'anno') mg_cmd_anno(args);
|
1437
|
+
else if (cmd == 'anno2tbl') mg_cmd_anno2tbl(args);
|
1438
|
+
else if (cmd == 'subgaf') mg_cmd_subgaf(args);
|
1439
|
+
else if (cmd == 'sveval') mg_cmd_sveval(args);
|
1440
|
+
else if (cmd == 'joinfa') mg_cmd_joinfa(args);
|
1441
|
+
else if (cmd == 'stableGaf') mg_cmd_stableGaf(args);
|
1442
|
+
else if (cmd == 'bed2sql') mg_cmd_bed2sql(args);
|
1443
|
+
else if (cmd == 'extractseg') mg_cmd_extractseg(args);
|
1444
|
+
else if (cmd == 'merge') mg_cmd_merge(args);
|
1445
|
+
else if (cmd == 'merge2vcf') mg_cmd_merge2vcf(args);
|
1446
|
+
else if (cmd == 'segfreq') mg_cmd_segfreq(args);
|
1447
|
+
else if (cmd == 'genecopy') mg_cmd_genecopy(args);
|
1448
|
+
else throw Error("unrecognized command: " + cmd);
|
1449
|
+
}
|
1450
|
+
|
1451
|
+
main(arguments);
|