minimap2 0.2.28.0 → 0.2.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/ext/cmappy/cmappy.c +3 -3
- data/ext/cmappy/cmappy.h +1 -1
- data/ext/minimap2/FAQ.md +1 -1
- data/ext/minimap2/Makefile +4 -3
- data/ext/minimap2/NEWS.md +39 -0
- data/ext/minimap2/README.md +30 -14
- data/ext/minimap2/align.c +134 -50
- data/ext/minimap2/cookbook.md +2 -2
- data/ext/minimap2/format.c +57 -3
- data/ext/minimap2/hit.c +14 -6
- data/ext/minimap2/index.c +304 -13
- data/ext/minimap2/jump.c +201 -0
- data/ext/minimap2/kalloc.h +8 -0
- data/ext/minimap2/ksw2.h +5 -2
- data/ext/minimap2/ksw2_dispatch.c +5 -5
- data/ext/minimap2/ksw2_exts2_sse.c +17 -6
- data/ext/minimap2/main.c +60 -12
- data/ext/minimap2/map.c +35 -8
- data/ext/minimap2/minimap.h +14 -3
- data/ext/minimap2/minimap2.1 +92 -45
- data/ext/minimap2/misc/README.md +2 -1
- data/ext/minimap2/misc/pafcluster.js +241 -0
- data/ext/minimap2/misc/paftools.js +8 -3
- data/ext/minimap2/mmpriv.h +24 -2
- data/ext/minimap2/options.c +27 -2
- data/ext/minimap2/python/cmappy.h +3 -3
- data/ext/minimap2/python/cmappy.pxd +4 -2
- data/ext/minimap2/python/mappy.pyx +19 -7
- data/ext/minimap2/setup.py +2 -2
- data/ext/minimap2.patch +2 -2
- data/lib/minimap2/aligner.rb +19 -12
- data/lib/minimap2/ffi/constants.rb +9 -1
- data/lib/minimap2/ffi/functions.rb +145 -6
- data/lib/minimap2/ffi/mappy.rb +1 -1
- data/lib/minimap2/version.rb +1 -1
- data/lib/minimap2.rb +2 -2
- metadata +5 -4
- data/ext/minimap2/misc/mmphase.js +0 -335
@@ -1,335 +0,0 @@
|
|
1
|
-
#!/usr/bin/env k8
|
2
|
-
|
3
|
-
var getopt = function(args, ostr) {
|
4
|
-
var oli; // option letter list index
|
5
|
-
if (typeof(getopt.place) == 'undefined')
|
6
|
-
getopt.ind = 0, getopt.arg = null, getopt.place = -1;
|
7
|
-
if (getopt.place == -1) { // update scanning pointer
|
8
|
-
if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
|
9
|
-
getopt.place = -1;
|
10
|
-
return null;
|
11
|
-
}
|
12
|
-
if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
|
13
|
-
++getopt.ind;
|
14
|
-
getopt.place = -1;
|
15
|
-
return null;
|
16
|
-
}
|
17
|
-
}
|
18
|
-
var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
|
19
|
-
if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
|
20
|
-
if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
|
21
|
-
if (getopt.place < 0) ++getopt.ind;
|
22
|
-
return '?';
|
23
|
-
}
|
24
|
-
if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
|
25
|
-
getopt.arg = null;
|
26
|
-
if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
|
27
|
-
} else { // need an argument
|
28
|
-
if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
|
29
|
-
getopt.arg = args[getopt.ind].substr(getopt.place);
|
30
|
-
else if (args.length <= ++getopt.ind) { // no arg
|
31
|
-
getopt.place = -1;
|
32
|
-
if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
|
33
|
-
return '?';
|
34
|
-
} else getopt.arg = args[getopt.ind]; // white space
|
35
|
-
getopt.place = -1;
|
36
|
-
++getopt.ind;
|
37
|
-
}
|
38
|
-
return optopt;
|
39
|
-
}
|
40
|
-
|
41
|
-
function read_fastx(file, buf)
|
42
|
-
{
|
43
|
-
if (file.readline(buf) < 0) return null;
|
44
|
-
var m, line = buf.toString();
|
45
|
-
if ((m = /^([>@])(\S+)/.exec(line)) == null)
|
46
|
-
throw Error("wrong fastx format");
|
47
|
-
var is_fq = (m[1] == '@');
|
48
|
-
var name = m[2];
|
49
|
-
if (file.readline(buf) < 0)
|
50
|
-
throw Error("missing sequence line");
|
51
|
-
var seq = buf.toString();
|
52
|
-
if (is_fq) { // skip quality
|
53
|
-
file.readline(buf);
|
54
|
-
file.readline(buf);
|
55
|
-
}
|
56
|
-
return [name, seq];
|
57
|
-
}
|
58
|
-
|
59
|
-
function filter_paf(a, opt)
|
60
|
-
{
|
61
|
-
if (a.length == 0) return;
|
62
|
-
var k = 0;
|
63
|
-
for (var i = 0; i < a.length; ++i) {
|
64
|
-
var ai = a[i];
|
65
|
-
if (ai[10] < opt.min_blen) continue;
|
66
|
-
if (ai[9] < ai[10] * opt.min_iden) continue;
|
67
|
-
var clip = [0, 0];
|
68
|
-
if (ai[4] == '+') {
|
69
|
-
clip[0] = ai[2] < ai[7]? ai[2] : ai[7];
|
70
|
-
clip[1] = ai[1] - ai[3] < ai[6] - ai[8]? ai[1] - ai[3] : ai[6] - ai[8];
|
71
|
-
} else {
|
72
|
-
clip[0] = ai[2] < ai[6] - ai[8]? ai[2] : ai[6] - ai[8];
|
73
|
-
clip[1] = ai[1] - ai[3] < ai[7]? ai[1] - ai[3] : ai[7];
|
74
|
-
}
|
75
|
-
if (clip[0] > opt.max_clip_len || clip[1] > opt.max_clip_len) continue;
|
76
|
-
a[k++] = ai;
|
77
|
-
}
|
78
|
-
a.length = k;
|
79
|
-
}
|
80
|
-
|
81
|
-
function parse_events(t, ev, id, buf)
|
82
|
-
{
|
83
|
-
var re = /(:(\d+))|(([\+\-\*])([a-z]+))/g;
|
84
|
-
var m, cs = null;
|
85
|
-
for (var j = 12; j < t.length; ++j) {
|
86
|
-
if ((m = /^cs:Z:(\S+)/.exec(t[j])) != null) {
|
87
|
-
cs = m[1].toLowerCase();
|
88
|
-
break;
|
89
|
-
}
|
90
|
-
}
|
91
|
-
if (cs == null) {
|
92
|
-
warn("Warning: no cs tag for read '" + t[0] + "'");
|
93
|
-
return;
|
94
|
-
}
|
95
|
-
var st = t[2], en = t[3];
|
96
|
-
var x = st;
|
97
|
-
while ((m = re.exec(cs)) != null) {
|
98
|
-
var l;
|
99
|
-
if (m[2] != null) { // an identitcal match ":\d+"
|
100
|
-
l = parseInt(m[2]);
|
101
|
-
// [start, end, type, index, changed_base]
|
102
|
-
ev.push([x, x + l, 0, id]);
|
103
|
-
} else {
|
104
|
-
if (m[4] == '*') {
|
105
|
-
l = 1;
|
106
|
-
ev.push([x, x + 1, 1, id, m[5][0]]);
|
107
|
-
} else if (m[4] == '+') {
|
108
|
-
l = m[5].length;
|
109
|
-
ev.push([x, x + l, 2, id]);
|
110
|
-
} else if (m[4] == '-') {
|
111
|
-
l = 0;
|
112
|
-
ev.push([x, x, -1, id, m[5]]);
|
113
|
-
}
|
114
|
-
}
|
115
|
-
x += l;
|
116
|
-
}
|
117
|
-
if (x != en)
|
118
|
-
throw Error("inconsistent cs for read '" + t[0] + "'");
|
119
|
-
}
|
120
|
-
|
121
|
-
function find_het_sub(ev, a, opt)
|
122
|
-
{
|
123
|
-
var n = a.length, last0_i = -1, h = [], d = [];
|
124
|
-
for (var i = 0; i < n; ++i) h[i] = [], d[i] = [];
|
125
|
-
for (var i = 0; i < ev.length; ++i) {
|
126
|
-
if (ev[i][2] == 0) {
|
127
|
-
if (last0_i < 0 || ev[i][0] != ev[last0_i][0]) last0_i = i;
|
128
|
-
else if (ev[i][1] > ev[last0_i][1])
|
129
|
-
last0_i = i;
|
130
|
-
} else if (ev[i][2] == 1 && last0_i >= 0 && ev[i][0] < ev[last0_i][1]) {
|
131
|
-
if (ev[last0_i][1] - ev[last0_i][0] >= opt.min_mlen) {
|
132
|
-
if (opt.dbg_ev) print("EV", ev[last0_i].join("\t"), "|", ev[i].join("\t"));
|
133
|
-
var e0 = ev[last0_i], hl = h[e0[3]];
|
134
|
-
if (hl.length == 0 || hl[hl.length-1][0] != e0[0])
|
135
|
-
hl.push([e0[0], e0[1]]);
|
136
|
-
d[ev[i][3]].push([ev[i][0], e0[1] - e0[0]]);
|
137
|
-
}
|
138
|
-
}
|
139
|
-
}
|
140
|
-
var b = [];
|
141
|
-
for (var i = 0; i < n; ++i) {
|
142
|
-
var sh = 0, dh = 0;
|
143
|
-
for (var j = 0; j < h[i].length; ++j)
|
144
|
-
sh += h[i][j][1] - h[i][j][0];
|
145
|
-
for (var j = 0; j < d[i].length; ++j)
|
146
|
-
dh += d[i][j][1];
|
147
|
-
// [start, end, index, #consistent, lenConsistent, #conflictive, lenConflictive, identity, mlen]
|
148
|
-
b[i] = [a[i][2], a[i][3], i, h[i].length, sh, d[i].length, dh, a[i][9] / a[i][10], a[i][9]];
|
149
|
-
}
|
150
|
-
return b;
|
151
|
-
}
|
152
|
-
|
153
|
-
function flt_utg_for_ec(b, opt)
|
154
|
-
{
|
155
|
-
var k = 0;
|
156
|
-
for (var i = 0; i < b.length; ++i) {
|
157
|
-
var bi = b[i];
|
158
|
-
if (bi[4] == 0 && bi[6] == 0) b[k++] = bi; // entirely ambiguous
|
159
|
-
else if (bi[6] < (bi[4] + bi[6]) * opt.max_ratio0) b[k++] = bi;
|
160
|
-
}
|
161
|
-
b.length = k;
|
162
|
-
if (b.length == 0) return;
|
163
|
-
// find the longest contiguous segment
|
164
|
-
b.sort(function(x,y) { return x[0]-y[0] });
|
165
|
-
var st = b[0][0], en = b[0][1], max_st = 0, max_en = 0, max_max_en = en;
|
166
|
-
for (var i = 1; i < b.length; ++i) {
|
167
|
-
if (b[i][0] > en) {
|
168
|
-
if (en - st > max_en - max_st)
|
169
|
-
max_st = st, max_en = en;
|
170
|
-
st = b[i][0], en = b[i][1];
|
171
|
-
} else {
|
172
|
-
en = en > b[i][1]? en : b[i][1];
|
173
|
-
}
|
174
|
-
max_max_en = max_max_en > b[i][1]? max_max_en : b[i][1];
|
175
|
-
}
|
176
|
-
if (en - st > max_en - max_st)
|
177
|
-
max_st = st, max_en = en;
|
178
|
-
if (max_max_en != en || st != b[0][0]) {
|
179
|
-
var k = 0;
|
180
|
-
for (var i = 0; i < b.length; ++i)
|
181
|
-
if (b[i][0] < max_en && b[i][1] > max_st)
|
182
|
-
b[k++] = b[i];
|
183
|
-
b.length = k;
|
184
|
-
}
|
185
|
-
}
|
186
|
-
|
187
|
-
function flt_utg_for_bin(b, opt) // filter out alignments clearly on the wrong phase
|
188
|
-
{
|
189
|
-
var k = 0;
|
190
|
-
for (var i = 0; i < b.length; ++i) {
|
191
|
-
var bi = b[i];
|
192
|
-
if (bi[4] + bi[6] == 0 || bi[4] >= (bi[4] + bi[6]) * opt.max_ratio0) b[k++] = bi;
|
193
|
-
}
|
194
|
-
b.length = k;
|
195
|
-
}
|
196
|
-
|
197
|
-
function ec_core(b, n_a, ev, buf, ecb) // error correction
|
198
|
-
{
|
199
|
-
var intv = [];
|
200
|
-
for (var i = 0; i < n_a; ++i)
|
201
|
-
intv[i] = null;
|
202
|
-
intv[b[0][2]] = [b[0][0], b[0][1]];
|
203
|
-
var en = b[0][1];
|
204
|
-
for (var i = 1; i < b.length; ++i) {
|
205
|
-
if (b[i][1] <= en) continue;
|
206
|
-
intv[b[i][2]] = [en, b[i][1]];
|
207
|
-
en = b[i][1];
|
208
|
-
}
|
209
|
-
var k = 0;
|
210
|
-
ecb.capacity = buf.capacity;
|
211
|
-
ecb.length = 0;
|
212
|
-
for (var i = 0; i < ev.length; ++i) {
|
213
|
-
var e = ev[i], I = intv[e[3]];
|
214
|
-
if (I == null) continue;
|
215
|
-
if (e[0] >= I[0] && e[0] < I[1]) { // this is to reduce duplicated events around junctions
|
216
|
-
//print("X", e.join("\t"));
|
217
|
-
if (e[2] == 0) {
|
218
|
-
ecb.length += e[1] - e[0];
|
219
|
-
for (var j = e[0]; j < e[1]; ++j)
|
220
|
-
ecb[k++] = buf[j];
|
221
|
-
} else if (e[2] == 1) {
|
222
|
-
++ecb.length;
|
223
|
-
ecb[k++] = e[4].charCodeAt(0);
|
224
|
-
} else if (e[2] < 0) {
|
225
|
-
ecb.length += e[4].length;
|
226
|
-
for (var j = 0; j < e[4].length; ++j)
|
227
|
-
ecb[k++] = e[4].charCodeAt(j);
|
228
|
-
} // else, skip e[2] == 2
|
229
|
-
}
|
230
|
-
}
|
231
|
-
if (ecb.length != k) throw Error("BUG!");
|
232
|
-
}
|
233
|
-
|
234
|
-
function process_paf(a, opt, fp_seq, buf, ecb)
|
235
|
-
{
|
236
|
-
if (a.length == 0) return;
|
237
|
-
var len = a[0][1], name = a[0][0], seq = null;
|
238
|
-
if (len < opt.min_rlen) return;
|
239
|
-
if (fp_seq) {
|
240
|
-
var ret;
|
241
|
-
while ((ret = read_fastx(fp_seq, buf)) != null)
|
242
|
-
if (ret[0] == a[0][0])
|
243
|
-
break;
|
244
|
-
if (ret == null)
|
245
|
-
throw Error("failed to find sequence for read '" + a[0][0] + "'");
|
246
|
-
name = ret[0], seq = ret[1];
|
247
|
-
if (seq.length != len)
|
248
|
-
throw Error("inconsistent length for read '" + name + "'");
|
249
|
-
}
|
250
|
-
filter_paf(a, opt);
|
251
|
-
if (a.length == 0) return;
|
252
|
-
var ev = [];
|
253
|
-
for (var i = 0; i < a.length; ++i)
|
254
|
-
parse_events(a[i], ev, i, buf);
|
255
|
-
ev.sort(function(x,y) { return x[0]!=y[0]? x[0]-y[0] : x[2]-y[2] });
|
256
|
-
if (seq == null) print("SQ", name, a[0][1], a.length);
|
257
|
-
var b = find_het_sub(ev, a, opt);
|
258
|
-
if (opt.ec) flt_utg_for_ec(b, opt);
|
259
|
-
else flt_utg_for_bin(b, opt);
|
260
|
-
if (seq == null) {
|
261
|
-
for (var i = 0; i < b.length; ++i) {
|
262
|
-
var m, ai = a[b[i][2]], score = 0;
|
263
|
-
for (var j = 10; j < ai.length; ++j)
|
264
|
-
if ((m = /^AS:i:(\d+)/.exec(ai[j])) != null)
|
265
|
-
score = m[1];
|
266
|
-
print("TS", b[i][2], b[i][0], b[i][1], ai.slice(5, 9).join("\t"), b[i].slice(3, 7).join("\t"), score);
|
267
|
-
}
|
268
|
-
print("//");
|
269
|
-
} else { // error correction
|
270
|
-
if (b.length == 0) return;
|
271
|
-
buf.set(seq, 0);
|
272
|
-
ec_core(b, a.length, ev, buf, ecb);
|
273
|
-
print(">" + name);
|
274
|
-
print(ecb);
|
275
|
-
}
|
276
|
-
}
|
277
|
-
|
278
|
-
function main(args)
|
279
|
-
{
|
280
|
-
var c, opt = { min_rlen:5000, min_blen:5000, min_iden:0.8, min_mlen:5, max_clip_len:500, max_ratio0:0.25, dbg_ev:false };
|
281
|
-
while ((c = getopt(args, "l:b:d:m:c:r:E")) != null) {
|
282
|
-
if (c == 'l') opt.min_rlen = parseInt(getopt.arg);
|
283
|
-
else if (c == 'b') opt.min_blen = parseInt(getopt.arg);
|
284
|
-
else if (c == 'd') opt.min_iden = parseFloat(getopt.arg);
|
285
|
-
else if (c == 'm') opt.min_slen = parseInt(getopt.arg);
|
286
|
-
else if (c == 'c') opt.max_clip_len = parseInt(getopt.arg);
|
287
|
-
else if (c == 'r') opt.max_ratio0 = parseFloat(getopt.arg);
|
288
|
-
else if (c == 'E') opt.dbg_ev = true;
|
289
|
-
}
|
290
|
-
if (args.length - getopt.ind < 1) {
|
291
|
-
print("Usage: mmphase.js [options] <map-with-cs.paf> [reads.fa]");
|
292
|
-
print("Options:");
|
293
|
-
print(" -l INT min read length [" + opt.min_rlen + "]");
|
294
|
-
print(" -b INT min alignment length [" + opt.min_blen + "]");
|
295
|
-
print(" -d FLOAT min identity [" + opt.min_iden + "]");
|
296
|
-
print(" -s INT min match length [" + opt.min_mlen + "]");
|
297
|
-
print(" -c INT max clip length [" + opt.max_clip_len + "]");
|
298
|
-
print(" -r FLOAT initial ratio for haplotype filtering [" + opt.max_ratio0 + "]");
|
299
|
-
return 0;
|
300
|
-
}
|
301
|
-
|
302
|
-
opt.ec = args.length - getopt.ind < 2? false : true;
|
303
|
-
if (!opt.ec) {
|
304
|
-
print("CC");
|
305
|
-
print("CC", "SQ qName qLen nHits");
|
306
|
-
print("CC", "TS index qStart qEnd tName tLen tStart tEnd nConsistent lCons nConflictive lConf score");
|
307
|
-
print("CC");
|
308
|
-
}
|
309
|
-
|
310
|
-
var buf = new Bytes(), ecb = new Bytes();
|
311
|
-
var fp_paf = new File(args[getopt.ind]);
|
312
|
-
var fp_seq = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : null;
|
313
|
-
var a = [];
|
314
|
-
while (fp_paf.readline(buf) >= 0) {
|
315
|
-
var t = buf.toString().split("\t");
|
316
|
-
if (a.length > 0 && a[0][0] != t[0]) {
|
317
|
-
process_paf(a, opt, fp_seq, buf, ecb);
|
318
|
-
a.length = 0;
|
319
|
-
}
|
320
|
-
for (var i = 1; i <= 3; ++i) t[i] = parseInt(t[i]);
|
321
|
-
if (t[1] < opt.min_rlen) continue;
|
322
|
-
for (var i = 6; i <= 10; ++i) t[i] = parseInt(t[i]);
|
323
|
-
if (t[10] < opt.min_blen) continue;
|
324
|
-
a.push(t);
|
325
|
-
}
|
326
|
-
if (a.length >= 0)
|
327
|
-
process_paf(a, opt, fp_seq, buf, ecb);
|
328
|
-
if (fp_seq) fp_seq.close();
|
329
|
-
fp_paf.close();
|
330
|
-
ecb.destroy();
|
331
|
-
buf.destroy();
|
332
|
-
}
|
333
|
-
|
334
|
-
var ret = main(arguments)
|
335
|
-
exit(ret)
|