minimap2 0.2.22.0 → 0.2.24.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +60 -76
- data/ext/Rakefile +55 -0
- data/ext/cmappy/cmappy.c +129 -0
- data/ext/cmappy/cmappy.h +44 -0
- data/ext/minimap2/FAQ.md +46 -0
- data/ext/minimap2/LICENSE.txt +24 -0
- data/ext/minimap2/MANIFEST.in +10 -0
- data/ext/minimap2/Makefile +132 -0
- data/ext/minimap2/Makefile.simde +97 -0
- data/ext/minimap2/NEWS.md +821 -0
- data/ext/minimap2/README.md +403 -0
- data/ext/minimap2/align.c +1020 -0
- data/ext/minimap2/bseq.c +169 -0
- data/ext/minimap2/bseq.h +64 -0
- data/ext/minimap2/code_of_conduct.md +30 -0
- data/ext/minimap2/cookbook.md +243 -0
- data/ext/minimap2/esterr.c +64 -0
- data/ext/minimap2/example.c +63 -0
- data/ext/minimap2/format.c +559 -0
- data/ext/minimap2/hit.c +466 -0
- data/ext/minimap2/index.c +775 -0
- data/ext/minimap2/kalloc.c +205 -0
- data/ext/minimap2/kalloc.h +76 -0
- data/ext/minimap2/kdq.h +132 -0
- data/ext/minimap2/ketopt.h +120 -0
- data/ext/minimap2/khash.h +615 -0
- data/ext/minimap2/krmq.h +474 -0
- data/ext/minimap2/kseq.h +256 -0
- data/ext/minimap2/ksort.h +153 -0
- data/ext/minimap2/ksw2.h +184 -0
- data/ext/minimap2/ksw2_dispatch.c +96 -0
- data/ext/minimap2/ksw2_extd2_sse.c +402 -0
- data/ext/minimap2/ksw2_exts2_sse.c +416 -0
- data/ext/minimap2/ksw2_extz2_sse.c +313 -0
- data/ext/minimap2/ksw2_ll_sse.c +152 -0
- data/ext/minimap2/kthread.c +159 -0
- data/ext/minimap2/kthread.h +15 -0
- data/ext/minimap2/kvec.h +105 -0
- data/ext/minimap2/lchain.c +369 -0
- data/ext/minimap2/main.c +459 -0
- data/ext/minimap2/map.c +714 -0
- data/ext/minimap2/minimap.h +410 -0
- data/ext/minimap2/minimap2.1 +725 -0
- data/ext/minimap2/misc/README.md +179 -0
- data/ext/minimap2/misc/mmphase.js +335 -0
- data/ext/minimap2/misc/paftools.js +3149 -0
- data/ext/minimap2/misc.c +162 -0
- data/ext/minimap2/mmpriv.h +132 -0
- data/ext/minimap2/options.c +234 -0
- data/ext/minimap2/pe.c +177 -0
- data/ext/minimap2/python/README.rst +196 -0
- data/ext/minimap2/python/cmappy.h +152 -0
- data/ext/minimap2/python/cmappy.pxd +153 -0
- data/ext/minimap2/python/mappy.pyx +273 -0
- data/ext/minimap2/python/minimap2.py +39 -0
- data/ext/minimap2/sdust.c +213 -0
- data/ext/minimap2/sdust.h +25 -0
- data/ext/minimap2/seed.c +131 -0
- data/ext/minimap2/setup.py +55 -0
- data/ext/minimap2/sketch.c +143 -0
- data/ext/minimap2/splitidx.c +84 -0
- data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
- data/ext/minimap2/test/MT-human.fa +278 -0
- data/ext/minimap2/test/MT-orang.fa +276 -0
- data/ext/minimap2/test/q-inv.fa +4 -0
- data/ext/minimap2/test/q2.fa +2 -0
- data/ext/minimap2/test/t-inv.fa +127 -0
- data/ext/minimap2/test/t2.fa +2 -0
- data/ext/minimap2/tex/Makefile +21 -0
- data/ext/minimap2/tex/bioinfo.cls +930 -0
- data/ext/minimap2/tex/blasr-mc.eval +17 -0
- data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
- data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
- data/ext/minimap2/tex/bwa.eval +55 -0
- data/ext/minimap2/tex/eval2roc.pl +33 -0
- data/ext/minimap2/tex/graphmap.eval +4 -0
- data/ext/minimap2/tex/hs38-simu.sh +10 -0
- data/ext/minimap2/tex/minialign.eval +49 -0
- data/ext/minimap2/tex/minimap2.bib +460 -0
- data/ext/minimap2/tex/minimap2.tex +724 -0
- data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
- data/ext/minimap2/tex/mm2-update.tex +240 -0
- data/ext/minimap2/tex/mm2.approx.eval +12 -0
- data/ext/minimap2/tex/mm2.eval +13 -0
- data/ext/minimap2/tex/natbib.bst +1288 -0
- data/ext/minimap2/tex/natbib.sty +803 -0
- data/ext/minimap2/tex/ngmlr.eval +38 -0
- data/ext/minimap2/tex/roc.gp +60 -0
- data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
- data/ext/minimap2.patch +19 -0
- data/lib/minimap2/aligner.rb +4 -4
- data/lib/minimap2/alignment.rb +11 -11
- data/lib/minimap2/ffi/constants.rb +20 -16
- data/lib/minimap2/ffi/functions.rb +5 -0
- data/lib/minimap2/ffi.rb +4 -5
- data/lib/minimap2/version.rb +2 -2
- data/lib/minimap2.rb +51 -15
- metadata +97 -79
- data/lib/minimap2/ffi_helper.rb +0 -53
- data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,179 @@
|
|
1
|
+
## <a name="started"></a>Getting Started
|
2
|
+
|
3
|
+
```sh
|
4
|
+
# install minimap2
|
5
|
+
git clone https://github.com/lh3/minimap2
|
6
|
+
cd minimap2 && make
|
7
|
+
# install the k8 javascript shell
|
8
|
+
curl -L https://github.com/attractivechaos/k8/releases/download/v0.2.4/k8-0.2.4.tar.bz2 | tar -jxf -
|
9
|
+
cp k8-0.2.4/k8-`uname -s` k8 # or copy it to a directory on your $PATH
|
10
|
+
# export PATH="$PATH:`pwd`:`pwd`/misc" # run this if k8, minimap2 or paftools.js not on your $PATH
|
11
|
+
minimap2 --cs test/MT-human.fa test/MT-orang.fa | paftools.js view - # view alignment
|
12
|
+
minimap2 -c test/MT-human.fa test/MT-orang.fa | paftools.js stat - # basic alignment statistics
|
13
|
+
minimap2 -c --cs test/MT-human.fa test/MT-orang.fa \
|
14
|
+
| sort -k6,6 -k8,8n | paftools.js call -L15000 - # calling variants from asm-to-ref alignment
|
15
|
+
minimap2 -c test/MT-human.fa test/MT-orang.fa \
|
16
|
+
| paftools.js liftover -l10000 - <(echo -e "MT_orang\t2000\t5000") # liftOver
|
17
|
+
# no test data for the following examples
|
18
|
+
paftools.js junceval -e anno.gtf splice.sam > out.txt # compare splice junctions to annotations
|
19
|
+
paftools.js splice2bed anno.gtf > anno.bed # convert GTF/GFF3 to BED12
|
20
|
+
```
|
21
|
+
|
22
|
+
## Table of Contents
|
23
|
+
|
24
|
+
- [Getting Started](#started)
|
25
|
+
- [Introduction](#intro)
|
26
|
+
- [Evaluation](#eval)
|
27
|
+
- [Evaluating mapping accuracy with simulated reads](#mapeval)
|
28
|
+
- [Evaluating read overlap sensitivity](#oveval)
|
29
|
+
- [Calling Variants from Assemblies](#asmvar)
|
30
|
+
|
31
|
+
## <a name="intro"></a>Introduction
|
32
|
+
|
33
|
+
paftools.js is a script that processes alignments in the [PAF format][paf],
|
34
|
+
such as converting between formats, evaluating mapping accuracy, lifting over
|
35
|
+
BED files based on alignment, and calling variants from assembly-to-assembly
|
36
|
+
alignment. This script *requires* the [k8 Javascript shell][k8] to run. On
|
37
|
+
Linux or Mac, you can download the precompiled k8 binary with:
|
38
|
+
|
39
|
+
```sh
|
40
|
+
curl -L https://github.com/attractivechaos/k8/releases/download/v0.2.4/k8-0.2.4.tar.bz2 | tar -jxf -
|
41
|
+
cp k8-0.2.4/k8-`uname -s` $HOME/bin/k8 # assuming $HOME/bin in your $PATH
|
42
|
+
```
|
43
|
+
|
44
|
+
It is highly recommended to copy the executable `k8` to a directory on your
|
45
|
+
`$PATH` such as `/usr/bin/env` can find it. Like python scripts, once you
|
46
|
+
install `k8`, you can launch paftools.js in one of the two ways:
|
47
|
+
|
48
|
+
```sh
|
49
|
+
path/to/paftools.js # only if k8 is on your $PATH
|
50
|
+
k8 path/to/paftools.js
|
51
|
+
```
|
52
|
+
|
53
|
+
In a nutshell, paftools.js has the following commands:
|
54
|
+
|
55
|
+
```
|
56
|
+
Usage: paftools.js <command> [arguments]
|
57
|
+
Commands:
|
58
|
+
view convert PAF to BLAST-like (for eyeballing) or MAF
|
59
|
+
splice2bed convert spliced alignment in PAF/SAM to BED12
|
60
|
+
sam2paf convert SAM to PAF
|
61
|
+
delta2paf convert MUMmer's delta to PAF
|
62
|
+
gff2bed convert GTF/GFF3 to BED12
|
63
|
+
|
64
|
+
stat collect basic mapping information in PAF/SAM
|
65
|
+
liftover simplistic liftOver
|
66
|
+
call call variants from asm-to-ref alignment with the cs tag
|
67
|
+
bedcov compute the number of bases covered
|
68
|
+
|
69
|
+
mapeval evaluate mapping accuracy using mason2/PBSIM-simulated FASTQ
|
70
|
+
mason2fq convert mason2-simulated SAM to FASTQ
|
71
|
+
pbsim2fq convert PBSIM-simulated MAF to FASTQ
|
72
|
+
junceval evaluate splice junction consistency with known annotations
|
73
|
+
ov-eval evaluate read overlap sensitivity using read-to-ref mapping
|
74
|
+
```
|
75
|
+
|
76
|
+
paftools.js seamlessly reads both plain text files and gzip'd text files.
|
77
|
+
|
78
|
+
## <a name="eval"></a>Evaluation
|
79
|
+
|
80
|
+
### <a name="mapeval"></a>Evaluating mapping accuracy with simulated reads
|
81
|
+
|
82
|
+
The **pbsim2fq** command of paftools.js converts the MAF output of [pbsim][pbsim]
|
83
|
+
to FASTQ and encodes the true mapping position in the read name in a format like
|
84
|
+
`S1_33!chr1!225258409!225267761!-`. Similarly, the **mason2fq** command
|
85
|
+
converts [mason2][mason2] simulated SAM to FASTQ.
|
86
|
+
|
87
|
+
Command **mapeval** evaluates mapped SAM/PAF. Here is example output:
|
88
|
+
|
89
|
+
```
|
90
|
+
Q 60 32478 0 0.000000000 32478
|
91
|
+
Q 22 16 1 0.000030775 32494
|
92
|
+
Q 21 43 1 0.000061468 32537
|
93
|
+
Q 19 73 1 0.000091996 32610
|
94
|
+
Q 14 66 1 0.000122414 32676
|
95
|
+
Q 10 27 3 0.000214048 32703
|
96
|
+
Q 8 14 1 0.000244521 32717
|
97
|
+
Q 7 13 2 0.000305530 32730
|
98
|
+
Q 6 46 1 0.000335611 32776
|
99
|
+
Q 3 10 1 0.000366010 32786
|
100
|
+
Q 2 20 2 0.000426751 32806
|
101
|
+
Q 1 248 94 0.003267381 33054
|
102
|
+
Q 0 31 17 0.003778147 33085
|
103
|
+
U 3
|
104
|
+
```
|
105
|
+
|
106
|
+
where each Q-line gives the quality threshold, the number of reads mapped with
|
107
|
+
mapping quality equal to or greater than the threshold, number of wrong
|
108
|
+
mappings, accumulative mapping error rate and the accumulative number of
|
109
|
+
mapped reads. The U-line, if present, gives the number of unmapped reads if
|
110
|
+
they are present in the SAM file.
|
111
|
+
|
112
|
+
Suppose the reported mapping coordinate overlap with the true coordinate like
|
113
|
+
the following:
|
114
|
+
|
115
|
+
```
|
116
|
+
truth: --------------------
|
117
|
+
mapper: ----------------------
|
118
|
+
|<- l1 ->|<-- o -->|<-- l2 -->|
|
119
|
+
```
|
120
|
+
|
121
|
+
Let `r=o/(l1+o+l2)`. The reported mapping is considered correct if `r>0.1` by
|
122
|
+
default.
|
123
|
+
|
124
|
+
### <a name="oveval"></a>Evaluating read overlap sensitivity
|
125
|
+
|
126
|
+
Command **ov-eval** takes *sorted* read-to-reference alignment and read
|
127
|
+
overlaps in PAF as input, and evaluates the sensitivity. For example:
|
128
|
+
|
129
|
+
```sh
|
130
|
+
minimap2 -cx map-pb ref.fa reads.fq.gz | sort -k6,6 -k8,8n > reads-to-ref.paf
|
131
|
+
minimap2 -x ava-pb reads.fq.gz reads.fq.gz > ovlp.paf
|
132
|
+
k8 ov-eval.js reads-to-ref.paf ovlp.paf
|
133
|
+
```
|
134
|
+
|
135
|
+
## <a name="asmvar"></a>Calling Variants from Haploid Assemblies
|
136
|
+
|
137
|
+
The **call** command of paftools.js calls variants from coordinate-sorted
|
138
|
+
assembly-to-reference alignment. It calls variants from the [cs tag][cs] and
|
139
|
+
identifies confident/callable regions as those covered by exactly one contig.
|
140
|
+
Here are example command lines:
|
141
|
+
|
142
|
+
```sh
|
143
|
+
minimap2 -cx asm5 -t8 --cs ref.fa asm.fa > asm.paf # keeping this file is recommended; --cs required!
|
144
|
+
sort -k6,6 -k8,8n asm.paf > asm.srt.paf # sort by reference start coordinate
|
145
|
+
k8 paftools.js call asm.srt.paf > asm.var.txt
|
146
|
+
```
|
147
|
+
|
148
|
+
Here is sample output:
|
149
|
+
|
150
|
+
```
|
151
|
+
V chr1 2276040 2276041 1 60 c g LJII01000171.1 1217409 1217410 +
|
152
|
+
V chr1 2280409 2280410 1 60 a g LJII01000171.1 1221778 1221779 +
|
153
|
+
V chr1 2280504 2280505 1 60 a g LJII01000171.1 1221873 1221874 +
|
154
|
+
R chr1 2325140 2436340
|
155
|
+
V chr1 2325287 2325287 1 60 - ct LJII01000171.1 1272894 1272896 +
|
156
|
+
V chr1 2325642 2325644 1 60 tt - LJII01000171.1 1273251 1273251 +
|
157
|
+
V chr1 2326051 2326052 1 60 c t LJII01000171.1 1273658 1273659 +
|
158
|
+
V chr1 2326287 2326288 1 60 c t LJII01000171.1 1273894 1273895 +
|
159
|
+
```
|
160
|
+
|
161
|
+
where a line starting with `R` gives regions covered by one query contig, and a
|
162
|
+
V-line encodes a variant in the following format: chr, start, end, query depth,
|
163
|
+
mapping quality, REF allele, ALT allele, query name, query start, end and the
|
164
|
+
query orientation. Generally, you should only look at variants where column 5
|
165
|
+
is one.
|
166
|
+
|
167
|
+
By default, when calling variants, "paftools.js call" ignores alignments 50kb
|
168
|
+
or shorter; when deriving callable regions, it ignores alignments 10kb or
|
169
|
+
shorter. It uses two thresholds to avoid edge effects. These defaults are
|
170
|
+
designed for long-read assemblies. For short reads, both should be reduced.
|
171
|
+
|
172
|
+
|
173
|
+
|
174
|
+
[paf]: https://github.com/lh3/miniasm/blob/master/PAF.md
|
175
|
+
[cs]: https://github.com/lh3/minimap2#cs
|
176
|
+
[k8]: https://github.com/attractivechaos/k8
|
177
|
+
[maf]: https://genome.ucsc.edu/FAQ/FAQformat#format5
|
178
|
+
[pbsim]: https://github.com/pfaucon/PBSIM-PacBio-Simulator
|
179
|
+
[mason2]: https://github.com/seqan/seqan/tree/master/apps/mason2
|
@@ -0,0 +1,335 @@
|
|
1
|
+
#!/usr/bin/env k8
|
2
|
+
|
3
|
+
var getopt = function(args, ostr) {
|
4
|
+
var oli; // option letter list index
|
5
|
+
if (typeof(getopt.place) == 'undefined')
|
6
|
+
getopt.ind = 0, getopt.arg = null, getopt.place = -1;
|
7
|
+
if (getopt.place == -1) { // update scanning pointer
|
8
|
+
if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
|
9
|
+
getopt.place = -1;
|
10
|
+
return null;
|
11
|
+
}
|
12
|
+
if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
|
13
|
+
++getopt.ind;
|
14
|
+
getopt.place = -1;
|
15
|
+
return null;
|
16
|
+
}
|
17
|
+
}
|
18
|
+
var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
|
19
|
+
if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
|
20
|
+
if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
|
21
|
+
if (getopt.place < 0) ++getopt.ind;
|
22
|
+
return '?';
|
23
|
+
}
|
24
|
+
if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
|
25
|
+
getopt.arg = null;
|
26
|
+
if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
|
27
|
+
} else { // need an argument
|
28
|
+
if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
|
29
|
+
getopt.arg = args[getopt.ind].substr(getopt.place);
|
30
|
+
else if (args.length <= ++getopt.ind) { // no arg
|
31
|
+
getopt.place = -1;
|
32
|
+
if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
|
33
|
+
return '?';
|
34
|
+
} else getopt.arg = args[getopt.ind]; // white space
|
35
|
+
getopt.place = -1;
|
36
|
+
++getopt.ind;
|
37
|
+
}
|
38
|
+
return optopt;
|
39
|
+
}
|
40
|
+
|
41
|
+
function read_fastx(file, buf)
|
42
|
+
{
|
43
|
+
if (file.readline(buf) < 0) return null;
|
44
|
+
var m, line = buf.toString();
|
45
|
+
if ((m = /^([>@])(\S+)/.exec(line)) == null)
|
46
|
+
throw Error("wrong fastx format");
|
47
|
+
var is_fq = (m[1] == '@');
|
48
|
+
var name = m[2];
|
49
|
+
if (file.readline(buf) < 0)
|
50
|
+
throw Error("missing sequence line");
|
51
|
+
var seq = buf.toString();
|
52
|
+
if (is_fq) { // skip quality
|
53
|
+
file.readline(buf);
|
54
|
+
file.readline(buf);
|
55
|
+
}
|
56
|
+
return [name, seq];
|
57
|
+
}
|
58
|
+
|
59
|
+
function filter_paf(a, opt)
|
60
|
+
{
|
61
|
+
if (a.length == 0) return;
|
62
|
+
var k = 0;
|
63
|
+
for (var i = 0; i < a.length; ++i) {
|
64
|
+
var ai = a[i];
|
65
|
+
if (ai[10] < opt.min_blen) continue;
|
66
|
+
if (ai[9] < ai[10] * opt.min_iden) continue;
|
67
|
+
var clip = [0, 0];
|
68
|
+
if (ai[4] == '+') {
|
69
|
+
clip[0] = ai[2] < ai[7]? ai[2] : ai[7];
|
70
|
+
clip[1] = ai[1] - ai[3] < ai[6] - ai[8]? ai[1] - ai[3] : ai[6] - ai[8];
|
71
|
+
} else {
|
72
|
+
clip[0] = ai[2] < ai[6] - ai[8]? ai[2] : ai[6] - ai[8];
|
73
|
+
clip[1] = ai[1] - ai[3] < ai[7]? ai[1] - ai[3] : ai[7];
|
74
|
+
}
|
75
|
+
if (clip[0] > opt.max_clip_len || clip[1] > opt.max_clip_len) continue;
|
76
|
+
a[k++] = ai;
|
77
|
+
}
|
78
|
+
a.length = k;
|
79
|
+
}
|
80
|
+
|
81
|
+
function parse_events(t, ev, id, buf)
|
82
|
+
{
|
83
|
+
var re = /(:(\d+))|(([\+\-\*])([a-z]+))/g;
|
84
|
+
var m, cs = null;
|
85
|
+
for (var j = 12; j < t.length; ++j) {
|
86
|
+
if ((m = /^cs:Z:(\S+)/.exec(t[j])) != null) {
|
87
|
+
cs = m[1].toLowerCase();
|
88
|
+
break;
|
89
|
+
}
|
90
|
+
}
|
91
|
+
if (cs == null) {
|
92
|
+
warn("Warning: no cs tag for read '" + t[0] + "'");
|
93
|
+
return;
|
94
|
+
}
|
95
|
+
var st = t[2], en = t[3];
|
96
|
+
var x = st;
|
97
|
+
while ((m = re.exec(cs)) != null) {
|
98
|
+
var l;
|
99
|
+
if (m[2] != null) { // an identitcal match ":\d+"
|
100
|
+
l = parseInt(m[2]);
|
101
|
+
// [start, end, type, index, changed_base]
|
102
|
+
ev.push([x, x + l, 0, id]);
|
103
|
+
} else {
|
104
|
+
if (m[4] == '*') {
|
105
|
+
l = 1;
|
106
|
+
ev.push([x, x + 1, 1, id, m[5][0]]);
|
107
|
+
} else if (m[4] == '+') {
|
108
|
+
l = m[5].length;
|
109
|
+
ev.push([x, x + l, 2, id]);
|
110
|
+
} else if (m[4] == '-') {
|
111
|
+
l = 0;
|
112
|
+
ev.push([x, x, -1, id, m[5]]);
|
113
|
+
}
|
114
|
+
}
|
115
|
+
x += l;
|
116
|
+
}
|
117
|
+
if (x != en)
|
118
|
+
throw Error("inconsistent cs for read '" + t[0] + "'");
|
119
|
+
}
|
120
|
+
|
121
|
+
function find_het_sub(ev, a, opt)
|
122
|
+
{
|
123
|
+
var n = a.length, last0_i = -1, h = [], d = [];
|
124
|
+
for (var i = 0; i < n; ++i) h[i] = [], d[i] = [];
|
125
|
+
for (var i = 0; i < ev.length; ++i) {
|
126
|
+
if (ev[i][2] == 0) {
|
127
|
+
if (last0_i < 0 || ev[i][0] != ev[last0_i][0]) last0_i = i;
|
128
|
+
else if (ev[i][1] > ev[last0_i][1])
|
129
|
+
last0_i = i;
|
130
|
+
} else if (ev[i][2] == 1 && last0_i >= 0 && ev[i][0] < ev[last0_i][1]) {
|
131
|
+
if (ev[last0_i][1] - ev[last0_i][0] >= opt.min_mlen) {
|
132
|
+
if (opt.dbg_ev) print("EV", ev[last0_i].join("\t"), "|", ev[i].join("\t"));
|
133
|
+
var e0 = ev[last0_i], hl = h[e0[3]];
|
134
|
+
if (hl.length == 0 || hl[hl.length-1][0] != e0[0])
|
135
|
+
hl.push([e0[0], e0[1]]);
|
136
|
+
d[ev[i][3]].push([ev[i][0], e0[1] - e0[0]]);
|
137
|
+
}
|
138
|
+
}
|
139
|
+
}
|
140
|
+
var b = [];
|
141
|
+
for (var i = 0; i < n; ++i) {
|
142
|
+
var sh = 0, dh = 0;
|
143
|
+
for (var j = 0; j < h[i].length; ++j)
|
144
|
+
sh += h[i][j][1] - h[i][j][0];
|
145
|
+
for (var j = 0; j < d[i].length; ++j)
|
146
|
+
dh += d[i][j][1];
|
147
|
+
// [start, end, index, #consistent, lenConsistent, #conflictive, lenConflictive, identity, mlen]
|
148
|
+
b[i] = [a[i][2], a[i][3], i, h[i].length, sh, d[i].length, dh, a[i][9] / a[i][10], a[i][9]];
|
149
|
+
}
|
150
|
+
return b;
|
151
|
+
}
|
152
|
+
|
153
|
+
function flt_utg_for_ec(b, opt)
|
154
|
+
{
|
155
|
+
var k = 0;
|
156
|
+
for (var i = 0; i < b.length; ++i) {
|
157
|
+
var bi = b[i];
|
158
|
+
if (bi[4] == 0 && bi[6] == 0) b[k++] = bi; // entirely ambiguous
|
159
|
+
else if (bi[6] < (bi[4] + bi[6]) * opt.max_ratio0) b[k++] = bi;
|
160
|
+
}
|
161
|
+
b.length = k;
|
162
|
+
if (b.length == 0) return;
|
163
|
+
// find the longest contiguous segment
|
164
|
+
b.sort(function(x,y) { return x[0]-y[0] });
|
165
|
+
var st = b[0][0], en = b[0][1], max_st = 0, max_en = 0, max_max_en = en;
|
166
|
+
for (var i = 1; i < b.length; ++i) {
|
167
|
+
if (b[i][0] > en) {
|
168
|
+
if (en - st > max_en - max_st)
|
169
|
+
max_st = st, max_en = en;
|
170
|
+
st = b[i][0], en = b[i][1];
|
171
|
+
} else {
|
172
|
+
en = en > b[i][1]? en : b[i][1];
|
173
|
+
}
|
174
|
+
max_max_en = max_max_en > b[i][1]? max_max_en : b[i][1];
|
175
|
+
}
|
176
|
+
if (en - st > max_en - max_st)
|
177
|
+
max_st = st, max_en = en;
|
178
|
+
if (max_max_en != en || st != b[0][0]) {
|
179
|
+
var k = 0;
|
180
|
+
for (var i = 0; i < b.length; ++i)
|
181
|
+
if (b[i][0] < max_en && b[i][1] > max_st)
|
182
|
+
b[k++] = b[i];
|
183
|
+
b.length = k;
|
184
|
+
}
|
185
|
+
}
|
186
|
+
|
187
|
+
function flt_utg_for_bin(b, opt) // filter out alignments clearly on the wrong phase
|
188
|
+
{
|
189
|
+
var k = 0;
|
190
|
+
for (var i = 0; i < b.length; ++i) {
|
191
|
+
var bi = b[i];
|
192
|
+
if (bi[4] + bi[6] == 0 || bi[4] >= (bi[4] + bi[6]) * opt.max_ratio0) b[k++] = bi;
|
193
|
+
}
|
194
|
+
b.length = k;
|
195
|
+
}
|
196
|
+
|
197
|
+
function ec_core(b, n_a, ev, buf, ecb) // error correction
|
198
|
+
{
|
199
|
+
var intv = [];
|
200
|
+
for (var i = 0; i < n_a; ++i)
|
201
|
+
intv[i] = null;
|
202
|
+
intv[b[0][2]] = [b[0][0], b[0][1]];
|
203
|
+
var en = b[0][1];
|
204
|
+
for (var i = 1; i < b.length; ++i) {
|
205
|
+
if (b[i][1] <= en) continue;
|
206
|
+
intv[b[i][2]] = [en, b[i][1]];
|
207
|
+
en = b[i][1];
|
208
|
+
}
|
209
|
+
var k = 0;
|
210
|
+
ecb.capacity = buf.capacity;
|
211
|
+
ecb.length = 0;
|
212
|
+
for (var i = 0; i < ev.length; ++i) {
|
213
|
+
var e = ev[i], I = intv[e[3]];
|
214
|
+
if (I == null) continue;
|
215
|
+
if (e[0] >= I[0] && e[0] < I[1]) { // this is to reduce duplicated events around junctions
|
216
|
+
//print("X", e.join("\t"));
|
217
|
+
if (e[2] == 0) {
|
218
|
+
ecb.length += e[1] - e[0];
|
219
|
+
for (var j = e[0]; j < e[1]; ++j)
|
220
|
+
ecb[k++] = buf[j];
|
221
|
+
} else if (e[2] == 1) {
|
222
|
+
++ecb.length;
|
223
|
+
ecb[k++] = e[4].charCodeAt(0);
|
224
|
+
} else if (e[2] < 0) {
|
225
|
+
ecb.length += e[4].length;
|
226
|
+
for (var j = 0; j < e[4].length; ++j)
|
227
|
+
ecb[k++] = e[4].charCodeAt(j);
|
228
|
+
} // else, skip e[2] == 2
|
229
|
+
}
|
230
|
+
}
|
231
|
+
if (ecb.length != k) throw Error("BUG!");
|
232
|
+
}
|
233
|
+
|
234
|
+
function process_paf(a, opt, fp_seq, buf, ecb)
|
235
|
+
{
|
236
|
+
if (a.length == 0) return;
|
237
|
+
var len = a[0][1], name = a[0][0], seq = null;
|
238
|
+
if (len < opt.min_rlen) return;
|
239
|
+
if (fp_seq) {
|
240
|
+
var ret;
|
241
|
+
while ((ret = read_fastx(fp_seq, buf)) != null)
|
242
|
+
if (ret[0] == a[0][0])
|
243
|
+
break;
|
244
|
+
if (ret == null)
|
245
|
+
throw Error("failed to find sequence for read '" + a[0][0] + "'");
|
246
|
+
name = ret[0], seq = ret[1];
|
247
|
+
if (seq.length != len)
|
248
|
+
throw Error("inconsistent length for read '" + name + "'");
|
249
|
+
}
|
250
|
+
filter_paf(a, opt);
|
251
|
+
if (a.length == 0) return;
|
252
|
+
var ev = [];
|
253
|
+
for (var i = 0; i < a.length; ++i)
|
254
|
+
parse_events(a[i], ev, i, buf);
|
255
|
+
ev.sort(function(x,y) { return x[0]!=y[0]? x[0]-y[0] : x[2]-y[2] });
|
256
|
+
if (seq == null) print("SQ", name, a[0][1], a.length);
|
257
|
+
var b = find_het_sub(ev, a, opt);
|
258
|
+
if (opt.ec) flt_utg_for_ec(b, opt);
|
259
|
+
else flt_utg_for_bin(b, opt);
|
260
|
+
if (seq == null) {
|
261
|
+
for (var i = 0; i < b.length; ++i) {
|
262
|
+
var m, ai = a[b[i][2]], score = 0;
|
263
|
+
for (var j = 10; j < ai.length; ++j)
|
264
|
+
if ((m = /^AS:i:(\d+)/.exec(ai[j])) != null)
|
265
|
+
score = m[1];
|
266
|
+
print("TS", b[i][2], b[i][0], b[i][1], ai.slice(5, 9).join("\t"), b[i].slice(3, 7).join("\t"), score);
|
267
|
+
}
|
268
|
+
print("//");
|
269
|
+
} else { // error correction
|
270
|
+
if (b.length == 0) return;
|
271
|
+
buf.set(seq, 0);
|
272
|
+
ec_core(b, a.length, ev, buf, ecb);
|
273
|
+
print(">" + name);
|
274
|
+
print(ecb);
|
275
|
+
}
|
276
|
+
}
|
277
|
+
|
278
|
+
function main(args)
|
279
|
+
{
|
280
|
+
var c, opt = { min_rlen:5000, min_blen:5000, min_iden:0.8, min_mlen:5, max_clip_len:500, max_ratio0:0.25, dbg_ev:false };
|
281
|
+
while ((c = getopt(args, "l:b:d:m:c:r:E")) != null) {
|
282
|
+
if (c == 'l') opt.min_rlen = parseInt(getopt.arg);
|
283
|
+
else if (c == 'b') opt.min_blen = parseInt(getopt.arg);
|
284
|
+
else if (c == 'd') opt.min_iden = parseFloat(getopt.arg);
|
285
|
+
else if (c == 'm') opt.min_slen = parseInt(getopt.arg);
|
286
|
+
else if (c == 'c') opt.max_clip_len = parseInt(getopt.arg);
|
287
|
+
else if (c == 'r') opt.max_ratio0 = parseFloat(getopt.arg);
|
288
|
+
else if (c == 'E') opt.dbg_ev = true;
|
289
|
+
}
|
290
|
+
if (args.length - getopt.ind < 1) {
|
291
|
+
print("Usage: mmphase.js [options] <map-with-cs.paf> [reads.fa]");
|
292
|
+
print("Options:");
|
293
|
+
print(" -l INT min read length [" + opt.min_rlen + "]");
|
294
|
+
print(" -b INT min alignment length [" + opt.min_blen + "]");
|
295
|
+
print(" -d FLOAT min identity [" + opt.min_iden + "]");
|
296
|
+
print(" -s INT min match length [" + opt.min_mlen + "]");
|
297
|
+
print(" -c INT max clip length [" + opt.max_clip_len + "]");
|
298
|
+
print(" -r FLOAT initial ratio for haplotype filtering [" + opt.max_ratio0 + "]");
|
299
|
+
return 0;
|
300
|
+
}
|
301
|
+
|
302
|
+
opt.ec = args.length - getopt.ind < 2? false : true;
|
303
|
+
if (!opt.ec) {
|
304
|
+
print("CC");
|
305
|
+
print("CC", "SQ qName qLen nHits");
|
306
|
+
print("CC", "TS index qStart qEnd tName tLen tStart tEnd nConsistent lCons nConflictive lConf score");
|
307
|
+
print("CC");
|
308
|
+
}
|
309
|
+
|
310
|
+
var buf = new Bytes(), ecb = new Bytes();
|
311
|
+
var fp_paf = new File(args[getopt.ind]);
|
312
|
+
var fp_seq = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : null;
|
313
|
+
var a = [];
|
314
|
+
while (fp_paf.readline(buf) >= 0) {
|
315
|
+
var t = buf.toString().split("\t");
|
316
|
+
if (a.length > 0 && a[0][0] != t[0]) {
|
317
|
+
process_paf(a, opt, fp_seq, buf, ecb);
|
318
|
+
a.length = 0;
|
319
|
+
}
|
320
|
+
for (var i = 1; i <= 3; ++i) t[i] = parseInt(t[i]);
|
321
|
+
if (t[1] < opt.min_rlen) continue;
|
322
|
+
for (var i = 6; i <= 10; ++i) t[i] = parseInt(t[i]);
|
323
|
+
if (t[10] < opt.min_blen) continue;
|
324
|
+
a.push(t);
|
325
|
+
}
|
326
|
+
if (a.length >= 0)
|
327
|
+
process_paf(a, opt, fp_seq, buf, ecb);
|
328
|
+
if (fp_seq) fp_seq.close();
|
329
|
+
fp_paf.close();
|
330
|
+
ecb.destroy();
|
331
|
+
buf.destroy();
|
332
|
+
}
|
333
|
+
|
334
|
+
var ret = main(arguments)
|
335
|
+
exit(ret)
|