minimap2 0.2.22.0 → 0.2.24.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,179 @@
1
+ ## <a name="started"></a>Getting Started
2
+
3
+ ```sh
4
+ # install minimap2
5
+ git clone https://github.com/lh3/minimap2
6
+ cd minimap2 && make
7
+ # install the k8 javascript shell
8
+ curl -L https://github.com/attractivechaos/k8/releases/download/v0.2.4/k8-0.2.4.tar.bz2 | tar -jxf -
9
+ cp k8-0.2.4/k8-`uname -s` k8 # or copy it to a directory on your $PATH
10
+ # export PATH="$PATH:`pwd`:`pwd`/misc" # run this if k8, minimap2 or paftools.js not on your $PATH
11
+ minimap2 --cs test/MT-human.fa test/MT-orang.fa | paftools.js view - # view alignment
12
+ minimap2 -c test/MT-human.fa test/MT-orang.fa | paftools.js stat - # basic alignment statistics
13
+ minimap2 -c --cs test/MT-human.fa test/MT-orang.fa \
14
+ | sort -k6,6 -k8,8n | paftools.js call -L15000 - # calling variants from asm-to-ref alignment
15
+ minimap2 -c test/MT-human.fa test/MT-orang.fa \
16
+ | paftools.js liftover -l10000 - <(echo -e "MT_orang\t2000\t5000") # liftOver
17
+ # no test data for the following examples
18
+ paftools.js junceval -e anno.gtf splice.sam > out.txt # compare splice junctions to annotations
19
+ paftools.js splice2bed anno.gtf > anno.bed # convert GTF/GFF3 to BED12
20
+ ```
21
+
22
+ ## Table of Contents
23
+
24
+ - [Getting Started](#started)
25
+ - [Introduction](#intro)
26
+ - [Evaluation](#eval)
27
+ - [Evaluating mapping accuracy with simulated reads](#mapeval)
28
+ - [Evaluating read overlap sensitivity](#oveval)
29
+ - [Calling Variants from Assemblies](#asmvar)
30
+
31
+ ## <a name="intro"></a>Introduction
32
+
33
+ paftools.js is a script that processes alignments in the [PAF format][paf],
34
+ such as converting between formats, evaluating mapping accuracy, lifting over
35
+ BED files based on alignment, and calling variants from assembly-to-assembly
36
+ alignment. This script *requires* the [k8 Javascript shell][k8] to run. On
37
+ Linux or Mac, you can download the precompiled k8 binary with:
38
+
39
+ ```sh
40
+ curl -L https://github.com/attractivechaos/k8/releases/download/v0.2.4/k8-0.2.4.tar.bz2 | tar -jxf -
41
+ cp k8-0.2.4/k8-`uname -s` $HOME/bin/k8 # assuming $HOME/bin in your $PATH
42
+ ```
43
+
44
+ It is highly recommended to copy the executable `k8` to a directory on your
45
+ `$PATH` such as `/usr/bin/env` can find it. Like python scripts, once you
46
+ install `k8`, you can launch paftools.js in one of the two ways:
47
+
48
+ ```sh
49
+ path/to/paftools.js # only if k8 is on your $PATH
50
+ k8 path/to/paftools.js
51
+ ```
52
+
53
+ In a nutshell, paftools.js has the following commands:
54
+
55
+ ```
56
+ Usage: paftools.js <command> [arguments]
57
+ Commands:
58
+ view convert PAF to BLAST-like (for eyeballing) or MAF
59
+ splice2bed convert spliced alignment in PAF/SAM to BED12
60
+ sam2paf convert SAM to PAF
61
+ delta2paf convert MUMmer's delta to PAF
62
+ gff2bed convert GTF/GFF3 to BED12
63
+
64
+ stat collect basic mapping information in PAF/SAM
65
+ liftover simplistic liftOver
66
+ call call variants from asm-to-ref alignment with the cs tag
67
+ bedcov compute the number of bases covered
68
+
69
+ mapeval evaluate mapping accuracy using mason2/PBSIM-simulated FASTQ
70
+ mason2fq convert mason2-simulated SAM to FASTQ
71
+ pbsim2fq convert PBSIM-simulated MAF to FASTQ
72
+ junceval evaluate splice junction consistency with known annotations
73
+ ov-eval evaluate read overlap sensitivity using read-to-ref mapping
74
+ ```
75
+
76
+ paftools.js seamlessly reads both plain text files and gzip'd text files.
77
+
78
+ ## <a name="eval"></a>Evaluation
79
+
80
+ ### <a name="mapeval"></a>Evaluating mapping accuracy with simulated reads
81
+
82
+ The **pbsim2fq** command of paftools.js converts the MAF output of [pbsim][pbsim]
83
+ to FASTQ and encodes the true mapping position in the read name in a format like
84
+ `S1_33!chr1!225258409!225267761!-`. Similarly, the **mason2fq** command
85
+ converts [mason2][mason2] simulated SAM to FASTQ.
86
+
87
+ Command **mapeval** evaluates mapped SAM/PAF. Here is example output:
88
+
89
+ ```
90
+ Q 60 32478 0 0.000000000 32478
91
+ Q 22 16 1 0.000030775 32494
92
+ Q 21 43 1 0.000061468 32537
93
+ Q 19 73 1 0.000091996 32610
94
+ Q 14 66 1 0.000122414 32676
95
+ Q 10 27 3 0.000214048 32703
96
+ Q 8 14 1 0.000244521 32717
97
+ Q 7 13 2 0.000305530 32730
98
+ Q 6 46 1 0.000335611 32776
99
+ Q 3 10 1 0.000366010 32786
100
+ Q 2 20 2 0.000426751 32806
101
+ Q 1 248 94 0.003267381 33054
102
+ Q 0 31 17 0.003778147 33085
103
+ U 3
104
+ ```
105
+
106
+ where each Q-line gives the quality threshold, the number of reads mapped with
107
+ mapping quality equal to or greater than the threshold, number of wrong
108
+ mappings, accumulative mapping error rate and the accumulative number of
109
+ mapped reads. The U-line, if present, gives the number of unmapped reads if
110
+ they are present in the SAM file.
111
+
112
+ Suppose the reported mapping coordinate overlap with the true coordinate like
113
+ the following:
114
+
115
+ ```
116
+ truth: --------------------
117
+ mapper: ----------------------
118
+ |<- l1 ->|<-- o -->|<-- l2 -->|
119
+ ```
120
+
121
+ Let `r=o/(l1+o+l2)`. The reported mapping is considered correct if `r>0.1` by
122
+ default.
123
+
124
+ ### <a name="oveval"></a>Evaluating read overlap sensitivity
125
+
126
+ Command **ov-eval** takes *sorted* read-to-reference alignment and read
127
+ overlaps in PAF as input, and evaluates the sensitivity. For example:
128
+
129
+ ```sh
130
+ minimap2 -cx map-pb ref.fa reads.fq.gz | sort -k6,6 -k8,8n > reads-to-ref.paf
131
+ minimap2 -x ava-pb reads.fq.gz reads.fq.gz > ovlp.paf
132
+ k8 ov-eval.js reads-to-ref.paf ovlp.paf
133
+ ```
134
+
135
+ ## <a name="asmvar"></a>Calling Variants from Haploid Assemblies
136
+
137
+ The **call** command of paftools.js calls variants from coordinate-sorted
138
+ assembly-to-reference alignment. It calls variants from the [cs tag][cs] and
139
+ identifies confident/callable regions as those covered by exactly one contig.
140
+ Here are example command lines:
141
+
142
+ ```sh
143
+ minimap2 -cx asm5 -t8 --cs ref.fa asm.fa > asm.paf # keeping this file is recommended; --cs required!
144
+ sort -k6,6 -k8,8n asm.paf > asm.srt.paf # sort by reference start coordinate
145
+ k8 paftools.js call asm.srt.paf > asm.var.txt
146
+ ```
147
+
148
+ Here is sample output:
149
+
150
+ ```
151
+ V chr1 2276040 2276041 1 60 c g LJII01000171.1 1217409 1217410 +
152
+ V chr1 2280409 2280410 1 60 a g LJII01000171.1 1221778 1221779 +
153
+ V chr1 2280504 2280505 1 60 a g LJII01000171.1 1221873 1221874 +
154
+ R chr1 2325140 2436340
155
+ V chr1 2325287 2325287 1 60 - ct LJII01000171.1 1272894 1272896 +
156
+ V chr1 2325642 2325644 1 60 tt - LJII01000171.1 1273251 1273251 +
157
+ V chr1 2326051 2326052 1 60 c t LJII01000171.1 1273658 1273659 +
158
+ V chr1 2326287 2326288 1 60 c t LJII01000171.1 1273894 1273895 +
159
+ ```
160
+
161
+ where a line starting with `R` gives regions covered by one query contig, and a
162
+ V-line encodes a variant in the following format: chr, start, end, query depth,
163
+ mapping quality, REF allele, ALT allele, query name, query start, end and the
164
+ query orientation. Generally, you should only look at variants where column 5
165
+ is one.
166
+
167
+ By default, when calling variants, "paftools.js call" ignores alignments 50kb
168
+ or shorter; when deriving callable regions, it ignores alignments 10kb or
169
+ shorter. It uses two thresholds to avoid edge effects. These defaults are
170
+ designed for long-read assemblies. For short reads, both should be reduced.
171
+
172
+
173
+
174
+ [paf]: https://github.com/lh3/miniasm/blob/master/PAF.md
175
+ [cs]: https://github.com/lh3/minimap2#cs
176
+ [k8]: https://github.com/attractivechaos/k8
177
+ [maf]: https://genome.ucsc.edu/FAQ/FAQformat#format5
178
+ [pbsim]: https://github.com/pfaucon/PBSIM-PacBio-Simulator
179
+ [mason2]: https://github.com/seqan/seqan/tree/master/apps/mason2
@@ -0,0 +1,335 @@
1
+ #!/usr/bin/env k8
2
+
3
+ var getopt = function(args, ostr) {
4
+ var oli; // option letter list index
5
+ if (typeof(getopt.place) == 'undefined')
6
+ getopt.ind = 0, getopt.arg = null, getopt.place = -1;
7
+ if (getopt.place == -1) { // update scanning pointer
8
+ if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
9
+ getopt.place = -1;
10
+ return null;
11
+ }
12
+ if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
13
+ ++getopt.ind;
14
+ getopt.place = -1;
15
+ return null;
16
+ }
17
+ }
18
+ var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
19
+ if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
20
+ if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
21
+ if (getopt.place < 0) ++getopt.ind;
22
+ return '?';
23
+ }
24
+ if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
25
+ getopt.arg = null;
26
+ if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
27
+ } else { // need an argument
28
+ if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
29
+ getopt.arg = args[getopt.ind].substr(getopt.place);
30
+ else if (args.length <= ++getopt.ind) { // no arg
31
+ getopt.place = -1;
32
+ if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
33
+ return '?';
34
+ } else getopt.arg = args[getopt.ind]; // white space
35
+ getopt.place = -1;
36
+ ++getopt.ind;
37
+ }
38
+ return optopt;
39
+ }
40
+
41
+ function read_fastx(file, buf)
42
+ {
43
+ if (file.readline(buf) < 0) return null;
44
+ var m, line = buf.toString();
45
+ if ((m = /^([>@])(\S+)/.exec(line)) == null)
46
+ throw Error("wrong fastx format");
47
+ var is_fq = (m[1] == '@');
48
+ var name = m[2];
49
+ if (file.readline(buf) < 0)
50
+ throw Error("missing sequence line");
51
+ var seq = buf.toString();
52
+ if (is_fq) { // skip quality
53
+ file.readline(buf);
54
+ file.readline(buf);
55
+ }
56
+ return [name, seq];
57
+ }
58
+
59
+ function filter_paf(a, opt)
60
+ {
61
+ if (a.length == 0) return;
62
+ var k = 0;
63
+ for (var i = 0; i < a.length; ++i) {
64
+ var ai = a[i];
65
+ if (ai[10] < opt.min_blen) continue;
66
+ if (ai[9] < ai[10] * opt.min_iden) continue;
67
+ var clip = [0, 0];
68
+ if (ai[4] == '+') {
69
+ clip[0] = ai[2] < ai[7]? ai[2] : ai[7];
70
+ clip[1] = ai[1] - ai[3] < ai[6] - ai[8]? ai[1] - ai[3] : ai[6] - ai[8];
71
+ } else {
72
+ clip[0] = ai[2] < ai[6] - ai[8]? ai[2] : ai[6] - ai[8];
73
+ clip[1] = ai[1] - ai[3] < ai[7]? ai[1] - ai[3] : ai[7];
74
+ }
75
+ if (clip[0] > opt.max_clip_len || clip[1] > opt.max_clip_len) continue;
76
+ a[k++] = ai;
77
+ }
78
+ a.length = k;
79
+ }
80
+
81
+ function parse_events(t, ev, id, buf)
82
+ {
83
+ var re = /(:(\d+))|(([\+\-\*])([a-z]+))/g;
84
+ var m, cs = null;
85
+ for (var j = 12; j < t.length; ++j) {
86
+ if ((m = /^cs:Z:(\S+)/.exec(t[j])) != null) {
87
+ cs = m[1].toLowerCase();
88
+ break;
89
+ }
90
+ }
91
+ if (cs == null) {
92
+ warn("Warning: no cs tag for read '" + t[0] + "'");
93
+ return;
94
+ }
95
+ var st = t[2], en = t[3];
96
+ var x = st;
97
+ while ((m = re.exec(cs)) != null) {
98
+ var l;
99
+ if (m[2] != null) { // an identitcal match ":\d+"
100
+ l = parseInt(m[2]);
101
+ // [start, end, type, index, changed_base]
102
+ ev.push([x, x + l, 0, id]);
103
+ } else {
104
+ if (m[4] == '*') {
105
+ l = 1;
106
+ ev.push([x, x + 1, 1, id, m[5][0]]);
107
+ } else if (m[4] == '+') {
108
+ l = m[5].length;
109
+ ev.push([x, x + l, 2, id]);
110
+ } else if (m[4] == '-') {
111
+ l = 0;
112
+ ev.push([x, x, -1, id, m[5]]);
113
+ }
114
+ }
115
+ x += l;
116
+ }
117
+ if (x != en)
118
+ throw Error("inconsistent cs for read '" + t[0] + "'");
119
+ }
120
+
121
+ function find_het_sub(ev, a, opt)
122
+ {
123
+ var n = a.length, last0_i = -1, h = [], d = [];
124
+ for (var i = 0; i < n; ++i) h[i] = [], d[i] = [];
125
+ for (var i = 0; i < ev.length; ++i) {
126
+ if (ev[i][2] == 0) {
127
+ if (last0_i < 0 || ev[i][0] != ev[last0_i][0]) last0_i = i;
128
+ else if (ev[i][1] > ev[last0_i][1])
129
+ last0_i = i;
130
+ } else if (ev[i][2] == 1 && last0_i >= 0 && ev[i][0] < ev[last0_i][1]) {
131
+ if (ev[last0_i][1] - ev[last0_i][0] >= opt.min_mlen) {
132
+ if (opt.dbg_ev) print("EV", ev[last0_i].join("\t"), "|", ev[i].join("\t"));
133
+ var e0 = ev[last0_i], hl = h[e0[3]];
134
+ if (hl.length == 0 || hl[hl.length-1][0] != e0[0])
135
+ hl.push([e0[0], e0[1]]);
136
+ d[ev[i][3]].push([ev[i][0], e0[1] - e0[0]]);
137
+ }
138
+ }
139
+ }
140
+ var b = [];
141
+ for (var i = 0; i < n; ++i) {
142
+ var sh = 0, dh = 0;
143
+ for (var j = 0; j < h[i].length; ++j)
144
+ sh += h[i][j][1] - h[i][j][0];
145
+ for (var j = 0; j < d[i].length; ++j)
146
+ dh += d[i][j][1];
147
+ // [start, end, index, #consistent, lenConsistent, #conflictive, lenConflictive, identity, mlen]
148
+ b[i] = [a[i][2], a[i][3], i, h[i].length, sh, d[i].length, dh, a[i][9] / a[i][10], a[i][9]];
149
+ }
150
+ return b;
151
+ }
152
+
153
+ function flt_utg_for_ec(b, opt)
154
+ {
155
+ var k = 0;
156
+ for (var i = 0; i < b.length; ++i) {
157
+ var bi = b[i];
158
+ if (bi[4] == 0 && bi[6] == 0) b[k++] = bi; // entirely ambiguous
159
+ else if (bi[6] < (bi[4] + bi[6]) * opt.max_ratio0) b[k++] = bi;
160
+ }
161
+ b.length = k;
162
+ if (b.length == 0) return;
163
+ // find the longest contiguous segment
164
+ b.sort(function(x,y) { return x[0]-y[0] });
165
+ var st = b[0][0], en = b[0][1], max_st = 0, max_en = 0, max_max_en = en;
166
+ for (var i = 1; i < b.length; ++i) {
167
+ if (b[i][0] > en) {
168
+ if (en - st > max_en - max_st)
169
+ max_st = st, max_en = en;
170
+ st = b[i][0], en = b[i][1];
171
+ } else {
172
+ en = en > b[i][1]? en : b[i][1];
173
+ }
174
+ max_max_en = max_max_en > b[i][1]? max_max_en : b[i][1];
175
+ }
176
+ if (en - st > max_en - max_st)
177
+ max_st = st, max_en = en;
178
+ if (max_max_en != en || st != b[0][0]) {
179
+ var k = 0;
180
+ for (var i = 0; i < b.length; ++i)
181
+ if (b[i][0] < max_en && b[i][1] > max_st)
182
+ b[k++] = b[i];
183
+ b.length = k;
184
+ }
185
+ }
186
+
187
+ function flt_utg_for_bin(b, opt) // filter out alignments clearly on the wrong phase
188
+ {
189
+ var k = 0;
190
+ for (var i = 0; i < b.length; ++i) {
191
+ var bi = b[i];
192
+ if (bi[4] + bi[6] == 0 || bi[4] >= (bi[4] + bi[6]) * opt.max_ratio0) b[k++] = bi;
193
+ }
194
+ b.length = k;
195
+ }
196
+
197
+ function ec_core(b, n_a, ev, buf, ecb) // error correction
198
+ {
199
+ var intv = [];
200
+ for (var i = 0; i < n_a; ++i)
201
+ intv[i] = null;
202
+ intv[b[0][2]] = [b[0][0], b[0][1]];
203
+ var en = b[0][1];
204
+ for (var i = 1; i < b.length; ++i) {
205
+ if (b[i][1] <= en) continue;
206
+ intv[b[i][2]] = [en, b[i][1]];
207
+ en = b[i][1];
208
+ }
209
+ var k = 0;
210
+ ecb.capacity = buf.capacity;
211
+ ecb.length = 0;
212
+ for (var i = 0; i < ev.length; ++i) {
213
+ var e = ev[i], I = intv[e[3]];
214
+ if (I == null) continue;
215
+ if (e[0] >= I[0] && e[0] < I[1]) { // this is to reduce duplicated events around junctions
216
+ //print("X", e.join("\t"));
217
+ if (e[2] == 0) {
218
+ ecb.length += e[1] - e[0];
219
+ for (var j = e[0]; j < e[1]; ++j)
220
+ ecb[k++] = buf[j];
221
+ } else if (e[2] == 1) {
222
+ ++ecb.length;
223
+ ecb[k++] = e[4].charCodeAt(0);
224
+ } else if (e[2] < 0) {
225
+ ecb.length += e[4].length;
226
+ for (var j = 0; j < e[4].length; ++j)
227
+ ecb[k++] = e[4].charCodeAt(j);
228
+ } // else, skip e[2] == 2
229
+ }
230
+ }
231
+ if (ecb.length != k) throw Error("BUG!");
232
+ }
233
+
234
+ function process_paf(a, opt, fp_seq, buf, ecb)
235
+ {
236
+ if (a.length == 0) return;
237
+ var len = a[0][1], name = a[0][0], seq = null;
238
+ if (len < opt.min_rlen) return;
239
+ if (fp_seq) {
240
+ var ret;
241
+ while ((ret = read_fastx(fp_seq, buf)) != null)
242
+ if (ret[0] == a[0][0])
243
+ break;
244
+ if (ret == null)
245
+ throw Error("failed to find sequence for read '" + a[0][0] + "'");
246
+ name = ret[0], seq = ret[1];
247
+ if (seq.length != len)
248
+ throw Error("inconsistent length for read '" + name + "'");
249
+ }
250
+ filter_paf(a, opt);
251
+ if (a.length == 0) return;
252
+ var ev = [];
253
+ for (var i = 0; i < a.length; ++i)
254
+ parse_events(a[i], ev, i, buf);
255
+ ev.sort(function(x,y) { return x[0]!=y[0]? x[0]-y[0] : x[2]-y[2] });
256
+ if (seq == null) print("SQ", name, a[0][1], a.length);
257
+ var b = find_het_sub(ev, a, opt);
258
+ if (opt.ec) flt_utg_for_ec(b, opt);
259
+ else flt_utg_for_bin(b, opt);
260
+ if (seq == null) {
261
+ for (var i = 0; i < b.length; ++i) {
262
+ var m, ai = a[b[i][2]], score = 0;
263
+ for (var j = 10; j < ai.length; ++j)
264
+ if ((m = /^AS:i:(\d+)/.exec(ai[j])) != null)
265
+ score = m[1];
266
+ print("TS", b[i][2], b[i][0], b[i][1], ai.slice(5, 9).join("\t"), b[i].slice(3, 7).join("\t"), score);
267
+ }
268
+ print("//");
269
+ } else { // error correction
270
+ if (b.length == 0) return;
271
+ buf.set(seq, 0);
272
+ ec_core(b, a.length, ev, buf, ecb);
273
+ print(">" + name);
274
+ print(ecb);
275
+ }
276
+ }
277
+
278
+ function main(args)
279
+ {
280
+ var c, opt = { min_rlen:5000, min_blen:5000, min_iden:0.8, min_mlen:5, max_clip_len:500, max_ratio0:0.25, dbg_ev:false };
281
+ while ((c = getopt(args, "l:b:d:m:c:r:E")) != null) {
282
+ if (c == 'l') opt.min_rlen = parseInt(getopt.arg);
283
+ else if (c == 'b') opt.min_blen = parseInt(getopt.arg);
284
+ else if (c == 'd') opt.min_iden = parseFloat(getopt.arg);
285
+ else if (c == 'm') opt.min_slen = parseInt(getopt.arg);
286
+ else if (c == 'c') opt.max_clip_len = parseInt(getopt.arg);
287
+ else if (c == 'r') opt.max_ratio0 = parseFloat(getopt.arg);
288
+ else if (c == 'E') opt.dbg_ev = true;
289
+ }
290
+ if (args.length - getopt.ind < 1) {
291
+ print("Usage: mmphase.js [options] <map-with-cs.paf> [reads.fa]");
292
+ print("Options:");
293
+ print(" -l INT min read length [" + opt.min_rlen + "]");
294
+ print(" -b INT min alignment length [" + opt.min_blen + "]");
295
+ print(" -d FLOAT min identity [" + opt.min_iden + "]");
296
+ print(" -s INT min match length [" + opt.min_mlen + "]");
297
+ print(" -c INT max clip length [" + opt.max_clip_len + "]");
298
+ print(" -r FLOAT initial ratio for haplotype filtering [" + opt.max_ratio0 + "]");
299
+ return 0;
300
+ }
301
+
302
+ opt.ec = args.length - getopt.ind < 2? false : true;
303
+ if (!opt.ec) {
304
+ print("CC");
305
+ print("CC", "SQ qName qLen nHits");
306
+ print("CC", "TS index qStart qEnd tName tLen tStart tEnd nConsistent lCons nConflictive lConf score");
307
+ print("CC");
308
+ }
309
+
310
+ var buf = new Bytes(), ecb = new Bytes();
311
+ var fp_paf = new File(args[getopt.ind]);
312
+ var fp_seq = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : null;
313
+ var a = [];
314
+ while (fp_paf.readline(buf) >= 0) {
315
+ var t = buf.toString().split("\t");
316
+ if (a.length > 0 && a[0][0] != t[0]) {
317
+ process_paf(a, opt, fp_seq, buf, ecb);
318
+ a.length = 0;
319
+ }
320
+ for (var i = 1; i <= 3; ++i) t[i] = parseInt(t[i]);
321
+ if (t[1] < opt.min_rlen) continue;
322
+ for (var i = 6; i <= 10; ++i) t[i] = parseInt(t[i]);
323
+ if (t[10] < opt.min_blen) continue;
324
+ a.push(t);
325
+ }
326
+ if (a.length >= 0)
327
+ process_paf(a, opt, fp_seq, buf, ecb);
328
+ if (fp_seq) fp_seq.close();
329
+ fp_paf.close();
330
+ ecb.destroy();
331
+ buf.destroy();
332
+ }
333
+
334
+ var ret = main(arguments)
335
+ exit(ret)