minimap2 0.2.22.0 → 0.2.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,179 @@
1
+ ## <a name="started"></a>Getting Started
2
+
3
+ ```sh
4
+ # install minimap2
5
+ git clone https://github.com/lh3/minimap2
6
+ cd minimap2 && make
7
+ # install the k8 javascript shell
8
+ curl -L https://github.com/attractivechaos/k8/releases/download/v0.2.4/k8-0.2.4.tar.bz2 | tar -jxf -
9
+ cp k8-0.2.4/k8-`uname -s` k8 # or copy it to a directory on your $PATH
10
+ # export PATH="$PATH:`pwd`:`pwd`/misc" # run this if k8, minimap2 or paftools.js not on your $PATH
11
+ minimap2 --cs test/MT-human.fa test/MT-orang.fa | paftools.js view - # view alignment
12
+ minimap2 -c test/MT-human.fa test/MT-orang.fa | paftools.js stat - # basic alignment statistics
13
+ minimap2 -c --cs test/MT-human.fa test/MT-orang.fa \
14
+ | sort -k6,6 -k8,8n | paftools.js call -L15000 - # calling variants from asm-to-ref alignment
15
+ minimap2 -c test/MT-human.fa test/MT-orang.fa \
16
+ | paftools.js liftover -l10000 - <(echo -e "MT_orang\t2000\t5000") # liftOver
17
+ # no test data for the following examples
18
+ paftools.js junceval -e anno.gtf splice.sam > out.txt # compare splice junctions to annotations
19
+ paftools.js splice2bed anno.gtf > anno.bed # convert GTF/GFF3 to BED12
20
+ ```
21
+
22
+ ## Table of Contents
23
+
24
+ - [Getting Started](#started)
25
+ - [Introduction](#intro)
26
+ - [Evaluation](#eval)
27
+ - [Evaluating mapping accuracy with simulated reads](#mapeval)
28
+ - [Evaluating read overlap sensitivity](#oveval)
29
+ - [Calling Variants from Assemblies](#asmvar)
30
+
31
+ ## <a name="intro"></a>Introduction
32
+
33
+ paftools.js is a script that processes alignments in the [PAF format][paf],
34
+ such as converting between formats, evaluating mapping accuracy, lifting over
35
+ BED files based on alignment, and calling variants from assembly-to-assembly
36
+ alignment. This script *requires* the [k8 Javascript shell][k8] to run. On
37
+ Linux or Mac, you can download the precompiled k8 binary with:
38
+
39
+ ```sh
40
+ curl -L https://github.com/attractivechaos/k8/releases/download/v0.2.4/k8-0.2.4.tar.bz2 | tar -jxf -
41
+ cp k8-0.2.4/k8-`uname -s` $HOME/bin/k8 # assuming $HOME/bin in your $PATH
42
+ ```
43
+
44
+ It is highly recommended to copy the executable `k8` to a directory on your
45
+ `$PATH` such as `/usr/bin/env` can find it. Like python scripts, once you
46
+ install `k8`, you can launch paftools.js in one of the two ways:
47
+
48
+ ```sh
49
+ path/to/paftools.js # only if k8 is on your $PATH
50
+ k8 path/to/paftools.js
51
+ ```
52
+
53
+ In a nutshell, paftools.js has the following commands:
54
+
55
+ ```
56
+ Usage: paftools.js <command> [arguments]
57
+ Commands:
58
+ view convert PAF to BLAST-like (for eyeballing) or MAF
59
+ splice2bed convert spliced alignment in PAF/SAM to BED12
60
+ sam2paf convert SAM to PAF
61
+ delta2paf convert MUMmer's delta to PAF
62
+ gff2bed convert GTF/GFF3 to BED12
63
+
64
+ stat collect basic mapping information in PAF/SAM
65
+ liftover simplistic liftOver
66
+ call call variants from asm-to-ref alignment with the cs tag
67
+ bedcov compute the number of bases covered
68
+
69
+ mapeval evaluate mapping accuracy using mason2/PBSIM-simulated FASTQ
70
+ mason2fq convert mason2-simulated SAM to FASTQ
71
+ pbsim2fq convert PBSIM-simulated MAF to FASTQ
72
+ junceval evaluate splice junction consistency with known annotations
73
+ ov-eval evaluate read overlap sensitivity using read-to-ref mapping
74
+ ```
75
+
76
+ paftools.js seamlessly reads both plain text files and gzip'd text files.
77
+
78
+ ## <a name="eval"></a>Evaluation
79
+
80
+ ### <a name="mapeval"></a>Evaluating mapping accuracy with simulated reads
81
+
82
+ The **pbsim2fq** command of paftools.js converts the MAF output of [pbsim][pbsim]
83
+ to FASTQ and encodes the true mapping position in the read name in a format like
84
+ `S1_33!chr1!225258409!225267761!-`. Similarly, the **mason2fq** command
85
+ converts [mason2][mason2] simulated SAM to FASTQ.
86
+
87
+ Command **mapeval** evaluates mapped SAM/PAF. Here is example output:
88
+
89
+ ```
90
+ Q 60 32478 0 0.000000000 32478
91
+ Q 22 16 1 0.000030775 32494
92
+ Q 21 43 1 0.000061468 32537
93
+ Q 19 73 1 0.000091996 32610
94
+ Q 14 66 1 0.000122414 32676
95
+ Q 10 27 3 0.000214048 32703
96
+ Q 8 14 1 0.000244521 32717
97
+ Q 7 13 2 0.000305530 32730
98
+ Q 6 46 1 0.000335611 32776
99
+ Q 3 10 1 0.000366010 32786
100
+ Q 2 20 2 0.000426751 32806
101
+ Q 1 248 94 0.003267381 33054
102
+ Q 0 31 17 0.003778147 33085
103
+ U 3
104
+ ```
105
+
106
+ where each Q-line gives the quality threshold, the number of reads mapped with
107
+ mapping quality equal to or greater than the threshold, number of wrong
108
+ mappings, accumulative mapping error rate and the accumulative number of
109
+ mapped reads. The U-line, if present, gives the number of unmapped reads if
110
+ they are present in the SAM file.
111
+
112
+ Suppose the reported mapping coordinate overlap with the true coordinate like
113
+ the following:
114
+
115
+ ```
116
+ truth: --------------------
117
+ mapper: ----------------------
118
+ |<- l1 ->|<-- o -->|<-- l2 -->|
119
+ ```
120
+
121
+ Let `r=o/(l1+o+l2)`. The reported mapping is considered correct if `r>0.1` by
122
+ default.
123
+
124
+ ### <a name="oveval"></a>Evaluating read overlap sensitivity
125
+
126
+ Command **ov-eval** takes *sorted* read-to-reference alignment and read
127
+ overlaps in PAF as input, and evaluates the sensitivity. For example:
128
+
129
+ ```sh
130
+ minimap2 -cx map-pb ref.fa reads.fq.gz | sort -k6,6 -k8,8n > reads-to-ref.paf
131
+ minimap2 -x ava-pb reads.fq.gz reads.fq.gz > ovlp.paf
132
+ k8 ov-eval.js reads-to-ref.paf ovlp.paf
133
+ ```
134
+
135
+ ## <a name="asmvar"></a>Calling Variants from Haploid Assemblies
136
+
137
+ The **call** command of paftools.js calls variants from coordinate-sorted
138
+ assembly-to-reference alignment. It calls variants from the [cs tag][cs] and
139
+ identifies confident/callable regions as those covered by exactly one contig.
140
+ Here are example command lines:
141
+
142
+ ```sh
143
+ minimap2 -cx asm5 -t8 --cs ref.fa asm.fa > asm.paf # keeping this file is recommended; --cs required!
144
+ sort -k6,6 -k8,8n asm.paf > asm.srt.paf # sort by reference start coordinate
145
+ k8 paftools.js call asm.srt.paf > asm.var.txt
146
+ ```
147
+
148
+ Here is sample output:
149
+
150
+ ```
151
+ V chr1 2276040 2276041 1 60 c g LJII01000171.1 1217409 1217410 +
152
+ V chr1 2280409 2280410 1 60 a g LJII01000171.1 1221778 1221779 +
153
+ V chr1 2280504 2280505 1 60 a g LJII01000171.1 1221873 1221874 +
154
+ R chr1 2325140 2436340
155
+ V chr1 2325287 2325287 1 60 - ct LJII01000171.1 1272894 1272896 +
156
+ V chr1 2325642 2325644 1 60 tt - LJII01000171.1 1273251 1273251 +
157
+ V chr1 2326051 2326052 1 60 c t LJII01000171.1 1273658 1273659 +
158
+ V chr1 2326287 2326288 1 60 c t LJII01000171.1 1273894 1273895 +
159
+ ```
160
+
161
+ where a line starting with `R` gives regions covered by one query contig, and a
162
+ V-line encodes a variant in the following format: chr, start, end, query depth,
163
+ mapping quality, REF allele, ALT allele, query name, query start, end and the
164
+ query orientation. Generally, you should only look at variants where column 5
165
+ is one.
166
+
167
+ By default, when calling variants, "paftools.js call" ignores alignments 50kb
168
+ or shorter; when deriving callable regions, it ignores alignments 10kb or
169
+ shorter. It uses two thresholds to avoid edge effects. These defaults are
170
+ designed for long-read assemblies. For short reads, both should be reduced.
171
+
172
+
173
+
174
+ [paf]: https://github.com/lh3/miniasm/blob/master/PAF.md
175
+ [cs]: https://github.com/lh3/minimap2#cs
176
+ [k8]: https://github.com/attractivechaos/k8
177
+ [maf]: https://genome.ucsc.edu/FAQ/FAQformat#format5
178
+ [pbsim]: https://github.com/pfaucon/PBSIM-PacBio-Simulator
179
+ [mason2]: https://github.com/seqan/seqan/tree/master/apps/mason2
@@ -0,0 +1,335 @@
1
+ #!/usr/bin/env k8
2
+
3
+ var getopt = function(args, ostr) {
4
+ var oli; // option letter list index
5
+ if (typeof(getopt.place) == 'undefined')
6
+ getopt.ind = 0, getopt.arg = null, getopt.place = -1;
7
+ if (getopt.place == -1) { // update scanning pointer
8
+ if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
9
+ getopt.place = -1;
10
+ return null;
11
+ }
12
+ if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
13
+ ++getopt.ind;
14
+ getopt.place = -1;
15
+ return null;
16
+ }
17
+ }
18
+ var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
19
+ if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
20
+ if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
21
+ if (getopt.place < 0) ++getopt.ind;
22
+ return '?';
23
+ }
24
+ if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
25
+ getopt.arg = null;
26
+ if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
27
+ } else { // need an argument
28
+ if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
29
+ getopt.arg = args[getopt.ind].substr(getopt.place);
30
+ else if (args.length <= ++getopt.ind) { // no arg
31
+ getopt.place = -1;
32
+ if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
33
+ return '?';
34
+ } else getopt.arg = args[getopt.ind]; // white space
35
+ getopt.place = -1;
36
+ ++getopt.ind;
37
+ }
38
+ return optopt;
39
+ }
40
+
41
+ function read_fastx(file, buf)
42
+ {
43
+ if (file.readline(buf) < 0) return null;
44
+ var m, line = buf.toString();
45
+ if ((m = /^([>@])(\S+)/.exec(line)) == null)
46
+ throw Error("wrong fastx format");
47
+ var is_fq = (m[1] == '@');
48
+ var name = m[2];
49
+ if (file.readline(buf) < 0)
50
+ throw Error("missing sequence line");
51
+ var seq = buf.toString();
52
+ if (is_fq) { // skip quality
53
+ file.readline(buf);
54
+ file.readline(buf);
55
+ }
56
+ return [name, seq];
57
+ }
58
+
59
+ function filter_paf(a, opt)
60
+ {
61
+ if (a.length == 0) return;
62
+ var k = 0;
63
+ for (var i = 0; i < a.length; ++i) {
64
+ var ai = a[i];
65
+ if (ai[10] < opt.min_blen) continue;
66
+ if (ai[9] < ai[10] * opt.min_iden) continue;
67
+ var clip = [0, 0];
68
+ if (ai[4] == '+') {
69
+ clip[0] = ai[2] < ai[7]? ai[2] : ai[7];
70
+ clip[1] = ai[1] - ai[3] < ai[6] - ai[8]? ai[1] - ai[3] : ai[6] - ai[8];
71
+ } else {
72
+ clip[0] = ai[2] < ai[6] - ai[8]? ai[2] : ai[6] - ai[8];
73
+ clip[1] = ai[1] - ai[3] < ai[7]? ai[1] - ai[3] : ai[7];
74
+ }
75
+ if (clip[0] > opt.max_clip_len || clip[1] > opt.max_clip_len) continue;
76
+ a[k++] = ai;
77
+ }
78
+ a.length = k;
79
+ }
80
+
81
+ function parse_events(t, ev, id, buf)
82
+ {
83
+ var re = /(:(\d+))|(([\+\-\*])([a-z]+))/g;
84
+ var m, cs = null;
85
+ for (var j = 12; j < t.length; ++j) {
86
+ if ((m = /^cs:Z:(\S+)/.exec(t[j])) != null) {
87
+ cs = m[1].toLowerCase();
88
+ break;
89
+ }
90
+ }
91
+ if (cs == null) {
92
+ warn("Warning: no cs tag for read '" + t[0] + "'");
93
+ return;
94
+ }
95
+ var st = t[2], en = t[3];
96
+ var x = st;
97
+ while ((m = re.exec(cs)) != null) {
98
+ var l;
99
+ if (m[2] != null) { // an identitcal match ":\d+"
100
+ l = parseInt(m[2]);
101
+ // [start, end, type, index, changed_base]
102
+ ev.push([x, x + l, 0, id]);
103
+ } else {
104
+ if (m[4] == '*') {
105
+ l = 1;
106
+ ev.push([x, x + 1, 1, id, m[5][0]]);
107
+ } else if (m[4] == '+') {
108
+ l = m[5].length;
109
+ ev.push([x, x + l, 2, id]);
110
+ } else if (m[4] == '-') {
111
+ l = 0;
112
+ ev.push([x, x, -1, id, m[5]]);
113
+ }
114
+ }
115
+ x += l;
116
+ }
117
+ if (x != en)
118
+ throw Error("inconsistent cs for read '" + t[0] + "'");
119
+ }
120
+
121
+ function find_het_sub(ev, a, opt)
122
+ {
123
+ var n = a.length, last0_i = -1, h = [], d = [];
124
+ for (var i = 0; i < n; ++i) h[i] = [], d[i] = [];
125
+ for (var i = 0; i < ev.length; ++i) {
126
+ if (ev[i][2] == 0) {
127
+ if (last0_i < 0 || ev[i][0] != ev[last0_i][0]) last0_i = i;
128
+ else if (ev[i][1] > ev[last0_i][1])
129
+ last0_i = i;
130
+ } else if (ev[i][2] == 1 && last0_i >= 0 && ev[i][0] < ev[last0_i][1]) {
131
+ if (ev[last0_i][1] - ev[last0_i][0] >= opt.min_mlen) {
132
+ if (opt.dbg_ev) print("EV", ev[last0_i].join("\t"), "|", ev[i].join("\t"));
133
+ var e0 = ev[last0_i], hl = h[e0[3]];
134
+ if (hl.length == 0 || hl[hl.length-1][0] != e0[0])
135
+ hl.push([e0[0], e0[1]]);
136
+ d[ev[i][3]].push([ev[i][0], e0[1] - e0[0]]);
137
+ }
138
+ }
139
+ }
140
+ var b = [];
141
+ for (var i = 0; i < n; ++i) {
142
+ var sh = 0, dh = 0;
143
+ for (var j = 0; j < h[i].length; ++j)
144
+ sh += h[i][j][1] - h[i][j][0];
145
+ for (var j = 0; j < d[i].length; ++j)
146
+ dh += d[i][j][1];
147
+ // [start, end, index, #consistent, lenConsistent, #conflictive, lenConflictive, identity, mlen]
148
+ b[i] = [a[i][2], a[i][3], i, h[i].length, sh, d[i].length, dh, a[i][9] / a[i][10], a[i][9]];
149
+ }
150
+ return b;
151
+ }
152
+
153
+ function flt_utg_for_ec(b, opt)
154
+ {
155
+ var k = 0;
156
+ for (var i = 0; i < b.length; ++i) {
157
+ var bi = b[i];
158
+ if (bi[4] == 0 && bi[6] == 0) b[k++] = bi; // entirely ambiguous
159
+ else if (bi[6] < (bi[4] + bi[6]) * opt.max_ratio0) b[k++] = bi;
160
+ }
161
+ b.length = k;
162
+ if (b.length == 0) return;
163
+ // find the longest contiguous segment
164
+ b.sort(function(x,y) { return x[0]-y[0] });
165
+ var st = b[0][0], en = b[0][1], max_st = 0, max_en = 0, max_max_en = en;
166
+ for (var i = 1; i < b.length; ++i) {
167
+ if (b[i][0] > en) {
168
+ if (en - st > max_en - max_st)
169
+ max_st = st, max_en = en;
170
+ st = b[i][0], en = b[i][1];
171
+ } else {
172
+ en = en > b[i][1]? en : b[i][1];
173
+ }
174
+ max_max_en = max_max_en > b[i][1]? max_max_en : b[i][1];
175
+ }
176
+ if (en - st > max_en - max_st)
177
+ max_st = st, max_en = en;
178
+ if (max_max_en != en || st != b[0][0]) {
179
+ var k = 0;
180
+ for (var i = 0; i < b.length; ++i)
181
+ if (b[i][0] < max_en && b[i][1] > max_st)
182
+ b[k++] = b[i];
183
+ b.length = k;
184
+ }
185
+ }
186
+
187
+ function flt_utg_for_bin(b, opt) // filter out alignments clearly on the wrong phase
188
+ {
189
+ var k = 0;
190
+ for (var i = 0; i < b.length; ++i) {
191
+ var bi = b[i];
192
+ if (bi[4] + bi[6] == 0 || bi[4] >= (bi[4] + bi[6]) * opt.max_ratio0) b[k++] = bi;
193
+ }
194
+ b.length = k;
195
+ }
196
+
197
+ function ec_core(b, n_a, ev, buf, ecb) // error correction
198
+ {
199
+ var intv = [];
200
+ for (var i = 0; i < n_a; ++i)
201
+ intv[i] = null;
202
+ intv[b[0][2]] = [b[0][0], b[0][1]];
203
+ var en = b[0][1];
204
+ for (var i = 1; i < b.length; ++i) {
205
+ if (b[i][1] <= en) continue;
206
+ intv[b[i][2]] = [en, b[i][1]];
207
+ en = b[i][1];
208
+ }
209
+ var k = 0;
210
+ ecb.capacity = buf.capacity;
211
+ ecb.length = 0;
212
+ for (var i = 0; i < ev.length; ++i) {
213
+ var e = ev[i], I = intv[e[3]];
214
+ if (I == null) continue;
215
+ if (e[0] >= I[0] && e[0] < I[1]) { // this is to reduce duplicated events around junctions
216
+ //print("X", e.join("\t"));
217
+ if (e[2] == 0) {
218
+ ecb.length += e[1] - e[0];
219
+ for (var j = e[0]; j < e[1]; ++j)
220
+ ecb[k++] = buf[j];
221
+ } else if (e[2] == 1) {
222
+ ++ecb.length;
223
+ ecb[k++] = e[4].charCodeAt(0);
224
+ } else if (e[2] < 0) {
225
+ ecb.length += e[4].length;
226
+ for (var j = 0; j < e[4].length; ++j)
227
+ ecb[k++] = e[4].charCodeAt(j);
228
+ } // else, skip e[2] == 2
229
+ }
230
+ }
231
+ if (ecb.length != k) throw Error("BUG!");
232
+ }
233
+
234
+ function process_paf(a, opt, fp_seq, buf, ecb)
235
+ {
236
+ if (a.length == 0) return;
237
+ var len = a[0][1], name = a[0][0], seq = null;
238
+ if (len < opt.min_rlen) return;
239
+ if (fp_seq) {
240
+ var ret;
241
+ while ((ret = read_fastx(fp_seq, buf)) != null)
242
+ if (ret[0] == a[0][0])
243
+ break;
244
+ if (ret == null)
245
+ throw Error("failed to find sequence for read '" + a[0][0] + "'");
246
+ name = ret[0], seq = ret[1];
247
+ if (seq.length != len)
248
+ throw Error("inconsistent length for read '" + name + "'");
249
+ }
250
+ filter_paf(a, opt);
251
+ if (a.length == 0) return;
252
+ var ev = [];
253
+ for (var i = 0; i < a.length; ++i)
254
+ parse_events(a[i], ev, i, buf);
255
+ ev.sort(function(x,y) { return x[0]!=y[0]? x[0]-y[0] : x[2]-y[2] });
256
+ if (seq == null) print("SQ", name, a[0][1], a.length);
257
+ var b = find_het_sub(ev, a, opt);
258
+ if (opt.ec) flt_utg_for_ec(b, opt);
259
+ else flt_utg_for_bin(b, opt);
260
+ if (seq == null) {
261
+ for (var i = 0; i < b.length; ++i) {
262
+ var m, ai = a[b[i][2]], score = 0;
263
+ for (var j = 10; j < ai.length; ++j)
264
+ if ((m = /^AS:i:(\d+)/.exec(ai[j])) != null)
265
+ score = m[1];
266
+ print("TS", b[i][2], b[i][0], b[i][1], ai.slice(5, 9).join("\t"), b[i].slice(3, 7).join("\t"), score);
267
+ }
268
+ print("//");
269
+ } else { // error correction
270
+ if (b.length == 0) return;
271
+ buf.set(seq, 0);
272
+ ec_core(b, a.length, ev, buf, ecb);
273
+ print(">" + name);
274
+ print(ecb);
275
+ }
276
+ }
277
+
278
+ function main(args)
279
+ {
280
+ var c, opt = { min_rlen:5000, min_blen:5000, min_iden:0.8, min_mlen:5, max_clip_len:500, max_ratio0:0.25, dbg_ev:false };
281
+ while ((c = getopt(args, "l:b:d:m:c:r:E")) != null) {
282
+ if (c == 'l') opt.min_rlen = parseInt(getopt.arg);
283
+ else if (c == 'b') opt.min_blen = parseInt(getopt.arg);
284
+ else if (c == 'd') opt.min_iden = parseFloat(getopt.arg);
285
+ else if (c == 'm') opt.min_slen = parseInt(getopt.arg);
286
+ else if (c == 'c') opt.max_clip_len = parseInt(getopt.arg);
287
+ else if (c == 'r') opt.max_ratio0 = parseFloat(getopt.arg);
288
+ else if (c == 'E') opt.dbg_ev = true;
289
+ }
290
+ if (args.length - getopt.ind < 1) {
291
+ print("Usage: mmphase.js [options] <map-with-cs.paf> [reads.fa]");
292
+ print("Options:");
293
+ print(" -l INT min read length [" + opt.min_rlen + "]");
294
+ print(" -b INT min alignment length [" + opt.min_blen + "]");
295
+ print(" -d FLOAT min identity [" + opt.min_iden + "]");
296
+ print(" -s INT min match length [" + opt.min_mlen + "]");
297
+ print(" -c INT max clip length [" + opt.max_clip_len + "]");
298
+ print(" -r FLOAT initial ratio for haplotype filtering [" + opt.max_ratio0 + "]");
299
+ return 0;
300
+ }
301
+
302
+ opt.ec = args.length - getopt.ind < 2? false : true;
303
+ if (!opt.ec) {
304
+ print("CC");
305
+ print("CC", "SQ qName qLen nHits");
306
+ print("CC", "TS index qStart qEnd tName tLen tStart tEnd nConsistent lCons nConflictive lConf score");
307
+ print("CC");
308
+ }
309
+
310
+ var buf = new Bytes(), ecb = new Bytes();
311
+ var fp_paf = new File(args[getopt.ind]);
312
+ var fp_seq = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : null;
313
+ var a = [];
314
+ while (fp_paf.readline(buf) >= 0) {
315
+ var t = buf.toString().split("\t");
316
+ if (a.length > 0 && a[0][0] != t[0]) {
317
+ process_paf(a, opt, fp_seq, buf, ecb);
318
+ a.length = 0;
319
+ }
320
+ for (var i = 1; i <= 3; ++i) t[i] = parseInt(t[i]);
321
+ if (t[1] < opt.min_rlen) continue;
322
+ for (var i = 6; i <= 10; ++i) t[i] = parseInt(t[i]);
323
+ if (t[10] < opt.min_blen) continue;
324
+ a.push(t);
325
+ }
326
+ if (a.length >= 0)
327
+ process_paf(a, opt, fp_seq, buf, ecb);
328
+ if (fp_seq) fp_seq.close();
329
+ fp_paf.close();
330
+ ecb.destroy();
331
+ buf.destroy();
332
+ }
333
+
334
+ var ret = main(arguments)
335
+ exit(ret)