bio-bwa 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
data/ext/bntseq.h
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
/* The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2008 Genome Research Ltd (GRL).
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
20
|
+
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
21
|
+
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
22
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
23
|
+
SOFTWARE.
|
24
|
+
*/
|
25
|
+
|
26
|
+
/* Contact: Heng Li <lh3@sanger.ac.uk> */
|
27
|
+
|
28
|
+
#ifndef BWT_BNTSEQ_H
|
29
|
+
#define BWT_BNTSEQ_H
|
30
|
+
|
31
|
+
#include <stdint.h>
|
32
|
+
#include <zlib.h>
|
33
|
+
|
34
|
+
#ifndef BWA_UBYTE
|
35
|
+
#define BWA_UBYTE
|
36
|
+
typedef uint8_t ubyte_t;
|
37
|
+
#endif
|
38
|
+
|
39
|
+
typedef struct {
|
40
|
+
int64_t offset;
|
41
|
+
int32_t len;
|
42
|
+
int32_t n_ambs;
|
43
|
+
uint32_t gi;
|
44
|
+
char *name, *anno;
|
45
|
+
} bntann1_t;
|
46
|
+
|
47
|
+
typedef struct {
|
48
|
+
int64_t offset;
|
49
|
+
int32_t len;
|
50
|
+
char amb;
|
51
|
+
} bntamb1_t;
|
52
|
+
|
53
|
+
typedef struct {
|
54
|
+
int64_t l_pac;
|
55
|
+
int32_t n_seqs;
|
56
|
+
uint32_t seed;
|
57
|
+
bntann1_t *anns; // n_seqs elements
|
58
|
+
int32_t n_holes;
|
59
|
+
bntamb1_t *ambs; // n_holes elements
|
60
|
+
FILE *fp_pac;
|
61
|
+
} bntseq_t;
|
62
|
+
|
63
|
+
extern unsigned char nst_nt4_table[256];
|
64
|
+
|
65
|
+
#ifdef __cplusplus
|
66
|
+
extern "C" {
|
67
|
+
#endif
|
68
|
+
|
69
|
+
void bns_dump(const bntseq_t *bns, const char *prefix);
|
70
|
+
bntseq_t *bns_restore(const char *prefix);
|
71
|
+
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename);
|
72
|
+
void bns_destroy(bntseq_t *bns);
|
73
|
+
void bns_fasta2bntseq(gzFile fp_fa, const char *prefix);
|
74
|
+
int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq);
|
75
|
+
|
76
|
+
#ifdef __cplusplus
|
77
|
+
}
|
78
|
+
#endif
|
79
|
+
|
80
|
+
#endif
|
data/ext/bwa.1
ADDED
@@ -0,0 +1,562 @@
|
|
1
|
+
.TH bwa 1 "24 January 2011" "bwa-0.5.9" "Bioinformatics tools"
|
2
|
+
.SH NAME
|
3
|
+
.PP
|
4
|
+
bwa - Burrows-Wheeler Alignment Tool
|
5
|
+
.SH SYNOPSIS
|
6
|
+
.PP
|
7
|
+
bwa index -a bwtsw database.fasta
|
8
|
+
.PP
|
9
|
+
bwa aln database.fasta short_read.fastq > aln_sa.sai
|
10
|
+
.PP
|
11
|
+
bwa samse database.fasta aln_sa.sai short_read.fastq > aln.sam
|
12
|
+
.PP
|
13
|
+
bwa sampe database.fasta aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln.sam
|
14
|
+
.PP
|
15
|
+
bwa bwasw database.fasta long_read.fastq > aln.sam
|
16
|
+
|
17
|
+
.SH DESCRIPTION
|
18
|
+
.PP
|
19
|
+
BWA is a fast light-weighted tool that aligns relatively short sequences
|
20
|
+
(queries) to a sequence database (targe), such as the human reference
|
21
|
+
genome. It implements two different algorithms, both based on
|
22
|
+
Burrows-Wheeler Transform (BWT). The first algorithm is designed for
|
23
|
+
short queries up to ~200bp with low error rate (<3%). It does gapped
|
24
|
+
global alignment w.r.t. queries, supports paired-end reads, and is one
|
25
|
+
of the fastest short read alignment algorithms to date while also
|
26
|
+
visiting suboptimal hits. The second algorithm, BWA-SW, is designed for
|
27
|
+
long reads with more errors. It performs heuristic Smith-Waterman-like
|
28
|
+
alignment to find high-scoring local hits (and thus chimera). On
|
29
|
+
low-error short queries, BWA-SW is slower and less accurate than the
|
30
|
+
first algorithm, but on long queries, it is better.
|
31
|
+
.PP
|
32
|
+
For both algorithms, the database file in the FASTA format must be
|
33
|
+
first indexed with the
|
34
|
+
.B `index'
|
35
|
+
command, which typically takes a few hours. The first algorithm is
|
36
|
+
implemented via the
|
37
|
+
.B `aln'
|
38
|
+
command, which finds the suffix array (SA) coordinates of good hits of
|
39
|
+
each individual read, and the
|
40
|
+
.B `samse/sampe'
|
41
|
+
command, which converts SA coordinates to chromosomal coordinate and
|
42
|
+
pairs reads (for `sampe'). The second algorithm is invoked by the
|
43
|
+
.B `bwasw'
|
44
|
+
command. It works for single-end reads only.
|
45
|
+
|
46
|
+
.SH COMMANDS AND OPTIONS
|
47
|
+
.TP
|
48
|
+
.B index
|
49
|
+
bwa index [-p prefix] [-a algoType] [-c] <in.db.fasta>
|
50
|
+
|
51
|
+
Index database sequences in the FASTA format.
|
52
|
+
|
53
|
+
.B OPTIONS:
|
54
|
+
.RS
|
55
|
+
.TP 10
|
56
|
+
.B -c
|
57
|
+
Build color-space index. The input fast should be in nucleotide space.
|
58
|
+
.TP
|
59
|
+
.BI -p \ STR
|
60
|
+
Prefix of the output database [same as db filename]
|
61
|
+
.TP
|
62
|
+
.BI -a \ STR
|
63
|
+
Algorithm for constructing BWT index. Available options are:
|
64
|
+
.RS
|
65
|
+
.TP
|
66
|
+
.B is
|
67
|
+
IS linear-time algorithm for constructing suffix array. It requires
|
68
|
+
5.37N memory where N is the size of the database. IS is moderately fast,
|
69
|
+
but does not work with database larger than 2GB. IS is the default
|
70
|
+
algorithm due to its simplicity. The current codes for IS algorithm are
|
71
|
+
reimplemented by Yuta Mori.
|
72
|
+
.TP
|
73
|
+
.B bwtsw
|
74
|
+
Algorithm implemented in BWT-SW. This method works with the whole human
|
75
|
+
genome, but it does not work with database smaller than 10MB and it is
|
76
|
+
usually slower than IS.
|
77
|
+
.RE
|
78
|
+
.RE
|
79
|
+
|
80
|
+
.TP
|
81
|
+
.B aln
|
82
|
+
bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i
|
83
|
+
nIndelEnd] [-k maxSeedDiff] [-l seedLen] [-t nThrds] [-cRN] [-M misMsc]
|
84
|
+
[-O gapOsc] [-E gapEsc] [-q trimQual] <in.db.fasta> <in.query.fq> >
|
85
|
+
<out.sai>
|
86
|
+
|
87
|
+
Find the SA coordinates of the input reads. Maximum
|
88
|
+
.I maxSeedDiff
|
89
|
+
differences are allowed in the first
|
90
|
+
.I seedLen
|
91
|
+
subsequence and maximum
|
92
|
+
.I maxDiff
|
93
|
+
differences are allowed in the whole sequence.
|
94
|
+
|
95
|
+
.B OPTIONS:
|
96
|
+
.RS
|
97
|
+
.TP 10
|
98
|
+
.BI -n \ NUM
|
99
|
+
Maximum edit distance if the value is INT, or the fraction of missing
|
100
|
+
alignments given 2% uniform base error rate if FLOAT. In the latter
|
101
|
+
case, the maximum edit distance is automatically chosen for different
|
102
|
+
read lengths. [0.04]
|
103
|
+
.TP
|
104
|
+
.BI -o \ INT
|
105
|
+
Maximum number of gap opens [1]
|
106
|
+
.TP
|
107
|
+
.BI -e \ INT
|
108
|
+
Maximum number of gap extensions, -1 for k-difference mode (disallowing
|
109
|
+
long gaps) [-1]
|
110
|
+
.TP
|
111
|
+
.BI -d \ INT
|
112
|
+
Disallow a long deletion within INT bp towards the 3'-end [16]
|
113
|
+
.TP
|
114
|
+
.BI -i \ INT
|
115
|
+
Disallow an indel within INT bp towards the ends [5]
|
116
|
+
.TP
|
117
|
+
.BI -l \ INT
|
118
|
+
Take the first INT subsequence as seed. If INT is larger than the query
|
119
|
+
sequence, seeding will be disabled. For long reads, this option is
|
120
|
+
typically ranged from 25 to 35 for `-k 2'. [inf]
|
121
|
+
.TP
|
122
|
+
.BI -k \ INT
|
123
|
+
Maximum edit distance in the seed [2]
|
124
|
+
.TP
|
125
|
+
.BI -t \ INT
|
126
|
+
Number of threads (multi-threading mode) [1]
|
127
|
+
.TP
|
128
|
+
.BI -M \ INT
|
129
|
+
Mismatch penalty. BWA will not search for suboptimal hits with a score
|
130
|
+
lower than (bestScore-misMsc). [3]
|
131
|
+
.TP
|
132
|
+
.BI -O \ INT
|
133
|
+
Gap open penalty [11]
|
134
|
+
.TP
|
135
|
+
.BI -E \ INT
|
136
|
+
Gap extension penalty [4]
|
137
|
+
.TP
|
138
|
+
.BI -R \ INT
|
139
|
+
Proceed with suboptimal alignments if there are no more than INT equally
|
140
|
+
best hits. This option only affects paired-end mapping. Increasing this
|
141
|
+
threshold helps to improve the pairing accuracy at the cost of speed,
|
142
|
+
especially for short reads (~32bp).
|
143
|
+
.TP
|
144
|
+
.B -c
|
145
|
+
Reverse query but not complement it, which is required for alignment in
|
146
|
+
the color space.
|
147
|
+
.TP
|
148
|
+
.B -N
|
149
|
+
Disable iterative search. All hits with no more than
|
150
|
+
.I maxDiff
|
151
|
+
differences will be found. This mode is much slower than the default.
|
152
|
+
.TP
|
153
|
+
.BI -q \ INT
|
154
|
+
Parameter for read trimming. BWA trims a read down to
|
155
|
+
argmax_x{\\sum_{i=x+1}^l(INT-q_i)} if q_l<INT where l is the original
|
156
|
+
read length. [0]
|
157
|
+
.TP
|
158
|
+
.B -I
|
159
|
+
The input is in the Illumina 1.3+ read format (quality equals ASCII-64).
|
160
|
+
.TP
|
161
|
+
.BI -B \ INT
|
162
|
+
Length of barcode starting from the 5'-end. When
|
163
|
+
.I INT
|
164
|
+
is positive, the barcode of each read will be trimmed before mapping and will
|
165
|
+
be written at the
|
166
|
+
.B BC
|
167
|
+
SAM tag. For paired-end reads, the barcode from both ends are concatenated. [0]
|
168
|
+
.TP
|
169
|
+
.B -b
|
170
|
+
Specify the input read sequence file is the BAM format. For paired-end
|
171
|
+
data, two ends in a pair must be grouped together and options
|
172
|
+
.B -1
|
173
|
+
or
|
174
|
+
.B -2
|
175
|
+
are usually applied to specify which end should be mapped. Typical
|
176
|
+
command lines for mapping pair-end data in the BAM format are:
|
177
|
+
|
178
|
+
bwa aln ref.fa -b1 reads.bam > 1.sai
|
179
|
+
bwa aln ref.fa -b2 reads.bam > 2.sai
|
180
|
+
bwa sampe ref.fa 1.sai 2.sai reads.bam reads.bam > aln.sam
|
181
|
+
.TP
|
182
|
+
.B -0
|
183
|
+
When
|
184
|
+
.B -b
|
185
|
+
is specified, only use single-end reads in mapping.
|
186
|
+
.TP
|
187
|
+
.B -1
|
188
|
+
When
|
189
|
+
.B -b
|
190
|
+
is specified, only use the first read in a read pair in mapping (skip
|
191
|
+
single-end reads and the second reads).
|
192
|
+
.TP
|
193
|
+
.B -2
|
194
|
+
When
|
195
|
+
.B -b
|
196
|
+
is specified, only use the second read in a read pair in mapping.
|
197
|
+
.B
|
198
|
+
.RE
|
199
|
+
|
200
|
+
.TP
|
201
|
+
.B samse
|
202
|
+
bwa samse [-n maxOcc] <in.db.fasta> <in.sai> <in.fq> > <out.sam>
|
203
|
+
|
204
|
+
Generate alignments in the SAM format given single-end reads. Repetitive
|
205
|
+
hits will be randomly chosen.
|
206
|
+
|
207
|
+
.B OPTIONS:
|
208
|
+
.RS
|
209
|
+
.TP 10
|
210
|
+
.BI -n \ INT
|
211
|
+
Maximum number of alignments to output in the XA tag for reads paired
|
212
|
+
properly. If a read has more than INT hits, the XA tag will not be
|
213
|
+
written. [3]
|
214
|
+
.TP
|
215
|
+
.BI -r \ STR
|
216
|
+
Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null]
|
217
|
+
.RE
|
218
|
+
|
219
|
+
.TP
|
220
|
+
.B sampe
|
221
|
+
bwa sampe [-a maxInsSize] [-o maxOcc] [-n maxHitPaired] [-N maxHitDis]
|
222
|
+
[-P] <in.db.fasta> <in1.sai> <in2.sai> <in1.fq> <in2.fq> > <out.sam>
|
223
|
+
|
224
|
+
Generate alignments in the SAM format given paired-end reads. Repetitive
|
225
|
+
read pairs will be placed randomly.
|
226
|
+
|
227
|
+
.B OPTIONS:
|
228
|
+
.RS
|
229
|
+
.TP 8
|
230
|
+
.BI -a \ INT
|
231
|
+
Maximum insert size for a read pair to be considered being mapped
|
232
|
+
properly. Since 0.4.5, this option is only used when there are not
|
233
|
+
enough good alignment to infer the distribution of insert sizes. [500]
|
234
|
+
.TP
|
235
|
+
.BI -o \ INT
|
236
|
+
Maximum occurrences of a read for pairing. A read with more occurrneces
|
237
|
+
will be treated as a single-end read. Reducing this parameter helps
|
238
|
+
faster pairing. [100000]
|
239
|
+
.TP
|
240
|
+
.B -P
|
241
|
+
Load the entire FM-index into memory to reduce disk operations
|
242
|
+
(base-space reads only). With this option, at least 1.25N bytes of
|
243
|
+
memory are required, where N is the length of the genome.
|
244
|
+
.TP
|
245
|
+
.BI -n \ INT
|
246
|
+
Maximum number of alignments to output in the XA tag for reads paired
|
247
|
+
properly. If a read has more than INT hits, the XA tag will not be
|
248
|
+
written. [3]
|
249
|
+
.TP
|
250
|
+
.BI -N \ INT
|
251
|
+
Maximum number of alignments to output in the XA tag for disconcordant
|
252
|
+
read pairs (excluding singletons). If a read has more than INT hits, the
|
253
|
+
XA tag will not be written. [10]
|
254
|
+
.TP
|
255
|
+
.BI -r \ STR
|
256
|
+
Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null]
|
257
|
+
.RE
|
258
|
+
|
259
|
+
.TP
|
260
|
+
.B bwasw
|
261
|
+
bwa bwasw [-a matchScore] [-b mmPen] [-q gapOpenPen] [-r gapExtPen] [-t
|
262
|
+
nThreads] [-w bandWidth] [-T thres] [-s hspIntv] [-z zBest] [-N
|
263
|
+
nHspRev] [-c thresCoef] <in.db.fasta> <in.fq>
|
264
|
+
|
265
|
+
Align query sequences in the <in.fq> file.
|
266
|
+
|
267
|
+
.B OPTIONS:
|
268
|
+
.RS
|
269
|
+
.TP 10
|
270
|
+
.BI -a \ INT
|
271
|
+
Score of a match [1]
|
272
|
+
.TP
|
273
|
+
.BI -b \ INT
|
274
|
+
Mismatch penalty [3]
|
275
|
+
.TP
|
276
|
+
.BI -q \ INT
|
277
|
+
Gap open penalty [5]
|
278
|
+
.TP
|
279
|
+
.BI -r \ INT
|
280
|
+
Gap extension penalty. The penalty for a contiguous gap of size k is
|
281
|
+
q+k*r. [2]
|
282
|
+
.TP
|
283
|
+
.BI -t \ INT
|
284
|
+
Number of threads in the multi-threading mode [1]
|
285
|
+
.TP
|
286
|
+
.BI -w \ INT
|
287
|
+
Band width in the banded alignment [33]
|
288
|
+
.TP
|
289
|
+
.BI -T \ INT
|
290
|
+
Minimum score threshold divided by a [37]
|
291
|
+
.TP
|
292
|
+
.BI -c \ FLOAT
|
293
|
+
Coefficient for threshold adjustment according to query length. Given an
|
294
|
+
l-long query, the threshold for a hit to be retained is
|
295
|
+
a*max{T,c*log(l)}. [5.5]
|
296
|
+
.TP
|
297
|
+
.BI -z \ INT
|
298
|
+
Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1]
|
299
|
+
.TP
|
300
|
+
.BI -s \ INT
|
301
|
+
Maximum SA interval size for initiating a seed. Higher -s increases
|
302
|
+
accuracy at the cost of speed. [3]
|
303
|
+
.TP
|
304
|
+
.BI -N \ INT
|
305
|
+
Minimum number of seeds supporting the resultant alignment to skip
|
306
|
+
reverse alignment. [5]
|
307
|
+
.RE
|
308
|
+
|
309
|
+
.SH SAM ALIGNMENT FORMAT
|
310
|
+
.PP
|
311
|
+
The output of the
|
312
|
+
.B `aln'
|
313
|
+
command is binary and designed for BWA use only. BWA outputs the final
|
314
|
+
alignment in the SAM (Sequence Alignment/Map) format. Each line consists
|
315
|
+
of:
|
316
|
+
|
317
|
+
.TS
|
318
|
+
center box;
|
319
|
+
cb | cb | cb
|
320
|
+
n | l | l .
|
321
|
+
Col Field Description
|
322
|
+
_
|
323
|
+
1 QNAME Query (pair) NAME
|
324
|
+
2 FLAG bitwise FLAG
|
325
|
+
3 RNAME Reference sequence NAME
|
326
|
+
4 POS 1-based leftmost POSition/coordinate of clipped sequence
|
327
|
+
5 MAPQ MAPping Quality (Phred-scaled)
|
328
|
+
6 CIAGR extended CIGAR string
|
329
|
+
7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME)
|
330
|
+
8 MPOS 1-based Mate POSistion
|
331
|
+
9 ISIZE Inferred insert SIZE
|
332
|
+
10 SEQ query SEQuence on the same strand as the reference
|
333
|
+
11 QUAL query QUALity (ASCII-33 gives the Phred base quality)
|
334
|
+
12 OPT variable OPTional fields in the format TAG:VTYPE:VALUE
|
335
|
+
.TE
|
336
|
+
|
337
|
+
.PP
|
338
|
+
Each bit in the FLAG field is defined as:
|
339
|
+
|
340
|
+
.TS
|
341
|
+
center box;
|
342
|
+
cb | cb | cb
|
343
|
+
c | l | l .
|
344
|
+
Chr Flag Description
|
345
|
+
_
|
346
|
+
p 0x0001 the read is paired in sequencing
|
347
|
+
P 0x0002 the read is mapped in a proper pair
|
348
|
+
u 0x0004 the query sequence itself is unmapped
|
349
|
+
U 0x0008 the mate is unmapped
|
350
|
+
r 0x0010 strand of the query (1 for reverse)
|
351
|
+
R 0x0020 strand of the mate
|
352
|
+
1 0x0040 the read is the first read in a pair
|
353
|
+
2 0x0080 the read is the second read in a pair
|
354
|
+
s 0x0100 the alignment is not primary
|
355
|
+
f 0x0200 QC failure
|
356
|
+
d 0x0400 optical or PCR duplicate
|
357
|
+
.TE
|
358
|
+
|
359
|
+
.PP
|
360
|
+
The Please check <http://samtools.sourceforge.net> for the format
|
361
|
+
specification and the tools for post-processing the alignment.
|
362
|
+
|
363
|
+
BWA generates the following optional fields. Tags starting with `X' are
|
364
|
+
specific to BWA.
|
365
|
+
|
366
|
+
.TS
|
367
|
+
center box;
|
368
|
+
cb | cb
|
369
|
+
cB | l .
|
370
|
+
Tag Meaning
|
371
|
+
_
|
372
|
+
NM Edit distance
|
373
|
+
MD Mismatching positions/bases
|
374
|
+
AS Alignment score
|
375
|
+
BC Barcode sequence
|
376
|
+
_
|
377
|
+
X0 Number of best hits
|
378
|
+
X1 Number of suboptimal hits found by BWA
|
379
|
+
XN Number of ambiguous bases in the referenece
|
380
|
+
XM Number of mismatches in the alignment
|
381
|
+
XO Number of gap opens
|
382
|
+
XG Number of gap extentions
|
383
|
+
XT Type: Unique/Repeat/N/Mate-sw
|
384
|
+
XA Alternative hits; format: (chr,pos,CIGAR,NM;)*
|
385
|
+
_
|
386
|
+
XS Suboptimal alignment score
|
387
|
+
XF Support from forward/reverse alignment
|
388
|
+
XE Number of supporting seeds
|
389
|
+
.TE
|
390
|
+
|
391
|
+
.PP
|
392
|
+
Note that XO and XG are generated by BWT search while the CIGAR string
|
393
|
+
by Smith-Waterman alignment. These two tags may be inconsistent with the
|
394
|
+
CIGAR string. This is not a bug.
|
395
|
+
|
396
|
+
.SH NOTES ON SHORT-READ ALIGNMENT
|
397
|
+
.SS Alignment Accuracy
|
398
|
+
.PP
|
399
|
+
When seeding is disabled, BWA guarantees to find an alignment
|
400
|
+
containing maximum
|
401
|
+
.I maxDiff
|
402
|
+
differences including
|
403
|
+
.I maxGapO
|
404
|
+
gap opens which do not occur within
|
405
|
+
.I nIndelEnd
|
406
|
+
bp towards either end of the query. Longer gaps may be found if
|
407
|
+
.I maxGapE
|
408
|
+
is positive, but it is not guaranteed to find all hits. When seeding is
|
409
|
+
enabled, BWA further requires that the first
|
410
|
+
.I seedLen
|
411
|
+
subsequence contains no more than
|
412
|
+
.I maxSeedDiff
|
413
|
+
differences.
|
414
|
+
.PP
|
415
|
+
When gapped alignment is disabled, BWA is expected to generate the same
|
416
|
+
alignment as Eland, the Illumina alignment program. However, as BWA
|
417
|
+
change `N' in the database sequence to random nucleotides, hits to these
|
418
|
+
random sequences will also be counted. As a consequence, BWA may mark a
|
419
|
+
unique hit as a repeat, if the random sequences happen to be identical
|
420
|
+
to the sequences which should be unqiue in the database. This random
|
421
|
+
behaviour will be avoided in future releases.
|
422
|
+
.PP
|
423
|
+
By default, if the best hit is no so repetitive (controlled by -R), BWA
|
424
|
+
also finds all hits contains one more mismatch; otherwise, BWA finds all
|
425
|
+
equally best hits only. Base quality is NOT considered in evaluating
|
426
|
+
hits. In paired-end alignment, BWA pairs all hits it found. It further
|
427
|
+
performs Smith-Waterman alignment for unmapped reads with mates mapped
|
428
|
+
to rescue mapped mates, and for high-quality anomalous pairs to fix
|
429
|
+
potential alignment errors.
|
430
|
+
|
431
|
+
.SS Estimating Insert Size Distribution
|
432
|
+
.PP
|
433
|
+
BWA estimates the insert size distribution per 256*1024 read pairs. It
|
434
|
+
first collects pairs of reads with both ends mapped with a single-end
|
435
|
+
quality 20 or higher and then calculates median (Q2), lower and higher
|
436
|
+
quartile (Q1 and Q3). It estimates the mean and the variance of the
|
437
|
+
insert size distribution from pairs whose insert sizes are within
|
438
|
+
interval [Q1-2(Q3-Q1), Q3+2(Q3-Q1)]. The maximum distance x for a pair
|
439
|
+
considered to be properly paired (SAM flag 0x2) is calculated by solving
|
440
|
+
equation Phi((x-mu)/sigma)=x/L*p0, where mu is the mean, sigma is the
|
441
|
+
standard error of the insert size distribution, L is the length of the
|
442
|
+
genome, p0 is prior of anomalous pair and Phi() is the standard
|
443
|
+
cumulative distribution function. For mapping Illumina short-insert
|
444
|
+
reads to the human genome, x is about 6-7 sigma away from the
|
445
|
+
mean. Quartiles, mean, variance and x will be printed to the standard
|
446
|
+
error output.
|
447
|
+
|
448
|
+
.SS Memory Requirement
|
449
|
+
.PP
|
450
|
+
With bwtsw algorithm, 2.5GB memory is required for indexing the complete
|
451
|
+
human genome sequences. For short reads, the
|
452
|
+
.B `aln'
|
453
|
+
command uses ~2.3GB memory and the
|
454
|
+
.B `sampe'
|
455
|
+
command uses ~3.5GB.
|
456
|
+
|
457
|
+
.SS Speed
|
458
|
+
.PP
|
459
|
+
Indexing the human genome sequences takes 3 hours with bwtsw
|
460
|
+
algorithm. Indexing smaller genomes with IS or divsufsort algorithms is
|
461
|
+
several times faster, but requires more memory.
|
462
|
+
.PP
|
463
|
+
Speed of alignment is largely determined by the error rate of the query
|
464
|
+
sequences (r). Firstly, BWA runs much faster for near perfect hits than
|
465
|
+
for hits with many differences, and it stops searching for a hit with
|
466
|
+
l+2 differences if a l-difference hit is found. This means BWA will be
|
467
|
+
very slow if r is high because in this case BWA has to visit hits with
|
468
|
+
many differences and looking for these hits is expensive. Secondly, the
|
469
|
+
alignment algorithm behind makes the speed sensitive to [k log(N)/m],
|
470
|
+
where k is the maximum allowed differences, N the size of database and m
|
471
|
+
the length of a query. In practice, we choose k w.r.t. r and therefore r
|
472
|
+
is the leading factor. I would not recommend to use BWA on data with
|
473
|
+
r>0.02.
|
474
|
+
.PP
|
475
|
+
Pairing is slower for shorter reads. This is mainly because shorter
|
476
|
+
reads have more spurious hits and converting SA coordinates to
|
477
|
+
chromosomal coordinates are very costly.
|
478
|
+
.PP
|
479
|
+
In a practical experiment, BWA is able to map 2 million 32bp reads to a
|
480
|
+
bacterial genome in several minutes, map the same amount of reads to
|
481
|
+
human X chromosome in 8-15 minutes and to the human genome in 15-25
|
482
|
+
minutes. This result implies that the speed of BWA is insensitive to the
|
483
|
+
size of database and therefore BWA is more efficient when the database
|
484
|
+
is sufficiently large. On smaller genomes, hash based algorithms are
|
485
|
+
usually much faster.
|
486
|
+
|
487
|
+
.SH NOTES ON LONG-READ ALIGNMENT
|
488
|
+
.PP
|
489
|
+
Command
|
490
|
+
.B `bwasw'
|
491
|
+
is designed for long-read alignment. The algorithm behind, BWA-SW, is
|
492
|
+
similar to BWT-SW, but does not guarantee to find all local hits due to
|
493
|
+
the heuristic acceleration. It tends to be faster and more accurate if
|
494
|
+
the resultant alignment is supported by more seeds, and therefore
|
495
|
+
BWA-SW usually performs better on long queries than on short ones.
|
496
|
+
|
497
|
+
On 350-1000bp reads, BWA-SW is several to tens of times faster than the
|
498
|
+
existing programs. Its accuracy is comparable to SSAHA2, more accurate
|
499
|
+
than BLAT. Like BLAT, BWA-SW also finds chimera which may pose a
|
500
|
+
challenge to SSAHA2. On 10-100kbp queries where chimera detection is
|
501
|
+
important, BWA-SW is over 10X faster than BLAT while being more
|
502
|
+
sensitive.
|
503
|
+
|
504
|
+
BWA-SW can also be used to align ~100bp reads, but it is slower than
|
505
|
+
the short-read algorithm. Its sensitivity and accuracy is lower than
|
506
|
+
SSAHA2 especially when the sequencing error rate is above 2%. This is
|
507
|
+
the trade-off of the 30X speed up in comparison to SSAHA2's -454 mode.
|
508
|
+
|
509
|
+
.SH SEE ALSO
|
510
|
+
BWA website <http://bio-bwa.sourceforge.net>, Samtools website
|
511
|
+
<http://samtools.sourceforge.net>
|
512
|
+
|
513
|
+
.SH AUTHOR
|
514
|
+
Heng Li at the Sanger Institute wrote the key source codes and
|
515
|
+
integrated the following codes for BWT construction: bwtsw
|
516
|
+
<http://i.cs.hku.hk/~ckwong3/bwtsw/>, implemented by Chi-Kwong Wong at
|
517
|
+
the University of Hong Kong and IS
|
518
|
+
<http://yuta.256.googlepages.com/sais> originally proposed by Nong Ge
|
519
|
+
<http://www.cs.sysu.edu.cn/nong/> at the Sun Yat-Sen University and
|
520
|
+
implemented by Yuta Mori.
|
521
|
+
|
522
|
+
.SH LICENSE AND CITATION
|
523
|
+
.PP
|
524
|
+
The full BWA package is distributed under GPLv3 as it uses source codes
|
525
|
+
from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS
|
526
|
+
libraries are distributed under the MIT license.
|
527
|
+
.PP
|
528
|
+
If you use the short-read alignment component, please cite the following
|
529
|
+
paper:
|
530
|
+
.PP
|
531
|
+
Li H. and Durbin R. (2009) Fast and accurate short read alignment with
|
532
|
+
Burrows-Wheeler transform. Bioinformatics, 25, 1754-60. [PMID: 19451168]
|
533
|
+
.PP
|
534
|
+
If you use the long-read component (BWA-SW), please cite:
|
535
|
+
.PP
|
536
|
+
Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
|
537
|
+
Burrows-Wheeler transform. Bioinformatics. [PMID: 20080505]
|
538
|
+
|
539
|
+
.SH HISTORY
|
540
|
+
BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW
|
541
|
+
and mimics its binary file formats; BWA-SW resembles BWT-SW in several
|
542
|
+
ways. The initial idea about BWT-based alignment also came from the
|
543
|
+
group who developed BWT-SW. At the same time, BWA is different enough
|
544
|
+
from BWT-SW. The short-read alignment algorithm bears no similarity to
|
545
|
+
Smith-Waterman algorithm any more. While BWA-SW learns from BWT-SW, it
|
546
|
+
introduces heuristics that can hardly be applied to the original
|
547
|
+
algorithm. In all, BWA does not guarantee to find all local hits as what
|
548
|
+
BWT-SW is designed to do, but it is much faster than BWT-SW on both
|
549
|
+
short and long query sequences.
|
550
|
+
|
551
|
+
I started to write the first piece of codes on 24 May 2008 and got the
|
552
|
+
initial stable version on 02 June 2008. During this period, I was
|
553
|
+
acquainted that Professor Tak-Wah Lam, the first author of BWT-SW paper,
|
554
|
+
was collaborating with Beijing Genomics Institute on SOAP2, the successor
|
555
|
+
to SOAP (Short Oligonucleotide Analysis Package). SOAP2 has come out in
|
556
|
+
November 2008. According to the SourceForge download page, the third
|
557
|
+
BWT-based short read aligner, bowtie, was first released in August
|
558
|
+
2008. At the time of writing this manual, at least three more BWT-based
|
559
|
+
short-read aligners are being implemented.
|
560
|
+
|
561
|
+
The BWA-SW algorithm is a new component of BWA. It was conceived in
|
562
|
+
November 2008 and implemented ten months later.
|