bio-bwa 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
data/ext/bntseq.h ADDED
@@ -0,0 +1,80 @@
1
+ /* The MIT License
2
+
3
+ Copyright (c) 2008 Genome Research Ltd (GRL).
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
24
+ */
25
+
26
+ /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
+
28
+ #ifndef BWT_BNTSEQ_H
29
+ #define BWT_BNTSEQ_H
30
+
31
+ #include <stdint.h>
32
+ #include <zlib.h>
33
+
34
+ #ifndef BWA_UBYTE
35
+ #define BWA_UBYTE
36
+ typedef uint8_t ubyte_t;
37
+ #endif
38
+
39
+ typedef struct {
40
+ int64_t offset;
41
+ int32_t len;
42
+ int32_t n_ambs;
43
+ uint32_t gi;
44
+ char *name, *anno;
45
+ } bntann1_t;
46
+
47
+ typedef struct {
48
+ int64_t offset;
49
+ int32_t len;
50
+ char amb;
51
+ } bntamb1_t;
52
+
53
+ typedef struct {
54
+ int64_t l_pac;
55
+ int32_t n_seqs;
56
+ uint32_t seed;
57
+ bntann1_t *anns; // n_seqs elements
58
+ int32_t n_holes;
59
+ bntamb1_t *ambs; // n_holes elements
60
+ FILE *fp_pac;
61
+ } bntseq_t;
62
+
63
+ extern unsigned char nst_nt4_table[256];
64
+
65
+ #ifdef __cplusplus
66
+ extern "C" {
67
+ #endif
68
+
69
+ void bns_dump(const bntseq_t *bns, const char *prefix);
70
+ bntseq_t *bns_restore(const char *prefix);
71
+ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename);
72
+ void bns_destroy(bntseq_t *bns);
73
+ void bns_fasta2bntseq(gzFile fp_fa, const char *prefix);
74
+ int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq);
75
+
76
+ #ifdef __cplusplus
77
+ }
78
+ #endif
79
+
80
+ #endif
data/ext/bwa.1 ADDED
@@ -0,0 +1,562 @@
1
+ .TH bwa 1 "24 January 2011" "bwa-0.5.9" "Bioinformatics tools"
2
+ .SH NAME
3
+ .PP
4
+ bwa - Burrows-Wheeler Alignment Tool
5
+ .SH SYNOPSIS
6
+ .PP
7
+ bwa index -a bwtsw database.fasta
8
+ .PP
9
+ bwa aln database.fasta short_read.fastq > aln_sa.sai
10
+ .PP
11
+ bwa samse database.fasta aln_sa.sai short_read.fastq > aln.sam
12
+ .PP
13
+ bwa sampe database.fasta aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln.sam
14
+ .PP
15
+ bwa bwasw database.fasta long_read.fastq > aln.sam
16
+
17
+ .SH DESCRIPTION
18
+ .PP
19
+ BWA is a fast light-weighted tool that aligns relatively short sequences
20
+ (queries) to a sequence database (targe), such as the human reference
21
+ genome. It implements two different algorithms, both based on
22
+ Burrows-Wheeler Transform (BWT). The first algorithm is designed for
23
+ short queries up to ~200bp with low error rate (<3%). It does gapped
24
+ global alignment w.r.t. queries, supports paired-end reads, and is one
25
+ of the fastest short read alignment algorithms to date while also
26
+ visiting suboptimal hits. The second algorithm, BWA-SW, is designed for
27
+ long reads with more errors. It performs heuristic Smith-Waterman-like
28
+ alignment to find high-scoring local hits (and thus chimera). On
29
+ low-error short queries, BWA-SW is slower and less accurate than the
30
+ first algorithm, but on long queries, it is better.
31
+ .PP
32
+ For both algorithms, the database file in the FASTA format must be
33
+ first indexed with the
34
+ .B `index'
35
+ command, which typically takes a few hours. The first algorithm is
36
+ implemented via the
37
+ .B `aln'
38
+ command, which finds the suffix array (SA) coordinates of good hits of
39
+ each individual read, and the
40
+ .B `samse/sampe'
41
+ command, which converts SA coordinates to chromosomal coordinate and
42
+ pairs reads (for `sampe'). The second algorithm is invoked by the
43
+ .B `bwasw'
44
+ command. It works for single-end reads only.
45
+
46
+ .SH COMMANDS AND OPTIONS
47
+ .TP
48
+ .B index
49
+ bwa index [-p prefix] [-a algoType] [-c] <in.db.fasta>
50
+
51
+ Index database sequences in the FASTA format.
52
+
53
+ .B OPTIONS:
54
+ .RS
55
+ .TP 10
56
+ .B -c
57
+ Build color-space index. The input fast should be in nucleotide space.
58
+ .TP
59
+ .BI -p \ STR
60
+ Prefix of the output database [same as db filename]
61
+ .TP
62
+ .BI -a \ STR
63
+ Algorithm for constructing BWT index. Available options are:
64
+ .RS
65
+ .TP
66
+ .B is
67
+ IS linear-time algorithm for constructing suffix array. It requires
68
+ 5.37N memory where N is the size of the database. IS is moderately fast,
69
+ but does not work with database larger than 2GB. IS is the default
70
+ algorithm due to its simplicity. The current codes for IS algorithm are
71
+ reimplemented by Yuta Mori.
72
+ .TP
73
+ .B bwtsw
74
+ Algorithm implemented in BWT-SW. This method works with the whole human
75
+ genome, but it does not work with database smaller than 10MB and it is
76
+ usually slower than IS.
77
+ .RE
78
+ .RE
79
+
80
+ .TP
81
+ .B aln
82
+ bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i
83
+ nIndelEnd] [-k maxSeedDiff] [-l seedLen] [-t nThrds] [-cRN] [-M misMsc]
84
+ [-O gapOsc] [-E gapEsc] [-q trimQual] <in.db.fasta> <in.query.fq> >
85
+ <out.sai>
86
+
87
+ Find the SA coordinates of the input reads. Maximum
88
+ .I maxSeedDiff
89
+ differences are allowed in the first
90
+ .I seedLen
91
+ subsequence and maximum
92
+ .I maxDiff
93
+ differences are allowed in the whole sequence.
94
+
95
+ .B OPTIONS:
96
+ .RS
97
+ .TP 10
98
+ .BI -n \ NUM
99
+ Maximum edit distance if the value is INT, or the fraction of missing
100
+ alignments given 2% uniform base error rate if FLOAT. In the latter
101
+ case, the maximum edit distance is automatically chosen for different
102
+ read lengths. [0.04]
103
+ .TP
104
+ .BI -o \ INT
105
+ Maximum number of gap opens [1]
106
+ .TP
107
+ .BI -e \ INT
108
+ Maximum number of gap extensions, -1 for k-difference mode (disallowing
109
+ long gaps) [-1]
110
+ .TP
111
+ .BI -d \ INT
112
+ Disallow a long deletion within INT bp towards the 3'-end [16]
113
+ .TP
114
+ .BI -i \ INT
115
+ Disallow an indel within INT bp towards the ends [5]
116
+ .TP
117
+ .BI -l \ INT
118
+ Take the first INT subsequence as seed. If INT is larger than the query
119
+ sequence, seeding will be disabled. For long reads, this option is
120
+ typically ranged from 25 to 35 for `-k 2'. [inf]
121
+ .TP
122
+ .BI -k \ INT
123
+ Maximum edit distance in the seed [2]
124
+ .TP
125
+ .BI -t \ INT
126
+ Number of threads (multi-threading mode) [1]
127
+ .TP
128
+ .BI -M \ INT
129
+ Mismatch penalty. BWA will not search for suboptimal hits with a score
130
+ lower than (bestScore-misMsc). [3]
131
+ .TP
132
+ .BI -O \ INT
133
+ Gap open penalty [11]
134
+ .TP
135
+ .BI -E \ INT
136
+ Gap extension penalty [4]
137
+ .TP
138
+ .BI -R \ INT
139
+ Proceed with suboptimal alignments if there are no more than INT equally
140
+ best hits. This option only affects paired-end mapping. Increasing this
141
+ threshold helps to improve the pairing accuracy at the cost of speed,
142
+ especially for short reads (~32bp).
143
+ .TP
144
+ .B -c
145
+ Reverse query but not complement it, which is required for alignment in
146
+ the color space.
147
+ .TP
148
+ .B -N
149
+ Disable iterative search. All hits with no more than
150
+ .I maxDiff
151
+ differences will be found. This mode is much slower than the default.
152
+ .TP
153
+ .BI -q \ INT
154
+ Parameter for read trimming. BWA trims a read down to
155
+ argmax_x{\\sum_{i=x+1}^l(INT-q_i)} if q_l<INT where l is the original
156
+ read length. [0]
157
+ .TP
158
+ .B -I
159
+ The input is in the Illumina 1.3+ read format (quality equals ASCII-64).
160
+ .TP
161
+ .BI -B \ INT
162
+ Length of barcode starting from the 5'-end. When
163
+ .I INT
164
+ is positive, the barcode of each read will be trimmed before mapping and will
165
+ be written at the
166
+ .B BC
167
+ SAM tag. For paired-end reads, the barcode from both ends are concatenated. [0]
168
+ .TP
169
+ .B -b
170
+ Specify the input read sequence file is the BAM format. For paired-end
171
+ data, two ends in a pair must be grouped together and options
172
+ .B -1
173
+ or
174
+ .B -2
175
+ are usually applied to specify which end should be mapped. Typical
176
+ command lines for mapping pair-end data in the BAM format are:
177
+
178
+ bwa aln ref.fa -b1 reads.bam > 1.sai
179
+ bwa aln ref.fa -b2 reads.bam > 2.sai
180
+ bwa sampe ref.fa 1.sai 2.sai reads.bam reads.bam > aln.sam
181
+ .TP
182
+ .B -0
183
+ When
184
+ .B -b
185
+ is specified, only use single-end reads in mapping.
186
+ .TP
187
+ .B -1
188
+ When
189
+ .B -b
190
+ is specified, only use the first read in a read pair in mapping (skip
191
+ single-end reads and the second reads).
192
+ .TP
193
+ .B -2
194
+ When
195
+ .B -b
196
+ is specified, only use the second read in a read pair in mapping.
197
+ .B
198
+ .RE
199
+
200
+ .TP
201
+ .B samse
202
+ bwa samse [-n maxOcc] <in.db.fasta> <in.sai> <in.fq> > <out.sam>
203
+
204
+ Generate alignments in the SAM format given single-end reads. Repetitive
205
+ hits will be randomly chosen.
206
+
207
+ .B OPTIONS:
208
+ .RS
209
+ .TP 10
210
+ .BI -n \ INT
211
+ Maximum number of alignments to output in the XA tag for reads paired
212
+ properly. If a read has more than INT hits, the XA tag will not be
213
+ written. [3]
214
+ .TP
215
+ .BI -r \ STR
216
+ Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null]
217
+ .RE
218
+
219
+ .TP
220
+ .B sampe
221
+ bwa sampe [-a maxInsSize] [-o maxOcc] [-n maxHitPaired] [-N maxHitDis]
222
+ [-P] <in.db.fasta> <in1.sai> <in2.sai> <in1.fq> <in2.fq> > <out.sam>
223
+
224
+ Generate alignments in the SAM format given paired-end reads. Repetitive
225
+ read pairs will be placed randomly.
226
+
227
+ .B OPTIONS:
228
+ .RS
229
+ .TP 8
230
+ .BI -a \ INT
231
+ Maximum insert size for a read pair to be considered being mapped
232
+ properly. Since 0.4.5, this option is only used when there are not
233
+ enough good alignment to infer the distribution of insert sizes. [500]
234
+ .TP
235
+ .BI -o \ INT
236
+ Maximum occurrences of a read for pairing. A read with more occurrneces
237
+ will be treated as a single-end read. Reducing this parameter helps
238
+ faster pairing. [100000]
239
+ .TP
240
+ .B -P
241
+ Load the entire FM-index into memory to reduce disk operations
242
+ (base-space reads only). With this option, at least 1.25N bytes of
243
+ memory are required, where N is the length of the genome.
244
+ .TP
245
+ .BI -n \ INT
246
+ Maximum number of alignments to output in the XA tag for reads paired
247
+ properly. If a read has more than INT hits, the XA tag will not be
248
+ written. [3]
249
+ .TP
250
+ .BI -N \ INT
251
+ Maximum number of alignments to output in the XA tag for disconcordant
252
+ read pairs (excluding singletons). If a read has more than INT hits, the
253
+ XA tag will not be written. [10]
254
+ .TP
255
+ .BI -r \ STR
256
+ Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null]
257
+ .RE
258
+
259
+ .TP
260
+ .B bwasw
261
+ bwa bwasw [-a matchScore] [-b mmPen] [-q gapOpenPen] [-r gapExtPen] [-t
262
+ nThreads] [-w bandWidth] [-T thres] [-s hspIntv] [-z zBest] [-N
263
+ nHspRev] [-c thresCoef] <in.db.fasta> <in.fq>
264
+
265
+ Align query sequences in the <in.fq> file.
266
+
267
+ .B OPTIONS:
268
+ .RS
269
+ .TP 10
270
+ .BI -a \ INT
271
+ Score of a match [1]
272
+ .TP
273
+ .BI -b \ INT
274
+ Mismatch penalty [3]
275
+ .TP
276
+ .BI -q \ INT
277
+ Gap open penalty [5]
278
+ .TP
279
+ .BI -r \ INT
280
+ Gap extension penalty. The penalty for a contiguous gap of size k is
281
+ q+k*r. [2]
282
+ .TP
283
+ .BI -t \ INT
284
+ Number of threads in the multi-threading mode [1]
285
+ .TP
286
+ .BI -w \ INT
287
+ Band width in the banded alignment [33]
288
+ .TP
289
+ .BI -T \ INT
290
+ Minimum score threshold divided by a [37]
291
+ .TP
292
+ .BI -c \ FLOAT
293
+ Coefficient for threshold adjustment according to query length. Given an
294
+ l-long query, the threshold for a hit to be retained is
295
+ a*max{T,c*log(l)}. [5.5]
296
+ .TP
297
+ .BI -z \ INT
298
+ Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1]
299
+ .TP
300
+ .BI -s \ INT
301
+ Maximum SA interval size for initiating a seed. Higher -s increases
302
+ accuracy at the cost of speed. [3]
303
+ .TP
304
+ .BI -N \ INT
305
+ Minimum number of seeds supporting the resultant alignment to skip
306
+ reverse alignment. [5]
307
+ .RE
308
+
309
+ .SH SAM ALIGNMENT FORMAT
310
+ .PP
311
+ The output of the
312
+ .B `aln'
313
+ command is binary and designed for BWA use only. BWA outputs the final
314
+ alignment in the SAM (Sequence Alignment/Map) format. Each line consists
315
+ of:
316
+
317
+ .TS
318
+ center box;
319
+ cb | cb | cb
320
+ n | l | l .
321
+ Col Field Description
322
+ _
323
+ 1 QNAME Query (pair) NAME
324
+ 2 FLAG bitwise FLAG
325
+ 3 RNAME Reference sequence NAME
326
+ 4 POS 1-based leftmost POSition/coordinate of clipped sequence
327
+ 5 MAPQ MAPping Quality (Phred-scaled)
328
+ 6 CIAGR extended CIGAR string
329
+ 7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME)
330
+ 8 MPOS 1-based Mate POSistion
331
+ 9 ISIZE Inferred insert SIZE
332
+ 10 SEQ query SEQuence on the same strand as the reference
333
+ 11 QUAL query QUALity (ASCII-33 gives the Phred base quality)
334
+ 12 OPT variable OPTional fields in the format TAG:VTYPE:VALUE
335
+ .TE
336
+
337
+ .PP
338
+ Each bit in the FLAG field is defined as:
339
+
340
+ .TS
341
+ center box;
342
+ cb | cb | cb
343
+ c | l | l .
344
+ Chr Flag Description
345
+ _
346
+ p 0x0001 the read is paired in sequencing
347
+ P 0x0002 the read is mapped in a proper pair
348
+ u 0x0004 the query sequence itself is unmapped
349
+ U 0x0008 the mate is unmapped
350
+ r 0x0010 strand of the query (1 for reverse)
351
+ R 0x0020 strand of the mate
352
+ 1 0x0040 the read is the first read in a pair
353
+ 2 0x0080 the read is the second read in a pair
354
+ s 0x0100 the alignment is not primary
355
+ f 0x0200 QC failure
356
+ d 0x0400 optical or PCR duplicate
357
+ .TE
358
+
359
+ .PP
360
+ The Please check <http://samtools.sourceforge.net> for the format
361
+ specification and the tools for post-processing the alignment.
362
+
363
+ BWA generates the following optional fields. Tags starting with `X' are
364
+ specific to BWA.
365
+
366
+ .TS
367
+ center box;
368
+ cb | cb
369
+ cB | l .
370
+ Tag Meaning
371
+ _
372
+ NM Edit distance
373
+ MD Mismatching positions/bases
374
+ AS Alignment score
375
+ BC Barcode sequence
376
+ _
377
+ X0 Number of best hits
378
+ X1 Number of suboptimal hits found by BWA
379
+ XN Number of ambiguous bases in the referenece
380
+ XM Number of mismatches in the alignment
381
+ XO Number of gap opens
382
+ XG Number of gap extentions
383
+ XT Type: Unique/Repeat/N/Mate-sw
384
+ XA Alternative hits; format: (chr,pos,CIGAR,NM;)*
385
+ _
386
+ XS Suboptimal alignment score
387
+ XF Support from forward/reverse alignment
388
+ XE Number of supporting seeds
389
+ .TE
390
+
391
+ .PP
392
+ Note that XO and XG are generated by BWT search while the CIGAR string
393
+ by Smith-Waterman alignment. These two tags may be inconsistent with the
394
+ CIGAR string. This is not a bug.
395
+
396
+ .SH NOTES ON SHORT-READ ALIGNMENT
397
+ .SS Alignment Accuracy
398
+ .PP
399
+ When seeding is disabled, BWA guarantees to find an alignment
400
+ containing maximum
401
+ .I maxDiff
402
+ differences including
403
+ .I maxGapO
404
+ gap opens which do not occur within
405
+ .I nIndelEnd
406
+ bp towards either end of the query. Longer gaps may be found if
407
+ .I maxGapE
408
+ is positive, but it is not guaranteed to find all hits. When seeding is
409
+ enabled, BWA further requires that the first
410
+ .I seedLen
411
+ subsequence contains no more than
412
+ .I maxSeedDiff
413
+ differences.
414
+ .PP
415
+ When gapped alignment is disabled, BWA is expected to generate the same
416
+ alignment as Eland, the Illumina alignment program. However, as BWA
417
+ change `N' in the database sequence to random nucleotides, hits to these
418
+ random sequences will also be counted. As a consequence, BWA may mark a
419
+ unique hit as a repeat, if the random sequences happen to be identical
420
+ to the sequences which should be unqiue in the database. This random
421
+ behaviour will be avoided in future releases.
422
+ .PP
423
+ By default, if the best hit is no so repetitive (controlled by -R), BWA
424
+ also finds all hits contains one more mismatch; otherwise, BWA finds all
425
+ equally best hits only. Base quality is NOT considered in evaluating
426
+ hits. In paired-end alignment, BWA pairs all hits it found. It further
427
+ performs Smith-Waterman alignment for unmapped reads with mates mapped
428
+ to rescue mapped mates, and for high-quality anomalous pairs to fix
429
+ potential alignment errors.
430
+
431
+ .SS Estimating Insert Size Distribution
432
+ .PP
433
+ BWA estimates the insert size distribution per 256*1024 read pairs. It
434
+ first collects pairs of reads with both ends mapped with a single-end
435
+ quality 20 or higher and then calculates median (Q2), lower and higher
436
+ quartile (Q1 and Q3). It estimates the mean and the variance of the
437
+ insert size distribution from pairs whose insert sizes are within
438
+ interval [Q1-2(Q3-Q1), Q3+2(Q3-Q1)]. The maximum distance x for a pair
439
+ considered to be properly paired (SAM flag 0x2) is calculated by solving
440
+ equation Phi((x-mu)/sigma)=x/L*p0, where mu is the mean, sigma is the
441
+ standard error of the insert size distribution, L is the length of the
442
+ genome, p0 is prior of anomalous pair and Phi() is the standard
443
+ cumulative distribution function. For mapping Illumina short-insert
444
+ reads to the human genome, x is about 6-7 sigma away from the
445
+ mean. Quartiles, mean, variance and x will be printed to the standard
446
+ error output.
447
+
448
+ .SS Memory Requirement
449
+ .PP
450
+ With bwtsw algorithm, 2.5GB memory is required for indexing the complete
451
+ human genome sequences. For short reads, the
452
+ .B `aln'
453
+ command uses ~2.3GB memory and the
454
+ .B `sampe'
455
+ command uses ~3.5GB.
456
+
457
+ .SS Speed
458
+ .PP
459
+ Indexing the human genome sequences takes 3 hours with bwtsw
460
+ algorithm. Indexing smaller genomes with IS or divsufsort algorithms is
461
+ several times faster, but requires more memory.
462
+ .PP
463
+ Speed of alignment is largely determined by the error rate of the query
464
+ sequences (r). Firstly, BWA runs much faster for near perfect hits than
465
+ for hits with many differences, and it stops searching for a hit with
466
+ l+2 differences if a l-difference hit is found. This means BWA will be
467
+ very slow if r is high because in this case BWA has to visit hits with
468
+ many differences and looking for these hits is expensive. Secondly, the
469
+ alignment algorithm behind makes the speed sensitive to [k log(N)/m],
470
+ where k is the maximum allowed differences, N the size of database and m
471
+ the length of a query. In practice, we choose k w.r.t. r and therefore r
472
+ is the leading factor. I would not recommend to use BWA on data with
473
+ r>0.02.
474
+ .PP
475
+ Pairing is slower for shorter reads. This is mainly because shorter
476
+ reads have more spurious hits and converting SA coordinates to
477
+ chromosomal coordinates are very costly.
478
+ .PP
479
+ In a practical experiment, BWA is able to map 2 million 32bp reads to a
480
+ bacterial genome in several minutes, map the same amount of reads to
481
+ human X chromosome in 8-15 minutes and to the human genome in 15-25
482
+ minutes. This result implies that the speed of BWA is insensitive to the
483
+ size of database and therefore BWA is more efficient when the database
484
+ is sufficiently large. On smaller genomes, hash based algorithms are
485
+ usually much faster.
486
+
487
+ .SH NOTES ON LONG-READ ALIGNMENT
488
+ .PP
489
+ Command
490
+ .B `bwasw'
491
+ is designed for long-read alignment. The algorithm behind, BWA-SW, is
492
+ similar to BWT-SW, but does not guarantee to find all local hits due to
493
+ the heuristic acceleration. It tends to be faster and more accurate if
494
+ the resultant alignment is supported by more seeds, and therefore
495
+ BWA-SW usually performs better on long queries than on short ones.
496
+
497
+ On 350-1000bp reads, BWA-SW is several to tens of times faster than the
498
+ existing programs. Its accuracy is comparable to SSAHA2, more accurate
499
+ than BLAT. Like BLAT, BWA-SW also finds chimera which may pose a
500
+ challenge to SSAHA2. On 10-100kbp queries where chimera detection is
501
+ important, BWA-SW is over 10X faster than BLAT while being more
502
+ sensitive.
503
+
504
+ BWA-SW can also be used to align ~100bp reads, but it is slower than
505
+ the short-read algorithm. Its sensitivity and accuracy is lower than
506
+ SSAHA2 especially when the sequencing error rate is above 2%. This is
507
+ the trade-off of the 30X speed up in comparison to SSAHA2's -454 mode.
508
+
509
+ .SH SEE ALSO
510
+ BWA website <http://bio-bwa.sourceforge.net>, Samtools website
511
+ <http://samtools.sourceforge.net>
512
+
513
+ .SH AUTHOR
514
+ Heng Li at the Sanger Institute wrote the key source codes and
515
+ integrated the following codes for BWT construction: bwtsw
516
+ <http://i.cs.hku.hk/~ckwong3/bwtsw/>, implemented by Chi-Kwong Wong at
517
+ the University of Hong Kong and IS
518
+ <http://yuta.256.googlepages.com/sais> originally proposed by Nong Ge
519
+ <http://www.cs.sysu.edu.cn/nong/> at the Sun Yat-Sen University and
520
+ implemented by Yuta Mori.
521
+
522
+ .SH LICENSE AND CITATION
523
+ .PP
524
+ The full BWA package is distributed under GPLv3 as it uses source codes
525
+ from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS
526
+ libraries are distributed under the MIT license.
527
+ .PP
528
+ If you use the short-read alignment component, please cite the following
529
+ paper:
530
+ .PP
531
+ Li H. and Durbin R. (2009) Fast and accurate short read alignment with
532
+ Burrows-Wheeler transform. Bioinformatics, 25, 1754-60. [PMID: 19451168]
533
+ .PP
534
+ If you use the long-read component (BWA-SW), please cite:
535
+ .PP
536
+ Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
537
+ Burrows-Wheeler transform. Bioinformatics. [PMID: 20080505]
538
+
539
+ .SH HISTORY
540
+ BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW
541
+ and mimics its binary file formats; BWA-SW resembles BWT-SW in several
542
+ ways. The initial idea about BWT-based alignment also came from the
543
+ group who developed BWT-SW. At the same time, BWA is different enough
544
+ from BWT-SW. The short-read alignment algorithm bears no similarity to
545
+ Smith-Waterman algorithm any more. While BWA-SW learns from BWT-SW, it
546
+ introduces heuristics that can hardly be applied to the original
547
+ algorithm. In all, BWA does not guarantee to find all local hits as what
548
+ BWT-SW is designed to do, but it is much faster than BWT-SW on both
549
+ short and long query sequences.
550
+
551
+ I started to write the first piece of codes on 24 May 2008 and got the
552
+ initial stable version on 02 June 2008. During this period, I was
553
+ acquainted that Professor Tak-Wah Lam, the first author of BWT-SW paper,
554
+ was collaborating with Beijing Genomics Institute on SOAP2, the successor
555
+ to SOAP (Short Oligonucleotide Analysis Package). SOAP2 has come out in
556
+ November 2008. According to the SourceForge download page, the third
557
+ BWT-based short read aligner, bowtie, was first released in August
558
+ 2008. At the time of writing this manual, at least three more BWT-based
559
+ short-read aligners are being implemented.
560
+
561
+ The BWA-SW algorithm is a new component of BWA. It was conceived in
562
+ November 2008 and implemented ten months later.