bio-bwa 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
@@ -0,0 +1,105 @@
|
|
1
|
+
/*
|
2
|
+
|
3
|
+
BWTConstruct.h BWT-Index Construction
|
4
|
+
|
5
|
+
This module constructs BWT and auxiliary data structures.
|
6
|
+
|
7
|
+
Copyright (C) 2004, Wong Chi Kwong.
|
8
|
+
|
9
|
+
This program is free software; you can redistribute it and/or
|
10
|
+
modify it under the terms of the GNU General Public License
|
11
|
+
as published by the Free Software Foundation; either version 2
|
12
|
+
of the License, or (at your option) any later version.
|
13
|
+
|
14
|
+
This program is distributed in the hope that it will be useful,
|
15
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
GNU General Public License for more details.
|
18
|
+
|
19
|
+
You should have received a copy of the GNU General Public License
|
20
|
+
along with this program; if not, write to the Free Software
|
21
|
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
22
|
+
|
23
|
+
*/
|
24
|
+
|
25
|
+
#ifndef BWT_GEN_H
|
26
|
+
#define BWT_GEN_H
|
27
|
+
|
28
|
+
#define ALPHABET_SIZE 4
|
29
|
+
#define BIT_PER_CHAR 2
|
30
|
+
#define CHAR_PER_WORD 16
|
31
|
+
#define CHAR_PER_BYTE 4
|
32
|
+
|
33
|
+
#define BITS_IN_WORD 32
|
34
|
+
#define BITS_IN_BYTE 8
|
35
|
+
#define BYTES_IN_WORD 4
|
36
|
+
|
37
|
+
#define ALL_ONE_MASK 0xFFFFFFFF
|
38
|
+
#define DNA_OCC_CNT_TABLE_SIZE_IN_WORD 65536
|
39
|
+
|
40
|
+
#define BITS_PER_OCC_VALUE 16
|
41
|
+
#define OCC_VALUE_PER_WORD 2
|
42
|
+
#define OCC_INTERVAL 256
|
43
|
+
#define OCC_INTERVAL_MAJOR 65536
|
44
|
+
|
45
|
+
#define TRUE 1
|
46
|
+
#define FALSE 0
|
47
|
+
|
48
|
+
#define BWTINC_INSERT_SORT_NUM_ITEM 7
|
49
|
+
|
50
|
+
#define average(value1, value2) ( ((value1) & (value2)) + ((value1) ^ (value2)) / 2 )
|
51
|
+
#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) )
|
52
|
+
#define max(value1, value2) ( ((value1) > (value2)) ? (value1) : (value2) )
|
53
|
+
#define med3(a, b, c) ( a<b ? (b<c ? b : a<c ? c : a) : (b>c ? b : a>c ? c : a))
|
54
|
+
#define swap(a, b, t); t = a; a = b; b = t;
|
55
|
+
#define truncateLeft(value, offset) ( (value) << (offset) >> (offset) )
|
56
|
+
#define truncateRight(value, offset) ( (value) >> (offset) << (offset) )
|
57
|
+
#define DNA_OCC_SUM_EXCEPTION(sum) ((sum & 0xfefefeff) == 0)
|
58
|
+
|
59
|
+
typedef struct SaIndexRange {
|
60
|
+
unsigned int startSaIndex;
|
61
|
+
unsigned int endSaIndex;
|
62
|
+
} SaIndexRange;
|
63
|
+
|
64
|
+
typedef struct BWT {
|
65
|
+
unsigned int textLength; // length of the text
|
66
|
+
unsigned int saInterval; // interval between two SA values stored explicitly
|
67
|
+
unsigned int inverseSaInterval; // interval between two inverse SA stored explicitly
|
68
|
+
unsigned int inverseSa0; // SA-1[0]
|
69
|
+
unsigned int *cumulativeFreq; // cumulative frequency
|
70
|
+
unsigned int *bwtCode; // BWT code
|
71
|
+
unsigned int *occValue; // Occurrence values stored explicitly
|
72
|
+
unsigned int *occValueMajor; // Occurrence values stored explicitly
|
73
|
+
unsigned int *saValue; // SA values stored explicitly
|
74
|
+
unsigned int *inverseSa; // Inverse SA stored explicitly
|
75
|
+
SaIndexRange *saIndexRange; // SA index range
|
76
|
+
int saIndexRangeNumOfChar; // Number of characters indexed in SA index range
|
77
|
+
unsigned int *saValueOnBoundary; // Pre-calculated frequently referred data
|
78
|
+
unsigned int *decodeTable; // For decoding BWT by table lookup
|
79
|
+
unsigned int decodeTableGenerated; // == TRUE if decode table is generated on load and will be freed
|
80
|
+
unsigned int bwtSizeInWord; // Temporary variable to hold the memory allocated
|
81
|
+
unsigned int occSizeInWord; // Temporary variable to hold the memory allocated
|
82
|
+
unsigned int occMajorSizeInWord; // Temporary variable to hold the memory allocated
|
83
|
+
unsigned int saValueSize; // Temporary variable to hold the memory allocated
|
84
|
+
unsigned int inverseSaSize; // Temporary variable to hold the memory allocated
|
85
|
+
unsigned int saIndexRangeSize; // Temporary variable to hold the memory allocated
|
86
|
+
} BWT;
|
87
|
+
|
88
|
+
typedef struct BWTInc {
|
89
|
+
BWT *bwt;
|
90
|
+
unsigned int numberOfIterationDone;
|
91
|
+
unsigned int *cumulativeCountInCurrentBuild;
|
92
|
+
unsigned int availableWord;
|
93
|
+
unsigned int targetTextLength;
|
94
|
+
float targetNBit;
|
95
|
+
unsigned int buildSize;
|
96
|
+
unsigned int initialMaxBuildSize;
|
97
|
+
unsigned int incMaxBuildSize;
|
98
|
+
unsigned int firstCharInLastIteration;
|
99
|
+
unsigned int *workingMemory;
|
100
|
+
unsigned int *packedText;
|
101
|
+
unsigned char *textBuffer;
|
102
|
+
unsigned int *packedShift;
|
103
|
+
} BWTInc;
|
104
|
+
|
105
|
+
#endif
|
data/ext/bwt_lite.c
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include "bwt_lite.h"
|
5
|
+
|
6
|
+
int is_sa(const uint8_t *T, uint32_t *SA, int n);
|
7
|
+
int is_bwt(uint8_t *T, int n);
|
8
|
+
|
9
|
+
bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq)
|
10
|
+
{
|
11
|
+
bwtl_t *b;
|
12
|
+
int i;
|
13
|
+
b = (bwtl_t*)calloc(1, sizeof(bwtl_t));
|
14
|
+
b->seq_len = len;
|
15
|
+
|
16
|
+
{ // calculate b->bwt
|
17
|
+
uint8_t *s;
|
18
|
+
b->sa = (uint32_t*)calloc(len + 1, 4);
|
19
|
+
is_sa(seq, b->sa, len);
|
20
|
+
s = (uint8_t*)calloc(len + 1, 1);
|
21
|
+
for (i = 0; i <= len; ++i) {
|
22
|
+
if (b->sa[i] == 0) b->primary = i;
|
23
|
+
else s[i] = seq[b->sa[i] - 1];
|
24
|
+
}
|
25
|
+
for (i = b->primary; i < len; ++i) s[i] = s[i + 1];
|
26
|
+
b->bwt_size = (len + 15) / 16;
|
27
|
+
b->bwt = (uint32_t*)calloc(b->bwt_size, 4);
|
28
|
+
for (i = 0; i < len; ++i)
|
29
|
+
b->bwt[i>>4] |= s[i] << ((15 - (i&15)) << 1);
|
30
|
+
free(s);
|
31
|
+
}
|
32
|
+
{ // calculate b->occ
|
33
|
+
uint32_t c[4];
|
34
|
+
b->n_occ = (len + 15) / 16 * 4;
|
35
|
+
b->occ = (uint32_t*)calloc(b->n_occ, 4);
|
36
|
+
memset(c, 0, 16);
|
37
|
+
for (i = 0; i < len; ++i) {
|
38
|
+
if (i % 16 == 0)
|
39
|
+
memcpy(b->occ + (i/16) * 4, c, 16);
|
40
|
+
++c[bwtl_B0(b, i)];
|
41
|
+
}
|
42
|
+
memcpy(b->L2+1, c, 16);
|
43
|
+
for (i = 2; i < 5; ++i) b->L2[i] += b->L2[i-1];
|
44
|
+
}
|
45
|
+
{ // generate cnt_table
|
46
|
+
for (i = 0; i != 256; ++i) {
|
47
|
+
u_int32_t j, x = 0;
|
48
|
+
for (j = 0; j != 4; ++j)
|
49
|
+
x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
|
50
|
+
b->cnt_table[i] = x;
|
51
|
+
}
|
52
|
+
}
|
53
|
+
return b;
|
54
|
+
}
|
55
|
+
inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c)
|
56
|
+
{
|
57
|
+
uint32_t n, b;
|
58
|
+
if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
|
59
|
+
if (k == (uint32_t)(-1)) return 0;
|
60
|
+
if (k >= bwt->primary) --k; // because $ is not in bwt
|
61
|
+
n = bwt->occ[k/16<<2|c];
|
62
|
+
b = bwt->bwt[k/16] & ~((1U<<((15-(k&15))<<1)) - 1);
|
63
|
+
n += (bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff]
|
64
|
+
+ bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24]) >> (c<<3) & 0xff;
|
65
|
+
if (c == 0) n -= 15 - (k&15); // corrected for the masked bits
|
66
|
+
return n;
|
67
|
+
}
|
68
|
+
inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4])
|
69
|
+
{
|
70
|
+
uint32_t x, b;
|
71
|
+
if (k == (uint32_t)(-1)) {
|
72
|
+
memset(cnt, 0, 16);
|
73
|
+
return;
|
74
|
+
}
|
75
|
+
if (k >= bwt->primary) --k; // because $ is not in bwt
|
76
|
+
memcpy(cnt, bwt->occ + (k>>4<<2), 16);
|
77
|
+
b = bwt->bwt[k>>4] & ~((1U<<((~k&15)<<1)) - 1);
|
78
|
+
x = bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff]
|
79
|
+
+ bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24];
|
80
|
+
x -= 15 - (k&15);
|
81
|
+
cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
|
82
|
+
}
|
83
|
+
inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4])
|
84
|
+
{
|
85
|
+
bwtl_occ4(bwt, k, cntk);
|
86
|
+
bwtl_occ4(bwt, l, cntl);
|
87
|
+
}
|
88
|
+
void bwtl_destroy(bwtl_t *bwt)
|
89
|
+
{
|
90
|
+
if (bwt) {
|
91
|
+
free(bwt->occ); free(bwt->bwt); free(bwt->sa);
|
92
|
+
free(bwt);
|
93
|
+
}
|
94
|
+
}
|
data/ext/bwt_lite.h
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#ifndef BWT_LITE_H_
|
2
|
+
#define BWT_LITE_H_
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
|
6
|
+
typedef struct {
|
7
|
+
uint32_t seq_len, bwt_size, n_occ;
|
8
|
+
uint32_t primary;
|
9
|
+
uint32_t *bwt, *occ, *sa, L2[5];
|
10
|
+
uint32_t cnt_table[256];
|
11
|
+
} bwtl_t;
|
12
|
+
|
13
|
+
#define bwtl_B0(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
|
14
|
+
|
15
|
+
#ifdef __cplusplus
|
16
|
+
extern "C" {
|
17
|
+
#endif
|
18
|
+
|
19
|
+
bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq);
|
20
|
+
inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c);
|
21
|
+
inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]);
|
22
|
+
inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]);
|
23
|
+
void bwtl_destroy(bwtl_t *bwt);
|
24
|
+
|
25
|
+
#ifdef __cplusplus
|
26
|
+
}
|
27
|
+
#endif
|
28
|
+
|
29
|
+
#endif
|
data/ext/bwtaln.c
ADDED
@@ -0,0 +1,345 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <unistd.h>
|
3
|
+
#include <math.h>
|
4
|
+
#include <stdlib.h>
|
5
|
+
#include <string.h>
|
6
|
+
#include <time.h>
|
7
|
+
#include <stdint.h>
|
8
|
+
#ifdef HAVE_CONFIG_H
|
9
|
+
#include "config.h"
|
10
|
+
#endif
|
11
|
+
#include "bwtaln.h"
|
12
|
+
#include "bwtgap.h"
|
13
|
+
#include "utils.h"
|
14
|
+
|
15
|
+
#ifdef HAVE_PTHREAD
|
16
|
+
#define THREAD_BLOCK_SIZE 1024
|
17
|
+
#include <pthread.h>
|
18
|
+
static pthread_mutex_t g_seq_lock = PTHREAD_MUTEX_INITIALIZER;
|
19
|
+
#endif
|
20
|
+
|
21
|
+
gap_opt_t *gap_init_opt()
|
22
|
+
{
|
23
|
+
gap_opt_t *o;
|
24
|
+
o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t));
|
25
|
+
/* IMPORTANT: s_mm*10 should be about the average base error
|
26
|
+
rate. Voilating this requirement will break pairing! */
|
27
|
+
o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4;
|
28
|
+
o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6;
|
29
|
+
o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000;
|
30
|
+
o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD;
|
31
|
+
o->seed_len = 32; o->max_seed_diff = 2;
|
32
|
+
o->fnr = 0.04;
|
33
|
+
o->n_threads = 1;
|
34
|
+
o->max_top2 = 30;
|
35
|
+
o->trim_qual = 0;
|
36
|
+
return o;
|
37
|
+
}
|
38
|
+
|
39
|
+
int bwa_cal_maxdiff(int l, double err, double thres)
|
40
|
+
{
|
41
|
+
double elambda = exp(-l * err);
|
42
|
+
double sum, y = 1.0;
|
43
|
+
int k, x = 1;
|
44
|
+
for (k = 1, sum = elambda; k < 1000; ++k) {
|
45
|
+
y *= l * err;
|
46
|
+
x *= k;
|
47
|
+
sum += elambda * y / x;
|
48
|
+
if (1.0 - sum < thres) return k;
|
49
|
+
}
|
50
|
+
return 2;
|
51
|
+
}
|
52
|
+
|
53
|
+
// width must be filled as zero
|
54
|
+
static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_width_t *width)
|
55
|
+
{
|
56
|
+
bwtint_t k, l, ok, ol;
|
57
|
+
int i, bid;
|
58
|
+
bid = 0;
|
59
|
+
k = 0; l = rbwt->seq_len;
|
60
|
+
for (i = 0; i < len; ++i) {
|
61
|
+
ubyte_t c = str[i];
|
62
|
+
if (c < 4) {
|
63
|
+
bwt_2occ(rbwt, k - 1, l, c, &ok, &ol);
|
64
|
+
k = rbwt->L2[c] + ok + 1;
|
65
|
+
l = rbwt->L2[c] + ol;
|
66
|
+
}
|
67
|
+
if (k > l || c > 3) { // then restart
|
68
|
+
k = 0;
|
69
|
+
l = rbwt->seq_len;
|
70
|
+
++bid;
|
71
|
+
}
|
72
|
+
width[i].w = l - k + 1;
|
73
|
+
width[i].bid = bid;
|
74
|
+
}
|
75
|
+
width[len].w = 0;
|
76
|
+
width[len].bid = ++bid;
|
77
|
+
return bid;
|
78
|
+
}
|
79
|
+
|
80
|
+
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
|
81
|
+
{
|
82
|
+
int i, max_l = 0, max_len;
|
83
|
+
gap_stack_t *stack;
|
84
|
+
bwt_width_t *w[2], *seed_w[2];
|
85
|
+
const ubyte_t *seq[2];
|
86
|
+
gap_opt_t local_opt = *opt;
|
87
|
+
|
88
|
+
// initiate priority stack
|
89
|
+
for (i = max_len = 0; i != n_seqs; ++i)
|
90
|
+
if (seqs[i].len > max_len) max_len = seqs[i].len;
|
91
|
+
if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr);
|
92
|
+
if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff;
|
93
|
+
stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt);
|
94
|
+
|
95
|
+
seed_w[0] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
|
96
|
+
seed_w[1] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
|
97
|
+
w[0] = w[1] = 0;
|
98
|
+
for (i = 0; i != n_seqs; ++i) {
|
99
|
+
bwa_seq_t *p = seqs + i;
|
100
|
+
#ifdef HAVE_PTHREAD
|
101
|
+
if (opt->n_threads > 1) {
|
102
|
+
pthread_mutex_lock(&g_seq_lock);
|
103
|
+
if (p->tid < 0) { // unassigned
|
104
|
+
int j;
|
105
|
+
for (j = i; j < n_seqs && j < i + THREAD_BLOCK_SIZE; ++j)
|
106
|
+
seqs[j].tid = tid;
|
107
|
+
} else if (p->tid != tid) {
|
108
|
+
pthread_mutex_unlock(&g_seq_lock);
|
109
|
+
continue;
|
110
|
+
}
|
111
|
+
pthread_mutex_unlock(&g_seq_lock);
|
112
|
+
}
|
113
|
+
#endif
|
114
|
+
p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0;
|
115
|
+
seq[0] = p->seq; seq[1] = p->rseq;
|
116
|
+
if (max_l < p->len) {
|
117
|
+
max_l = p->len;
|
118
|
+
w[0] = (bwt_width_t*)realloc(w[0], (max_l + 1) * sizeof(bwt_width_t));
|
119
|
+
w[1] = (bwt_width_t*)realloc(w[1], (max_l + 1) * sizeof(bwt_width_t));
|
120
|
+
memset(w[0], 0, (max_l + 1) * sizeof(bwt_width_t));
|
121
|
+
memset(w[1], 0, (max_l + 1) * sizeof(bwt_width_t));
|
122
|
+
}
|
123
|
+
bwt_cal_width(bwt[0], p->len, seq[0], w[0]);
|
124
|
+
bwt_cal_width(bwt[1], p->len, seq[1], w[1]);
|
125
|
+
if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr);
|
126
|
+
local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff;
|
127
|
+
if (p->len > opt->seed_len) {
|
128
|
+
bwt_cal_width(bwt[0], opt->seed_len, seq[0] + (p->len - opt->seed_len), seed_w[0]);
|
129
|
+
bwt_cal_width(bwt[1], opt->seed_len, seq[1] + (p->len - opt->seed_len), seed_w[1]);
|
130
|
+
}
|
131
|
+
// core function
|
132
|
+
p->aln = bwt_match_gap(bwt, p->len, seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack);
|
133
|
+
// store the alignment
|
134
|
+
free(p->name); free(p->seq); free(p->rseq); free(p->qual);
|
135
|
+
p->name = 0; p->seq = p->rseq = p->qual = 0;
|
136
|
+
}
|
137
|
+
free(seed_w[0]); free(seed_w[1]);
|
138
|
+
free(w[0]); free(w[1]);
|
139
|
+
gap_destroy_stack(stack);
|
140
|
+
}
|
141
|
+
|
142
|
+
#ifdef HAVE_PTHREAD
|
143
|
+
typedef struct {
|
144
|
+
int tid;
|
145
|
+
bwt_t *bwt[2];
|
146
|
+
int n_seqs;
|
147
|
+
bwa_seq_t *seqs;
|
148
|
+
const gap_opt_t *opt;
|
149
|
+
} thread_aux_t;
|
150
|
+
|
151
|
+
static void *worker(void *data)
|
152
|
+
{
|
153
|
+
thread_aux_t *d = (thread_aux_t*)data;
|
154
|
+
bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt);
|
155
|
+
return 0;
|
156
|
+
}
|
157
|
+
#endif
|
158
|
+
|
159
|
+
bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa)
|
160
|
+
{
|
161
|
+
bwa_seqio_t *ks;
|
162
|
+
if (mode & BWA_MODE_BAM) { // open BAM
|
163
|
+
int which = 0;
|
164
|
+
if (mode & BWA_MODE_BAM_SE) which |= 4;
|
165
|
+
if (mode & BWA_MODE_BAM_READ1) which |= 1;
|
166
|
+
if (mode & BWA_MODE_BAM_READ2) which |= 2;
|
167
|
+
if (which == 0) which = 7; // then read all reads
|
168
|
+
ks = bwa_bam_open(fn_fa, which);
|
169
|
+
} else ks = bwa_seq_open(fn_fa);
|
170
|
+
return ks;
|
171
|
+
}
|
172
|
+
|
173
|
+
void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
|
174
|
+
{
|
175
|
+
int i, n_seqs, tot_seqs = 0;
|
176
|
+
bwa_seq_t *seqs;
|
177
|
+
bwa_seqio_t *ks;
|
178
|
+
clock_t t;
|
179
|
+
bwt_t *bwt[2];
|
180
|
+
|
181
|
+
// initialization
|
182
|
+
ks = bwa_open_reads(opt->mode, fn_fa);
|
183
|
+
|
184
|
+
{ // load BWT
|
185
|
+
char *str = (char*)calloc(strlen(prefix) + 10, 1);
|
186
|
+
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
|
187
|
+
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
|
188
|
+
free(str);
|
189
|
+
}
|
190
|
+
|
191
|
+
// core loop
|
192
|
+
fwrite(opt, sizeof(gap_opt_t), 1, stdout);
|
193
|
+
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) {
|
194
|
+
tot_seqs += n_seqs;
|
195
|
+
t = clock();
|
196
|
+
|
197
|
+
fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... ");
|
198
|
+
|
199
|
+
#ifdef HAVE_PTHREAD
|
200
|
+
if (opt->n_threads <= 1) { // no multi-threading at all
|
201
|
+
bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
|
202
|
+
} else {
|
203
|
+
pthread_t *tid;
|
204
|
+
pthread_attr_t attr;
|
205
|
+
thread_aux_t *data;
|
206
|
+
int j;
|
207
|
+
pthread_attr_init(&attr);
|
208
|
+
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
|
209
|
+
data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t));
|
210
|
+
tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
|
211
|
+
for (j = 0; j < opt->n_threads; ++j) {
|
212
|
+
data[j].tid = j; data[j].bwt[0] = bwt[0]; data[j].bwt[1] = bwt[1];
|
213
|
+
data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt;
|
214
|
+
pthread_create(&tid[j], &attr, worker, data + j);
|
215
|
+
}
|
216
|
+
for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0);
|
217
|
+
free(data); free(tid);
|
218
|
+
}
|
219
|
+
#else
|
220
|
+
bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
|
221
|
+
#endif
|
222
|
+
|
223
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
224
|
+
|
225
|
+
t = clock();
|
226
|
+
fprintf(stderr, "[bwa_aln_core] write to the disk... ");
|
227
|
+
for (i = 0; i < n_seqs; ++i) {
|
228
|
+
bwa_seq_t *p = seqs + i;
|
229
|
+
fwrite(&p->n_aln, 4, 1, stdout);
|
230
|
+
if (p->n_aln) fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout);
|
231
|
+
}
|
232
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
233
|
+
|
234
|
+
bwa_free_read_seq(n_seqs, seqs);
|
235
|
+
fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
|
236
|
+
}
|
237
|
+
|
238
|
+
// destroy
|
239
|
+
bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
|
240
|
+
bwa_seq_close(ks);
|
241
|
+
}
|
242
|
+
|
243
|
+
int bwa_aln(int argc, char *argv[])
|
244
|
+
{
|
245
|
+
int c, opte = -1;
|
246
|
+
gap_opt_t *opt;
|
247
|
+
optind = 1;
|
248
|
+
opt = gap_init_opt();
|
249
|
+
while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IB:")) >= 0) {
|
250
|
+
switch (c) {
|
251
|
+
case 'n':
|
252
|
+
if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
|
253
|
+
else opt->max_diff = atoi(optarg), opt->fnr = -1.0;
|
254
|
+
break;
|
255
|
+
case 'o': opt->max_gapo = atoi(optarg); break;
|
256
|
+
case 'e': opte = atoi(optarg); break;
|
257
|
+
case 'M': opt->s_mm = atoi(optarg); break;
|
258
|
+
case 'O': opt->s_gapo = atoi(optarg); break;
|
259
|
+
case 'E': opt->s_gape = atoi(optarg); break;
|
260
|
+
case 'd': opt->max_del_occ = atoi(optarg); break;
|
261
|
+
case 'i': opt->indel_end_skip = atoi(optarg); break;
|
262
|
+
case 'l': opt->seed_len = atoi(optarg); break;
|
263
|
+
case 'k': opt->max_seed_diff = atoi(optarg); break;
|
264
|
+
case 'm': opt->max_entries = atoi(optarg); break;
|
265
|
+
case 't': opt->n_threads = atoi(optarg); break;
|
266
|
+
case 'L': opt->mode |= BWA_MODE_LOGGAP; break;
|
267
|
+
case 'R': opt->max_top2 = atoi(optarg); break;
|
268
|
+
case 'q': opt->trim_qual = atoi(optarg); break;
|
269
|
+
case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break;
|
270
|
+
case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break;
|
271
|
+
case 'f': xreopen(optarg, "wb", stdout); break;
|
272
|
+
case 'b': opt->mode |= BWA_MODE_BAM; break;
|
273
|
+
case '0': opt->mode |= BWA_MODE_BAM_SE; break;
|
274
|
+
case '1': opt->mode |= BWA_MODE_BAM_READ1; break;
|
275
|
+
case '2': opt->mode |= BWA_MODE_BAM_READ2; break;
|
276
|
+
case 'I': opt->mode |= BWA_MODE_IL13; break;
|
277
|
+
case 'B': opt->mode |= atoi(optarg) << 24; break;
|
278
|
+
default: return 1;
|
279
|
+
}
|
280
|
+
}
|
281
|
+
if (opte > 0) {
|
282
|
+
opt->max_gape = opte;
|
283
|
+
opt->mode &= ~BWA_MODE_GAPE;
|
284
|
+
}
|
285
|
+
|
286
|
+
if (optind + 2 > argc) {
|
287
|
+
fprintf(stderr, "\n");
|
288
|
+
fprintf(stderr, "Usage: bwa aln [options] <prefix> <in.fq>\n\n");
|
289
|
+
fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n",
|
290
|
+
BWA_AVG_ERR, opt->fnr);
|
291
|
+
fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo);
|
292
|
+
fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n");
|
293
|
+
fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip);
|
294
|
+
fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ);
|
295
|
+
fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len);
|
296
|
+
fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff);
|
297
|
+
fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries);
|
298
|
+
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
|
299
|
+
fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm);
|
300
|
+
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo);
|
301
|
+
fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape);
|
302
|
+
fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2);
|
303
|
+
fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
|
304
|
+
fprintf(stderr, " -f FILE file to write output to instead of stdout\n");
|
305
|
+
fprintf(stderr, " -B INT length of barcode\n");
|
306
|
+
fprintf(stderr, " -c input sequences are in the color space\n");
|
307
|
+
fprintf(stderr, " -L log-scaled gap penalty for long deletions\n");
|
308
|
+
fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n");
|
309
|
+
fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n");
|
310
|
+
fprintf(stderr, " -b the input read file is in the BAM format\n");
|
311
|
+
fprintf(stderr, " -0 use single-end reads only (effective with -b)\n");
|
312
|
+
fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n");
|
313
|
+
fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n");
|
314
|
+
fprintf(stderr, "\n");
|
315
|
+
return 1;
|
316
|
+
}
|
317
|
+
if (opt->fnr > 0.0) {
|
318
|
+
int i, k;
|
319
|
+
for (i = 17, k = 0; i <= 250; ++i) {
|
320
|
+
int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
|
321
|
+
if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l);
|
322
|
+
k = l;
|
323
|
+
}
|
324
|
+
}
|
325
|
+
bwa_aln_core(argv[optind], argv[optind+1], opt);
|
326
|
+
free(opt);
|
327
|
+
fflush(stdout);
|
328
|
+
xreopen("/dev/tty","w",stdout);
|
329
|
+
return 0;
|
330
|
+
}
|
331
|
+
|
332
|
+
/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t,
|
333
|
+
__cigar_op and __cigar_len while keeping stdaln stand alone */
|
334
|
+
bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar)
|
335
|
+
{
|
336
|
+
uint32_t *cigar32;
|
337
|
+
bwa_cigar_t *cigar;
|
338
|
+
int i;
|
339
|
+
cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar);
|
340
|
+
cigar = (bwa_cigar_t*)cigar32;
|
341
|
+
for (i = 0; i < *n_cigar; ++i)
|
342
|
+
cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) );
|
343
|
+
return cigar;
|
344
|
+
}
|
345
|
+
|