bio-bwa 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
@@ -0,0 +1,105 @@
|
|
1
|
+
/*
|
2
|
+
|
3
|
+
BWTConstruct.h BWT-Index Construction
|
4
|
+
|
5
|
+
This module constructs BWT and auxiliary data structures.
|
6
|
+
|
7
|
+
Copyright (C) 2004, Wong Chi Kwong.
|
8
|
+
|
9
|
+
This program is free software; you can redistribute it and/or
|
10
|
+
modify it under the terms of the GNU General Public License
|
11
|
+
as published by the Free Software Foundation; either version 2
|
12
|
+
of the License, or (at your option) any later version.
|
13
|
+
|
14
|
+
This program is distributed in the hope that it will be useful,
|
15
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
GNU General Public License for more details.
|
18
|
+
|
19
|
+
You should have received a copy of the GNU General Public License
|
20
|
+
along with this program; if not, write to the Free Software
|
21
|
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
22
|
+
|
23
|
+
*/
|
24
|
+
|
25
|
+
#ifndef BWT_GEN_H
|
26
|
+
#define BWT_GEN_H
|
27
|
+
|
28
|
+
#define ALPHABET_SIZE 4
|
29
|
+
#define BIT_PER_CHAR 2
|
30
|
+
#define CHAR_PER_WORD 16
|
31
|
+
#define CHAR_PER_BYTE 4
|
32
|
+
|
33
|
+
#define BITS_IN_WORD 32
|
34
|
+
#define BITS_IN_BYTE 8
|
35
|
+
#define BYTES_IN_WORD 4
|
36
|
+
|
37
|
+
#define ALL_ONE_MASK 0xFFFFFFFF
|
38
|
+
#define DNA_OCC_CNT_TABLE_SIZE_IN_WORD 65536
|
39
|
+
|
40
|
+
#define BITS_PER_OCC_VALUE 16
|
41
|
+
#define OCC_VALUE_PER_WORD 2
|
42
|
+
#define OCC_INTERVAL 256
|
43
|
+
#define OCC_INTERVAL_MAJOR 65536
|
44
|
+
|
45
|
+
#define TRUE 1
|
46
|
+
#define FALSE 0
|
47
|
+
|
48
|
+
#define BWTINC_INSERT_SORT_NUM_ITEM 7
|
49
|
+
|
50
|
+
#define average(value1, value2) ( ((value1) & (value2)) + ((value1) ^ (value2)) / 2 )
|
51
|
+
#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) )
|
52
|
+
#define max(value1, value2) ( ((value1) > (value2)) ? (value1) : (value2) )
|
53
|
+
#define med3(a, b, c) ( a<b ? (b<c ? b : a<c ? c : a) : (b>c ? b : a>c ? c : a))
|
54
|
+
#define swap(a, b, t); t = a; a = b; b = t;
|
55
|
+
#define truncateLeft(value, offset) ( (value) << (offset) >> (offset) )
|
56
|
+
#define truncateRight(value, offset) ( (value) >> (offset) << (offset) )
|
57
|
+
#define DNA_OCC_SUM_EXCEPTION(sum) ((sum & 0xfefefeff) == 0)
|
58
|
+
|
59
|
+
typedef struct SaIndexRange {
|
60
|
+
unsigned int startSaIndex;
|
61
|
+
unsigned int endSaIndex;
|
62
|
+
} SaIndexRange;
|
63
|
+
|
64
|
+
typedef struct BWT {
|
65
|
+
unsigned int textLength; // length of the text
|
66
|
+
unsigned int saInterval; // interval between two SA values stored explicitly
|
67
|
+
unsigned int inverseSaInterval; // interval between two inverse SA stored explicitly
|
68
|
+
unsigned int inverseSa0; // SA-1[0]
|
69
|
+
unsigned int *cumulativeFreq; // cumulative frequency
|
70
|
+
unsigned int *bwtCode; // BWT code
|
71
|
+
unsigned int *occValue; // Occurrence values stored explicitly
|
72
|
+
unsigned int *occValueMajor; // Occurrence values stored explicitly
|
73
|
+
unsigned int *saValue; // SA values stored explicitly
|
74
|
+
unsigned int *inverseSa; // Inverse SA stored explicitly
|
75
|
+
SaIndexRange *saIndexRange; // SA index range
|
76
|
+
int saIndexRangeNumOfChar; // Number of characters indexed in SA index range
|
77
|
+
unsigned int *saValueOnBoundary; // Pre-calculated frequently referred data
|
78
|
+
unsigned int *decodeTable; // For decoding BWT by table lookup
|
79
|
+
unsigned int decodeTableGenerated; // == TRUE if decode table is generated on load and will be freed
|
80
|
+
unsigned int bwtSizeInWord; // Temporary variable to hold the memory allocated
|
81
|
+
unsigned int occSizeInWord; // Temporary variable to hold the memory allocated
|
82
|
+
unsigned int occMajorSizeInWord; // Temporary variable to hold the memory allocated
|
83
|
+
unsigned int saValueSize; // Temporary variable to hold the memory allocated
|
84
|
+
unsigned int inverseSaSize; // Temporary variable to hold the memory allocated
|
85
|
+
unsigned int saIndexRangeSize; // Temporary variable to hold the memory allocated
|
86
|
+
} BWT;
|
87
|
+
|
88
|
+
typedef struct BWTInc {
|
89
|
+
BWT *bwt;
|
90
|
+
unsigned int numberOfIterationDone;
|
91
|
+
unsigned int *cumulativeCountInCurrentBuild;
|
92
|
+
unsigned int availableWord;
|
93
|
+
unsigned int targetTextLength;
|
94
|
+
float targetNBit;
|
95
|
+
unsigned int buildSize;
|
96
|
+
unsigned int initialMaxBuildSize;
|
97
|
+
unsigned int incMaxBuildSize;
|
98
|
+
unsigned int firstCharInLastIteration;
|
99
|
+
unsigned int *workingMemory;
|
100
|
+
unsigned int *packedText;
|
101
|
+
unsigned char *textBuffer;
|
102
|
+
unsigned int *packedShift;
|
103
|
+
} BWTInc;
|
104
|
+
|
105
|
+
#endif
|
data/ext/bwt_lite.c
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include "bwt_lite.h"
|
5
|
+
|
6
|
+
int is_sa(const uint8_t *T, uint32_t *SA, int n);
|
7
|
+
int is_bwt(uint8_t *T, int n);
|
8
|
+
|
9
|
+
bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq)
|
10
|
+
{
|
11
|
+
bwtl_t *b;
|
12
|
+
int i;
|
13
|
+
b = (bwtl_t*)calloc(1, sizeof(bwtl_t));
|
14
|
+
b->seq_len = len;
|
15
|
+
|
16
|
+
{ // calculate b->bwt
|
17
|
+
uint8_t *s;
|
18
|
+
b->sa = (uint32_t*)calloc(len + 1, 4);
|
19
|
+
is_sa(seq, b->sa, len);
|
20
|
+
s = (uint8_t*)calloc(len + 1, 1);
|
21
|
+
for (i = 0; i <= len; ++i) {
|
22
|
+
if (b->sa[i] == 0) b->primary = i;
|
23
|
+
else s[i] = seq[b->sa[i] - 1];
|
24
|
+
}
|
25
|
+
for (i = b->primary; i < len; ++i) s[i] = s[i + 1];
|
26
|
+
b->bwt_size = (len + 15) / 16;
|
27
|
+
b->bwt = (uint32_t*)calloc(b->bwt_size, 4);
|
28
|
+
for (i = 0; i < len; ++i)
|
29
|
+
b->bwt[i>>4] |= s[i] << ((15 - (i&15)) << 1);
|
30
|
+
free(s);
|
31
|
+
}
|
32
|
+
{ // calculate b->occ
|
33
|
+
uint32_t c[4];
|
34
|
+
b->n_occ = (len + 15) / 16 * 4;
|
35
|
+
b->occ = (uint32_t*)calloc(b->n_occ, 4);
|
36
|
+
memset(c, 0, 16);
|
37
|
+
for (i = 0; i < len; ++i) {
|
38
|
+
if (i % 16 == 0)
|
39
|
+
memcpy(b->occ + (i/16) * 4, c, 16);
|
40
|
+
++c[bwtl_B0(b, i)];
|
41
|
+
}
|
42
|
+
memcpy(b->L2+1, c, 16);
|
43
|
+
for (i = 2; i < 5; ++i) b->L2[i] += b->L2[i-1];
|
44
|
+
}
|
45
|
+
{ // generate cnt_table
|
46
|
+
for (i = 0; i != 256; ++i) {
|
47
|
+
u_int32_t j, x = 0;
|
48
|
+
for (j = 0; j != 4; ++j)
|
49
|
+
x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
|
50
|
+
b->cnt_table[i] = x;
|
51
|
+
}
|
52
|
+
}
|
53
|
+
return b;
|
54
|
+
}
|
55
|
+
inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c)
|
56
|
+
{
|
57
|
+
uint32_t n, b;
|
58
|
+
if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
|
59
|
+
if (k == (uint32_t)(-1)) return 0;
|
60
|
+
if (k >= bwt->primary) --k; // because $ is not in bwt
|
61
|
+
n = bwt->occ[k/16<<2|c];
|
62
|
+
b = bwt->bwt[k/16] & ~((1U<<((15-(k&15))<<1)) - 1);
|
63
|
+
n += (bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff]
|
64
|
+
+ bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24]) >> (c<<3) & 0xff;
|
65
|
+
if (c == 0) n -= 15 - (k&15); // corrected for the masked bits
|
66
|
+
return n;
|
67
|
+
}
|
68
|
+
inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4])
|
69
|
+
{
|
70
|
+
uint32_t x, b;
|
71
|
+
if (k == (uint32_t)(-1)) {
|
72
|
+
memset(cnt, 0, 16);
|
73
|
+
return;
|
74
|
+
}
|
75
|
+
if (k >= bwt->primary) --k; // because $ is not in bwt
|
76
|
+
memcpy(cnt, bwt->occ + (k>>4<<2), 16);
|
77
|
+
b = bwt->bwt[k>>4] & ~((1U<<((~k&15)<<1)) - 1);
|
78
|
+
x = bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff]
|
79
|
+
+ bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24];
|
80
|
+
x -= 15 - (k&15);
|
81
|
+
cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
|
82
|
+
}
|
83
|
+
inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4])
|
84
|
+
{
|
85
|
+
bwtl_occ4(bwt, k, cntk);
|
86
|
+
bwtl_occ4(bwt, l, cntl);
|
87
|
+
}
|
88
|
+
void bwtl_destroy(bwtl_t *bwt)
|
89
|
+
{
|
90
|
+
if (bwt) {
|
91
|
+
free(bwt->occ); free(bwt->bwt); free(bwt->sa);
|
92
|
+
free(bwt);
|
93
|
+
}
|
94
|
+
}
|
data/ext/bwt_lite.h
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#ifndef BWT_LITE_H_
|
2
|
+
#define BWT_LITE_H_
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
|
6
|
+
typedef struct {
|
7
|
+
uint32_t seq_len, bwt_size, n_occ;
|
8
|
+
uint32_t primary;
|
9
|
+
uint32_t *bwt, *occ, *sa, L2[5];
|
10
|
+
uint32_t cnt_table[256];
|
11
|
+
} bwtl_t;
|
12
|
+
|
13
|
+
#define bwtl_B0(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
|
14
|
+
|
15
|
+
#ifdef __cplusplus
|
16
|
+
extern "C" {
|
17
|
+
#endif
|
18
|
+
|
19
|
+
bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq);
|
20
|
+
inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c);
|
21
|
+
inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]);
|
22
|
+
inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]);
|
23
|
+
void bwtl_destroy(bwtl_t *bwt);
|
24
|
+
|
25
|
+
#ifdef __cplusplus
|
26
|
+
}
|
27
|
+
#endif
|
28
|
+
|
29
|
+
#endif
|
data/ext/bwtaln.c
ADDED
@@ -0,0 +1,345 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <unistd.h>
|
3
|
+
#include <math.h>
|
4
|
+
#include <stdlib.h>
|
5
|
+
#include <string.h>
|
6
|
+
#include <time.h>
|
7
|
+
#include <stdint.h>
|
8
|
+
#ifdef HAVE_CONFIG_H
|
9
|
+
#include "config.h"
|
10
|
+
#endif
|
11
|
+
#include "bwtaln.h"
|
12
|
+
#include "bwtgap.h"
|
13
|
+
#include "utils.h"
|
14
|
+
|
15
|
+
#ifdef HAVE_PTHREAD
|
16
|
+
#define THREAD_BLOCK_SIZE 1024
|
17
|
+
#include <pthread.h>
|
18
|
+
static pthread_mutex_t g_seq_lock = PTHREAD_MUTEX_INITIALIZER;
|
19
|
+
#endif
|
20
|
+
|
21
|
+
gap_opt_t *gap_init_opt()
|
22
|
+
{
|
23
|
+
gap_opt_t *o;
|
24
|
+
o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t));
|
25
|
+
/* IMPORTANT: s_mm*10 should be about the average base error
|
26
|
+
rate. Voilating this requirement will break pairing! */
|
27
|
+
o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4;
|
28
|
+
o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6;
|
29
|
+
o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000;
|
30
|
+
o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD;
|
31
|
+
o->seed_len = 32; o->max_seed_diff = 2;
|
32
|
+
o->fnr = 0.04;
|
33
|
+
o->n_threads = 1;
|
34
|
+
o->max_top2 = 30;
|
35
|
+
o->trim_qual = 0;
|
36
|
+
return o;
|
37
|
+
}
|
38
|
+
|
39
|
+
int bwa_cal_maxdiff(int l, double err, double thres)
|
40
|
+
{
|
41
|
+
double elambda = exp(-l * err);
|
42
|
+
double sum, y = 1.0;
|
43
|
+
int k, x = 1;
|
44
|
+
for (k = 1, sum = elambda; k < 1000; ++k) {
|
45
|
+
y *= l * err;
|
46
|
+
x *= k;
|
47
|
+
sum += elambda * y / x;
|
48
|
+
if (1.0 - sum < thres) return k;
|
49
|
+
}
|
50
|
+
return 2;
|
51
|
+
}
|
52
|
+
|
53
|
+
// width must be filled as zero
|
54
|
+
static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_width_t *width)
|
55
|
+
{
|
56
|
+
bwtint_t k, l, ok, ol;
|
57
|
+
int i, bid;
|
58
|
+
bid = 0;
|
59
|
+
k = 0; l = rbwt->seq_len;
|
60
|
+
for (i = 0; i < len; ++i) {
|
61
|
+
ubyte_t c = str[i];
|
62
|
+
if (c < 4) {
|
63
|
+
bwt_2occ(rbwt, k - 1, l, c, &ok, &ol);
|
64
|
+
k = rbwt->L2[c] + ok + 1;
|
65
|
+
l = rbwt->L2[c] + ol;
|
66
|
+
}
|
67
|
+
if (k > l || c > 3) { // then restart
|
68
|
+
k = 0;
|
69
|
+
l = rbwt->seq_len;
|
70
|
+
++bid;
|
71
|
+
}
|
72
|
+
width[i].w = l - k + 1;
|
73
|
+
width[i].bid = bid;
|
74
|
+
}
|
75
|
+
width[len].w = 0;
|
76
|
+
width[len].bid = ++bid;
|
77
|
+
return bid;
|
78
|
+
}
|
79
|
+
|
80
|
+
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
|
81
|
+
{
|
82
|
+
int i, max_l = 0, max_len;
|
83
|
+
gap_stack_t *stack;
|
84
|
+
bwt_width_t *w[2], *seed_w[2];
|
85
|
+
const ubyte_t *seq[2];
|
86
|
+
gap_opt_t local_opt = *opt;
|
87
|
+
|
88
|
+
// initiate priority stack
|
89
|
+
for (i = max_len = 0; i != n_seqs; ++i)
|
90
|
+
if (seqs[i].len > max_len) max_len = seqs[i].len;
|
91
|
+
if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr);
|
92
|
+
if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff;
|
93
|
+
stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt);
|
94
|
+
|
95
|
+
seed_w[0] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
|
96
|
+
seed_w[1] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
|
97
|
+
w[0] = w[1] = 0;
|
98
|
+
for (i = 0; i != n_seqs; ++i) {
|
99
|
+
bwa_seq_t *p = seqs + i;
|
100
|
+
#ifdef HAVE_PTHREAD
|
101
|
+
if (opt->n_threads > 1) {
|
102
|
+
pthread_mutex_lock(&g_seq_lock);
|
103
|
+
if (p->tid < 0) { // unassigned
|
104
|
+
int j;
|
105
|
+
for (j = i; j < n_seqs && j < i + THREAD_BLOCK_SIZE; ++j)
|
106
|
+
seqs[j].tid = tid;
|
107
|
+
} else if (p->tid != tid) {
|
108
|
+
pthread_mutex_unlock(&g_seq_lock);
|
109
|
+
continue;
|
110
|
+
}
|
111
|
+
pthread_mutex_unlock(&g_seq_lock);
|
112
|
+
}
|
113
|
+
#endif
|
114
|
+
p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0;
|
115
|
+
seq[0] = p->seq; seq[1] = p->rseq;
|
116
|
+
if (max_l < p->len) {
|
117
|
+
max_l = p->len;
|
118
|
+
w[0] = (bwt_width_t*)realloc(w[0], (max_l + 1) * sizeof(bwt_width_t));
|
119
|
+
w[1] = (bwt_width_t*)realloc(w[1], (max_l + 1) * sizeof(bwt_width_t));
|
120
|
+
memset(w[0], 0, (max_l + 1) * sizeof(bwt_width_t));
|
121
|
+
memset(w[1], 0, (max_l + 1) * sizeof(bwt_width_t));
|
122
|
+
}
|
123
|
+
bwt_cal_width(bwt[0], p->len, seq[0], w[0]);
|
124
|
+
bwt_cal_width(bwt[1], p->len, seq[1], w[1]);
|
125
|
+
if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr);
|
126
|
+
local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff;
|
127
|
+
if (p->len > opt->seed_len) {
|
128
|
+
bwt_cal_width(bwt[0], opt->seed_len, seq[0] + (p->len - opt->seed_len), seed_w[0]);
|
129
|
+
bwt_cal_width(bwt[1], opt->seed_len, seq[1] + (p->len - opt->seed_len), seed_w[1]);
|
130
|
+
}
|
131
|
+
// core function
|
132
|
+
p->aln = bwt_match_gap(bwt, p->len, seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack);
|
133
|
+
// store the alignment
|
134
|
+
free(p->name); free(p->seq); free(p->rseq); free(p->qual);
|
135
|
+
p->name = 0; p->seq = p->rseq = p->qual = 0;
|
136
|
+
}
|
137
|
+
free(seed_w[0]); free(seed_w[1]);
|
138
|
+
free(w[0]); free(w[1]);
|
139
|
+
gap_destroy_stack(stack);
|
140
|
+
}
|
141
|
+
|
142
|
+
#ifdef HAVE_PTHREAD
|
143
|
+
typedef struct {
|
144
|
+
int tid;
|
145
|
+
bwt_t *bwt[2];
|
146
|
+
int n_seqs;
|
147
|
+
bwa_seq_t *seqs;
|
148
|
+
const gap_opt_t *opt;
|
149
|
+
} thread_aux_t;
|
150
|
+
|
151
|
+
static void *worker(void *data)
|
152
|
+
{
|
153
|
+
thread_aux_t *d = (thread_aux_t*)data;
|
154
|
+
bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt);
|
155
|
+
return 0;
|
156
|
+
}
|
157
|
+
#endif
|
158
|
+
|
159
|
+
bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa)
|
160
|
+
{
|
161
|
+
bwa_seqio_t *ks;
|
162
|
+
if (mode & BWA_MODE_BAM) { // open BAM
|
163
|
+
int which = 0;
|
164
|
+
if (mode & BWA_MODE_BAM_SE) which |= 4;
|
165
|
+
if (mode & BWA_MODE_BAM_READ1) which |= 1;
|
166
|
+
if (mode & BWA_MODE_BAM_READ2) which |= 2;
|
167
|
+
if (which == 0) which = 7; // then read all reads
|
168
|
+
ks = bwa_bam_open(fn_fa, which);
|
169
|
+
} else ks = bwa_seq_open(fn_fa);
|
170
|
+
return ks;
|
171
|
+
}
|
172
|
+
|
173
|
+
void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
|
174
|
+
{
|
175
|
+
int i, n_seqs, tot_seqs = 0;
|
176
|
+
bwa_seq_t *seqs;
|
177
|
+
bwa_seqio_t *ks;
|
178
|
+
clock_t t;
|
179
|
+
bwt_t *bwt[2];
|
180
|
+
|
181
|
+
// initialization
|
182
|
+
ks = bwa_open_reads(opt->mode, fn_fa);
|
183
|
+
|
184
|
+
{ // load BWT
|
185
|
+
char *str = (char*)calloc(strlen(prefix) + 10, 1);
|
186
|
+
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
|
187
|
+
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
|
188
|
+
free(str);
|
189
|
+
}
|
190
|
+
|
191
|
+
// core loop
|
192
|
+
fwrite(opt, sizeof(gap_opt_t), 1, stdout);
|
193
|
+
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) {
|
194
|
+
tot_seqs += n_seqs;
|
195
|
+
t = clock();
|
196
|
+
|
197
|
+
fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... ");
|
198
|
+
|
199
|
+
#ifdef HAVE_PTHREAD
|
200
|
+
if (opt->n_threads <= 1) { // no multi-threading at all
|
201
|
+
bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
|
202
|
+
} else {
|
203
|
+
pthread_t *tid;
|
204
|
+
pthread_attr_t attr;
|
205
|
+
thread_aux_t *data;
|
206
|
+
int j;
|
207
|
+
pthread_attr_init(&attr);
|
208
|
+
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
|
209
|
+
data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t));
|
210
|
+
tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
|
211
|
+
for (j = 0; j < opt->n_threads; ++j) {
|
212
|
+
data[j].tid = j; data[j].bwt[0] = bwt[0]; data[j].bwt[1] = bwt[1];
|
213
|
+
data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt;
|
214
|
+
pthread_create(&tid[j], &attr, worker, data + j);
|
215
|
+
}
|
216
|
+
for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0);
|
217
|
+
free(data); free(tid);
|
218
|
+
}
|
219
|
+
#else
|
220
|
+
bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
|
221
|
+
#endif
|
222
|
+
|
223
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
224
|
+
|
225
|
+
t = clock();
|
226
|
+
fprintf(stderr, "[bwa_aln_core] write to the disk... ");
|
227
|
+
for (i = 0; i < n_seqs; ++i) {
|
228
|
+
bwa_seq_t *p = seqs + i;
|
229
|
+
fwrite(&p->n_aln, 4, 1, stdout);
|
230
|
+
if (p->n_aln) fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout);
|
231
|
+
}
|
232
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
233
|
+
|
234
|
+
bwa_free_read_seq(n_seqs, seqs);
|
235
|
+
fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
|
236
|
+
}
|
237
|
+
|
238
|
+
// destroy
|
239
|
+
bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
|
240
|
+
bwa_seq_close(ks);
|
241
|
+
}
|
242
|
+
|
243
|
+
int bwa_aln(int argc, char *argv[])
|
244
|
+
{
|
245
|
+
int c, opte = -1;
|
246
|
+
gap_opt_t *opt;
|
247
|
+
optind = 1;
|
248
|
+
opt = gap_init_opt();
|
249
|
+
while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IB:")) >= 0) {
|
250
|
+
switch (c) {
|
251
|
+
case 'n':
|
252
|
+
if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
|
253
|
+
else opt->max_diff = atoi(optarg), opt->fnr = -1.0;
|
254
|
+
break;
|
255
|
+
case 'o': opt->max_gapo = atoi(optarg); break;
|
256
|
+
case 'e': opte = atoi(optarg); break;
|
257
|
+
case 'M': opt->s_mm = atoi(optarg); break;
|
258
|
+
case 'O': opt->s_gapo = atoi(optarg); break;
|
259
|
+
case 'E': opt->s_gape = atoi(optarg); break;
|
260
|
+
case 'd': opt->max_del_occ = atoi(optarg); break;
|
261
|
+
case 'i': opt->indel_end_skip = atoi(optarg); break;
|
262
|
+
case 'l': opt->seed_len = atoi(optarg); break;
|
263
|
+
case 'k': opt->max_seed_diff = atoi(optarg); break;
|
264
|
+
case 'm': opt->max_entries = atoi(optarg); break;
|
265
|
+
case 't': opt->n_threads = atoi(optarg); break;
|
266
|
+
case 'L': opt->mode |= BWA_MODE_LOGGAP; break;
|
267
|
+
case 'R': opt->max_top2 = atoi(optarg); break;
|
268
|
+
case 'q': opt->trim_qual = atoi(optarg); break;
|
269
|
+
case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break;
|
270
|
+
case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break;
|
271
|
+
case 'f': xreopen(optarg, "wb", stdout); break;
|
272
|
+
case 'b': opt->mode |= BWA_MODE_BAM; break;
|
273
|
+
case '0': opt->mode |= BWA_MODE_BAM_SE; break;
|
274
|
+
case '1': opt->mode |= BWA_MODE_BAM_READ1; break;
|
275
|
+
case '2': opt->mode |= BWA_MODE_BAM_READ2; break;
|
276
|
+
case 'I': opt->mode |= BWA_MODE_IL13; break;
|
277
|
+
case 'B': opt->mode |= atoi(optarg) << 24; break;
|
278
|
+
default: return 1;
|
279
|
+
}
|
280
|
+
}
|
281
|
+
if (opte > 0) {
|
282
|
+
opt->max_gape = opte;
|
283
|
+
opt->mode &= ~BWA_MODE_GAPE;
|
284
|
+
}
|
285
|
+
|
286
|
+
if (optind + 2 > argc) {
|
287
|
+
fprintf(stderr, "\n");
|
288
|
+
fprintf(stderr, "Usage: bwa aln [options] <prefix> <in.fq>\n\n");
|
289
|
+
fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n",
|
290
|
+
BWA_AVG_ERR, opt->fnr);
|
291
|
+
fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo);
|
292
|
+
fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n");
|
293
|
+
fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip);
|
294
|
+
fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ);
|
295
|
+
fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len);
|
296
|
+
fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff);
|
297
|
+
fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries);
|
298
|
+
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
|
299
|
+
fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm);
|
300
|
+
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo);
|
301
|
+
fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape);
|
302
|
+
fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2);
|
303
|
+
fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
|
304
|
+
fprintf(stderr, " -f FILE file to write output to instead of stdout\n");
|
305
|
+
fprintf(stderr, " -B INT length of barcode\n");
|
306
|
+
fprintf(stderr, " -c input sequences are in the color space\n");
|
307
|
+
fprintf(stderr, " -L log-scaled gap penalty for long deletions\n");
|
308
|
+
fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n");
|
309
|
+
fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n");
|
310
|
+
fprintf(stderr, " -b the input read file is in the BAM format\n");
|
311
|
+
fprintf(stderr, " -0 use single-end reads only (effective with -b)\n");
|
312
|
+
fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n");
|
313
|
+
fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n");
|
314
|
+
fprintf(stderr, "\n");
|
315
|
+
return 1;
|
316
|
+
}
|
317
|
+
if (opt->fnr > 0.0) {
|
318
|
+
int i, k;
|
319
|
+
for (i = 17, k = 0; i <= 250; ++i) {
|
320
|
+
int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
|
321
|
+
if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l);
|
322
|
+
k = l;
|
323
|
+
}
|
324
|
+
}
|
325
|
+
bwa_aln_core(argv[optind], argv[optind+1], opt);
|
326
|
+
free(opt);
|
327
|
+
fflush(stdout);
|
328
|
+
xreopen("/dev/tty","w",stdout);
|
329
|
+
return 0;
|
330
|
+
}
|
331
|
+
|
332
|
+
/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t,
|
333
|
+
__cigar_op and __cigar_len while keeping stdaln stand alone */
|
334
|
+
bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar)
|
335
|
+
{
|
336
|
+
uint32_t *cigar32;
|
337
|
+
bwa_cigar_t *cigar;
|
338
|
+
int i;
|
339
|
+
cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar);
|
340
|
+
cigar = (bwa_cigar_t*)cigar32;
|
341
|
+
for (i = 0; i < *n_cigar; ++i)
|
342
|
+
cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) );
|
343
|
+
return cigar;
|
344
|
+
}
|
345
|
+
|