bio-bwa 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
data/ext/bwtaln.h
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
#ifndef BWTALN_H
|
2
|
+
#define BWTALN_H
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
#include "bwt.h"
|
6
|
+
|
7
|
+
#define BWA_TYPE_NO_MATCH 0
|
8
|
+
#define BWA_TYPE_UNIQUE 1
|
9
|
+
#define BWA_TYPE_REPEAT 2
|
10
|
+
#define BWA_TYPE_MATESW 3
|
11
|
+
|
12
|
+
#define SAM_FPD 1 // paired
|
13
|
+
#define SAM_FPP 2 // properly paired
|
14
|
+
#define SAM_FSU 4 // self-unmapped
|
15
|
+
#define SAM_FMU 8 // mate-unmapped
|
16
|
+
#define SAM_FSR 16 // self on the reverse strand
|
17
|
+
#define SAM_FMR 32 // mate on the reverse strand
|
18
|
+
#define SAM_FR1 64 // this is read one
|
19
|
+
#define SAM_FR2 128 // this is read two
|
20
|
+
#define SAM_FSC 256 // secondary alignment
|
21
|
+
|
22
|
+
#define BWA_AVG_ERR 0.02
|
23
|
+
#define BWA_MIN_RDLEN 35 // for read trimming
|
24
|
+
|
25
|
+
#ifndef bns_pac
|
26
|
+
#define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3)
|
27
|
+
#endif
|
28
|
+
|
29
|
+
typedef struct {
|
30
|
+
bwtint_t w;
|
31
|
+
int bid;
|
32
|
+
} bwt_width_t;
|
33
|
+
|
34
|
+
typedef struct {
|
35
|
+
uint32_t n_mm:8, n_gapo:8, n_gape:8, a:1;
|
36
|
+
bwtint_t k, l;
|
37
|
+
int score;
|
38
|
+
} bwt_aln1_t;
|
39
|
+
|
40
|
+
typedef uint16_t bwa_cigar_t;
|
41
|
+
/* rgoya: If changing order of bytes, beware of operations like:
|
42
|
+
* s->cigar[0] += s->full_len - s->len;
|
43
|
+
*/
|
44
|
+
#define CIGAR_OP_SHIFT 14
|
45
|
+
#define CIGAR_LN_MASK 0x3fff
|
46
|
+
|
47
|
+
#define __cigar_op(__cigar) ((__cigar)>>CIGAR_OP_SHIFT)
|
48
|
+
#define __cigar_len(__cigar) ((__cigar)&CIGAR_LN_MASK)
|
49
|
+
#define __cigar_create(__op, __len) ((__op)<<CIGAR_OP_SHIFT | (__len))
|
50
|
+
|
51
|
+
typedef struct {
|
52
|
+
uint32_t pos;
|
53
|
+
uint32_t n_cigar:15, gap:8, mm:8, strand:1;
|
54
|
+
bwa_cigar_t *cigar;
|
55
|
+
} bwt_multi1_t;
|
56
|
+
|
57
|
+
typedef struct {
|
58
|
+
char *name;
|
59
|
+
ubyte_t *seq, *rseq, *qual;
|
60
|
+
uint32_t len:20, strand:1, type:2, dummy:1, extra_flag:8;
|
61
|
+
uint32_t n_mm:8, n_gapo:8, n_gape:8, mapQ:8;
|
62
|
+
int score;
|
63
|
+
int clip_len;
|
64
|
+
// alignments in SA coordinates
|
65
|
+
int n_aln;
|
66
|
+
bwt_aln1_t *aln;
|
67
|
+
// multiple hits
|
68
|
+
int n_multi;
|
69
|
+
bwt_multi1_t *multi;
|
70
|
+
// alignment information
|
71
|
+
bwtint_t sa, pos;
|
72
|
+
uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ
|
73
|
+
int n_cigar;
|
74
|
+
bwa_cigar_t *cigar;
|
75
|
+
// for multi-threading only
|
76
|
+
int tid;
|
77
|
+
// barcode
|
78
|
+
char bc[16]; // null terminated; up to 15 bases
|
79
|
+
// NM and MD tags
|
80
|
+
uint32_t full_len:20, nm:12;
|
81
|
+
char *md;
|
82
|
+
} bwa_seq_t;
|
83
|
+
|
84
|
+
#define BWA_MODE_GAPE 0x01
|
85
|
+
#define BWA_MODE_COMPREAD 0x02
|
86
|
+
#define BWA_MODE_LOGGAP 0x04
|
87
|
+
#define BWA_MODE_NONSTOP 0x10
|
88
|
+
#define BWA_MODE_BAM 0x20
|
89
|
+
#define BWA_MODE_BAM_SE 0x40
|
90
|
+
#define BWA_MODE_BAM_READ1 0x80
|
91
|
+
#define BWA_MODE_BAM_READ2 0x100
|
92
|
+
#define BWA_MODE_IL13 0x200
|
93
|
+
|
94
|
+
typedef struct {
|
95
|
+
int s_mm, s_gapo, s_gape;
|
96
|
+
int mode; // bit 24-31 are the barcode length
|
97
|
+
int indel_end_skip, max_del_occ, max_entries;
|
98
|
+
float fnr;
|
99
|
+
int max_diff, max_gapo, max_gape;
|
100
|
+
int max_seed_diff, seed_len;
|
101
|
+
int n_threads;
|
102
|
+
int max_top2;
|
103
|
+
int trim_qual;
|
104
|
+
} gap_opt_t;
|
105
|
+
|
106
|
+
#define BWA_PET_STD 1
|
107
|
+
#define BWA_PET_SOLID 2
|
108
|
+
|
109
|
+
typedef struct {
|
110
|
+
int max_isize, force_isize;
|
111
|
+
int max_occ;
|
112
|
+
int n_multi, N_multi;
|
113
|
+
int type, is_sw, is_preload;
|
114
|
+
double ap_prior;
|
115
|
+
} pe_opt_t;
|
116
|
+
|
117
|
+
struct __bwa_seqio_t;
|
118
|
+
typedef struct __bwa_seqio_t bwa_seqio_t;
|
119
|
+
|
120
|
+
#ifdef __cplusplus
|
121
|
+
extern "C" {
|
122
|
+
#endif
|
123
|
+
|
124
|
+
gap_opt_t *gap_init_opt();
|
125
|
+
void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt);
|
126
|
+
|
127
|
+
bwa_seqio_t *bwa_seq_open(const char *fn);
|
128
|
+
bwa_seqio_t *bwa_bam_open(const char *fn, int which);
|
129
|
+
void bwa_seq_close(bwa_seqio_t *bs);
|
130
|
+
void seq_reverse(int len, ubyte_t *seq, int is_comp);
|
131
|
+
bwa_seq_t *bwa_read_seq(bwa_seqio_t *seq, int n_needed, int *n, int mode, int trim_qual);
|
132
|
+
void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs);
|
133
|
+
|
134
|
+
int bwa_cal_maxdiff(int l, double err, double thres);
|
135
|
+
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt);
|
136
|
+
|
137
|
+
void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac);
|
138
|
+
|
139
|
+
|
140
|
+
/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t,
|
141
|
+
__cigar_op and __cigar_len while keeping stdaln stand alone */
|
142
|
+
#include "stdaln.h"
|
143
|
+
|
144
|
+
bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar);
|
145
|
+
|
146
|
+
#ifdef __cplusplus
|
147
|
+
}
|
148
|
+
#endif
|
149
|
+
|
150
|
+
#endif
|
data/ext/bwtgap.c
ADDED
@@ -0,0 +1,264 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include "bwtgap.h"
|
5
|
+
#include "bwtaln.h"
|
6
|
+
|
7
|
+
#define STATE_M 0
|
8
|
+
#define STATE_I 1
|
9
|
+
#define STATE_D 2
|
10
|
+
|
11
|
+
#define aln_score(m,o,e,p) ((m)*(p)->s_mm + (o)*(p)->s_gapo + (e)*(p)->s_gape)
|
12
|
+
|
13
|
+
gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt)
|
14
|
+
{
|
15
|
+
int i;
|
16
|
+
gap_stack_t *stack;
|
17
|
+
stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t));
|
18
|
+
stack->n_stacks = aln_score(max_mm+1, max_gapo+1, max_gape+1, opt);
|
19
|
+
stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t));
|
20
|
+
for (i = 0; i != stack->n_stacks; ++i) {
|
21
|
+
gap_stack1_t *p = stack->stacks + i;
|
22
|
+
p->m_entries = 4;
|
23
|
+
p->stack = (gap_entry_t*)calloc(p->m_entries, sizeof(gap_entry_t));
|
24
|
+
}
|
25
|
+
return stack;
|
26
|
+
}
|
27
|
+
|
28
|
+
void gap_destroy_stack(gap_stack_t *stack)
|
29
|
+
{
|
30
|
+
int i;
|
31
|
+
for (i = 0; i != stack->n_stacks; ++i) free(stack->stacks[i].stack);
|
32
|
+
free(stack->stacks);
|
33
|
+
free(stack);
|
34
|
+
}
|
35
|
+
|
36
|
+
static void gap_reset_stack(gap_stack_t *stack)
|
37
|
+
{
|
38
|
+
int i;
|
39
|
+
for (i = 0; i != stack->n_stacks; ++i)
|
40
|
+
stack->stacks[i].n_entries = 0;
|
41
|
+
stack->best = stack->n_stacks;
|
42
|
+
stack->n_entries = 0;
|
43
|
+
}
|
44
|
+
|
45
|
+
static inline void gap_push(gap_stack_t *stack, int a, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape,
|
46
|
+
int state, int is_diff, const gap_opt_t *opt)
|
47
|
+
{
|
48
|
+
int score;
|
49
|
+
gap_entry_t *p;
|
50
|
+
gap_stack1_t *q;
|
51
|
+
score = aln_score(n_mm, n_gapo, n_gape, opt);
|
52
|
+
q = stack->stacks + score;
|
53
|
+
if (q->n_entries == q->m_entries) {
|
54
|
+
q->m_entries <<= 1;
|
55
|
+
q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries);
|
56
|
+
}
|
57
|
+
p = q->stack + q->n_entries;
|
58
|
+
p->info = (u_int32_t)score<<21 | a<<20 | i; p->k = k; p->l = l;
|
59
|
+
p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; p->state = state;
|
60
|
+
if (is_diff) p->last_diff_pos = i;
|
61
|
+
++(q->n_entries);
|
62
|
+
++(stack->n_entries);
|
63
|
+
if (stack->best > score) stack->best = score;
|
64
|
+
}
|
65
|
+
|
66
|
+
static inline void gap_pop(gap_stack_t *stack, gap_entry_t *e)
|
67
|
+
{
|
68
|
+
gap_stack1_t *q;
|
69
|
+
q = stack->stacks + stack->best;
|
70
|
+
*e = q->stack[q->n_entries - 1];
|
71
|
+
--(q->n_entries);
|
72
|
+
--(stack->n_entries);
|
73
|
+
if (q->n_entries == 0 && stack->n_entries) { // reset best
|
74
|
+
int i;
|
75
|
+
for (i = stack->best + 1; i < stack->n_stacks; ++i)
|
76
|
+
if (stack->stacks[i].n_entries != 0) break;
|
77
|
+
stack->best = i;
|
78
|
+
} else if (stack->n_entries == 0) stack->best = stack->n_stacks;
|
79
|
+
}
|
80
|
+
|
81
|
+
static inline void gap_shadow(int x, int len, bwtint_t max, int last_diff_pos, bwt_width_t *w)
|
82
|
+
{
|
83
|
+
int i, j;
|
84
|
+
for (i = j = 0; i < last_diff_pos; ++i) {
|
85
|
+
if (w[i].w > x) w[i].w -= x;
|
86
|
+
else if (w[i].w == x) {
|
87
|
+
w[i].bid = 1;
|
88
|
+
w[i].w = max - (++j);
|
89
|
+
} // else should not happen
|
90
|
+
}
|
91
|
+
}
|
92
|
+
|
93
|
+
static inline int int_log2(uint32_t v)
|
94
|
+
{
|
95
|
+
int c = 0;
|
96
|
+
if (v & 0xffff0000u) { v >>= 16; c |= 16; }
|
97
|
+
if (v & 0xff00) { v >>= 8; c |= 8; }
|
98
|
+
if (v & 0xf0) { v >>= 4; c |= 4; }
|
99
|
+
if (v & 0xc) { v >>= 2; c |= 2; }
|
100
|
+
if (v & 0x2) c |= 1;
|
101
|
+
return c;
|
102
|
+
}
|
103
|
+
|
104
|
+
bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], bwt_width_t *w[2],
|
105
|
+
bwt_width_t *seed_w[2], const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack)
|
106
|
+
{
|
107
|
+
int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt);
|
108
|
+
int best_diff = opt->max_diff + 1, max_diff = opt->max_diff;
|
109
|
+
int best_cnt = 0;
|
110
|
+
int max_entries = 0, j, _j, n_aln, m_aln;
|
111
|
+
bwt_aln1_t *aln;
|
112
|
+
|
113
|
+
m_aln = 4; n_aln = 0;
|
114
|
+
aln = (bwt_aln1_t*)calloc(m_aln, sizeof(bwt_aln1_t));
|
115
|
+
|
116
|
+
// check whether there are too many N
|
117
|
+
for (j = _j = 0; j < len; ++j)
|
118
|
+
if (seq[0][j] > 3) ++_j;
|
119
|
+
if (_j > max_diff) {
|
120
|
+
*_n_aln = n_aln;
|
121
|
+
return aln;
|
122
|
+
}
|
123
|
+
|
124
|
+
//for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w);
|
125
|
+
gap_reset_stack(stack); // reset stack
|
126
|
+
gap_push(stack, 0, len, 0, bwts[0]->seq_len, 0, 0, 0, 0, 0, opt);
|
127
|
+
gap_push(stack, 1, len, 0, bwts[0]->seq_len, 0, 0, 0, 0, 0, opt);
|
128
|
+
|
129
|
+
while (stack->n_entries) {
|
130
|
+
gap_entry_t e;
|
131
|
+
int a, i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp;
|
132
|
+
bwtint_t k, l, cnt_k[4], cnt_l[4], occ;
|
133
|
+
const bwt_t *bwt;
|
134
|
+
const ubyte_t *str;
|
135
|
+
const bwt_width_t *seed_width = 0;
|
136
|
+
bwt_width_t *width;
|
137
|
+
|
138
|
+
if (max_entries < stack->n_entries) max_entries = stack->n_entries;
|
139
|
+
if (stack->n_entries > opt->max_entries) break;
|
140
|
+
gap_pop(stack, &e); // get the best entry
|
141
|
+
k = e.k; l = e.l; // SA interval
|
142
|
+
a = e.info>>20&1; i = e.info&0xffff; // strand, length
|
143
|
+
if (!(opt->mode & BWA_MODE_NONSTOP) && e.info>>21 > best_score + opt->s_mm) break; // no need to proceed
|
144
|
+
|
145
|
+
m = max_diff - (e.n_mm + e.n_gapo);
|
146
|
+
if (opt->mode & BWA_MODE_GAPE) m -= e.n_gape;
|
147
|
+
if (m < 0) continue;
|
148
|
+
bwt = bwts[1-a]; str = seq[a]; width = w[a];
|
149
|
+
if (seed_w) { // apply seeding
|
150
|
+
seed_width = seed_w[a];
|
151
|
+
m_seed = opt->max_seed_diff - (e.n_mm + e.n_gapo);
|
152
|
+
if (opt->mode & BWA_MODE_GAPE) m_seed -= e.n_gape;
|
153
|
+
}
|
154
|
+
//printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%u,%u]\t[%u,%u]\t%d\n", stack->n_entries, a, i, "MID"[e.state], e.n_mm, e.n_gapo, e.n_gape, width[i-1].bid, width[i-1].w, k, l, e.last_diff_pos);
|
155
|
+
if (i > 0 && m < width[i-1].bid) continue;
|
156
|
+
|
157
|
+
// check whether a hit is found
|
158
|
+
hit_found = 0;
|
159
|
+
if (i == 0) hit_found = 1;
|
160
|
+
else if (m == 0 && (e.state == STATE_M || (opt->mode&BWA_MODE_GAPE) || e.n_gape == opt->max_gape)) { // no diff allowed
|
161
|
+
if (bwt_match_exact_alt(bwt, i, str, &k, &l)) hit_found = 1;
|
162
|
+
else continue; // no hit, skip
|
163
|
+
}
|
164
|
+
|
165
|
+
if (hit_found) { // action for found hits
|
166
|
+
int score = aln_score(e.n_mm, e.n_gapo, e.n_gape, opt);
|
167
|
+
int do_add = 1;
|
168
|
+
//printf("#2 hits found: %d:(%u,%u)\n", e.n_mm+e.n_gapo, k, l);
|
169
|
+
if (n_aln == 0) {
|
170
|
+
best_score = score;
|
171
|
+
best_diff = e.n_mm + e.n_gapo;
|
172
|
+
if (opt->mode & BWA_MODE_GAPE) best_diff += e.n_gape;
|
173
|
+
if (!(opt->mode & BWA_MODE_NONSTOP))
|
174
|
+
max_diff = (best_diff + 1 > opt->max_diff)? opt->max_diff : best_diff + 1; // top2 behaviour
|
175
|
+
}
|
176
|
+
if (score == best_score) best_cnt += l - k + 1;
|
177
|
+
else if (best_cnt > opt->max_top2) break; // top2b behaviour
|
178
|
+
if (e.n_gapo) { // check whether the hit has been found. this may happen when a gap occurs in a tandem repeat
|
179
|
+
for (j = 0; j != n_aln; ++j)
|
180
|
+
if (aln[j].k == k && aln[j].l == l) break;
|
181
|
+
if (j < n_aln) do_add = 0;
|
182
|
+
}
|
183
|
+
if (do_add) { // append
|
184
|
+
bwt_aln1_t *p;
|
185
|
+
gap_shadow(l - k + 1, len, bwt->seq_len, e.last_diff_pos, width);
|
186
|
+
if (n_aln == m_aln) {
|
187
|
+
m_aln <<= 1;
|
188
|
+
aln = (bwt_aln1_t*)realloc(aln, m_aln * sizeof(bwt_aln1_t));
|
189
|
+
memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t));
|
190
|
+
}
|
191
|
+
p = aln + n_aln;
|
192
|
+
p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; p->a = a;
|
193
|
+
p->k = k; p->l = l;
|
194
|
+
p->score = score;
|
195
|
+
++n_aln;
|
196
|
+
}
|
197
|
+
continue;
|
198
|
+
}
|
199
|
+
|
200
|
+
--i;
|
201
|
+
bwt_2occ4(bwt, k - 1, l, cnt_k, cnt_l); // retrieve Occ values
|
202
|
+
occ = l - k + 1;
|
203
|
+
// test whether diff is allowed
|
204
|
+
allow_diff = allow_M = 1;
|
205
|
+
if (i > 0) {
|
206
|
+
int ii = i - (len - opt->seed_len);
|
207
|
+
if (width[i-1].bid > m-1) allow_diff = 0;
|
208
|
+
else if (width[i-1].bid == m-1 && width[i].bid == m-1 && width[i-1].w == width[i].w) allow_M = 0;
|
209
|
+
if (seed_w && ii > 0) {
|
210
|
+
if (seed_width[ii-1].bid > m_seed-1) allow_diff = 0;
|
211
|
+
else if (seed_width[ii-1].bid == m_seed-1 && seed_width[ii].bid == m_seed-1
|
212
|
+
&& seed_width[ii-1].w == seed_width[ii].w) allow_M = 0;
|
213
|
+
}
|
214
|
+
}
|
215
|
+
// indels
|
216
|
+
tmp = (opt->mode & BWA_MODE_LOGGAP)? int_log2(e.n_gape + e.n_gapo)/2+1 : e.n_gapo + e.n_gape;
|
217
|
+
if (allow_diff && i >= opt->indel_end_skip + tmp && len - i >= opt->indel_end_skip + tmp) {
|
218
|
+
if (e.state == STATE_M) { // gap open
|
219
|
+
if (e.n_gapo < opt->max_gapo) { // gap open is allowed
|
220
|
+
// insertion
|
221
|
+
gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_I, 1, opt);
|
222
|
+
// deletion
|
223
|
+
for (j = 0; j != 4; ++j) {
|
224
|
+
k = bwt->L2[j] + cnt_k[j] + 1;
|
225
|
+
l = bwt->L2[j] + cnt_l[j];
|
226
|
+
if (k <= l) gap_push(stack, a, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_D, 1, opt);
|
227
|
+
}
|
228
|
+
}
|
229
|
+
} else if (e.state == STATE_I) { // extention of an insertion
|
230
|
+
if (e.n_gape < opt->max_gape) // gap extention is allowed
|
231
|
+
gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_I, 1, opt);
|
232
|
+
} else if (e.state == STATE_D) { // extention of a deletion
|
233
|
+
if (e.n_gape < opt->max_gape) { // gap extention is allowed
|
234
|
+
if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) {
|
235
|
+
for (j = 0; j != 4; ++j) {
|
236
|
+
k = bwt->L2[j] + cnt_k[j] + 1;
|
237
|
+
l = bwt->L2[j] + cnt_l[j];
|
238
|
+
if (k <= l) gap_push(stack, a, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_D, 1, opt);
|
239
|
+
}
|
240
|
+
}
|
241
|
+
}
|
242
|
+
}
|
243
|
+
}
|
244
|
+
// mismatches
|
245
|
+
if (allow_diff && allow_M) { // mismatch is allowed
|
246
|
+
for (j = 1; j <= 4; ++j) {
|
247
|
+
int c = (str[i] + j) & 3;
|
248
|
+
int is_mm = (j != 4 || str[i] > 3);
|
249
|
+
k = bwt->L2[c] + cnt_k[c] + 1;
|
250
|
+
l = bwt->L2[c] + cnt_l[c];
|
251
|
+
if (k <= l) gap_push(stack, a, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, STATE_M, is_mm, opt);
|
252
|
+
}
|
253
|
+
} else if (str[i] < 4) { // try exact match only
|
254
|
+
int c = str[i] & 3;
|
255
|
+
k = bwt->L2[c] + cnt_k[c] + 1;
|
256
|
+
l = bwt->L2[c] + cnt_l[c];
|
257
|
+
if (k <= l) gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt);
|
258
|
+
}
|
259
|
+
}
|
260
|
+
|
261
|
+
*_n_aln = n_aln;
|
262
|
+
//fprintf(stderr, "max_entries = %d\n", max_entries);
|
263
|
+
return aln;
|
264
|
+
}
|
data/ext/bwtgap.h
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
#ifndef BWTGAP_H_
|
2
|
+
#define BWTGAP_H_
|
3
|
+
|
4
|
+
#include "bwt.h"
|
5
|
+
#include "bwtaln.h"
|
6
|
+
|
7
|
+
typedef struct { // recursion stack
|
8
|
+
u_int32_t info; // score<<21 | a<<20 | i
|
9
|
+
u_int32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6;
|
10
|
+
bwtint_t k, l; // (k,l) is the SA region of [i,n-1]
|
11
|
+
int last_diff_pos;
|
12
|
+
} gap_entry_t;
|
13
|
+
|
14
|
+
typedef struct {
|
15
|
+
int n_entries, m_entries;
|
16
|
+
gap_entry_t *stack;
|
17
|
+
} gap_stack1_t;
|
18
|
+
|
19
|
+
typedef struct {
|
20
|
+
int n_stacks, best, n_entries;
|
21
|
+
gap_stack1_t *stacks;
|
22
|
+
} gap_stack_t;
|
23
|
+
|
24
|
+
#ifdef __cplusplus
|
25
|
+
extern "C" {
|
26
|
+
#endif
|
27
|
+
|
28
|
+
gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt);
|
29
|
+
void gap_destroy_stack(gap_stack_t *stack);
|
30
|
+
bwt_aln1_t *bwt_match_gap(bwt_t *const bwt[2], int len, const ubyte_t *seq[2], bwt_width_t *w[2],
|
31
|
+
bwt_width_t *seed_w[2], const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack);
|
32
|
+
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
|
33
|
+
|
34
|
+
#ifdef __cplusplus
|
35
|
+
}
|
36
|
+
#endif
|
37
|
+
|
38
|
+
#endif
|