bio-bwa 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
data/ext/bwtaln.h
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
#ifndef BWTALN_H
|
2
|
+
#define BWTALN_H
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
#include "bwt.h"
|
6
|
+
|
7
|
+
#define BWA_TYPE_NO_MATCH 0
|
8
|
+
#define BWA_TYPE_UNIQUE 1
|
9
|
+
#define BWA_TYPE_REPEAT 2
|
10
|
+
#define BWA_TYPE_MATESW 3
|
11
|
+
|
12
|
+
#define SAM_FPD 1 // paired
|
13
|
+
#define SAM_FPP 2 // properly paired
|
14
|
+
#define SAM_FSU 4 // self-unmapped
|
15
|
+
#define SAM_FMU 8 // mate-unmapped
|
16
|
+
#define SAM_FSR 16 // self on the reverse strand
|
17
|
+
#define SAM_FMR 32 // mate on the reverse strand
|
18
|
+
#define SAM_FR1 64 // this is read one
|
19
|
+
#define SAM_FR2 128 // this is read two
|
20
|
+
#define SAM_FSC 256 // secondary alignment
|
21
|
+
|
22
|
+
#define BWA_AVG_ERR 0.02
|
23
|
+
#define BWA_MIN_RDLEN 35 // for read trimming
|
24
|
+
|
25
|
+
#ifndef bns_pac
|
26
|
+
#define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3)
|
27
|
+
#endif
|
28
|
+
|
29
|
+
typedef struct {
|
30
|
+
bwtint_t w;
|
31
|
+
int bid;
|
32
|
+
} bwt_width_t;
|
33
|
+
|
34
|
+
typedef struct {
|
35
|
+
uint32_t n_mm:8, n_gapo:8, n_gape:8, a:1;
|
36
|
+
bwtint_t k, l;
|
37
|
+
int score;
|
38
|
+
} bwt_aln1_t;
|
39
|
+
|
40
|
+
typedef uint16_t bwa_cigar_t;
|
41
|
+
/* rgoya: If changing order of bytes, beware of operations like:
|
42
|
+
* s->cigar[0] += s->full_len - s->len;
|
43
|
+
*/
|
44
|
+
#define CIGAR_OP_SHIFT 14
|
45
|
+
#define CIGAR_LN_MASK 0x3fff
|
46
|
+
|
47
|
+
#define __cigar_op(__cigar) ((__cigar)>>CIGAR_OP_SHIFT)
|
48
|
+
#define __cigar_len(__cigar) ((__cigar)&CIGAR_LN_MASK)
|
49
|
+
#define __cigar_create(__op, __len) ((__op)<<CIGAR_OP_SHIFT | (__len))
|
50
|
+
|
51
|
+
typedef struct {
|
52
|
+
uint32_t pos;
|
53
|
+
uint32_t n_cigar:15, gap:8, mm:8, strand:1;
|
54
|
+
bwa_cigar_t *cigar;
|
55
|
+
} bwt_multi1_t;
|
56
|
+
|
57
|
+
typedef struct {
|
58
|
+
char *name;
|
59
|
+
ubyte_t *seq, *rseq, *qual;
|
60
|
+
uint32_t len:20, strand:1, type:2, dummy:1, extra_flag:8;
|
61
|
+
uint32_t n_mm:8, n_gapo:8, n_gape:8, mapQ:8;
|
62
|
+
int score;
|
63
|
+
int clip_len;
|
64
|
+
// alignments in SA coordinates
|
65
|
+
int n_aln;
|
66
|
+
bwt_aln1_t *aln;
|
67
|
+
// multiple hits
|
68
|
+
int n_multi;
|
69
|
+
bwt_multi1_t *multi;
|
70
|
+
// alignment information
|
71
|
+
bwtint_t sa, pos;
|
72
|
+
uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ
|
73
|
+
int n_cigar;
|
74
|
+
bwa_cigar_t *cigar;
|
75
|
+
// for multi-threading only
|
76
|
+
int tid;
|
77
|
+
// barcode
|
78
|
+
char bc[16]; // null terminated; up to 15 bases
|
79
|
+
// NM and MD tags
|
80
|
+
uint32_t full_len:20, nm:12;
|
81
|
+
char *md;
|
82
|
+
} bwa_seq_t;
|
83
|
+
|
84
|
+
#define BWA_MODE_GAPE 0x01
|
85
|
+
#define BWA_MODE_COMPREAD 0x02
|
86
|
+
#define BWA_MODE_LOGGAP 0x04
|
87
|
+
#define BWA_MODE_NONSTOP 0x10
|
88
|
+
#define BWA_MODE_BAM 0x20
|
89
|
+
#define BWA_MODE_BAM_SE 0x40
|
90
|
+
#define BWA_MODE_BAM_READ1 0x80
|
91
|
+
#define BWA_MODE_BAM_READ2 0x100
|
92
|
+
#define BWA_MODE_IL13 0x200
|
93
|
+
|
94
|
+
typedef struct {
|
95
|
+
int s_mm, s_gapo, s_gape;
|
96
|
+
int mode; // bit 24-31 are the barcode length
|
97
|
+
int indel_end_skip, max_del_occ, max_entries;
|
98
|
+
float fnr;
|
99
|
+
int max_diff, max_gapo, max_gape;
|
100
|
+
int max_seed_diff, seed_len;
|
101
|
+
int n_threads;
|
102
|
+
int max_top2;
|
103
|
+
int trim_qual;
|
104
|
+
} gap_opt_t;
|
105
|
+
|
106
|
+
#define BWA_PET_STD 1
|
107
|
+
#define BWA_PET_SOLID 2
|
108
|
+
|
109
|
+
typedef struct {
|
110
|
+
int max_isize, force_isize;
|
111
|
+
int max_occ;
|
112
|
+
int n_multi, N_multi;
|
113
|
+
int type, is_sw, is_preload;
|
114
|
+
double ap_prior;
|
115
|
+
} pe_opt_t;
|
116
|
+
|
117
|
+
struct __bwa_seqio_t;
|
118
|
+
typedef struct __bwa_seqio_t bwa_seqio_t;
|
119
|
+
|
120
|
+
#ifdef __cplusplus
|
121
|
+
extern "C" {
|
122
|
+
#endif
|
123
|
+
|
124
|
+
gap_opt_t *gap_init_opt();
|
125
|
+
void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt);
|
126
|
+
|
127
|
+
bwa_seqio_t *bwa_seq_open(const char *fn);
|
128
|
+
bwa_seqio_t *bwa_bam_open(const char *fn, int which);
|
129
|
+
void bwa_seq_close(bwa_seqio_t *bs);
|
130
|
+
void seq_reverse(int len, ubyte_t *seq, int is_comp);
|
131
|
+
bwa_seq_t *bwa_read_seq(bwa_seqio_t *seq, int n_needed, int *n, int mode, int trim_qual);
|
132
|
+
void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs);
|
133
|
+
|
134
|
+
int bwa_cal_maxdiff(int l, double err, double thres);
|
135
|
+
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt);
|
136
|
+
|
137
|
+
void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac);
|
138
|
+
|
139
|
+
|
140
|
+
/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t,
|
141
|
+
__cigar_op and __cigar_len while keeping stdaln stand alone */
|
142
|
+
#include "stdaln.h"
|
143
|
+
|
144
|
+
bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar);
|
145
|
+
|
146
|
+
#ifdef __cplusplus
|
147
|
+
}
|
148
|
+
#endif
|
149
|
+
|
150
|
+
#endif
|
data/ext/bwtgap.c
ADDED
@@ -0,0 +1,264 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include "bwtgap.h"
|
5
|
+
#include "bwtaln.h"
|
6
|
+
|
7
|
+
#define STATE_M 0
|
8
|
+
#define STATE_I 1
|
9
|
+
#define STATE_D 2
|
10
|
+
|
11
|
+
#define aln_score(m,o,e,p) ((m)*(p)->s_mm + (o)*(p)->s_gapo + (e)*(p)->s_gape)
|
12
|
+
|
13
|
+
gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt)
|
14
|
+
{
|
15
|
+
int i;
|
16
|
+
gap_stack_t *stack;
|
17
|
+
stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t));
|
18
|
+
stack->n_stacks = aln_score(max_mm+1, max_gapo+1, max_gape+1, opt);
|
19
|
+
stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t));
|
20
|
+
for (i = 0; i != stack->n_stacks; ++i) {
|
21
|
+
gap_stack1_t *p = stack->stacks + i;
|
22
|
+
p->m_entries = 4;
|
23
|
+
p->stack = (gap_entry_t*)calloc(p->m_entries, sizeof(gap_entry_t));
|
24
|
+
}
|
25
|
+
return stack;
|
26
|
+
}
|
27
|
+
|
28
|
+
void gap_destroy_stack(gap_stack_t *stack)
|
29
|
+
{
|
30
|
+
int i;
|
31
|
+
for (i = 0; i != stack->n_stacks; ++i) free(stack->stacks[i].stack);
|
32
|
+
free(stack->stacks);
|
33
|
+
free(stack);
|
34
|
+
}
|
35
|
+
|
36
|
+
static void gap_reset_stack(gap_stack_t *stack)
|
37
|
+
{
|
38
|
+
int i;
|
39
|
+
for (i = 0; i != stack->n_stacks; ++i)
|
40
|
+
stack->stacks[i].n_entries = 0;
|
41
|
+
stack->best = stack->n_stacks;
|
42
|
+
stack->n_entries = 0;
|
43
|
+
}
|
44
|
+
|
45
|
+
static inline void gap_push(gap_stack_t *stack, int a, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape,
|
46
|
+
int state, int is_diff, const gap_opt_t *opt)
|
47
|
+
{
|
48
|
+
int score;
|
49
|
+
gap_entry_t *p;
|
50
|
+
gap_stack1_t *q;
|
51
|
+
score = aln_score(n_mm, n_gapo, n_gape, opt);
|
52
|
+
q = stack->stacks + score;
|
53
|
+
if (q->n_entries == q->m_entries) {
|
54
|
+
q->m_entries <<= 1;
|
55
|
+
q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries);
|
56
|
+
}
|
57
|
+
p = q->stack + q->n_entries;
|
58
|
+
p->info = (u_int32_t)score<<21 | a<<20 | i; p->k = k; p->l = l;
|
59
|
+
p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; p->state = state;
|
60
|
+
if (is_diff) p->last_diff_pos = i;
|
61
|
+
++(q->n_entries);
|
62
|
+
++(stack->n_entries);
|
63
|
+
if (stack->best > score) stack->best = score;
|
64
|
+
}
|
65
|
+
|
66
|
+
static inline void gap_pop(gap_stack_t *stack, gap_entry_t *e)
|
67
|
+
{
|
68
|
+
gap_stack1_t *q;
|
69
|
+
q = stack->stacks + stack->best;
|
70
|
+
*e = q->stack[q->n_entries - 1];
|
71
|
+
--(q->n_entries);
|
72
|
+
--(stack->n_entries);
|
73
|
+
if (q->n_entries == 0 && stack->n_entries) { // reset best
|
74
|
+
int i;
|
75
|
+
for (i = stack->best + 1; i < stack->n_stacks; ++i)
|
76
|
+
if (stack->stacks[i].n_entries != 0) break;
|
77
|
+
stack->best = i;
|
78
|
+
} else if (stack->n_entries == 0) stack->best = stack->n_stacks;
|
79
|
+
}
|
80
|
+
|
81
|
+
static inline void gap_shadow(int x, int len, bwtint_t max, int last_diff_pos, bwt_width_t *w)
|
82
|
+
{
|
83
|
+
int i, j;
|
84
|
+
for (i = j = 0; i < last_diff_pos; ++i) {
|
85
|
+
if (w[i].w > x) w[i].w -= x;
|
86
|
+
else if (w[i].w == x) {
|
87
|
+
w[i].bid = 1;
|
88
|
+
w[i].w = max - (++j);
|
89
|
+
} // else should not happen
|
90
|
+
}
|
91
|
+
}
|
92
|
+
|
93
|
+
static inline int int_log2(uint32_t v)
|
94
|
+
{
|
95
|
+
int c = 0;
|
96
|
+
if (v & 0xffff0000u) { v >>= 16; c |= 16; }
|
97
|
+
if (v & 0xff00) { v >>= 8; c |= 8; }
|
98
|
+
if (v & 0xf0) { v >>= 4; c |= 4; }
|
99
|
+
if (v & 0xc) { v >>= 2; c |= 2; }
|
100
|
+
if (v & 0x2) c |= 1;
|
101
|
+
return c;
|
102
|
+
}
|
103
|
+
|
104
|
+
bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], bwt_width_t *w[2],
|
105
|
+
bwt_width_t *seed_w[2], const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack)
|
106
|
+
{
|
107
|
+
int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt);
|
108
|
+
int best_diff = opt->max_diff + 1, max_diff = opt->max_diff;
|
109
|
+
int best_cnt = 0;
|
110
|
+
int max_entries = 0, j, _j, n_aln, m_aln;
|
111
|
+
bwt_aln1_t *aln;
|
112
|
+
|
113
|
+
m_aln = 4; n_aln = 0;
|
114
|
+
aln = (bwt_aln1_t*)calloc(m_aln, sizeof(bwt_aln1_t));
|
115
|
+
|
116
|
+
// check whether there are too many N
|
117
|
+
for (j = _j = 0; j < len; ++j)
|
118
|
+
if (seq[0][j] > 3) ++_j;
|
119
|
+
if (_j > max_diff) {
|
120
|
+
*_n_aln = n_aln;
|
121
|
+
return aln;
|
122
|
+
}
|
123
|
+
|
124
|
+
//for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w);
|
125
|
+
gap_reset_stack(stack); // reset stack
|
126
|
+
gap_push(stack, 0, len, 0, bwts[0]->seq_len, 0, 0, 0, 0, 0, opt);
|
127
|
+
gap_push(stack, 1, len, 0, bwts[0]->seq_len, 0, 0, 0, 0, 0, opt);
|
128
|
+
|
129
|
+
while (stack->n_entries) {
|
130
|
+
gap_entry_t e;
|
131
|
+
int a, i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp;
|
132
|
+
bwtint_t k, l, cnt_k[4], cnt_l[4], occ;
|
133
|
+
const bwt_t *bwt;
|
134
|
+
const ubyte_t *str;
|
135
|
+
const bwt_width_t *seed_width = 0;
|
136
|
+
bwt_width_t *width;
|
137
|
+
|
138
|
+
if (max_entries < stack->n_entries) max_entries = stack->n_entries;
|
139
|
+
if (stack->n_entries > opt->max_entries) break;
|
140
|
+
gap_pop(stack, &e); // get the best entry
|
141
|
+
k = e.k; l = e.l; // SA interval
|
142
|
+
a = e.info>>20&1; i = e.info&0xffff; // strand, length
|
143
|
+
if (!(opt->mode & BWA_MODE_NONSTOP) && e.info>>21 > best_score + opt->s_mm) break; // no need to proceed
|
144
|
+
|
145
|
+
m = max_diff - (e.n_mm + e.n_gapo);
|
146
|
+
if (opt->mode & BWA_MODE_GAPE) m -= e.n_gape;
|
147
|
+
if (m < 0) continue;
|
148
|
+
bwt = bwts[1-a]; str = seq[a]; width = w[a];
|
149
|
+
if (seed_w) { // apply seeding
|
150
|
+
seed_width = seed_w[a];
|
151
|
+
m_seed = opt->max_seed_diff - (e.n_mm + e.n_gapo);
|
152
|
+
if (opt->mode & BWA_MODE_GAPE) m_seed -= e.n_gape;
|
153
|
+
}
|
154
|
+
//printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%u,%u]\t[%u,%u]\t%d\n", stack->n_entries, a, i, "MID"[e.state], e.n_mm, e.n_gapo, e.n_gape, width[i-1].bid, width[i-1].w, k, l, e.last_diff_pos);
|
155
|
+
if (i > 0 && m < width[i-1].bid) continue;
|
156
|
+
|
157
|
+
// check whether a hit is found
|
158
|
+
hit_found = 0;
|
159
|
+
if (i == 0) hit_found = 1;
|
160
|
+
else if (m == 0 && (e.state == STATE_M || (opt->mode&BWA_MODE_GAPE) || e.n_gape == opt->max_gape)) { // no diff allowed
|
161
|
+
if (bwt_match_exact_alt(bwt, i, str, &k, &l)) hit_found = 1;
|
162
|
+
else continue; // no hit, skip
|
163
|
+
}
|
164
|
+
|
165
|
+
if (hit_found) { // action for found hits
|
166
|
+
int score = aln_score(e.n_mm, e.n_gapo, e.n_gape, opt);
|
167
|
+
int do_add = 1;
|
168
|
+
//printf("#2 hits found: %d:(%u,%u)\n", e.n_mm+e.n_gapo, k, l);
|
169
|
+
if (n_aln == 0) {
|
170
|
+
best_score = score;
|
171
|
+
best_diff = e.n_mm + e.n_gapo;
|
172
|
+
if (opt->mode & BWA_MODE_GAPE) best_diff += e.n_gape;
|
173
|
+
if (!(opt->mode & BWA_MODE_NONSTOP))
|
174
|
+
max_diff = (best_diff + 1 > opt->max_diff)? opt->max_diff : best_diff + 1; // top2 behaviour
|
175
|
+
}
|
176
|
+
if (score == best_score) best_cnt += l - k + 1;
|
177
|
+
else if (best_cnt > opt->max_top2) break; // top2b behaviour
|
178
|
+
if (e.n_gapo) { // check whether the hit has been found. this may happen when a gap occurs in a tandem repeat
|
179
|
+
for (j = 0; j != n_aln; ++j)
|
180
|
+
if (aln[j].k == k && aln[j].l == l) break;
|
181
|
+
if (j < n_aln) do_add = 0;
|
182
|
+
}
|
183
|
+
if (do_add) { // append
|
184
|
+
bwt_aln1_t *p;
|
185
|
+
gap_shadow(l - k + 1, len, bwt->seq_len, e.last_diff_pos, width);
|
186
|
+
if (n_aln == m_aln) {
|
187
|
+
m_aln <<= 1;
|
188
|
+
aln = (bwt_aln1_t*)realloc(aln, m_aln * sizeof(bwt_aln1_t));
|
189
|
+
memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t));
|
190
|
+
}
|
191
|
+
p = aln + n_aln;
|
192
|
+
p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; p->a = a;
|
193
|
+
p->k = k; p->l = l;
|
194
|
+
p->score = score;
|
195
|
+
++n_aln;
|
196
|
+
}
|
197
|
+
continue;
|
198
|
+
}
|
199
|
+
|
200
|
+
--i;
|
201
|
+
bwt_2occ4(bwt, k - 1, l, cnt_k, cnt_l); // retrieve Occ values
|
202
|
+
occ = l - k + 1;
|
203
|
+
// test whether diff is allowed
|
204
|
+
allow_diff = allow_M = 1;
|
205
|
+
if (i > 0) {
|
206
|
+
int ii = i - (len - opt->seed_len);
|
207
|
+
if (width[i-1].bid > m-1) allow_diff = 0;
|
208
|
+
else if (width[i-1].bid == m-1 && width[i].bid == m-1 && width[i-1].w == width[i].w) allow_M = 0;
|
209
|
+
if (seed_w && ii > 0) {
|
210
|
+
if (seed_width[ii-1].bid > m_seed-1) allow_diff = 0;
|
211
|
+
else if (seed_width[ii-1].bid == m_seed-1 && seed_width[ii].bid == m_seed-1
|
212
|
+
&& seed_width[ii-1].w == seed_width[ii].w) allow_M = 0;
|
213
|
+
}
|
214
|
+
}
|
215
|
+
// indels
|
216
|
+
tmp = (opt->mode & BWA_MODE_LOGGAP)? int_log2(e.n_gape + e.n_gapo)/2+1 : e.n_gapo + e.n_gape;
|
217
|
+
if (allow_diff && i >= opt->indel_end_skip + tmp && len - i >= opt->indel_end_skip + tmp) {
|
218
|
+
if (e.state == STATE_M) { // gap open
|
219
|
+
if (e.n_gapo < opt->max_gapo) { // gap open is allowed
|
220
|
+
// insertion
|
221
|
+
gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_I, 1, opt);
|
222
|
+
// deletion
|
223
|
+
for (j = 0; j != 4; ++j) {
|
224
|
+
k = bwt->L2[j] + cnt_k[j] + 1;
|
225
|
+
l = bwt->L2[j] + cnt_l[j];
|
226
|
+
if (k <= l) gap_push(stack, a, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_D, 1, opt);
|
227
|
+
}
|
228
|
+
}
|
229
|
+
} else if (e.state == STATE_I) { // extention of an insertion
|
230
|
+
if (e.n_gape < opt->max_gape) // gap extention is allowed
|
231
|
+
gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_I, 1, opt);
|
232
|
+
} else if (e.state == STATE_D) { // extention of a deletion
|
233
|
+
if (e.n_gape < opt->max_gape) { // gap extention is allowed
|
234
|
+
if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) {
|
235
|
+
for (j = 0; j != 4; ++j) {
|
236
|
+
k = bwt->L2[j] + cnt_k[j] + 1;
|
237
|
+
l = bwt->L2[j] + cnt_l[j];
|
238
|
+
if (k <= l) gap_push(stack, a, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_D, 1, opt);
|
239
|
+
}
|
240
|
+
}
|
241
|
+
}
|
242
|
+
}
|
243
|
+
}
|
244
|
+
// mismatches
|
245
|
+
if (allow_diff && allow_M) { // mismatch is allowed
|
246
|
+
for (j = 1; j <= 4; ++j) {
|
247
|
+
int c = (str[i] + j) & 3;
|
248
|
+
int is_mm = (j != 4 || str[i] > 3);
|
249
|
+
k = bwt->L2[c] + cnt_k[c] + 1;
|
250
|
+
l = bwt->L2[c] + cnt_l[c];
|
251
|
+
if (k <= l) gap_push(stack, a, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, STATE_M, is_mm, opt);
|
252
|
+
}
|
253
|
+
} else if (str[i] < 4) { // try exact match only
|
254
|
+
int c = str[i] & 3;
|
255
|
+
k = bwt->L2[c] + cnt_k[c] + 1;
|
256
|
+
l = bwt->L2[c] + cnt_l[c];
|
257
|
+
if (k <= l) gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt);
|
258
|
+
}
|
259
|
+
}
|
260
|
+
|
261
|
+
*_n_aln = n_aln;
|
262
|
+
//fprintf(stderr, "max_entries = %d\n", max_entries);
|
263
|
+
return aln;
|
264
|
+
}
|
data/ext/bwtgap.h
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
#ifndef BWTGAP_H_
|
2
|
+
#define BWTGAP_H_
|
3
|
+
|
4
|
+
#include "bwt.h"
|
5
|
+
#include "bwtaln.h"
|
6
|
+
|
7
|
+
typedef struct { // recursion stack
|
8
|
+
u_int32_t info; // score<<21 | a<<20 | i
|
9
|
+
u_int32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6;
|
10
|
+
bwtint_t k, l; // (k,l) is the SA region of [i,n-1]
|
11
|
+
int last_diff_pos;
|
12
|
+
} gap_entry_t;
|
13
|
+
|
14
|
+
typedef struct {
|
15
|
+
int n_entries, m_entries;
|
16
|
+
gap_entry_t *stack;
|
17
|
+
} gap_stack1_t;
|
18
|
+
|
19
|
+
typedef struct {
|
20
|
+
int n_stacks, best, n_entries;
|
21
|
+
gap_stack1_t *stacks;
|
22
|
+
} gap_stack_t;
|
23
|
+
|
24
|
+
#ifdef __cplusplus
|
25
|
+
extern "C" {
|
26
|
+
#endif
|
27
|
+
|
28
|
+
gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt);
|
29
|
+
void gap_destroy_stack(gap_stack_t *stack);
|
30
|
+
bwt_aln1_t *bwt_match_gap(bwt_t *const bwt[2], int len, const ubyte_t *seq[2], bwt_width_t *w[2],
|
31
|
+
bwt_width_t *seed_w[2], const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack);
|
32
|
+
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
|
33
|
+
|
34
|
+
#ifdef __cplusplus
|
35
|
+
}
|
36
|
+
#endif
|
37
|
+
|
38
|
+
#endif
|