bio-bwa 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
data/ext/bwape.c
ADDED
@@ -0,0 +1,807 @@
|
|
1
|
+
#include <unistd.h>
|
2
|
+
#include <math.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <time.h>
|
5
|
+
#include <stdio.h>
|
6
|
+
#include <string.h>
|
7
|
+
#include "bwtaln.h"
|
8
|
+
#include "kvec.h"
|
9
|
+
#include "bntseq.h"
|
10
|
+
#include "utils.h"
|
11
|
+
#include "stdaln.h"
|
12
|
+
|
13
|
+
typedef struct {
|
14
|
+
int n;
|
15
|
+
bwtint_t *a;
|
16
|
+
} poslist_t;
|
17
|
+
|
18
|
+
typedef struct {
|
19
|
+
double avg, std, ap_prior;
|
20
|
+
bwtint_t low, high, high_bayesian;
|
21
|
+
} isize_info_t;
|
22
|
+
|
23
|
+
#include "khash.h"
|
24
|
+
KHASH_MAP_INIT_INT64(64, poslist_t)
|
25
|
+
|
26
|
+
#include "ksort.h"
|
27
|
+
KSORT_INIT_GENERIC(uint64_t)
|
28
|
+
|
29
|
+
typedef struct {
|
30
|
+
kvec_t(uint64_t) arr;
|
31
|
+
kvec_t(uint64_t) pos[2];
|
32
|
+
kvec_t(bwt_aln1_t) aln[2];
|
33
|
+
} pe_data_t;
|
34
|
+
|
35
|
+
#define MIN_HASH_WIDTH 1000
|
36
|
+
|
37
|
+
extern int g_log_n[256]; // in bwase.c
|
38
|
+
static kh_64_t *g_hash;
|
39
|
+
|
40
|
+
void bwase_initialize();
|
41
|
+
void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi);
|
42
|
+
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
|
43
|
+
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
|
44
|
+
int bwa_approx_mapQ(const bwa_seq_t *p, int mm);
|
45
|
+
void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2);
|
46
|
+
bntseq_t *bwa_open_nt(const char *prefix);
|
47
|
+
void bwa_print_sam_SQ(const bntseq_t *bns);
|
48
|
+
void bwa_print_sam_PG();
|
49
|
+
|
50
|
+
pe_opt_t *bwa_init_pe_opt()
|
51
|
+
{
|
52
|
+
pe_opt_t *po;
|
53
|
+
po = (pe_opt_t*)calloc(1, sizeof(pe_opt_t));
|
54
|
+
po->max_isize = 500;
|
55
|
+
po->force_isize = 0;
|
56
|
+
po->max_occ = 100000;
|
57
|
+
po->n_multi = 3;
|
58
|
+
po->N_multi = 10;
|
59
|
+
po->type = BWA_PET_STD;
|
60
|
+
po->is_sw = 1;
|
61
|
+
po->ap_prior = 1e-5;
|
62
|
+
return po;
|
63
|
+
}
|
64
|
+
|
65
|
+
static inline uint64_t hash_64(uint64_t key)
|
66
|
+
{
|
67
|
+
key += ~(key << 32);
|
68
|
+
key ^= (key >> 22);
|
69
|
+
key += ~(key << 13);
|
70
|
+
key ^= (key >> 8);
|
71
|
+
key += (key << 3);
|
72
|
+
key ^= (key >> 15);
|
73
|
+
key += ~(key << 27);
|
74
|
+
key ^= (key >> 31);
|
75
|
+
return key;
|
76
|
+
}
|
77
|
+
/*
|
78
|
+
static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x);
|
79
|
+
{
|
80
|
+
const double a = 0.140012;
|
81
|
+
double b, c;
|
82
|
+
b = log(x * (2 - x));
|
83
|
+
c = 2./M_PI/a + b / 2.;
|
84
|
+
return sqrt(sqrt(c * c - b / a) - c);
|
85
|
+
}
|
86
|
+
*/
|
87
|
+
|
88
|
+
// for normal distribution, this is about 3std
|
89
|
+
#define OUTLIER_BOUND 2.0
|
90
|
+
|
91
|
+
static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double ap_prior, int64_t L)
|
92
|
+
{
|
93
|
+
uint64_t x, *isizes, n_ap = 0;
|
94
|
+
int n, i, tot, p25, p75, p50, max_len = 1, tmp;
|
95
|
+
double skewness = 0.0, kurtosis = 0.0, y;
|
96
|
+
|
97
|
+
ii->avg = ii->std = -1.0;
|
98
|
+
ii->low = ii->high = ii->high_bayesian = 0;
|
99
|
+
isizes = (uint64_t*)calloc(n_seqs, 8);
|
100
|
+
for (i = 0, tot = 0; i != n_seqs; ++i) {
|
101
|
+
bwa_seq_t *p[2];
|
102
|
+
p[0] = seqs[0] + i; p[1] = seqs[1] + i;
|
103
|
+
if (p[0]->mapQ >= 20 && p[1]->mapQ >= 20) {
|
104
|
+
x = (p[0]->pos < p[1]->pos)? p[1]->pos + p[1]->len - p[0]->pos : p[0]->pos + p[0]->len - p[1]->pos;
|
105
|
+
if (x < 100000) isizes[tot++] = x;
|
106
|
+
}
|
107
|
+
if (p[0]->len > max_len) max_len = p[0]->len;
|
108
|
+
if (p[1]->len > max_len) max_len = p[1]->len;
|
109
|
+
}
|
110
|
+
if (tot < 20) {
|
111
|
+
fprintf(stderr, "[infer_isize] fail to infer insert size: too few good pairs\n");
|
112
|
+
free(isizes);
|
113
|
+
return -1;
|
114
|
+
}
|
115
|
+
ks_introsort(uint64_t, tot, isizes);
|
116
|
+
p25 = isizes[(int)(tot*0.25 + 0.5)];
|
117
|
+
p50 = isizes[(int)(tot*0.50 + 0.5)];
|
118
|
+
p75 = isizes[(int)(tot*0.75 + 0.5)];
|
119
|
+
tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
|
120
|
+
ii->low = tmp > max_len? tmp : max_len; // ii->low is unsigned
|
121
|
+
ii->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
|
122
|
+
for (i = 0, x = n = 0; i < tot; ++i)
|
123
|
+
if (isizes[i] >= ii->low && isizes[i] <= ii->high)
|
124
|
+
++n, x += isizes[i];
|
125
|
+
ii->avg = (double)x / n;
|
126
|
+
for (i = 0; i < tot; ++i) {
|
127
|
+
if (isizes[i] >= ii->low && isizes[i] <= ii->high) {
|
128
|
+
double tmp = (isizes[i] - ii->avg) * (isizes[i] - ii->avg);
|
129
|
+
ii->std += tmp;
|
130
|
+
skewness += tmp * (isizes[i] - ii->avg);
|
131
|
+
kurtosis += tmp * tmp;
|
132
|
+
}
|
133
|
+
}
|
134
|
+
kurtosis = kurtosis/n / (ii->std / n * ii->std / n) - 3;
|
135
|
+
ii->std = sqrt(ii->std / n); // it would be better as n-1, but n is usually very large
|
136
|
+
skewness = skewness / n / (ii->std * ii->std * ii->std);
|
137
|
+
for (y = 1.0; y < 10.0; y += 0.01)
|
138
|
+
if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break;
|
139
|
+
ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499);
|
140
|
+
for (i = 0; i < tot; ++i)
|
141
|
+
if (isizes[i] > ii->high_bayesian) ++n_ap;
|
142
|
+
ii->ap_prior = .01 * (n_ap + .01) / tot;
|
143
|
+
if (ii->ap_prior < ap_prior) ii->ap_prior = ap_prior;
|
144
|
+
free(isizes);
|
145
|
+
fprintf(stderr, "[infer_isize] (25, 50, 75) percentile: (%d, %d, %d)\n", p25, p50, p75);
|
146
|
+
if (isnan(ii->std) || p75 > 100000) {
|
147
|
+
ii->low = ii->high = ii->high_bayesian = 0; ii->avg = ii->std = -1.0;
|
148
|
+
fprintf(stderr, "[infer_isize] fail to infer insert size: weird pairing\n");
|
149
|
+
return -1;
|
150
|
+
}
|
151
|
+
for (y = 1.0; y < 10.0; y += 0.01)
|
152
|
+
if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break;
|
153
|
+
ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499);
|
154
|
+
fprintf(stderr, "[infer_isize] low and high boundaries: %d and %d for estimating avg and std\n", ii->low, ii->high);
|
155
|
+
fprintf(stderr, "[infer_isize] inferred external isize from %d pairs: %.3lf +/- %.3lf\n", n, ii->avg, ii->std);
|
156
|
+
fprintf(stderr, "[infer_isize] skewness: %.3lf; kurtosis: %.3lf; ap_prior: %.2e\n", skewness, kurtosis, ii->ap_prior);
|
157
|
+
fprintf(stderr, "[infer_isize] inferred maximum insert size: %d (%.2lf sigma)\n", ii->high_bayesian, y);
|
158
|
+
return 0;
|
159
|
+
}
|
160
|
+
|
161
|
+
static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, const isize_info_t *ii)
|
162
|
+
{
|
163
|
+
int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len;
|
164
|
+
uint64_t last_pos[2][2], o_pos[2], subo_score, o_score;
|
165
|
+
max_len = p[0]->full_len;
|
166
|
+
if (max_len < p[1]->full_len) max_len = p[1]->full_len;
|
167
|
+
if (low_bound < max_len) low_bound = max_len;
|
168
|
+
|
169
|
+
// here v>=u. When ii is set, we check insert size with ii; otherwise with opt->max_isize
|
170
|
+
#define __pairing_aux(u,v) do { \
|
171
|
+
bwtint_t l = ((v)>>32) + p[(v)&1]->len - ((u)>>32); \
|
172
|
+
if ((u) != (uint64_t)-1 && (v)>>32 > (u)>>32 && l >= max_len \
|
173
|
+
&& ((ii->high && l <= ii->high_bayesian) || (ii->high == 0 && l <= opt->max_isize))) \
|
174
|
+
{ \
|
175
|
+
uint64_t s = d->aln[(v)&1].a[(uint32_t)(v)>>1].score + d->aln[(u)&1].a[(uint32_t)(u)>>1].score; \
|
176
|
+
s *= 10; \
|
177
|
+
if (ii->high) s += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * fabs(l - ii->avg) / ii->std)) + .499); \
|
178
|
+
s = s<<32 | (uint32_t)hash_64((u)>>32<<32 | (v)>>32); \
|
179
|
+
if (s>>32 == o_score>>32) ++o_n; \
|
180
|
+
else if (s>>32 < o_score>>32) { subo_n += o_n; o_n = 1; } \
|
181
|
+
else ++subo_n; \
|
182
|
+
if (s < o_score) subo_score = o_score, o_score = s, o_pos[(u)&1] = (u), o_pos[(v)&1] = (v); \
|
183
|
+
else if (s < subo_score) subo_score = s; \
|
184
|
+
} \
|
185
|
+
} while (0)
|
186
|
+
|
187
|
+
#define __pairing_aux2(q, w) do { \
|
188
|
+
const bwt_aln1_t *r = d->aln[(w)&1].a + ((uint32_t)(w)>>1); \
|
189
|
+
(q)->extra_flag |= SAM_FPP; \
|
190
|
+
if ((q)->pos != (w)>>32 || (q)->strand != r->a) { \
|
191
|
+
(q)->n_mm = r->n_mm; (q)->n_gapo = r->n_gapo; (q)->n_gape = r->n_gape; (q)->strand = r->a; \
|
192
|
+
(q)->score = r->score; \
|
193
|
+
(q)->pos = (w)>>32; \
|
194
|
+
if ((q)->mapQ > 0) ++cnt_chg; \
|
195
|
+
} \
|
196
|
+
} while (0)
|
197
|
+
|
198
|
+
o_score = subo_score = (uint64_t)-1;
|
199
|
+
o_n = subo_n = 0;
|
200
|
+
ks_introsort(uint64_t, d->arr.n, d->arr.a);
|
201
|
+
for (j = 0; j < 2; ++j) last_pos[j][0] = last_pos[j][1] = (uint64_t)-1;
|
202
|
+
if (opt->type == BWA_PET_STD) {
|
203
|
+
for (i = 0; i < d->arr.n; ++i) {
|
204
|
+
uint64_t x = d->arr.a[i];
|
205
|
+
int strand = d->aln[x&1].a[(uint32_t)x>>1].a;
|
206
|
+
if (strand == 1) { // reverse strand, then check
|
207
|
+
int y = 1 - (x&1);
|
208
|
+
__pairing_aux(last_pos[y][1], x);
|
209
|
+
__pairing_aux(last_pos[y][0], x);
|
210
|
+
} else { // forward strand, then push
|
211
|
+
last_pos[x&1][0] = last_pos[x&1][1];
|
212
|
+
last_pos[x&1][1] = x;
|
213
|
+
}
|
214
|
+
}
|
215
|
+
} else if (opt->type == BWA_PET_SOLID) {
|
216
|
+
for (i = 0; i < d->arr.n; ++i) {
|
217
|
+
uint64_t x = d->arr.a[i];
|
218
|
+
int strand = d->aln[x&1].a[(uint32_t)x>>1].a;
|
219
|
+
if ((strand^x)&1) { // push
|
220
|
+
int y = 1 - (x&1);
|
221
|
+
__pairing_aux(last_pos[y][1], x);
|
222
|
+
__pairing_aux(last_pos[y][0], x);
|
223
|
+
} else { // check
|
224
|
+
last_pos[x&1][0] = last_pos[x&1][1];
|
225
|
+
last_pos[x&1][1] = x;
|
226
|
+
}
|
227
|
+
}
|
228
|
+
} else {
|
229
|
+
fprintf(stderr, "[paring] not implemented yet!\n");
|
230
|
+
exit(1);
|
231
|
+
}
|
232
|
+
// set pairing
|
233
|
+
//fprintf(stderr, "[%d, %d, %d, %d]\n", d->arr.n, (int)(o_score>>32), (int)(subo_score>>32), o_n);
|
234
|
+
if (o_score != (uint64_t)-1) {
|
235
|
+
int mapQ_p = 0; // this is the maximum mapping quality when one end is moved
|
236
|
+
int rr[2];
|
237
|
+
//fprintf(stderr, "%d, %d\n", o_n, subo_n);
|
238
|
+
if (o_n == 1) {
|
239
|
+
if (subo_score == (uint64_t)-1) mapQ_p = 29; // no sub-optimal pair
|
240
|
+
else if ((subo_score>>32) - (o_score>>32) > s_mm * 10) mapQ_p = 23; // poor sub-optimal pair
|
241
|
+
else {
|
242
|
+
int n = subo_n > 255? 255 : subo_n;
|
243
|
+
mapQ_p = ((subo_score>>32) - (o_score>>32)) / 2 - g_log_n[n];
|
244
|
+
if (mapQ_p < 0) mapQ_p = 0;
|
245
|
+
}
|
246
|
+
}
|
247
|
+
rr[0] = d->aln[o_pos[0]&1].a[(uint32_t)o_pos[0]>>1].a;
|
248
|
+
rr[1] = d->aln[o_pos[1]&1].a[(uint32_t)o_pos[1]>>1].a;
|
249
|
+
if ((p[0]->pos == o_pos[0]>>32 && p[0]->strand == rr[0]) && (p[1]->pos == o_pos[1]>>32 && p[1]->strand == rr[1])) { // both ends not moved
|
250
|
+
if (p[0]->mapQ > 0 && p[1]->mapQ > 0) {
|
251
|
+
int mapQ = p[0]->mapQ + p[1]->mapQ;
|
252
|
+
if (mapQ > 60) mapQ = 60;
|
253
|
+
p[0]->mapQ = p[1]->mapQ = mapQ;
|
254
|
+
} else {
|
255
|
+
if (p[0]->mapQ == 0) p[0]->mapQ = (mapQ_p + 7 < p[1]->mapQ)? mapQ_p + 7 : p[1]->mapQ;
|
256
|
+
if (p[1]->mapQ == 0) p[1]->mapQ = (mapQ_p + 7 < p[0]->mapQ)? mapQ_p + 7 : p[0]->mapQ;
|
257
|
+
}
|
258
|
+
} else if (p[0]->pos == o_pos[0]>>32 && p[0]->strand == rr[0]) { // [1] moved
|
259
|
+
p[1]->seQ = 0; p[1]->mapQ = p[0]->mapQ;
|
260
|
+
if (p[1]->mapQ > mapQ_p) p[1]->mapQ = mapQ_p;
|
261
|
+
} else if (p[1]->pos == o_pos[1]>>32 && p[1]->strand == rr[1]) { // [0] moved
|
262
|
+
p[0]->seQ = 0; p[0]->mapQ = p[1]->mapQ;
|
263
|
+
if (p[0]->mapQ > mapQ_p) p[0]->mapQ = mapQ_p;
|
264
|
+
} else { // both ends moved
|
265
|
+
p[0]->seQ = p[1]->seQ = 0;
|
266
|
+
mapQ_p -= 20;
|
267
|
+
if (mapQ_p < 0) mapQ_p = 0;
|
268
|
+
p[0]->mapQ = p[1]->mapQ = mapQ_p;
|
269
|
+
}
|
270
|
+
__pairing_aux2(p[0], o_pos[0]);
|
271
|
+
__pairing_aux2(p[1], o_pos[1]);
|
272
|
+
}
|
273
|
+
return cnt_chg;
|
274
|
+
}
|
275
|
+
|
276
|
+
typedef struct {
|
277
|
+
kvec_t(bwt_aln1_t) aln;
|
278
|
+
} aln_buf_t;
|
279
|
+
|
280
|
+
int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii,
|
281
|
+
const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii)
|
282
|
+
{
|
283
|
+
int i, j, cnt_chg = 0;
|
284
|
+
char str[1024];
|
285
|
+
bwt_t *bwt[2];
|
286
|
+
pe_data_t *d;
|
287
|
+
aln_buf_t *buf[2];
|
288
|
+
|
289
|
+
d = (pe_data_t*)calloc(1, sizeof(pe_data_t));
|
290
|
+
buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
|
291
|
+
buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
|
292
|
+
|
293
|
+
if (_bwt[0] == 0) { // load forward SA
|
294
|
+
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
|
295
|
+
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
|
296
|
+
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
|
297
|
+
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
|
298
|
+
} else bwt[0] = _bwt[0], bwt[1] = _bwt[1];
|
299
|
+
|
300
|
+
// SE
|
301
|
+
for (i = 0; i != n_seqs; ++i) {
|
302
|
+
bwa_seq_t *p[2];
|
303
|
+
for (j = 0; j < 2; ++j) {
|
304
|
+
int n_aln;
|
305
|
+
p[j] = seqs[j] + i;
|
306
|
+
p[j]->n_multi = 0;
|
307
|
+
p[j]->extra_flag |= SAM_FPD | (j == 0? SAM_FR1 : SAM_FR2);
|
308
|
+
fread(&n_aln, 4, 1, fp_sa[j]);
|
309
|
+
if (n_aln > kv_max(d->aln[j]))
|
310
|
+
kv_resize(bwt_aln1_t, d->aln[j], n_aln);
|
311
|
+
d->aln[j].n = n_aln;
|
312
|
+
fread(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]);
|
313
|
+
kv_copy(bwt_aln1_t, buf[j][i].aln, d->aln[j]); // backup d->aln[j]
|
314
|
+
// generate SE alignment and mapping quality
|
315
|
+
bwa_aln2seq(n_aln, d->aln[j].a, p[j]);
|
316
|
+
if (p[j]->type == BWA_TYPE_UNIQUE || p[j]->type == BWA_TYPE_REPEAT) {
|
317
|
+
int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff;
|
318
|
+
p[j]->pos = p[j]->strand? bwt_sa(bwt[0], p[j]->sa)
|
319
|
+
: bwt[1]->seq_len - (bwt_sa(bwt[1], p[j]->sa) + p[j]->len);
|
320
|
+
p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff);
|
321
|
+
}
|
322
|
+
}
|
323
|
+
}
|
324
|
+
|
325
|
+
// infer isize
|
326
|
+
infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt[0]->seq_len);
|
327
|
+
if (ii->avg < 0.0 && last_ii->avg > 0.0) *ii = *last_ii;
|
328
|
+
if (opt->force_isize) {
|
329
|
+
fprintf(stderr, "[%s] discard insert size estimate as user's request.\n", __func__);
|
330
|
+
ii->low = ii->high = 0; ii->avg = ii->std = -1.0;
|
331
|
+
}
|
332
|
+
|
333
|
+
// PE
|
334
|
+
for (i = 0; i != n_seqs; ++i) {
|
335
|
+
bwa_seq_t *p[2];
|
336
|
+
for (j = 0; j < 2; ++j) {
|
337
|
+
p[j] = seqs[j] + i;
|
338
|
+
kv_copy(bwt_aln1_t, d->aln[j], buf[j][i].aln);
|
339
|
+
}
|
340
|
+
if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT)
|
341
|
+
&& (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT))
|
342
|
+
{ // only when both ends mapped
|
343
|
+
uint64_t x;
|
344
|
+
int j, k, n_occ[2];
|
345
|
+
for (j = 0; j < 2; ++j) {
|
346
|
+
n_occ[j] = 0;
|
347
|
+
for (k = 0; k < d->aln[j].n; ++k)
|
348
|
+
n_occ[j] += d->aln[j].a[k].l - d->aln[j].a[k].k + 1;
|
349
|
+
}
|
350
|
+
if (n_occ[0] > opt->max_occ || n_occ[1] > opt->max_occ) continue;
|
351
|
+
d->arr.n = 0;
|
352
|
+
for (j = 0; j < 2; ++j) {
|
353
|
+
for (k = 0; k < d->aln[j].n; ++k) {
|
354
|
+
bwt_aln1_t *r = d->aln[j].a + k;
|
355
|
+
bwtint_t l;
|
356
|
+
if (r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table
|
357
|
+
uint64_t key = (uint64_t)r->k<<32 | r->l;
|
358
|
+
int ret;
|
359
|
+
khint_t iter = kh_put(64, g_hash, key, &ret);
|
360
|
+
if (ret) { // not in the hash table; ret must equal 1 as we never remove elements
|
361
|
+
poslist_t *z = &kh_val(g_hash, iter);
|
362
|
+
z->n = r->l - r->k + 1;
|
363
|
+
z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n);
|
364
|
+
for (l = r->k; l <= r->l; ++l)
|
365
|
+
z->a[l - r->k] = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len);
|
366
|
+
}
|
367
|
+
for (l = 0; l < kh_val(g_hash, iter).n; ++l) {
|
368
|
+
x = kh_val(g_hash, iter).a[l];
|
369
|
+
x = x<<32 | k<<1 | j;
|
370
|
+
kv_push(uint64_t, d->arr, x);
|
371
|
+
}
|
372
|
+
} else { // then calculate on the fly
|
373
|
+
for (l = r->k; l <= r->l; ++l) {
|
374
|
+
x = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len);
|
375
|
+
x = x<<32 | k<<1 | j;
|
376
|
+
kv_push(uint64_t, d->arr, x);
|
377
|
+
}
|
378
|
+
}
|
379
|
+
}
|
380
|
+
}
|
381
|
+
cnt_chg += pairing(p, d, opt, gopt->s_mm, ii);
|
382
|
+
}
|
383
|
+
|
384
|
+
if (opt->N_multi || opt->n_multi) {
|
385
|
+
for (j = 0; j < 2; ++j) {
|
386
|
+
if (p[j]->type != BWA_TYPE_NO_MATCH) {
|
387
|
+
int k;
|
388
|
+
if (!(p[j]->extra_flag&SAM_FPP) && p[1-j]->type != BWA_TYPE_NO_MATCH) {
|
389
|
+
bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi);
|
390
|
+
} else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi);
|
391
|
+
for (k = 0; k < p[j]->n_multi; ++k) {
|
392
|
+
bwt_multi1_t *q = p[j]->multi + k;
|
393
|
+
q->pos = q->strand? bwt_sa(bwt[0], q->pos) : bwt[1]->seq_len - (bwt_sa(bwt[1], q->pos) + p[j]->len);
|
394
|
+
}
|
395
|
+
}
|
396
|
+
}
|
397
|
+
}
|
398
|
+
}
|
399
|
+
|
400
|
+
// free
|
401
|
+
for (i = 0; i < n_seqs; ++i) {
|
402
|
+
kv_destroy(buf[0][i].aln);
|
403
|
+
kv_destroy(buf[1][i].aln);
|
404
|
+
}
|
405
|
+
free(buf[0]); free(buf[1]);
|
406
|
+
if (_bwt[0] == 0) {
|
407
|
+
bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
|
408
|
+
}
|
409
|
+
kv_destroy(d->arr);
|
410
|
+
kv_destroy(d->pos[0]); kv_destroy(d->pos[1]);
|
411
|
+
kv_destroy(d->aln[0]); kv_destroy(d->aln[1]);
|
412
|
+
free(d);
|
413
|
+
return cnt_chg;
|
414
|
+
}
|
415
|
+
|
416
|
+
#define SW_MIN_MATCH_LEN 20
|
417
|
+
#define SW_MIN_MAPQ 17
|
418
|
+
|
419
|
+
// cnt = n_mm<<16 | n_gapo<<8 | n_gape
|
420
|
+
bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen,
|
421
|
+
int *n_cigar, uint32_t *_cnt)
|
422
|
+
{
|
423
|
+
bwa_cigar_t *cigar = 0;
|
424
|
+
ubyte_t *ref_seq;
|
425
|
+
bwtint_t k, x, y, l;
|
426
|
+
int path_len, ret;
|
427
|
+
AlnParam ap = aln_param_bwa;
|
428
|
+
path_t *path, *p;
|
429
|
+
|
430
|
+
// check whether there are too many N's
|
431
|
+
if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0;
|
432
|
+
for (k = 0, x = 0; k < len; ++k)
|
433
|
+
if (seq[k] >= 4) ++x;
|
434
|
+
if ((float)x/len >= 0.25 || len - x < SW_MIN_MATCH_LEN) return 0;
|
435
|
+
|
436
|
+
// get reference subsequence
|
437
|
+
ref_seq = (ubyte_t*)calloc(reglen, 1);
|
438
|
+
for (k = *beg, l = 0; l < reglen && k < l_pac; ++k)
|
439
|
+
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
|
440
|
+
path = (path_t*)calloc(l+len, sizeof(path_t));
|
441
|
+
|
442
|
+
// do alignment
|
443
|
+
ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, 0);
|
444
|
+
if (ret < 0) {
|
445
|
+
free(path); free(cigar); free(ref_seq); *n_cigar = 0;
|
446
|
+
return 0;
|
447
|
+
}
|
448
|
+
cigar = bwa_aln_path2cigar(path, path_len, n_cigar);
|
449
|
+
|
450
|
+
// check whether the alignment is good enough
|
451
|
+
for (k = 0, x = y = 0; k < *n_cigar; ++k) {
|
452
|
+
bwa_cigar_t c = cigar[k];
|
453
|
+
if (__cigar_op(c) == FROM_M) x += __cigar_len(c), y += __cigar_len(c);
|
454
|
+
else if (__cigar_op(c) == FROM_D) x += __cigar_len(c);
|
455
|
+
else y += __cigar_len(c);
|
456
|
+
}
|
457
|
+
if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough
|
458
|
+
free(path); free(cigar); free(ref_seq);
|
459
|
+
*n_cigar = 0;
|
460
|
+
return 0;
|
461
|
+
}
|
462
|
+
|
463
|
+
{ // update cigar and coordinate;
|
464
|
+
int start, end;
|
465
|
+
p = path + path_len - 1;
|
466
|
+
*beg += (p->i? p->i : 1) - 1;
|
467
|
+
start = (p->j? p->j : 1) - 1;
|
468
|
+
end = path->j;
|
469
|
+
cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2));
|
470
|
+
if (start) {
|
471
|
+
memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar));
|
472
|
+
cigar[0] = __cigar_create(3, start);
|
473
|
+
++(*n_cigar);
|
474
|
+
}
|
475
|
+
if (end < len) {
|
476
|
+
/*cigar[*n_cigar] = 3<<14 | (len - end);*/
|
477
|
+
cigar[*n_cigar] = __cigar_create(3, (len - end));
|
478
|
+
++(*n_cigar);
|
479
|
+
}
|
480
|
+
}
|
481
|
+
|
482
|
+
{ // set *cnt
|
483
|
+
int n_mm, n_gapo, n_gape;
|
484
|
+
n_mm = n_gapo = n_gape = 0;
|
485
|
+
p = path + path_len - 1;
|
486
|
+
x = p->i? p->i - 1 : 0; y = p->j? p->j - 1 : 0;
|
487
|
+
for (k = 0; k < *n_cigar; ++k) {
|
488
|
+
bwa_cigar_t c = cigar[k];
|
489
|
+
if (__cigar_op(c) == FROM_M) {
|
490
|
+
for (l = 0; l < (__cigar_len(c)); ++l)
|
491
|
+
if (ref_seq[x+l] < 4 && seq[y+l] < 4 && ref_seq[x+l] != seq[y+l]) ++n_mm;
|
492
|
+
x += __cigar_len(c), y += __cigar_len(c);
|
493
|
+
} else if (__cigar_op(c) == FROM_D) {
|
494
|
+
x += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1;
|
495
|
+
} else if (__cigar_op(c) == FROM_I) {
|
496
|
+
y += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1;
|
497
|
+
}
|
498
|
+
}
|
499
|
+
*_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape;
|
500
|
+
}
|
501
|
+
|
502
|
+
free(ref_seq); free(path);
|
503
|
+
return cigar;
|
504
|
+
}
|
505
|
+
|
506
|
+
ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, bwa_seq_t *seqs[2], const pe_opt_t *popt, const isize_info_t *ii)
|
507
|
+
{
|
508
|
+
ubyte_t *pacseq;
|
509
|
+
int i;
|
510
|
+
uint64_t n_tot[2], n_mapped[2];
|
511
|
+
|
512
|
+
// load reference sequence
|
513
|
+
if (_pacseq == 0) {
|
514
|
+
pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
|
515
|
+
rewind(bns->fp_pac);
|
516
|
+
fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
|
517
|
+
} else pacseq = (ubyte_t*)_pacseq;
|
518
|
+
if (!popt->is_sw || ii->avg < 0.0) return pacseq;
|
519
|
+
|
520
|
+
// perform mate alignment
|
521
|
+
n_tot[0] = n_tot[1] = n_mapped[0] = n_mapped[1] = 0;
|
522
|
+
for (i = 0; i != n_seqs; ++i) {
|
523
|
+
bwa_seq_t *p[2];
|
524
|
+
p[0] = seqs[0] + i; p[1] = seqs[1] + i;
|
525
|
+
if ((p[0]->mapQ >= SW_MIN_MAPQ || p[1]->mapQ >= SW_MIN_MAPQ) && (p[0]->extra_flag&SAM_FPP) == 0) { // unpaired and one read has high mapQ
|
526
|
+
int k, n_cigar[2], is_singleton, mapQ = 0, mq_adjust[2];
|
527
|
+
int64_t beg[2], end[2];
|
528
|
+
bwa_cigar_t *cigar[2];
|
529
|
+
uint32_t cnt[2];
|
530
|
+
|
531
|
+
/* In the following, _pref points to the reference read
|
532
|
+
* which must be aligned; _pmate points to its mate which is
|
533
|
+
* considered to be modified. */
|
534
|
+
|
535
|
+
#define __set_rght_coor(_a, _b, _pref, _pmate) do { \
|
536
|
+
(_a) = (int64_t)_pref->pos + ii->avg - 3 * ii->std - _pmate->len * 1.5; \
|
537
|
+
(_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \
|
538
|
+
if ((_a) < (int64_t)_pref->pos + _pref->len) (_a) = _pref->pos + _pref->len; \
|
539
|
+
if ((_b) > bns->l_pac) (_b) = bns->l_pac; \
|
540
|
+
} while (0)
|
541
|
+
|
542
|
+
#define __set_left_coor(_a, _b, _pref, _pmate) do { \
|
543
|
+
(_a) = (int64_t)_pref->pos + _pref->len - ii->avg - 3 * ii->std - _pmate->len * 0.5; \
|
544
|
+
(_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \
|
545
|
+
if ((_a) < 0) (_a) = 0; \
|
546
|
+
if ((_b) > _pref->pos) (_b) = _pref->pos; \
|
547
|
+
} while (0)
|
548
|
+
|
549
|
+
#define __set_fixed(_pref, _pmate, _beg, _cnt) do { \
|
550
|
+
_pmate->type = BWA_TYPE_MATESW; \
|
551
|
+
_pmate->pos = _beg; \
|
552
|
+
_pmate->seQ = _pref->seQ; \
|
553
|
+
_pmate->strand = (popt->type == BWA_PET_STD)? 1 - _pref->strand : _pref->strand; \
|
554
|
+
_pmate->n_mm = _cnt>>16; _pmate->n_gapo = _cnt>>8&0xff; _pmate->n_gape = _cnt&0xff; \
|
555
|
+
_pmate->extra_flag |= SAM_FPP; \
|
556
|
+
_pref->extra_flag |= SAM_FPP; \
|
557
|
+
} while (0)
|
558
|
+
|
559
|
+
mq_adjust[0] = mq_adjust[1] = 255; // not effective
|
560
|
+
is_singleton = (p[0]->type == BWA_TYPE_NO_MATCH || p[1]->type == BWA_TYPE_NO_MATCH)? 1 : 0;
|
561
|
+
|
562
|
+
++n_tot[is_singleton];
|
563
|
+
cigar[0] = cigar[1] = 0;
|
564
|
+
n_cigar[0] = n_cigar[1] = 0;
|
565
|
+
if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered
|
566
|
+
for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified
|
567
|
+
ubyte_t *seq;
|
568
|
+
if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip
|
569
|
+
if (popt->type == BWA_PET_STD) {
|
570
|
+
if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate
|
571
|
+
__set_rght_coor(beg[k], end[k], p[1-k], p[k]);
|
572
|
+
seq = p[k]->rseq;
|
573
|
+
} else { // then the mate is on forward stand and has smaller coordinate
|
574
|
+
__set_left_coor(beg[k], end[k], p[1-k], p[k]);
|
575
|
+
seq = p[k]->seq;
|
576
|
+
seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly
|
577
|
+
}
|
578
|
+
} else { // BWA_PET_SOLID
|
579
|
+
if (p[1-k]->strand == 0) { // R3-F3 pairing
|
580
|
+
if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
|
581
|
+
else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
|
582
|
+
seq = p[k]->rseq;
|
583
|
+
seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed
|
584
|
+
} else { // F3-R3 pairing
|
585
|
+
if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
|
586
|
+
else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
|
587
|
+
seq = p[k]->seq;
|
588
|
+
}
|
589
|
+
}
|
590
|
+
// perform SW alignment
|
591
|
+
cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]);
|
592
|
+
if (cigar[k] && p[k]->type != BWA_TYPE_NO_MATCH) { // re-evaluate cigar[k]
|
593
|
+
int s_old, clip = 0, s_new;
|
594
|
+
if (__cigar_op(cigar[k][0]) == 3) clip += __cigar_len(cigar[k][0]);
|
595
|
+
if (__cigar_op(cigar[k][n_cigar[k]-1]) == 3) clip += __cigar_len(cigar[k][n_cigar[k]-1]);
|
596
|
+
s_old = (int)((p[k]->n_mm * 9 + p[k]->n_gapo * 13 + p[k]->n_gape * 2) / 3. * 8. + .499);
|
597
|
+
s_new = (int)(((cnt[k]>>16) * 9 + (cnt[k]>>8&0xff) * 13 + (cnt[k]&0xff) * 2 + clip * 3) / 3. * 8. + .499);
|
598
|
+
s_old += -4.343 * log(ii->ap_prior / bns->l_pac);
|
599
|
+
s_new += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * 1.5) + .499)); // assume the mapped isize is 1.5\sigma
|
600
|
+
if (s_old < s_new) { // reject SW alignment
|
601
|
+
mq_adjust[k] = s_new - s_old;
|
602
|
+
free(cigar[k]); cigar[k] = 0; n_cigar[k] = 0;
|
603
|
+
} else mq_adjust[k] = s_old - s_new;
|
604
|
+
}
|
605
|
+
// now revserse sequence back such that p[*]->seq looks untouched
|
606
|
+
if (popt->type == BWA_PET_STD) {
|
607
|
+
if (p[1-k]->strand == 1) seq_reverse(p[k]->len, seq, 0);
|
608
|
+
} else {
|
609
|
+
if (p[1-k]->strand == 0) seq_reverse(p[k]->len, seq, 0);
|
610
|
+
}
|
611
|
+
}
|
612
|
+
k = -1; // no read to be changed
|
613
|
+
if (cigar[0] && cigar[1]) {
|
614
|
+
k = p[0]->mapQ < p[1]->mapQ? 0 : 1; // p[k] to be fixed
|
615
|
+
mapQ = abs(p[1]->mapQ - p[0]->mapQ);
|
616
|
+
} else if (cigar[0]) k = 0, mapQ = p[1]->mapQ;
|
617
|
+
else if (cigar[1]) k = 1, mapQ = p[0]->mapQ;
|
618
|
+
if (k >= 0 && p[k]->pos != beg[k]) {
|
619
|
+
++n_mapped[is_singleton];
|
620
|
+
{ // recalculate mapping quality
|
621
|
+
int tmp = (int)p[1-k]->mapQ - p[k]->mapQ/2 - 8;
|
622
|
+
if (tmp <= 0) tmp = 1;
|
623
|
+
if (mapQ > tmp) mapQ = tmp;
|
624
|
+
p[k]->mapQ = p[1-k]->mapQ = mapQ;
|
625
|
+
p[k]->seQ = p[1-k]->seQ = p[1-k]->seQ < mapQ? p[1-k]->seQ : mapQ;
|
626
|
+
if (p[k]->mapQ > mq_adjust[k]) p[k]->mapQ = mq_adjust[k];
|
627
|
+
if (p[k]->seQ > mq_adjust[k]) p[k]->seQ = mq_adjust[k];
|
628
|
+
}
|
629
|
+
// update CIGAR
|
630
|
+
free(p[k]->cigar); p[k]->cigar = cigar[k]; cigar[k] = 0;
|
631
|
+
p[k]->n_cigar = n_cigar[k];
|
632
|
+
// update the rest of information
|
633
|
+
__set_fixed(p[1-k], p[k], beg[k], cnt[k]);
|
634
|
+
}
|
635
|
+
free(cigar[0]); free(cigar[1]);
|
636
|
+
}
|
637
|
+
}
|
638
|
+
fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d singletons are mated.\n",
|
639
|
+
(long long)n_mapped[1], (long long)n_tot[1], SW_MIN_MAPQ);
|
640
|
+
fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d discordant pairs are fixed.\n",
|
641
|
+
(long long)n_mapped[0], (long long)n_tot[0], SW_MIN_MAPQ);
|
642
|
+
return pacseq;
|
643
|
+
}
|
644
|
+
|
645
|
+
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt)
|
646
|
+
{
|
647
|
+
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
|
648
|
+
int i, j, n_seqs, tot_seqs = 0;
|
649
|
+
bwa_seq_t *seqs[2];
|
650
|
+
bwa_seqio_t *ks[2];
|
651
|
+
clock_t t;
|
652
|
+
bntseq_t *bns, *ntbns = 0;
|
653
|
+
FILE *fp_sa[2];
|
654
|
+
gap_opt_t opt, opt0;
|
655
|
+
khint_t iter;
|
656
|
+
isize_info_t last_ii; // this is for the last batch of reads
|
657
|
+
char str[1024];
|
658
|
+
bwt_t *bwt[2];
|
659
|
+
uint8_t *pac;
|
660
|
+
|
661
|
+
// initialization
|
662
|
+
bwase_initialize(); // initialize g_log_n[] in bwase.c
|
663
|
+
pac = 0; bwt[0] = bwt[1] = 0;
|
664
|
+
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
|
665
|
+
bns = bns_restore(prefix);
|
666
|
+
srand48(bns->seed);
|
667
|
+
fp_sa[0] = xopen(fn_sa[0], "r");
|
668
|
+
fp_sa[1] = xopen(fn_sa[1], "r");
|
669
|
+
g_hash = kh_init(64);
|
670
|
+
last_ii.avg = -1.0;
|
671
|
+
|
672
|
+
fread(&opt, sizeof(gap_opt_t), 1, fp_sa[0]);
|
673
|
+
ks[0] = bwa_open_reads(opt.mode, fn_fa[0]);
|
674
|
+
opt0 = opt;
|
675
|
+
fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
|
676
|
+
ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
|
677
|
+
if (!(opt.mode & BWA_MODE_COMPREAD)) {
|
678
|
+
popt->type = BWA_PET_SOLID;
|
679
|
+
ntbns = bwa_open_nt(prefix);
|
680
|
+
} else { // for Illumina alignment only
|
681
|
+
if (popt->is_preload) {
|
682
|
+
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
|
683
|
+
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
|
684
|
+
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
|
685
|
+
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
|
686
|
+
pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
|
687
|
+
rewind(bns->fp_pac);
|
688
|
+
fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
|
689
|
+
}
|
690
|
+
}
|
691
|
+
|
692
|
+
// core loop
|
693
|
+
bwa_print_sam_SQ(bns);
|
694
|
+
bwa_print_sam_PG();
|
695
|
+
while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
|
696
|
+
int cnt_chg;
|
697
|
+
isize_info_t ii;
|
698
|
+
ubyte_t *pacseq;
|
699
|
+
|
700
|
+
seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode, opt.trim_qual);
|
701
|
+
tot_seqs += n_seqs;
|
702
|
+
t = clock();
|
703
|
+
|
704
|
+
fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n");
|
705
|
+
cnt_chg = bwa_cal_pac_pos_pe(prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii);
|
706
|
+
fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
707
|
+
fprintf(stderr, "[bwa_sai2sam_pe_core] changing coordinates of %d alignments.\n", cnt_chg);
|
708
|
+
|
709
|
+
fprintf(stderr, "[bwa_sai2sam_pe_core] align unmapped mate...\n");
|
710
|
+
pacseq = bwa_paired_sw(bns, pac, n_seqs, seqs, popt, &ii);
|
711
|
+
fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
712
|
+
|
713
|
+
fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... ");
|
714
|
+
for (j = 0; j < 2; ++j)
|
715
|
+
bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns);
|
716
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
717
|
+
if (pac == 0) free(pacseq);
|
718
|
+
|
719
|
+
fprintf(stderr, "[bwa_sai2sam_pe_core] print alignments... ");
|
720
|
+
for (i = 0; i < n_seqs; ++i) {
|
721
|
+
bwa_seq_t *p[2];
|
722
|
+
p[0] = seqs[0] + i; p[1] = seqs[1] + i;
|
723
|
+
if (p[0]->bc[0] || p[1]->bc[0]) {
|
724
|
+
strcat(p[0]->bc, p[1]->bc);
|
725
|
+
strcpy(p[1]->bc, p[0]->bc);
|
726
|
+
}
|
727
|
+
bwa_print_sam1(bns, p[0], p[1], opt.mode, opt.max_top2);
|
728
|
+
bwa_print_sam1(bns, p[1], p[0], opt.mode, opt.max_top2);
|
729
|
+
}
|
730
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
731
|
+
|
732
|
+
for (j = 0; j < 2; ++j)
|
733
|
+
bwa_free_read_seq(n_seqs, seqs[j]);
|
734
|
+
fprintf(stderr, "[bwa_sai2sam_pe_core] %d sequences have been processed.\n", tot_seqs);
|
735
|
+
last_ii = ii;
|
736
|
+
}
|
737
|
+
|
738
|
+
// destroy
|
739
|
+
bns_destroy(bns);
|
740
|
+
if (ntbns) bns_destroy(ntbns);
|
741
|
+
for (i = 0; i < 2; ++i) {
|
742
|
+
bwa_seq_close(ks[i]);
|
743
|
+
fclose(fp_sa[i]);
|
744
|
+
}
|
745
|
+
for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter)
|
746
|
+
if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a);
|
747
|
+
kh_destroy(64, g_hash);
|
748
|
+
if (pac) {
|
749
|
+
free(pac); bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
|
750
|
+
}
|
751
|
+
}
|
752
|
+
|
753
|
+
int bwa_sai2sam_pe(int argc, char *argv[])
|
754
|
+
{
|
755
|
+
extern char *bwa_rg_line, *bwa_rg_id;
|
756
|
+
extern int bwa_set_rg(const char *s);
|
757
|
+
int c;
|
758
|
+
pe_opt_t *popt;
|
759
|
+
popt = bwa_init_pe_opt();
|
760
|
+
optind = 1;
|
761
|
+
while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) {
|
762
|
+
switch (c) {
|
763
|
+
case 'r':
|
764
|
+
if (bwa_set_rg(optarg) < 0) {
|
765
|
+
fprintf(stderr, "[%s] malformated @RG line\n", __func__);
|
766
|
+
return 1;
|
767
|
+
}
|
768
|
+
break;
|
769
|
+
case 'a': popt->max_isize = atoi(optarg); break;
|
770
|
+
case 'o': popt->max_occ = atoi(optarg); break;
|
771
|
+
case 's': popt->is_sw = 0; break;
|
772
|
+
case 'P': popt->is_preload = 1; break;
|
773
|
+
case 'n': popt->n_multi = atoi(optarg); break;
|
774
|
+
case 'N': popt->N_multi = atoi(optarg); break;
|
775
|
+
case 'c': popt->ap_prior = atof(optarg); break;
|
776
|
+
case 'f': xreopen(optarg, "w", stdout); break;
|
777
|
+
case 'A': popt->force_isize = 1; break;
|
778
|
+
default: return 1;
|
779
|
+
}
|
780
|
+
}
|
781
|
+
|
782
|
+
if (optind + 5 > argc) {
|
783
|
+
fprintf(stderr, "\n");
|
784
|
+
fprintf(stderr, "Usage: bwa sampe [options] <prefix> <in1.sai> <in2.sai> <in1.fq> <in2.fq>\n\n");
|
785
|
+
fprintf(stderr, "Options: -a INT maximum insert size [%d]\n", popt->max_isize);
|
786
|
+
fprintf(stderr, " -o INT maximum occurrences for one end [%d]\n", popt->max_occ);
|
787
|
+
fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi);
|
788
|
+
fprintf(stderr, " -N INT maximum hits to output for discordant pairs [%d]\n", popt->N_multi);
|
789
|
+
fprintf(stderr, " -c FLOAT prior of chimeric rate (lower bound) [%.1le]\n", popt->ap_prior);
|
790
|
+
fprintf(stderr, " -f FILE sam file to output results to [stdout]\n");
|
791
|
+
fprintf(stderr, " -r STR read group header line such as `@RG\\tID:foo\\tSM:bar' [null]\n");
|
792
|
+
fprintf(stderr, " -P preload index into memory (for base-space reads only)\n");
|
793
|
+
fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n");
|
794
|
+
fprintf(stderr, " -A disable insert size estimate (force -s)\n\n");
|
795
|
+
fprintf(stderr, "Notes: 1. For SOLiD reads, <in1.fq> corresponds R3 reads and <in2.fq> to F3.\n");
|
796
|
+
fprintf(stderr, " 2. For reads shorter than 30bp, applying a smaller -o is recommended to\n");
|
797
|
+
fprintf(stderr, " to get a sensible speed at the cost of pairing accuracy.\n");
|
798
|
+
fprintf(stderr, "\n");
|
799
|
+
return 1;
|
800
|
+
}
|
801
|
+
bwa_sai2sam_pe_core(argv[optind], argv + optind + 1, argv + optind+3, popt);
|
802
|
+
free(bwa_rg_line); free(bwa_rg_id);
|
803
|
+
free(popt);
|
804
|
+
fflush(stdout);
|
805
|
+
xreopen("/dev/tty","w",stdout);
|
806
|
+
return 0;
|
807
|
+
}
|