bio-bwa 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
data/ext/bwape.c
ADDED
@@ -0,0 +1,807 @@
|
|
1
|
+
#include <unistd.h>
|
2
|
+
#include <math.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <time.h>
|
5
|
+
#include <stdio.h>
|
6
|
+
#include <string.h>
|
7
|
+
#include "bwtaln.h"
|
8
|
+
#include "kvec.h"
|
9
|
+
#include "bntseq.h"
|
10
|
+
#include "utils.h"
|
11
|
+
#include "stdaln.h"
|
12
|
+
|
13
|
+
typedef struct {
|
14
|
+
int n;
|
15
|
+
bwtint_t *a;
|
16
|
+
} poslist_t;
|
17
|
+
|
18
|
+
typedef struct {
|
19
|
+
double avg, std, ap_prior;
|
20
|
+
bwtint_t low, high, high_bayesian;
|
21
|
+
} isize_info_t;
|
22
|
+
|
23
|
+
#include "khash.h"
|
24
|
+
KHASH_MAP_INIT_INT64(64, poslist_t)
|
25
|
+
|
26
|
+
#include "ksort.h"
|
27
|
+
KSORT_INIT_GENERIC(uint64_t)
|
28
|
+
|
29
|
+
typedef struct {
|
30
|
+
kvec_t(uint64_t) arr;
|
31
|
+
kvec_t(uint64_t) pos[2];
|
32
|
+
kvec_t(bwt_aln1_t) aln[2];
|
33
|
+
} pe_data_t;
|
34
|
+
|
35
|
+
#define MIN_HASH_WIDTH 1000
|
36
|
+
|
37
|
+
extern int g_log_n[256]; // in bwase.c
|
38
|
+
static kh_64_t *g_hash;
|
39
|
+
|
40
|
+
void bwase_initialize();
|
41
|
+
void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi);
|
42
|
+
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
|
43
|
+
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
|
44
|
+
int bwa_approx_mapQ(const bwa_seq_t *p, int mm);
|
45
|
+
void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2);
|
46
|
+
bntseq_t *bwa_open_nt(const char *prefix);
|
47
|
+
void bwa_print_sam_SQ(const bntseq_t *bns);
|
48
|
+
void bwa_print_sam_PG();
|
49
|
+
|
50
|
+
pe_opt_t *bwa_init_pe_opt()
|
51
|
+
{
|
52
|
+
pe_opt_t *po;
|
53
|
+
po = (pe_opt_t*)calloc(1, sizeof(pe_opt_t));
|
54
|
+
po->max_isize = 500;
|
55
|
+
po->force_isize = 0;
|
56
|
+
po->max_occ = 100000;
|
57
|
+
po->n_multi = 3;
|
58
|
+
po->N_multi = 10;
|
59
|
+
po->type = BWA_PET_STD;
|
60
|
+
po->is_sw = 1;
|
61
|
+
po->ap_prior = 1e-5;
|
62
|
+
return po;
|
63
|
+
}
|
64
|
+
|
65
|
+
static inline uint64_t hash_64(uint64_t key)
|
66
|
+
{
|
67
|
+
key += ~(key << 32);
|
68
|
+
key ^= (key >> 22);
|
69
|
+
key += ~(key << 13);
|
70
|
+
key ^= (key >> 8);
|
71
|
+
key += (key << 3);
|
72
|
+
key ^= (key >> 15);
|
73
|
+
key += ~(key << 27);
|
74
|
+
key ^= (key >> 31);
|
75
|
+
return key;
|
76
|
+
}
|
77
|
+
/*
|
78
|
+
static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x);
|
79
|
+
{
|
80
|
+
const double a = 0.140012;
|
81
|
+
double b, c;
|
82
|
+
b = log(x * (2 - x));
|
83
|
+
c = 2./M_PI/a + b / 2.;
|
84
|
+
return sqrt(sqrt(c * c - b / a) - c);
|
85
|
+
}
|
86
|
+
*/
|
87
|
+
|
88
|
+
// for normal distribution, this is about 3std
|
89
|
+
#define OUTLIER_BOUND 2.0
|
90
|
+
|
91
|
+
static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double ap_prior, int64_t L)
|
92
|
+
{
|
93
|
+
uint64_t x, *isizes, n_ap = 0;
|
94
|
+
int n, i, tot, p25, p75, p50, max_len = 1, tmp;
|
95
|
+
double skewness = 0.0, kurtosis = 0.0, y;
|
96
|
+
|
97
|
+
ii->avg = ii->std = -1.0;
|
98
|
+
ii->low = ii->high = ii->high_bayesian = 0;
|
99
|
+
isizes = (uint64_t*)calloc(n_seqs, 8);
|
100
|
+
for (i = 0, tot = 0; i != n_seqs; ++i) {
|
101
|
+
bwa_seq_t *p[2];
|
102
|
+
p[0] = seqs[0] + i; p[1] = seqs[1] + i;
|
103
|
+
if (p[0]->mapQ >= 20 && p[1]->mapQ >= 20) {
|
104
|
+
x = (p[0]->pos < p[1]->pos)? p[1]->pos + p[1]->len - p[0]->pos : p[0]->pos + p[0]->len - p[1]->pos;
|
105
|
+
if (x < 100000) isizes[tot++] = x;
|
106
|
+
}
|
107
|
+
if (p[0]->len > max_len) max_len = p[0]->len;
|
108
|
+
if (p[1]->len > max_len) max_len = p[1]->len;
|
109
|
+
}
|
110
|
+
if (tot < 20) {
|
111
|
+
fprintf(stderr, "[infer_isize] fail to infer insert size: too few good pairs\n");
|
112
|
+
free(isizes);
|
113
|
+
return -1;
|
114
|
+
}
|
115
|
+
ks_introsort(uint64_t, tot, isizes);
|
116
|
+
p25 = isizes[(int)(tot*0.25 + 0.5)];
|
117
|
+
p50 = isizes[(int)(tot*0.50 + 0.5)];
|
118
|
+
p75 = isizes[(int)(tot*0.75 + 0.5)];
|
119
|
+
tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
|
120
|
+
ii->low = tmp > max_len? tmp : max_len; // ii->low is unsigned
|
121
|
+
ii->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
|
122
|
+
for (i = 0, x = n = 0; i < tot; ++i)
|
123
|
+
if (isizes[i] >= ii->low && isizes[i] <= ii->high)
|
124
|
+
++n, x += isizes[i];
|
125
|
+
ii->avg = (double)x / n;
|
126
|
+
for (i = 0; i < tot; ++i) {
|
127
|
+
if (isizes[i] >= ii->low && isizes[i] <= ii->high) {
|
128
|
+
double tmp = (isizes[i] - ii->avg) * (isizes[i] - ii->avg);
|
129
|
+
ii->std += tmp;
|
130
|
+
skewness += tmp * (isizes[i] - ii->avg);
|
131
|
+
kurtosis += tmp * tmp;
|
132
|
+
}
|
133
|
+
}
|
134
|
+
kurtosis = kurtosis/n / (ii->std / n * ii->std / n) - 3;
|
135
|
+
ii->std = sqrt(ii->std / n); // it would be better as n-1, but n is usually very large
|
136
|
+
skewness = skewness / n / (ii->std * ii->std * ii->std);
|
137
|
+
for (y = 1.0; y < 10.0; y += 0.01)
|
138
|
+
if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break;
|
139
|
+
ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499);
|
140
|
+
for (i = 0; i < tot; ++i)
|
141
|
+
if (isizes[i] > ii->high_bayesian) ++n_ap;
|
142
|
+
ii->ap_prior = .01 * (n_ap + .01) / tot;
|
143
|
+
if (ii->ap_prior < ap_prior) ii->ap_prior = ap_prior;
|
144
|
+
free(isizes);
|
145
|
+
fprintf(stderr, "[infer_isize] (25, 50, 75) percentile: (%d, %d, %d)\n", p25, p50, p75);
|
146
|
+
if (isnan(ii->std) || p75 > 100000) {
|
147
|
+
ii->low = ii->high = ii->high_bayesian = 0; ii->avg = ii->std = -1.0;
|
148
|
+
fprintf(stderr, "[infer_isize] fail to infer insert size: weird pairing\n");
|
149
|
+
return -1;
|
150
|
+
}
|
151
|
+
for (y = 1.0; y < 10.0; y += 0.01)
|
152
|
+
if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break;
|
153
|
+
ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499);
|
154
|
+
fprintf(stderr, "[infer_isize] low and high boundaries: %d and %d for estimating avg and std\n", ii->low, ii->high);
|
155
|
+
fprintf(stderr, "[infer_isize] inferred external isize from %d pairs: %.3lf +/- %.3lf\n", n, ii->avg, ii->std);
|
156
|
+
fprintf(stderr, "[infer_isize] skewness: %.3lf; kurtosis: %.3lf; ap_prior: %.2e\n", skewness, kurtosis, ii->ap_prior);
|
157
|
+
fprintf(stderr, "[infer_isize] inferred maximum insert size: %d (%.2lf sigma)\n", ii->high_bayesian, y);
|
158
|
+
return 0;
|
159
|
+
}
|
160
|
+
|
161
|
+
static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, const isize_info_t *ii)
|
162
|
+
{
|
163
|
+
int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len;
|
164
|
+
uint64_t last_pos[2][2], o_pos[2], subo_score, o_score;
|
165
|
+
max_len = p[0]->full_len;
|
166
|
+
if (max_len < p[1]->full_len) max_len = p[1]->full_len;
|
167
|
+
if (low_bound < max_len) low_bound = max_len;
|
168
|
+
|
169
|
+
// here v>=u. When ii is set, we check insert size with ii; otherwise with opt->max_isize
|
170
|
+
#define __pairing_aux(u,v) do { \
|
171
|
+
bwtint_t l = ((v)>>32) + p[(v)&1]->len - ((u)>>32); \
|
172
|
+
if ((u) != (uint64_t)-1 && (v)>>32 > (u)>>32 && l >= max_len \
|
173
|
+
&& ((ii->high && l <= ii->high_bayesian) || (ii->high == 0 && l <= opt->max_isize))) \
|
174
|
+
{ \
|
175
|
+
uint64_t s = d->aln[(v)&1].a[(uint32_t)(v)>>1].score + d->aln[(u)&1].a[(uint32_t)(u)>>1].score; \
|
176
|
+
s *= 10; \
|
177
|
+
if (ii->high) s += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * fabs(l - ii->avg) / ii->std)) + .499); \
|
178
|
+
s = s<<32 | (uint32_t)hash_64((u)>>32<<32 | (v)>>32); \
|
179
|
+
if (s>>32 == o_score>>32) ++o_n; \
|
180
|
+
else if (s>>32 < o_score>>32) { subo_n += o_n; o_n = 1; } \
|
181
|
+
else ++subo_n; \
|
182
|
+
if (s < o_score) subo_score = o_score, o_score = s, o_pos[(u)&1] = (u), o_pos[(v)&1] = (v); \
|
183
|
+
else if (s < subo_score) subo_score = s; \
|
184
|
+
} \
|
185
|
+
} while (0)
|
186
|
+
|
187
|
+
#define __pairing_aux2(q, w) do { \
|
188
|
+
const bwt_aln1_t *r = d->aln[(w)&1].a + ((uint32_t)(w)>>1); \
|
189
|
+
(q)->extra_flag |= SAM_FPP; \
|
190
|
+
if ((q)->pos != (w)>>32 || (q)->strand != r->a) { \
|
191
|
+
(q)->n_mm = r->n_mm; (q)->n_gapo = r->n_gapo; (q)->n_gape = r->n_gape; (q)->strand = r->a; \
|
192
|
+
(q)->score = r->score; \
|
193
|
+
(q)->pos = (w)>>32; \
|
194
|
+
if ((q)->mapQ > 0) ++cnt_chg; \
|
195
|
+
} \
|
196
|
+
} while (0)
|
197
|
+
|
198
|
+
o_score = subo_score = (uint64_t)-1;
|
199
|
+
o_n = subo_n = 0;
|
200
|
+
ks_introsort(uint64_t, d->arr.n, d->arr.a);
|
201
|
+
for (j = 0; j < 2; ++j) last_pos[j][0] = last_pos[j][1] = (uint64_t)-1;
|
202
|
+
if (opt->type == BWA_PET_STD) {
|
203
|
+
for (i = 0; i < d->arr.n; ++i) {
|
204
|
+
uint64_t x = d->arr.a[i];
|
205
|
+
int strand = d->aln[x&1].a[(uint32_t)x>>1].a;
|
206
|
+
if (strand == 1) { // reverse strand, then check
|
207
|
+
int y = 1 - (x&1);
|
208
|
+
__pairing_aux(last_pos[y][1], x);
|
209
|
+
__pairing_aux(last_pos[y][0], x);
|
210
|
+
} else { // forward strand, then push
|
211
|
+
last_pos[x&1][0] = last_pos[x&1][1];
|
212
|
+
last_pos[x&1][1] = x;
|
213
|
+
}
|
214
|
+
}
|
215
|
+
} else if (opt->type == BWA_PET_SOLID) {
|
216
|
+
for (i = 0; i < d->arr.n; ++i) {
|
217
|
+
uint64_t x = d->arr.a[i];
|
218
|
+
int strand = d->aln[x&1].a[(uint32_t)x>>1].a;
|
219
|
+
if ((strand^x)&1) { // push
|
220
|
+
int y = 1 - (x&1);
|
221
|
+
__pairing_aux(last_pos[y][1], x);
|
222
|
+
__pairing_aux(last_pos[y][0], x);
|
223
|
+
} else { // check
|
224
|
+
last_pos[x&1][0] = last_pos[x&1][1];
|
225
|
+
last_pos[x&1][1] = x;
|
226
|
+
}
|
227
|
+
}
|
228
|
+
} else {
|
229
|
+
fprintf(stderr, "[paring] not implemented yet!\n");
|
230
|
+
exit(1);
|
231
|
+
}
|
232
|
+
// set pairing
|
233
|
+
//fprintf(stderr, "[%d, %d, %d, %d]\n", d->arr.n, (int)(o_score>>32), (int)(subo_score>>32), o_n);
|
234
|
+
if (o_score != (uint64_t)-1) {
|
235
|
+
int mapQ_p = 0; // this is the maximum mapping quality when one end is moved
|
236
|
+
int rr[2];
|
237
|
+
//fprintf(stderr, "%d, %d\n", o_n, subo_n);
|
238
|
+
if (o_n == 1) {
|
239
|
+
if (subo_score == (uint64_t)-1) mapQ_p = 29; // no sub-optimal pair
|
240
|
+
else if ((subo_score>>32) - (o_score>>32) > s_mm * 10) mapQ_p = 23; // poor sub-optimal pair
|
241
|
+
else {
|
242
|
+
int n = subo_n > 255? 255 : subo_n;
|
243
|
+
mapQ_p = ((subo_score>>32) - (o_score>>32)) / 2 - g_log_n[n];
|
244
|
+
if (mapQ_p < 0) mapQ_p = 0;
|
245
|
+
}
|
246
|
+
}
|
247
|
+
rr[0] = d->aln[o_pos[0]&1].a[(uint32_t)o_pos[0]>>1].a;
|
248
|
+
rr[1] = d->aln[o_pos[1]&1].a[(uint32_t)o_pos[1]>>1].a;
|
249
|
+
if ((p[0]->pos == o_pos[0]>>32 && p[0]->strand == rr[0]) && (p[1]->pos == o_pos[1]>>32 && p[1]->strand == rr[1])) { // both ends not moved
|
250
|
+
if (p[0]->mapQ > 0 && p[1]->mapQ > 0) {
|
251
|
+
int mapQ = p[0]->mapQ + p[1]->mapQ;
|
252
|
+
if (mapQ > 60) mapQ = 60;
|
253
|
+
p[0]->mapQ = p[1]->mapQ = mapQ;
|
254
|
+
} else {
|
255
|
+
if (p[0]->mapQ == 0) p[0]->mapQ = (mapQ_p + 7 < p[1]->mapQ)? mapQ_p + 7 : p[1]->mapQ;
|
256
|
+
if (p[1]->mapQ == 0) p[1]->mapQ = (mapQ_p + 7 < p[0]->mapQ)? mapQ_p + 7 : p[0]->mapQ;
|
257
|
+
}
|
258
|
+
} else if (p[0]->pos == o_pos[0]>>32 && p[0]->strand == rr[0]) { // [1] moved
|
259
|
+
p[1]->seQ = 0; p[1]->mapQ = p[0]->mapQ;
|
260
|
+
if (p[1]->mapQ > mapQ_p) p[1]->mapQ = mapQ_p;
|
261
|
+
} else if (p[1]->pos == o_pos[1]>>32 && p[1]->strand == rr[1]) { // [0] moved
|
262
|
+
p[0]->seQ = 0; p[0]->mapQ = p[1]->mapQ;
|
263
|
+
if (p[0]->mapQ > mapQ_p) p[0]->mapQ = mapQ_p;
|
264
|
+
} else { // both ends moved
|
265
|
+
p[0]->seQ = p[1]->seQ = 0;
|
266
|
+
mapQ_p -= 20;
|
267
|
+
if (mapQ_p < 0) mapQ_p = 0;
|
268
|
+
p[0]->mapQ = p[1]->mapQ = mapQ_p;
|
269
|
+
}
|
270
|
+
__pairing_aux2(p[0], o_pos[0]);
|
271
|
+
__pairing_aux2(p[1], o_pos[1]);
|
272
|
+
}
|
273
|
+
return cnt_chg;
|
274
|
+
}
|
275
|
+
|
276
|
+
typedef struct {
|
277
|
+
kvec_t(bwt_aln1_t) aln;
|
278
|
+
} aln_buf_t;
|
279
|
+
|
280
|
+
int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii,
|
281
|
+
const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii)
|
282
|
+
{
|
283
|
+
int i, j, cnt_chg = 0;
|
284
|
+
char str[1024];
|
285
|
+
bwt_t *bwt[2];
|
286
|
+
pe_data_t *d;
|
287
|
+
aln_buf_t *buf[2];
|
288
|
+
|
289
|
+
d = (pe_data_t*)calloc(1, sizeof(pe_data_t));
|
290
|
+
buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
|
291
|
+
buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
|
292
|
+
|
293
|
+
if (_bwt[0] == 0) { // load forward SA
|
294
|
+
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
|
295
|
+
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
|
296
|
+
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
|
297
|
+
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
|
298
|
+
} else bwt[0] = _bwt[0], bwt[1] = _bwt[1];
|
299
|
+
|
300
|
+
// SE
|
301
|
+
for (i = 0; i != n_seqs; ++i) {
|
302
|
+
bwa_seq_t *p[2];
|
303
|
+
for (j = 0; j < 2; ++j) {
|
304
|
+
int n_aln;
|
305
|
+
p[j] = seqs[j] + i;
|
306
|
+
p[j]->n_multi = 0;
|
307
|
+
p[j]->extra_flag |= SAM_FPD | (j == 0? SAM_FR1 : SAM_FR2);
|
308
|
+
fread(&n_aln, 4, 1, fp_sa[j]);
|
309
|
+
if (n_aln > kv_max(d->aln[j]))
|
310
|
+
kv_resize(bwt_aln1_t, d->aln[j], n_aln);
|
311
|
+
d->aln[j].n = n_aln;
|
312
|
+
fread(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]);
|
313
|
+
kv_copy(bwt_aln1_t, buf[j][i].aln, d->aln[j]); // backup d->aln[j]
|
314
|
+
// generate SE alignment and mapping quality
|
315
|
+
bwa_aln2seq(n_aln, d->aln[j].a, p[j]);
|
316
|
+
if (p[j]->type == BWA_TYPE_UNIQUE || p[j]->type == BWA_TYPE_REPEAT) {
|
317
|
+
int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff;
|
318
|
+
p[j]->pos = p[j]->strand? bwt_sa(bwt[0], p[j]->sa)
|
319
|
+
: bwt[1]->seq_len - (bwt_sa(bwt[1], p[j]->sa) + p[j]->len);
|
320
|
+
p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff);
|
321
|
+
}
|
322
|
+
}
|
323
|
+
}
|
324
|
+
|
325
|
+
// infer isize
|
326
|
+
infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt[0]->seq_len);
|
327
|
+
if (ii->avg < 0.0 && last_ii->avg > 0.0) *ii = *last_ii;
|
328
|
+
if (opt->force_isize) {
|
329
|
+
fprintf(stderr, "[%s] discard insert size estimate as user's request.\n", __func__);
|
330
|
+
ii->low = ii->high = 0; ii->avg = ii->std = -1.0;
|
331
|
+
}
|
332
|
+
|
333
|
+
// PE
|
334
|
+
for (i = 0; i != n_seqs; ++i) {
|
335
|
+
bwa_seq_t *p[2];
|
336
|
+
for (j = 0; j < 2; ++j) {
|
337
|
+
p[j] = seqs[j] + i;
|
338
|
+
kv_copy(bwt_aln1_t, d->aln[j], buf[j][i].aln);
|
339
|
+
}
|
340
|
+
if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT)
|
341
|
+
&& (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT))
|
342
|
+
{ // only when both ends mapped
|
343
|
+
uint64_t x;
|
344
|
+
int j, k, n_occ[2];
|
345
|
+
for (j = 0; j < 2; ++j) {
|
346
|
+
n_occ[j] = 0;
|
347
|
+
for (k = 0; k < d->aln[j].n; ++k)
|
348
|
+
n_occ[j] += d->aln[j].a[k].l - d->aln[j].a[k].k + 1;
|
349
|
+
}
|
350
|
+
if (n_occ[0] > opt->max_occ || n_occ[1] > opt->max_occ) continue;
|
351
|
+
d->arr.n = 0;
|
352
|
+
for (j = 0; j < 2; ++j) {
|
353
|
+
for (k = 0; k < d->aln[j].n; ++k) {
|
354
|
+
bwt_aln1_t *r = d->aln[j].a + k;
|
355
|
+
bwtint_t l;
|
356
|
+
if (r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table
|
357
|
+
uint64_t key = (uint64_t)r->k<<32 | r->l;
|
358
|
+
int ret;
|
359
|
+
khint_t iter = kh_put(64, g_hash, key, &ret);
|
360
|
+
if (ret) { // not in the hash table; ret must equal 1 as we never remove elements
|
361
|
+
poslist_t *z = &kh_val(g_hash, iter);
|
362
|
+
z->n = r->l - r->k + 1;
|
363
|
+
z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n);
|
364
|
+
for (l = r->k; l <= r->l; ++l)
|
365
|
+
z->a[l - r->k] = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len);
|
366
|
+
}
|
367
|
+
for (l = 0; l < kh_val(g_hash, iter).n; ++l) {
|
368
|
+
x = kh_val(g_hash, iter).a[l];
|
369
|
+
x = x<<32 | k<<1 | j;
|
370
|
+
kv_push(uint64_t, d->arr, x);
|
371
|
+
}
|
372
|
+
} else { // then calculate on the fly
|
373
|
+
for (l = r->k; l <= r->l; ++l) {
|
374
|
+
x = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len);
|
375
|
+
x = x<<32 | k<<1 | j;
|
376
|
+
kv_push(uint64_t, d->arr, x);
|
377
|
+
}
|
378
|
+
}
|
379
|
+
}
|
380
|
+
}
|
381
|
+
cnt_chg += pairing(p, d, opt, gopt->s_mm, ii);
|
382
|
+
}
|
383
|
+
|
384
|
+
if (opt->N_multi || opt->n_multi) {
|
385
|
+
for (j = 0; j < 2; ++j) {
|
386
|
+
if (p[j]->type != BWA_TYPE_NO_MATCH) {
|
387
|
+
int k;
|
388
|
+
if (!(p[j]->extra_flag&SAM_FPP) && p[1-j]->type != BWA_TYPE_NO_MATCH) {
|
389
|
+
bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi);
|
390
|
+
} else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi);
|
391
|
+
for (k = 0; k < p[j]->n_multi; ++k) {
|
392
|
+
bwt_multi1_t *q = p[j]->multi + k;
|
393
|
+
q->pos = q->strand? bwt_sa(bwt[0], q->pos) : bwt[1]->seq_len - (bwt_sa(bwt[1], q->pos) + p[j]->len);
|
394
|
+
}
|
395
|
+
}
|
396
|
+
}
|
397
|
+
}
|
398
|
+
}
|
399
|
+
|
400
|
+
// free
|
401
|
+
for (i = 0; i < n_seqs; ++i) {
|
402
|
+
kv_destroy(buf[0][i].aln);
|
403
|
+
kv_destroy(buf[1][i].aln);
|
404
|
+
}
|
405
|
+
free(buf[0]); free(buf[1]);
|
406
|
+
if (_bwt[0] == 0) {
|
407
|
+
bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
|
408
|
+
}
|
409
|
+
kv_destroy(d->arr);
|
410
|
+
kv_destroy(d->pos[0]); kv_destroy(d->pos[1]);
|
411
|
+
kv_destroy(d->aln[0]); kv_destroy(d->aln[1]);
|
412
|
+
free(d);
|
413
|
+
return cnt_chg;
|
414
|
+
}
|
415
|
+
|
416
|
+
#define SW_MIN_MATCH_LEN 20
|
417
|
+
#define SW_MIN_MAPQ 17
|
418
|
+
|
419
|
+
// cnt = n_mm<<16 | n_gapo<<8 | n_gape
|
420
|
+
bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen,
|
421
|
+
int *n_cigar, uint32_t *_cnt)
|
422
|
+
{
|
423
|
+
bwa_cigar_t *cigar = 0;
|
424
|
+
ubyte_t *ref_seq;
|
425
|
+
bwtint_t k, x, y, l;
|
426
|
+
int path_len, ret;
|
427
|
+
AlnParam ap = aln_param_bwa;
|
428
|
+
path_t *path, *p;
|
429
|
+
|
430
|
+
// check whether there are too many N's
|
431
|
+
if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0;
|
432
|
+
for (k = 0, x = 0; k < len; ++k)
|
433
|
+
if (seq[k] >= 4) ++x;
|
434
|
+
if ((float)x/len >= 0.25 || len - x < SW_MIN_MATCH_LEN) return 0;
|
435
|
+
|
436
|
+
// get reference subsequence
|
437
|
+
ref_seq = (ubyte_t*)calloc(reglen, 1);
|
438
|
+
for (k = *beg, l = 0; l < reglen && k < l_pac; ++k)
|
439
|
+
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
|
440
|
+
path = (path_t*)calloc(l+len, sizeof(path_t));
|
441
|
+
|
442
|
+
// do alignment
|
443
|
+
ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, 0);
|
444
|
+
if (ret < 0) {
|
445
|
+
free(path); free(cigar); free(ref_seq); *n_cigar = 0;
|
446
|
+
return 0;
|
447
|
+
}
|
448
|
+
cigar = bwa_aln_path2cigar(path, path_len, n_cigar);
|
449
|
+
|
450
|
+
// check whether the alignment is good enough
|
451
|
+
for (k = 0, x = y = 0; k < *n_cigar; ++k) {
|
452
|
+
bwa_cigar_t c = cigar[k];
|
453
|
+
if (__cigar_op(c) == FROM_M) x += __cigar_len(c), y += __cigar_len(c);
|
454
|
+
else if (__cigar_op(c) == FROM_D) x += __cigar_len(c);
|
455
|
+
else y += __cigar_len(c);
|
456
|
+
}
|
457
|
+
if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough
|
458
|
+
free(path); free(cigar); free(ref_seq);
|
459
|
+
*n_cigar = 0;
|
460
|
+
return 0;
|
461
|
+
}
|
462
|
+
|
463
|
+
{ // update cigar and coordinate;
|
464
|
+
int start, end;
|
465
|
+
p = path + path_len - 1;
|
466
|
+
*beg += (p->i? p->i : 1) - 1;
|
467
|
+
start = (p->j? p->j : 1) - 1;
|
468
|
+
end = path->j;
|
469
|
+
cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2));
|
470
|
+
if (start) {
|
471
|
+
memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar));
|
472
|
+
cigar[0] = __cigar_create(3, start);
|
473
|
+
++(*n_cigar);
|
474
|
+
}
|
475
|
+
if (end < len) {
|
476
|
+
/*cigar[*n_cigar] = 3<<14 | (len - end);*/
|
477
|
+
cigar[*n_cigar] = __cigar_create(3, (len - end));
|
478
|
+
++(*n_cigar);
|
479
|
+
}
|
480
|
+
}
|
481
|
+
|
482
|
+
{ // set *cnt
|
483
|
+
int n_mm, n_gapo, n_gape;
|
484
|
+
n_mm = n_gapo = n_gape = 0;
|
485
|
+
p = path + path_len - 1;
|
486
|
+
x = p->i? p->i - 1 : 0; y = p->j? p->j - 1 : 0;
|
487
|
+
for (k = 0; k < *n_cigar; ++k) {
|
488
|
+
bwa_cigar_t c = cigar[k];
|
489
|
+
if (__cigar_op(c) == FROM_M) {
|
490
|
+
for (l = 0; l < (__cigar_len(c)); ++l)
|
491
|
+
if (ref_seq[x+l] < 4 && seq[y+l] < 4 && ref_seq[x+l] != seq[y+l]) ++n_mm;
|
492
|
+
x += __cigar_len(c), y += __cigar_len(c);
|
493
|
+
} else if (__cigar_op(c) == FROM_D) {
|
494
|
+
x += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1;
|
495
|
+
} else if (__cigar_op(c) == FROM_I) {
|
496
|
+
y += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1;
|
497
|
+
}
|
498
|
+
}
|
499
|
+
*_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape;
|
500
|
+
}
|
501
|
+
|
502
|
+
free(ref_seq); free(path);
|
503
|
+
return cigar;
|
504
|
+
}
|
505
|
+
|
506
|
+
ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, bwa_seq_t *seqs[2], const pe_opt_t *popt, const isize_info_t *ii)
|
507
|
+
{
|
508
|
+
ubyte_t *pacseq;
|
509
|
+
int i;
|
510
|
+
uint64_t n_tot[2], n_mapped[2];
|
511
|
+
|
512
|
+
// load reference sequence
|
513
|
+
if (_pacseq == 0) {
|
514
|
+
pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
|
515
|
+
rewind(bns->fp_pac);
|
516
|
+
fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
|
517
|
+
} else pacseq = (ubyte_t*)_pacseq;
|
518
|
+
if (!popt->is_sw || ii->avg < 0.0) return pacseq;
|
519
|
+
|
520
|
+
// perform mate alignment
|
521
|
+
n_tot[0] = n_tot[1] = n_mapped[0] = n_mapped[1] = 0;
|
522
|
+
for (i = 0; i != n_seqs; ++i) {
|
523
|
+
bwa_seq_t *p[2];
|
524
|
+
p[0] = seqs[0] + i; p[1] = seqs[1] + i;
|
525
|
+
if ((p[0]->mapQ >= SW_MIN_MAPQ || p[1]->mapQ >= SW_MIN_MAPQ) && (p[0]->extra_flag&SAM_FPP) == 0) { // unpaired and one read has high mapQ
|
526
|
+
int k, n_cigar[2], is_singleton, mapQ = 0, mq_adjust[2];
|
527
|
+
int64_t beg[2], end[2];
|
528
|
+
bwa_cigar_t *cigar[2];
|
529
|
+
uint32_t cnt[2];
|
530
|
+
|
531
|
+
/* In the following, _pref points to the reference read
|
532
|
+
* which must be aligned; _pmate points to its mate which is
|
533
|
+
* considered to be modified. */
|
534
|
+
|
535
|
+
#define __set_rght_coor(_a, _b, _pref, _pmate) do { \
|
536
|
+
(_a) = (int64_t)_pref->pos + ii->avg - 3 * ii->std - _pmate->len * 1.5; \
|
537
|
+
(_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \
|
538
|
+
if ((_a) < (int64_t)_pref->pos + _pref->len) (_a) = _pref->pos + _pref->len; \
|
539
|
+
if ((_b) > bns->l_pac) (_b) = bns->l_pac; \
|
540
|
+
} while (0)
|
541
|
+
|
542
|
+
#define __set_left_coor(_a, _b, _pref, _pmate) do { \
|
543
|
+
(_a) = (int64_t)_pref->pos + _pref->len - ii->avg - 3 * ii->std - _pmate->len * 0.5; \
|
544
|
+
(_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \
|
545
|
+
if ((_a) < 0) (_a) = 0; \
|
546
|
+
if ((_b) > _pref->pos) (_b) = _pref->pos; \
|
547
|
+
} while (0)
|
548
|
+
|
549
|
+
#define __set_fixed(_pref, _pmate, _beg, _cnt) do { \
|
550
|
+
_pmate->type = BWA_TYPE_MATESW; \
|
551
|
+
_pmate->pos = _beg; \
|
552
|
+
_pmate->seQ = _pref->seQ; \
|
553
|
+
_pmate->strand = (popt->type == BWA_PET_STD)? 1 - _pref->strand : _pref->strand; \
|
554
|
+
_pmate->n_mm = _cnt>>16; _pmate->n_gapo = _cnt>>8&0xff; _pmate->n_gape = _cnt&0xff; \
|
555
|
+
_pmate->extra_flag |= SAM_FPP; \
|
556
|
+
_pref->extra_flag |= SAM_FPP; \
|
557
|
+
} while (0)
|
558
|
+
|
559
|
+
mq_adjust[0] = mq_adjust[1] = 255; // not effective
|
560
|
+
is_singleton = (p[0]->type == BWA_TYPE_NO_MATCH || p[1]->type == BWA_TYPE_NO_MATCH)? 1 : 0;
|
561
|
+
|
562
|
+
++n_tot[is_singleton];
|
563
|
+
cigar[0] = cigar[1] = 0;
|
564
|
+
n_cigar[0] = n_cigar[1] = 0;
|
565
|
+
if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered
|
566
|
+
for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified
|
567
|
+
ubyte_t *seq;
|
568
|
+
if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip
|
569
|
+
if (popt->type == BWA_PET_STD) {
|
570
|
+
if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate
|
571
|
+
__set_rght_coor(beg[k], end[k], p[1-k], p[k]);
|
572
|
+
seq = p[k]->rseq;
|
573
|
+
} else { // then the mate is on forward stand and has smaller coordinate
|
574
|
+
__set_left_coor(beg[k], end[k], p[1-k], p[k]);
|
575
|
+
seq = p[k]->seq;
|
576
|
+
seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly
|
577
|
+
}
|
578
|
+
} else { // BWA_PET_SOLID
|
579
|
+
if (p[1-k]->strand == 0) { // R3-F3 pairing
|
580
|
+
if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
|
581
|
+
else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
|
582
|
+
seq = p[k]->rseq;
|
583
|
+
seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed
|
584
|
+
} else { // F3-R3 pairing
|
585
|
+
if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
|
586
|
+
else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
|
587
|
+
seq = p[k]->seq;
|
588
|
+
}
|
589
|
+
}
|
590
|
+
// perform SW alignment
|
591
|
+
cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]);
|
592
|
+
if (cigar[k] && p[k]->type != BWA_TYPE_NO_MATCH) { // re-evaluate cigar[k]
|
593
|
+
int s_old, clip = 0, s_new;
|
594
|
+
if (__cigar_op(cigar[k][0]) == 3) clip += __cigar_len(cigar[k][0]);
|
595
|
+
if (__cigar_op(cigar[k][n_cigar[k]-1]) == 3) clip += __cigar_len(cigar[k][n_cigar[k]-1]);
|
596
|
+
s_old = (int)((p[k]->n_mm * 9 + p[k]->n_gapo * 13 + p[k]->n_gape * 2) / 3. * 8. + .499);
|
597
|
+
s_new = (int)(((cnt[k]>>16) * 9 + (cnt[k]>>8&0xff) * 13 + (cnt[k]&0xff) * 2 + clip * 3) / 3. * 8. + .499);
|
598
|
+
s_old += -4.343 * log(ii->ap_prior / bns->l_pac);
|
599
|
+
s_new += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * 1.5) + .499)); // assume the mapped isize is 1.5\sigma
|
600
|
+
if (s_old < s_new) { // reject SW alignment
|
601
|
+
mq_adjust[k] = s_new - s_old;
|
602
|
+
free(cigar[k]); cigar[k] = 0; n_cigar[k] = 0;
|
603
|
+
} else mq_adjust[k] = s_old - s_new;
|
604
|
+
}
|
605
|
+
// now revserse sequence back such that p[*]->seq looks untouched
|
606
|
+
if (popt->type == BWA_PET_STD) {
|
607
|
+
if (p[1-k]->strand == 1) seq_reverse(p[k]->len, seq, 0);
|
608
|
+
} else {
|
609
|
+
if (p[1-k]->strand == 0) seq_reverse(p[k]->len, seq, 0);
|
610
|
+
}
|
611
|
+
}
|
612
|
+
k = -1; // no read to be changed
|
613
|
+
if (cigar[0] && cigar[1]) {
|
614
|
+
k = p[0]->mapQ < p[1]->mapQ? 0 : 1; // p[k] to be fixed
|
615
|
+
mapQ = abs(p[1]->mapQ - p[0]->mapQ);
|
616
|
+
} else if (cigar[0]) k = 0, mapQ = p[1]->mapQ;
|
617
|
+
else if (cigar[1]) k = 1, mapQ = p[0]->mapQ;
|
618
|
+
if (k >= 0 && p[k]->pos != beg[k]) {
|
619
|
+
++n_mapped[is_singleton];
|
620
|
+
{ // recalculate mapping quality
|
621
|
+
int tmp = (int)p[1-k]->mapQ - p[k]->mapQ/2 - 8;
|
622
|
+
if (tmp <= 0) tmp = 1;
|
623
|
+
if (mapQ > tmp) mapQ = tmp;
|
624
|
+
p[k]->mapQ = p[1-k]->mapQ = mapQ;
|
625
|
+
p[k]->seQ = p[1-k]->seQ = p[1-k]->seQ < mapQ? p[1-k]->seQ : mapQ;
|
626
|
+
if (p[k]->mapQ > mq_adjust[k]) p[k]->mapQ = mq_adjust[k];
|
627
|
+
if (p[k]->seQ > mq_adjust[k]) p[k]->seQ = mq_adjust[k];
|
628
|
+
}
|
629
|
+
// update CIGAR
|
630
|
+
free(p[k]->cigar); p[k]->cigar = cigar[k]; cigar[k] = 0;
|
631
|
+
p[k]->n_cigar = n_cigar[k];
|
632
|
+
// update the rest of information
|
633
|
+
__set_fixed(p[1-k], p[k], beg[k], cnt[k]);
|
634
|
+
}
|
635
|
+
free(cigar[0]); free(cigar[1]);
|
636
|
+
}
|
637
|
+
}
|
638
|
+
fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d singletons are mated.\n",
|
639
|
+
(long long)n_mapped[1], (long long)n_tot[1], SW_MIN_MAPQ);
|
640
|
+
fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d discordant pairs are fixed.\n",
|
641
|
+
(long long)n_mapped[0], (long long)n_tot[0], SW_MIN_MAPQ);
|
642
|
+
return pacseq;
|
643
|
+
}
|
644
|
+
|
645
|
+
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt)
|
646
|
+
{
|
647
|
+
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
|
648
|
+
int i, j, n_seqs, tot_seqs = 0;
|
649
|
+
bwa_seq_t *seqs[2];
|
650
|
+
bwa_seqio_t *ks[2];
|
651
|
+
clock_t t;
|
652
|
+
bntseq_t *bns, *ntbns = 0;
|
653
|
+
FILE *fp_sa[2];
|
654
|
+
gap_opt_t opt, opt0;
|
655
|
+
khint_t iter;
|
656
|
+
isize_info_t last_ii; // this is for the last batch of reads
|
657
|
+
char str[1024];
|
658
|
+
bwt_t *bwt[2];
|
659
|
+
uint8_t *pac;
|
660
|
+
|
661
|
+
// initialization
|
662
|
+
bwase_initialize(); // initialize g_log_n[] in bwase.c
|
663
|
+
pac = 0; bwt[0] = bwt[1] = 0;
|
664
|
+
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
|
665
|
+
bns = bns_restore(prefix);
|
666
|
+
srand48(bns->seed);
|
667
|
+
fp_sa[0] = xopen(fn_sa[0], "r");
|
668
|
+
fp_sa[1] = xopen(fn_sa[1], "r");
|
669
|
+
g_hash = kh_init(64);
|
670
|
+
last_ii.avg = -1.0;
|
671
|
+
|
672
|
+
fread(&opt, sizeof(gap_opt_t), 1, fp_sa[0]);
|
673
|
+
ks[0] = bwa_open_reads(opt.mode, fn_fa[0]);
|
674
|
+
opt0 = opt;
|
675
|
+
fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
|
676
|
+
ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
|
677
|
+
if (!(opt.mode & BWA_MODE_COMPREAD)) {
|
678
|
+
popt->type = BWA_PET_SOLID;
|
679
|
+
ntbns = bwa_open_nt(prefix);
|
680
|
+
} else { // for Illumina alignment only
|
681
|
+
if (popt->is_preload) {
|
682
|
+
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
|
683
|
+
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
|
684
|
+
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
|
685
|
+
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
|
686
|
+
pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
|
687
|
+
rewind(bns->fp_pac);
|
688
|
+
fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
|
689
|
+
}
|
690
|
+
}
|
691
|
+
|
692
|
+
// core loop
|
693
|
+
bwa_print_sam_SQ(bns);
|
694
|
+
bwa_print_sam_PG();
|
695
|
+
while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
|
696
|
+
int cnt_chg;
|
697
|
+
isize_info_t ii;
|
698
|
+
ubyte_t *pacseq;
|
699
|
+
|
700
|
+
seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode, opt.trim_qual);
|
701
|
+
tot_seqs += n_seqs;
|
702
|
+
t = clock();
|
703
|
+
|
704
|
+
fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n");
|
705
|
+
cnt_chg = bwa_cal_pac_pos_pe(prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii);
|
706
|
+
fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
707
|
+
fprintf(stderr, "[bwa_sai2sam_pe_core] changing coordinates of %d alignments.\n", cnt_chg);
|
708
|
+
|
709
|
+
fprintf(stderr, "[bwa_sai2sam_pe_core] align unmapped mate...\n");
|
710
|
+
pacseq = bwa_paired_sw(bns, pac, n_seqs, seqs, popt, &ii);
|
711
|
+
fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
712
|
+
|
713
|
+
fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... ");
|
714
|
+
for (j = 0; j < 2; ++j)
|
715
|
+
bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns);
|
716
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
717
|
+
if (pac == 0) free(pacseq);
|
718
|
+
|
719
|
+
fprintf(stderr, "[bwa_sai2sam_pe_core] print alignments... ");
|
720
|
+
for (i = 0; i < n_seqs; ++i) {
|
721
|
+
bwa_seq_t *p[2];
|
722
|
+
p[0] = seqs[0] + i; p[1] = seqs[1] + i;
|
723
|
+
if (p[0]->bc[0] || p[1]->bc[0]) {
|
724
|
+
strcat(p[0]->bc, p[1]->bc);
|
725
|
+
strcpy(p[1]->bc, p[0]->bc);
|
726
|
+
}
|
727
|
+
bwa_print_sam1(bns, p[0], p[1], opt.mode, opt.max_top2);
|
728
|
+
bwa_print_sam1(bns, p[1], p[0], opt.mode, opt.max_top2);
|
729
|
+
}
|
730
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
731
|
+
|
732
|
+
for (j = 0; j < 2; ++j)
|
733
|
+
bwa_free_read_seq(n_seqs, seqs[j]);
|
734
|
+
fprintf(stderr, "[bwa_sai2sam_pe_core] %d sequences have been processed.\n", tot_seqs);
|
735
|
+
last_ii = ii;
|
736
|
+
}
|
737
|
+
|
738
|
+
// destroy
|
739
|
+
bns_destroy(bns);
|
740
|
+
if (ntbns) bns_destroy(ntbns);
|
741
|
+
for (i = 0; i < 2; ++i) {
|
742
|
+
bwa_seq_close(ks[i]);
|
743
|
+
fclose(fp_sa[i]);
|
744
|
+
}
|
745
|
+
for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter)
|
746
|
+
if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a);
|
747
|
+
kh_destroy(64, g_hash);
|
748
|
+
if (pac) {
|
749
|
+
free(pac); bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
|
750
|
+
}
|
751
|
+
}
|
752
|
+
|
753
|
+
int bwa_sai2sam_pe(int argc, char *argv[])
|
754
|
+
{
|
755
|
+
extern char *bwa_rg_line, *bwa_rg_id;
|
756
|
+
extern int bwa_set_rg(const char *s);
|
757
|
+
int c;
|
758
|
+
pe_opt_t *popt;
|
759
|
+
popt = bwa_init_pe_opt();
|
760
|
+
optind = 1;
|
761
|
+
while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) {
|
762
|
+
switch (c) {
|
763
|
+
case 'r':
|
764
|
+
if (bwa_set_rg(optarg) < 0) {
|
765
|
+
fprintf(stderr, "[%s] malformated @RG line\n", __func__);
|
766
|
+
return 1;
|
767
|
+
}
|
768
|
+
break;
|
769
|
+
case 'a': popt->max_isize = atoi(optarg); break;
|
770
|
+
case 'o': popt->max_occ = atoi(optarg); break;
|
771
|
+
case 's': popt->is_sw = 0; break;
|
772
|
+
case 'P': popt->is_preload = 1; break;
|
773
|
+
case 'n': popt->n_multi = atoi(optarg); break;
|
774
|
+
case 'N': popt->N_multi = atoi(optarg); break;
|
775
|
+
case 'c': popt->ap_prior = atof(optarg); break;
|
776
|
+
case 'f': xreopen(optarg, "w", stdout); break;
|
777
|
+
case 'A': popt->force_isize = 1; break;
|
778
|
+
default: return 1;
|
779
|
+
}
|
780
|
+
}
|
781
|
+
|
782
|
+
if (optind + 5 > argc) {
|
783
|
+
fprintf(stderr, "\n");
|
784
|
+
fprintf(stderr, "Usage: bwa sampe [options] <prefix> <in1.sai> <in2.sai> <in1.fq> <in2.fq>\n\n");
|
785
|
+
fprintf(stderr, "Options: -a INT maximum insert size [%d]\n", popt->max_isize);
|
786
|
+
fprintf(stderr, " -o INT maximum occurrences for one end [%d]\n", popt->max_occ);
|
787
|
+
fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi);
|
788
|
+
fprintf(stderr, " -N INT maximum hits to output for discordant pairs [%d]\n", popt->N_multi);
|
789
|
+
fprintf(stderr, " -c FLOAT prior of chimeric rate (lower bound) [%.1le]\n", popt->ap_prior);
|
790
|
+
fprintf(stderr, " -f FILE sam file to output results to [stdout]\n");
|
791
|
+
fprintf(stderr, " -r STR read group header line such as `@RG\\tID:foo\\tSM:bar' [null]\n");
|
792
|
+
fprintf(stderr, " -P preload index into memory (for base-space reads only)\n");
|
793
|
+
fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n");
|
794
|
+
fprintf(stderr, " -A disable insert size estimate (force -s)\n\n");
|
795
|
+
fprintf(stderr, "Notes: 1. For SOLiD reads, <in1.fq> corresponds R3 reads and <in2.fq> to F3.\n");
|
796
|
+
fprintf(stderr, " 2. For reads shorter than 30bp, applying a smaller -o is recommended to\n");
|
797
|
+
fprintf(stderr, " to get a sensible speed at the cost of pairing accuracy.\n");
|
798
|
+
fprintf(stderr, "\n");
|
799
|
+
return 1;
|
800
|
+
}
|
801
|
+
bwa_sai2sam_pe_core(argv[optind], argv + optind + 1, argv + optind+3, popt);
|
802
|
+
free(bwa_rg_line); free(bwa_rg_id);
|
803
|
+
free(popt);
|
804
|
+
fflush(stdout);
|
805
|
+
xreopen("/dev/tty","w",stdout);
|
806
|
+
return 0;
|
807
|
+
}
|