bio-bwa 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
data/ext/bwape.c ADDED
@@ -0,0 +1,807 @@
1
+ #include <unistd.h>
2
+ #include <math.h>
3
+ #include <stdlib.h>
4
+ #include <time.h>
5
+ #include <stdio.h>
6
+ #include <string.h>
7
+ #include "bwtaln.h"
8
+ #include "kvec.h"
9
+ #include "bntseq.h"
10
+ #include "utils.h"
11
+ #include "stdaln.h"
12
+
13
+ typedef struct {
14
+ int n;
15
+ bwtint_t *a;
16
+ } poslist_t;
17
+
18
+ typedef struct {
19
+ double avg, std, ap_prior;
20
+ bwtint_t low, high, high_bayesian;
21
+ } isize_info_t;
22
+
23
+ #include "khash.h"
24
+ KHASH_MAP_INIT_INT64(64, poslist_t)
25
+
26
+ #include "ksort.h"
27
+ KSORT_INIT_GENERIC(uint64_t)
28
+
29
+ typedef struct {
30
+ kvec_t(uint64_t) arr;
31
+ kvec_t(uint64_t) pos[2];
32
+ kvec_t(bwt_aln1_t) aln[2];
33
+ } pe_data_t;
34
+
35
+ #define MIN_HASH_WIDTH 1000
36
+
37
+ extern int g_log_n[256]; // in bwase.c
38
+ static kh_64_t *g_hash;
39
+
40
+ void bwase_initialize();
41
+ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi);
42
+ void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
43
+ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
44
+ int bwa_approx_mapQ(const bwa_seq_t *p, int mm);
45
+ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2);
46
+ bntseq_t *bwa_open_nt(const char *prefix);
47
+ void bwa_print_sam_SQ(const bntseq_t *bns);
48
+ void bwa_print_sam_PG();
49
+
50
+ pe_opt_t *bwa_init_pe_opt()
51
+ {
52
+ pe_opt_t *po;
53
+ po = (pe_opt_t*)calloc(1, sizeof(pe_opt_t));
54
+ po->max_isize = 500;
55
+ po->force_isize = 0;
56
+ po->max_occ = 100000;
57
+ po->n_multi = 3;
58
+ po->N_multi = 10;
59
+ po->type = BWA_PET_STD;
60
+ po->is_sw = 1;
61
+ po->ap_prior = 1e-5;
62
+ return po;
63
+ }
64
+
65
+ static inline uint64_t hash_64(uint64_t key)
66
+ {
67
+ key += ~(key << 32);
68
+ key ^= (key >> 22);
69
+ key += ~(key << 13);
70
+ key ^= (key >> 8);
71
+ key += (key << 3);
72
+ key ^= (key >> 15);
73
+ key += ~(key << 27);
74
+ key ^= (key >> 31);
75
+ return key;
76
+ }
77
+ /*
78
+ static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x);
79
+ {
80
+ const double a = 0.140012;
81
+ double b, c;
82
+ b = log(x * (2 - x));
83
+ c = 2./M_PI/a + b / 2.;
84
+ return sqrt(sqrt(c * c - b / a) - c);
85
+ }
86
+ */
87
+
88
+ // for normal distribution, this is about 3std
89
+ #define OUTLIER_BOUND 2.0
90
+
91
+ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double ap_prior, int64_t L)
92
+ {
93
+ uint64_t x, *isizes, n_ap = 0;
94
+ int n, i, tot, p25, p75, p50, max_len = 1, tmp;
95
+ double skewness = 0.0, kurtosis = 0.0, y;
96
+
97
+ ii->avg = ii->std = -1.0;
98
+ ii->low = ii->high = ii->high_bayesian = 0;
99
+ isizes = (uint64_t*)calloc(n_seqs, 8);
100
+ for (i = 0, tot = 0; i != n_seqs; ++i) {
101
+ bwa_seq_t *p[2];
102
+ p[0] = seqs[0] + i; p[1] = seqs[1] + i;
103
+ if (p[0]->mapQ >= 20 && p[1]->mapQ >= 20) {
104
+ x = (p[0]->pos < p[1]->pos)? p[1]->pos + p[1]->len - p[0]->pos : p[0]->pos + p[0]->len - p[1]->pos;
105
+ if (x < 100000) isizes[tot++] = x;
106
+ }
107
+ if (p[0]->len > max_len) max_len = p[0]->len;
108
+ if (p[1]->len > max_len) max_len = p[1]->len;
109
+ }
110
+ if (tot < 20) {
111
+ fprintf(stderr, "[infer_isize] fail to infer insert size: too few good pairs\n");
112
+ free(isizes);
113
+ return -1;
114
+ }
115
+ ks_introsort(uint64_t, tot, isizes);
116
+ p25 = isizes[(int)(tot*0.25 + 0.5)];
117
+ p50 = isizes[(int)(tot*0.50 + 0.5)];
118
+ p75 = isizes[(int)(tot*0.75 + 0.5)];
119
+ tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
120
+ ii->low = tmp > max_len? tmp : max_len; // ii->low is unsigned
121
+ ii->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
122
+ for (i = 0, x = n = 0; i < tot; ++i)
123
+ if (isizes[i] >= ii->low && isizes[i] <= ii->high)
124
+ ++n, x += isizes[i];
125
+ ii->avg = (double)x / n;
126
+ for (i = 0; i < tot; ++i) {
127
+ if (isizes[i] >= ii->low && isizes[i] <= ii->high) {
128
+ double tmp = (isizes[i] - ii->avg) * (isizes[i] - ii->avg);
129
+ ii->std += tmp;
130
+ skewness += tmp * (isizes[i] - ii->avg);
131
+ kurtosis += tmp * tmp;
132
+ }
133
+ }
134
+ kurtosis = kurtosis/n / (ii->std / n * ii->std / n) - 3;
135
+ ii->std = sqrt(ii->std / n); // it would be better as n-1, but n is usually very large
136
+ skewness = skewness / n / (ii->std * ii->std * ii->std);
137
+ for (y = 1.0; y < 10.0; y += 0.01)
138
+ if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break;
139
+ ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499);
140
+ for (i = 0; i < tot; ++i)
141
+ if (isizes[i] > ii->high_bayesian) ++n_ap;
142
+ ii->ap_prior = .01 * (n_ap + .01) / tot;
143
+ if (ii->ap_prior < ap_prior) ii->ap_prior = ap_prior;
144
+ free(isizes);
145
+ fprintf(stderr, "[infer_isize] (25, 50, 75) percentile: (%d, %d, %d)\n", p25, p50, p75);
146
+ if (isnan(ii->std) || p75 > 100000) {
147
+ ii->low = ii->high = ii->high_bayesian = 0; ii->avg = ii->std = -1.0;
148
+ fprintf(stderr, "[infer_isize] fail to infer insert size: weird pairing\n");
149
+ return -1;
150
+ }
151
+ for (y = 1.0; y < 10.0; y += 0.01)
152
+ if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break;
153
+ ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499);
154
+ fprintf(stderr, "[infer_isize] low and high boundaries: %d and %d for estimating avg and std\n", ii->low, ii->high);
155
+ fprintf(stderr, "[infer_isize] inferred external isize from %d pairs: %.3lf +/- %.3lf\n", n, ii->avg, ii->std);
156
+ fprintf(stderr, "[infer_isize] skewness: %.3lf; kurtosis: %.3lf; ap_prior: %.2e\n", skewness, kurtosis, ii->ap_prior);
157
+ fprintf(stderr, "[infer_isize] inferred maximum insert size: %d (%.2lf sigma)\n", ii->high_bayesian, y);
158
+ return 0;
159
+ }
160
+
161
+ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, const isize_info_t *ii)
162
+ {
163
+ int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len;
164
+ uint64_t last_pos[2][2], o_pos[2], subo_score, o_score;
165
+ max_len = p[0]->full_len;
166
+ if (max_len < p[1]->full_len) max_len = p[1]->full_len;
167
+ if (low_bound < max_len) low_bound = max_len;
168
+
169
+ // here v>=u. When ii is set, we check insert size with ii; otherwise with opt->max_isize
170
+ #define __pairing_aux(u,v) do { \
171
+ bwtint_t l = ((v)>>32) + p[(v)&1]->len - ((u)>>32); \
172
+ if ((u) != (uint64_t)-1 && (v)>>32 > (u)>>32 && l >= max_len \
173
+ && ((ii->high && l <= ii->high_bayesian) || (ii->high == 0 && l <= opt->max_isize))) \
174
+ { \
175
+ uint64_t s = d->aln[(v)&1].a[(uint32_t)(v)>>1].score + d->aln[(u)&1].a[(uint32_t)(u)>>1].score; \
176
+ s *= 10; \
177
+ if (ii->high) s += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * fabs(l - ii->avg) / ii->std)) + .499); \
178
+ s = s<<32 | (uint32_t)hash_64((u)>>32<<32 | (v)>>32); \
179
+ if (s>>32 == o_score>>32) ++o_n; \
180
+ else if (s>>32 < o_score>>32) { subo_n += o_n; o_n = 1; } \
181
+ else ++subo_n; \
182
+ if (s < o_score) subo_score = o_score, o_score = s, o_pos[(u)&1] = (u), o_pos[(v)&1] = (v); \
183
+ else if (s < subo_score) subo_score = s; \
184
+ } \
185
+ } while (0)
186
+
187
+ #define __pairing_aux2(q, w) do { \
188
+ const bwt_aln1_t *r = d->aln[(w)&1].a + ((uint32_t)(w)>>1); \
189
+ (q)->extra_flag |= SAM_FPP; \
190
+ if ((q)->pos != (w)>>32 || (q)->strand != r->a) { \
191
+ (q)->n_mm = r->n_mm; (q)->n_gapo = r->n_gapo; (q)->n_gape = r->n_gape; (q)->strand = r->a; \
192
+ (q)->score = r->score; \
193
+ (q)->pos = (w)>>32; \
194
+ if ((q)->mapQ > 0) ++cnt_chg; \
195
+ } \
196
+ } while (0)
197
+
198
+ o_score = subo_score = (uint64_t)-1;
199
+ o_n = subo_n = 0;
200
+ ks_introsort(uint64_t, d->arr.n, d->arr.a);
201
+ for (j = 0; j < 2; ++j) last_pos[j][0] = last_pos[j][1] = (uint64_t)-1;
202
+ if (opt->type == BWA_PET_STD) {
203
+ for (i = 0; i < d->arr.n; ++i) {
204
+ uint64_t x = d->arr.a[i];
205
+ int strand = d->aln[x&1].a[(uint32_t)x>>1].a;
206
+ if (strand == 1) { // reverse strand, then check
207
+ int y = 1 - (x&1);
208
+ __pairing_aux(last_pos[y][1], x);
209
+ __pairing_aux(last_pos[y][0], x);
210
+ } else { // forward strand, then push
211
+ last_pos[x&1][0] = last_pos[x&1][1];
212
+ last_pos[x&1][1] = x;
213
+ }
214
+ }
215
+ } else if (opt->type == BWA_PET_SOLID) {
216
+ for (i = 0; i < d->arr.n; ++i) {
217
+ uint64_t x = d->arr.a[i];
218
+ int strand = d->aln[x&1].a[(uint32_t)x>>1].a;
219
+ if ((strand^x)&1) { // push
220
+ int y = 1 - (x&1);
221
+ __pairing_aux(last_pos[y][1], x);
222
+ __pairing_aux(last_pos[y][0], x);
223
+ } else { // check
224
+ last_pos[x&1][0] = last_pos[x&1][1];
225
+ last_pos[x&1][1] = x;
226
+ }
227
+ }
228
+ } else {
229
+ fprintf(stderr, "[paring] not implemented yet!\n");
230
+ exit(1);
231
+ }
232
+ // set pairing
233
+ //fprintf(stderr, "[%d, %d, %d, %d]\n", d->arr.n, (int)(o_score>>32), (int)(subo_score>>32), o_n);
234
+ if (o_score != (uint64_t)-1) {
235
+ int mapQ_p = 0; // this is the maximum mapping quality when one end is moved
236
+ int rr[2];
237
+ //fprintf(stderr, "%d, %d\n", o_n, subo_n);
238
+ if (o_n == 1) {
239
+ if (subo_score == (uint64_t)-1) mapQ_p = 29; // no sub-optimal pair
240
+ else if ((subo_score>>32) - (o_score>>32) > s_mm * 10) mapQ_p = 23; // poor sub-optimal pair
241
+ else {
242
+ int n = subo_n > 255? 255 : subo_n;
243
+ mapQ_p = ((subo_score>>32) - (o_score>>32)) / 2 - g_log_n[n];
244
+ if (mapQ_p < 0) mapQ_p = 0;
245
+ }
246
+ }
247
+ rr[0] = d->aln[o_pos[0]&1].a[(uint32_t)o_pos[0]>>1].a;
248
+ rr[1] = d->aln[o_pos[1]&1].a[(uint32_t)o_pos[1]>>1].a;
249
+ if ((p[0]->pos == o_pos[0]>>32 && p[0]->strand == rr[0]) && (p[1]->pos == o_pos[1]>>32 && p[1]->strand == rr[1])) { // both ends not moved
250
+ if (p[0]->mapQ > 0 && p[1]->mapQ > 0) {
251
+ int mapQ = p[0]->mapQ + p[1]->mapQ;
252
+ if (mapQ > 60) mapQ = 60;
253
+ p[0]->mapQ = p[1]->mapQ = mapQ;
254
+ } else {
255
+ if (p[0]->mapQ == 0) p[0]->mapQ = (mapQ_p + 7 < p[1]->mapQ)? mapQ_p + 7 : p[1]->mapQ;
256
+ if (p[1]->mapQ == 0) p[1]->mapQ = (mapQ_p + 7 < p[0]->mapQ)? mapQ_p + 7 : p[0]->mapQ;
257
+ }
258
+ } else if (p[0]->pos == o_pos[0]>>32 && p[0]->strand == rr[0]) { // [1] moved
259
+ p[1]->seQ = 0; p[1]->mapQ = p[0]->mapQ;
260
+ if (p[1]->mapQ > mapQ_p) p[1]->mapQ = mapQ_p;
261
+ } else if (p[1]->pos == o_pos[1]>>32 && p[1]->strand == rr[1]) { // [0] moved
262
+ p[0]->seQ = 0; p[0]->mapQ = p[1]->mapQ;
263
+ if (p[0]->mapQ > mapQ_p) p[0]->mapQ = mapQ_p;
264
+ } else { // both ends moved
265
+ p[0]->seQ = p[1]->seQ = 0;
266
+ mapQ_p -= 20;
267
+ if (mapQ_p < 0) mapQ_p = 0;
268
+ p[0]->mapQ = p[1]->mapQ = mapQ_p;
269
+ }
270
+ __pairing_aux2(p[0], o_pos[0]);
271
+ __pairing_aux2(p[1], o_pos[1]);
272
+ }
273
+ return cnt_chg;
274
+ }
275
+
276
+ typedef struct {
277
+ kvec_t(bwt_aln1_t) aln;
278
+ } aln_buf_t;
279
+
280
+ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii,
281
+ const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii)
282
+ {
283
+ int i, j, cnt_chg = 0;
284
+ char str[1024];
285
+ bwt_t *bwt[2];
286
+ pe_data_t *d;
287
+ aln_buf_t *buf[2];
288
+
289
+ d = (pe_data_t*)calloc(1, sizeof(pe_data_t));
290
+ buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
291
+ buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
292
+
293
+ if (_bwt[0] == 0) { // load forward SA
294
+ strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
295
+ strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
296
+ strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
297
+ strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
298
+ } else bwt[0] = _bwt[0], bwt[1] = _bwt[1];
299
+
300
+ // SE
301
+ for (i = 0; i != n_seqs; ++i) {
302
+ bwa_seq_t *p[2];
303
+ for (j = 0; j < 2; ++j) {
304
+ int n_aln;
305
+ p[j] = seqs[j] + i;
306
+ p[j]->n_multi = 0;
307
+ p[j]->extra_flag |= SAM_FPD | (j == 0? SAM_FR1 : SAM_FR2);
308
+ fread(&n_aln, 4, 1, fp_sa[j]);
309
+ if (n_aln > kv_max(d->aln[j]))
310
+ kv_resize(bwt_aln1_t, d->aln[j], n_aln);
311
+ d->aln[j].n = n_aln;
312
+ fread(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]);
313
+ kv_copy(bwt_aln1_t, buf[j][i].aln, d->aln[j]); // backup d->aln[j]
314
+ // generate SE alignment and mapping quality
315
+ bwa_aln2seq(n_aln, d->aln[j].a, p[j]);
316
+ if (p[j]->type == BWA_TYPE_UNIQUE || p[j]->type == BWA_TYPE_REPEAT) {
317
+ int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff;
318
+ p[j]->pos = p[j]->strand? bwt_sa(bwt[0], p[j]->sa)
319
+ : bwt[1]->seq_len - (bwt_sa(bwt[1], p[j]->sa) + p[j]->len);
320
+ p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff);
321
+ }
322
+ }
323
+ }
324
+
325
+ // infer isize
326
+ infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt[0]->seq_len);
327
+ if (ii->avg < 0.0 && last_ii->avg > 0.0) *ii = *last_ii;
328
+ if (opt->force_isize) {
329
+ fprintf(stderr, "[%s] discard insert size estimate as user's request.\n", __func__);
330
+ ii->low = ii->high = 0; ii->avg = ii->std = -1.0;
331
+ }
332
+
333
+ // PE
334
+ for (i = 0; i != n_seqs; ++i) {
335
+ bwa_seq_t *p[2];
336
+ for (j = 0; j < 2; ++j) {
337
+ p[j] = seqs[j] + i;
338
+ kv_copy(bwt_aln1_t, d->aln[j], buf[j][i].aln);
339
+ }
340
+ if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT)
341
+ && (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT))
342
+ { // only when both ends mapped
343
+ uint64_t x;
344
+ int j, k, n_occ[2];
345
+ for (j = 0; j < 2; ++j) {
346
+ n_occ[j] = 0;
347
+ for (k = 0; k < d->aln[j].n; ++k)
348
+ n_occ[j] += d->aln[j].a[k].l - d->aln[j].a[k].k + 1;
349
+ }
350
+ if (n_occ[0] > opt->max_occ || n_occ[1] > opt->max_occ) continue;
351
+ d->arr.n = 0;
352
+ for (j = 0; j < 2; ++j) {
353
+ for (k = 0; k < d->aln[j].n; ++k) {
354
+ bwt_aln1_t *r = d->aln[j].a + k;
355
+ bwtint_t l;
356
+ if (r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table
357
+ uint64_t key = (uint64_t)r->k<<32 | r->l;
358
+ int ret;
359
+ khint_t iter = kh_put(64, g_hash, key, &ret);
360
+ if (ret) { // not in the hash table; ret must equal 1 as we never remove elements
361
+ poslist_t *z = &kh_val(g_hash, iter);
362
+ z->n = r->l - r->k + 1;
363
+ z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n);
364
+ for (l = r->k; l <= r->l; ++l)
365
+ z->a[l - r->k] = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len);
366
+ }
367
+ for (l = 0; l < kh_val(g_hash, iter).n; ++l) {
368
+ x = kh_val(g_hash, iter).a[l];
369
+ x = x<<32 | k<<1 | j;
370
+ kv_push(uint64_t, d->arr, x);
371
+ }
372
+ } else { // then calculate on the fly
373
+ for (l = r->k; l <= r->l; ++l) {
374
+ x = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len);
375
+ x = x<<32 | k<<1 | j;
376
+ kv_push(uint64_t, d->arr, x);
377
+ }
378
+ }
379
+ }
380
+ }
381
+ cnt_chg += pairing(p, d, opt, gopt->s_mm, ii);
382
+ }
383
+
384
+ if (opt->N_multi || opt->n_multi) {
385
+ for (j = 0; j < 2; ++j) {
386
+ if (p[j]->type != BWA_TYPE_NO_MATCH) {
387
+ int k;
388
+ if (!(p[j]->extra_flag&SAM_FPP) && p[1-j]->type != BWA_TYPE_NO_MATCH) {
389
+ bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi);
390
+ } else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi);
391
+ for (k = 0; k < p[j]->n_multi; ++k) {
392
+ bwt_multi1_t *q = p[j]->multi + k;
393
+ q->pos = q->strand? bwt_sa(bwt[0], q->pos) : bwt[1]->seq_len - (bwt_sa(bwt[1], q->pos) + p[j]->len);
394
+ }
395
+ }
396
+ }
397
+ }
398
+ }
399
+
400
+ // free
401
+ for (i = 0; i < n_seqs; ++i) {
402
+ kv_destroy(buf[0][i].aln);
403
+ kv_destroy(buf[1][i].aln);
404
+ }
405
+ free(buf[0]); free(buf[1]);
406
+ if (_bwt[0] == 0) {
407
+ bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
408
+ }
409
+ kv_destroy(d->arr);
410
+ kv_destroy(d->pos[0]); kv_destroy(d->pos[1]);
411
+ kv_destroy(d->aln[0]); kv_destroy(d->aln[1]);
412
+ free(d);
413
+ return cnt_chg;
414
+ }
415
+
416
+ #define SW_MIN_MATCH_LEN 20
417
+ #define SW_MIN_MAPQ 17
418
+
419
+ // cnt = n_mm<<16 | n_gapo<<8 | n_gape
420
+ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen,
421
+ int *n_cigar, uint32_t *_cnt)
422
+ {
423
+ bwa_cigar_t *cigar = 0;
424
+ ubyte_t *ref_seq;
425
+ bwtint_t k, x, y, l;
426
+ int path_len, ret;
427
+ AlnParam ap = aln_param_bwa;
428
+ path_t *path, *p;
429
+
430
+ // check whether there are too many N's
431
+ if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0;
432
+ for (k = 0, x = 0; k < len; ++k)
433
+ if (seq[k] >= 4) ++x;
434
+ if ((float)x/len >= 0.25 || len - x < SW_MIN_MATCH_LEN) return 0;
435
+
436
+ // get reference subsequence
437
+ ref_seq = (ubyte_t*)calloc(reglen, 1);
438
+ for (k = *beg, l = 0; l < reglen && k < l_pac; ++k)
439
+ ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
440
+ path = (path_t*)calloc(l+len, sizeof(path_t));
441
+
442
+ // do alignment
443
+ ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, 0);
444
+ if (ret < 0) {
445
+ free(path); free(cigar); free(ref_seq); *n_cigar = 0;
446
+ return 0;
447
+ }
448
+ cigar = bwa_aln_path2cigar(path, path_len, n_cigar);
449
+
450
+ // check whether the alignment is good enough
451
+ for (k = 0, x = y = 0; k < *n_cigar; ++k) {
452
+ bwa_cigar_t c = cigar[k];
453
+ if (__cigar_op(c) == FROM_M) x += __cigar_len(c), y += __cigar_len(c);
454
+ else if (__cigar_op(c) == FROM_D) x += __cigar_len(c);
455
+ else y += __cigar_len(c);
456
+ }
457
+ if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough
458
+ free(path); free(cigar); free(ref_seq);
459
+ *n_cigar = 0;
460
+ return 0;
461
+ }
462
+
463
+ { // update cigar and coordinate;
464
+ int start, end;
465
+ p = path + path_len - 1;
466
+ *beg += (p->i? p->i : 1) - 1;
467
+ start = (p->j? p->j : 1) - 1;
468
+ end = path->j;
469
+ cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2));
470
+ if (start) {
471
+ memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar));
472
+ cigar[0] = __cigar_create(3, start);
473
+ ++(*n_cigar);
474
+ }
475
+ if (end < len) {
476
+ /*cigar[*n_cigar] = 3<<14 | (len - end);*/
477
+ cigar[*n_cigar] = __cigar_create(3, (len - end));
478
+ ++(*n_cigar);
479
+ }
480
+ }
481
+
482
+ { // set *cnt
483
+ int n_mm, n_gapo, n_gape;
484
+ n_mm = n_gapo = n_gape = 0;
485
+ p = path + path_len - 1;
486
+ x = p->i? p->i - 1 : 0; y = p->j? p->j - 1 : 0;
487
+ for (k = 0; k < *n_cigar; ++k) {
488
+ bwa_cigar_t c = cigar[k];
489
+ if (__cigar_op(c) == FROM_M) {
490
+ for (l = 0; l < (__cigar_len(c)); ++l)
491
+ if (ref_seq[x+l] < 4 && seq[y+l] < 4 && ref_seq[x+l] != seq[y+l]) ++n_mm;
492
+ x += __cigar_len(c), y += __cigar_len(c);
493
+ } else if (__cigar_op(c) == FROM_D) {
494
+ x += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1;
495
+ } else if (__cigar_op(c) == FROM_I) {
496
+ y += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1;
497
+ }
498
+ }
499
+ *_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape;
500
+ }
501
+
502
+ free(ref_seq); free(path);
503
+ return cigar;
504
+ }
505
+
506
+ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, bwa_seq_t *seqs[2], const pe_opt_t *popt, const isize_info_t *ii)
507
+ {
508
+ ubyte_t *pacseq;
509
+ int i;
510
+ uint64_t n_tot[2], n_mapped[2];
511
+
512
+ // load reference sequence
513
+ if (_pacseq == 0) {
514
+ pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
515
+ rewind(bns->fp_pac);
516
+ fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
517
+ } else pacseq = (ubyte_t*)_pacseq;
518
+ if (!popt->is_sw || ii->avg < 0.0) return pacseq;
519
+
520
+ // perform mate alignment
521
+ n_tot[0] = n_tot[1] = n_mapped[0] = n_mapped[1] = 0;
522
+ for (i = 0; i != n_seqs; ++i) {
523
+ bwa_seq_t *p[2];
524
+ p[0] = seqs[0] + i; p[1] = seqs[1] + i;
525
+ if ((p[0]->mapQ >= SW_MIN_MAPQ || p[1]->mapQ >= SW_MIN_MAPQ) && (p[0]->extra_flag&SAM_FPP) == 0) { // unpaired and one read has high mapQ
526
+ int k, n_cigar[2], is_singleton, mapQ = 0, mq_adjust[2];
527
+ int64_t beg[2], end[2];
528
+ bwa_cigar_t *cigar[2];
529
+ uint32_t cnt[2];
530
+
531
+ /* In the following, _pref points to the reference read
532
+ * which must be aligned; _pmate points to its mate which is
533
+ * considered to be modified. */
534
+
535
+ #define __set_rght_coor(_a, _b, _pref, _pmate) do { \
536
+ (_a) = (int64_t)_pref->pos + ii->avg - 3 * ii->std - _pmate->len * 1.5; \
537
+ (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \
538
+ if ((_a) < (int64_t)_pref->pos + _pref->len) (_a) = _pref->pos + _pref->len; \
539
+ if ((_b) > bns->l_pac) (_b) = bns->l_pac; \
540
+ } while (0)
541
+
542
+ #define __set_left_coor(_a, _b, _pref, _pmate) do { \
543
+ (_a) = (int64_t)_pref->pos + _pref->len - ii->avg - 3 * ii->std - _pmate->len * 0.5; \
544
+ (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \
545
+ if ((_a) < 0) (_a) = 0; \
546
+ if ((_b) > _pref->pos) (_b) = _pref->pos; \
547
+ } while (0)
548
+
549
+ #define __set_fixed(_pref, _pmate, _beg, _cnt) do { \
550
+ _pmate->type = BWA_TYPE_MATESW; \
551
+ _pmate->pos = _beg; \
552
+ _pmate->seQ = _pref->seQ; \
553
+ _pmate->strand = (popt->type == BWA_PET_STD)? 1 - _pref->strand : _pref->strand; \
554
+ _pmate->n_mm = _cnt>>16; _pmate->n_gapo = _cnt>>8&0xff; _pmate->n_gape = _cnt&0xff; \
555
+ _pmate->extra_flag |= SAM_FPP; \
556
+ _pref->extra_flag |= SAM_FPP; \
557
+ } while (0)
558
+
559
+ mq_adjust[0] = mq_adjust[1] = 255; // not effective
560
+ is_singleton = (p[0]->type == BWA_TYPE_NO_MATCH || p[1]->type == BWA_TYPE_NO_MATCH)? 1 : 0;
561
+
562
+ ++n_tot[is_singleton];
563
+ cigar[0] = cigar[1] = 0;
564
+ n_cigar[0] = n_cigar[1] = 0;
565
+ if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered
566
+ for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified
567
+ ubyte_t *seq;
568
+ if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip
569
+ if (popt->type == BWA_PET_STD) {
570
+ if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate
571
+ __set_rght_coor(beg[k], end[k], p[1-k], p[k]);
572
+ seq = p[k]->rseq;
573
+ } else { // then the mate is on forward stand and has smaller coordinate
574
+ __set_left_coor(beg[k], end[k], p[1-k], p[k]);
575
+ seq = p[k]->seq;
576
+ seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly
577
+ }
578
+ } else { // BWA_PET_SOLID
579
+ if (p[1-k]->strand == 0) { // R3-F3 pairing
580
+ if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
581
+ else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
582
+ seq = p[k]->rseq;
583
+ seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed
584
+ } else { // F3-R3 pairing
585
+ if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
586
+ else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
587
+ seq = p[k]->seq;
588
+ }
589
+ }
590
+ // perform SW alignment
591
+ cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]);
592
+ if (cigar[k] && p[k]->type != BWA_TYPE_NO_MATCH) { // re-evaluate cigar[k]
593
+ int s_old, clip = 0, s_new;
594
+ if (__cigar_op(cigar[k][0]) == 3) clip += __cigar_len(cigar[k][0]);
595
+ if (__cigar_op(cigar[k][n_cigar[k]-1]) == 3) clip += __cigar_len(cigar[k][n_cigar[k]-1]);
596
+ s_old = (int)((p[k]->n_mm * 9 + p[k]->n_gapo * 13 + p[k]->n_gape * 2) / 3. * 8. + .499);
597
+ s_new = (int)(((cnt[k]>>16) * 9 + (cnt[k]>>8&0xff) * 13 + (cnt[k]&0xff) * 2 + clip * 3) / 3. * 8. + .499);
598
+ s_old += -4.343 * log(ii->ap_prior / bns->l_pac);
599
+ s_new += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * 1.5) + .499)); // assume the mapped isize is 1.5\sigma
600
+ if (s_old < s_new) { // reject SW alignment
601
+ mq_adjust[k] = s_new - s_old;
602
+ free(cigar[k]); cigar[k] = 0; n_cigar[k] = 0;
603
+ } else mq_adjust[k] = s_old - s_new;
604
+ }
605
+ // now revserse sequence back such that p[*]->seq looks untouched
606
+ if (popt->type == BWA_PET_STD) {
607
+ if (p[1-k]->strand == 1) seq_reverse(p[k]->len, seq, 0);
608
+ } else {
609
+ if (p[1-k]->strand == 0) seq_reverse(p[k]->len, seq, 0);
610
+ }
611
+ }
612
+ k = -1; // no read to be changed
613
+ if (cigar[0] && cigar[1]) {
614
+ k = p[0]->mapQ < p[1]->mapQ? 0 : 1; // p[k] to be fixed
615
+ mapQ = abs(p[1]->mapQ - p[0]->mapQ);
616
+ } else if (cigar[0]) k = 0, mapQ = p[1]->mapQ;
617
+ else if (cigar[1]) k = 1, mapQ = p[0]->mapQ;
618
+ if (k >= 0 && p[k]->pos != beg[k]) {
619
+ ++n_mapped[is_singleton];
620
+ { // recalculate mapping quality
621
+ int tmp = (int)p[1-k]->mapQ - p[k]->mapQ/2 - 8;
622
+ if (tmp <= 0) tmp = 1;
623
+ if (mapQ > tmp) mapQ = tmp;
624
+ p[k]->mapQ = p[1-k]->mapQ = mapQ;
625
+ p[k]->seQ = p[1-k]->seQ = p[1-k]->seQ < mapQ? p[1-k]->seQ : mapQ;
626
+ if (p[k]->mapQ > mq_adjust[k]) p[k]->mapQ = mq_adjust[k];
627
+ if (p[k]->seQ > mq_adjust[k]) p[k]->seQ = mq_adjust[k];
628
+ }
629
+ // update CIGAR
630
+ free(p[k]->cigar); p[k]->cigar = cigar[k]; cigar[k] = 0;
631
+ p[k]->n_cigar = n_cigar[k];
632
+ // update the rest of information
633
+ __set_fixed(p[1-k], p[k], beg[k], cnt[k]);
634
+ }
635
+ free(cigar[0]); free(cigar[1]);
636
+ }
637
+ }
638
+ fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d singletons are mated.\n",
639
+ (long long)n_mapped[1], (long long)n_tot[1], SW_MIN_MAPQ);
640
+ fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d discordant pairs are fixed.\n",
641
+ (long long)n_mapped[0], (long long)n_tot[0], SW_MIN_MAPQ);
642
+ return pacseq;
643
+ }
644
+
645
+ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt)
646
+ {
647
+ extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
648
+ int i, j, n_seqs, tot_seqs = 0;
649
+ bwa_seq_t *seqs[2];
650
+ bwa_seqio_t *ks[2];
651
+ clock_t t;
652
+ bntseq_t *bns, *ntbns = 0;
653
+ FILE *fp_sa[2];
654
+ gap_opt_t opt, opt0;
655
+ khint_t iter;
656
+ isize_info_t last_ii; // this is for the last batch of reads
657
+ char str[1024];
658
+ bwt_t *bwt[2];
659
+ uint8_t *pac;
660
+
661
+ // initialization
662
+ bwase_initialize(); // initialize g_log_n[] in bwase.c
663
+ pac = 0; bwt[0] = bwt[1] = 0;
664
+ for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
665
+ bns = bns_restore(prefix);
666
+ srand48(bns->seed);
667
+ fp_sa[0] = xopen(fn_sa[0], "r");
668
+ fp_sa[1] = xopen(fn_sa[1], "r");
669
+ g_hash = kh_init(64);
670
+ last_ii.avg = -1.0;
671
+
672
+ fread(&opt, sizeof(gap_opt_t), 1, fp_sa[0]);
673
+ ks[0] = bwa_open_reads(opt.mode, fn_fa[0]);
674
+ opt0 = opt;
675
+ fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
676
+ ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
677
+ if (!(opt.mode & BWA_MODE_COMPREAD)) {
678
+ popt->type = BWA_PET_SOLID;
679
+ ntbns = bwa_open_nt(prefix);
680
+ } else { // for Illumina alignment only
681
+ if (popt->is_preload) {
682
+ strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
683
+ strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
684
+ strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
685
+ strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
686
+ pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
687
+ rewind(bns->fp_pac);
688
+ fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
689
+ }
690
+ }
691
+
692
+ // core loop
693
+ bwa_print_sam_SQ(bns);
694
+ bwa_print_sam_PG();
695
+ while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
696
+ int cnt_chg;
697
+ isize_info_t ii;
698
+ ubyte_t *pacseq;
699
+
700
+ seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode, opt.trim_qual);
701
+ tot_seqs += n_seqs;
702
+ t = clock();
703
+
704
+ fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n");
705
+ cnt_chg = bwa_cal_pac_pos_pe(prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii);
706
+ fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
707
+ fprintf(stderr, "[bwa_sai2sam_pe_core] changing coordinates of %d alignments.\n", cnt_chg);
708
+
709
+ fprintf(stderr, "[bwa_sai2sam_pe_core] align unmapped mate...\n");
710
+ pacseq = bwa_paired_sw(bns, pac, n_seqs, seqs, popt, &ii);
711
+ fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
712
+
713
+ fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... ");
714
+ for (j = 0; j < 2; ++j)
715
+ bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns);
716
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
717
+ if (pac == 0) free(pacseq);
718
+
719
+ fprintf(stderr, "[bwa_sai2sam_pe_core] print alignments... ");
720
+ for (i = 0; i < n_seqs; ++i) {
721
+ bwa_seq_t *p[2];
722
+ p[0] = seqs[0] + i; p[1] = seqs[1] + i;
723
+ if (p[0]->bc[0] || p[1]->bc[0]) {
724
+ strcat(p[0]->bc, p[1]->bc);
725
+ strcpy(p[1]->bc, p[0]->bc);
726
+ }
727
+ bwa_print_sam1(bns, p[0], p[1], opt.mode, opt.max_top2);
728
+ bwa_print_sam1(bns, p[1], p[0], opt.mode, opt.max_top2);
729
+ }
730
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
731
+
732
+ for (j = 0; j < 2; ++j)
733
+ bwa_free_read_seq(n_seqs, seqs[j]);
734
+ fprintf(stderr, "[bwa_sai2sam_pe_core] %d sequences have been processed.\n", tot_seqs);
735
+ last_ii = ii;
736
+ }
737
+
738
+ // destroy
739
+ bns_destroy(bns);
740
+ if (ntbns) bns_destroy(ntbns);
741
+ for (i = 0; i < 2; ++i) {
742
+ bwa_seq_close(ks[i]);
743
+ fclose(fp_sa[i]);
744
+ }
745
+ for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter)
746
+ if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a);
747
+ kh_destroy(64, g_hash);
748
+ if (pac) {
749
+ free(pac); bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
750
+ }
751
+ }
752
+
753
+ int bwa_sai2sam_pe(int argc, char *argv[])
754
+ {
755
+ extern char *bwa_rg_line, *bwa_rg_id;
756
+ extern int bwa_set_rg(const char *s);
757
+ int c;
758
+ pe_opt_t *popt;
759
+ popt = bwa_init_pe_opt();
760
+ optind = 1;
761
+ while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) {
762
+ switch (c) {
763
+ case 'r':
764
+ if (bwa_set_rg(optarg) < 0) {
765
+ fprintf(stderr, "[%s] malformated @RG line\n", __func__);
766
+ return 1;
767
+ }
768
+ break;
769
+ case 'a': popt->max_isize = atoi(optarg); break;
770
+ case 'o': popt->max_occ = atoi(optarg); break;
771
+ case 's': popt->is_sw = 0; break;
772
+ case 'P': popt->is_preload = 1; break;
773
+ case 'n': popt->n_multi = atoi(optarg); break;
774
+ case 'N': popt->N_multi = atoi(optarg); break;
775
+ case 'c': popt->ap_prior = atof(optarg); break;
776
+ case 'f': xreopen(optarg, "w", stdout); break;
777
+ case 'A': popt->force_isize = 1; break;
778
+ default: return 1;
779
+ }
780
+ }
781
+
782
+ if (optind + 5 > argc) {
783
+ fprintf(stderr, "\n");
784
+ fprintf(stderr, "Usage: bwa sampe [options] <prefix> <in1.sai> <in2.sai> <in1.fq> <in2.fq>\n\n");
785
+ fprintf(stderr, "Options: -a INT maximum insert size [%d]\n", popt->max_isize);
786
+ fprintf(stderr, " -o INT maximum occurrences for one end [%d]\n", popt->max_occ);
787
+ fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi);
788
+ fprintf(stderr, " -N INT maximum hits to output for discordant pairs [%d]\n", popt->N_multi);
789
+ fprintf(stderr, " -c FLOAT prior of chimeric rate (lower bound) [%.1le]\n", popt->ap_prior);
790
+ fprintf(stderr, " -f FILE sam file to output results to [stdout]\n");
791
+ fprintf(stderr, " -r STR read group header line such as `@RG\\tID:foo\\tSM:bar' [null]\n");
792
+ fprintf(stderr, " -P preload index into memory (for base-space reads only)\n");
793
+ fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n");
794
+ fprintf(stderr, " -A disable insert size estimate (force -s)\n\n");
795
+ fprintf(stderr, "Notes: 1. For SOLiD reads, <in1.fq> corresponds R3 reads and <in2.fq> to F3.\n");
796
+ fprintf(stderr, " 2. For reads shorter than 30bp, applying a smaller -o is recommended to\n");
797
+ fprintf(stderr, " to get a sensible speed at the cost of pairing accuracy.\n");
798
+ fprintf(stderr, "\n");
799
+ return 1;
800
+ }
801
+ bwa_sai2sam_pe_core(argv[optind], argv + optind + 1, argv + optind+3, popt);
802
+ free(bwa_rg_line); free(bwa_rg_id);
803
+ free(popt);
804
+ fflush(stdout);
805
+ xreopen("/dev/tty","w",stdout);
806
+ return 0;
807
+ }