bio-bwa 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
data/ext/bwape.c ADDED
@@ -0,0 +1,807 @@
1
+ #include <unistd.h>
2
+ #include <math.h>
3
+ #include <stdlib.h>
4
+ #include <time.h>
5
+ #include <stdio.h>
6
+ #include <string.h>
7
+ #include "bwtaln.h"
8
+ #include "kvec.h"
9
+ #include "bntseq.h"
10
+ #include "utils.h"
11
+ #include "stdaln.h"
12
+
13
+ typedef struct {
14
+ int n;
15
+ bwtint_t *a;
16
+ } poslist_t;
17
+
18
+ typedef struct {
19
+ double avg, std, ap_prior;
20
+ bwtint_t low, high, high_bayesian;
21
+ } isize_info_t;
22
+
23
+ #include "khash.h"
24
+ KHASH_MAP_INIT_INT64(64, poslist_t)
25
+
26
+ #include "ksort.h"
27
+ KSORT_INIT_GENERIC(uint64_t)
28
+
29
+ typedef struct {
30
+ kvec_t(uint64_t) arr;
31
+ kvec_t(uint64_t) pos[2];
32
+ kvec_t(bwt_aln1_t) aln[2];
33
+ } pe_data_t;
34
+
35
+ #define MIN_HASH_WIDTH 1000
36
+
37
+ extern int g_log_n[256]; // in bwase.c
38
+ static kh_64_t *g_hash;
39
+
40
+ void bwase_initialize();
41
+ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi);
42
+ void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
43
+ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
44
+ int bwa_approx_mapQ(const bwa_seq_t *p, int mm);
45
+ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2);
46
+ bntseq_t *bwa_open_nt(const char *prefix);
47
+ void bwa_print_sam_SQ(const bntseq_t *bns);
48
+ void bwa_print_sam_PG();
49
+
50
+ pe_opt_t *bwa_init_pe_opt()
51
+ {
52
+ pe_opt_t *po;
53
+ po = (pe_opt_t*)calloc(1, sizeof(pe_opt_t));
54
+ po->max_isize = 500;
55
+ po->force_isize = 0;
56
+ po->max_occ = 100000;
57
+ po->n_multi = 3;
58
+ po->N_multi = 10;
59
+ po->type = BWA_PET_STD;
60
+ po->is_sw = 1;
61
+ po->ap_prior = 1e-5;
62
+ return po;
63
+ }
64
+
65
+ static inline uint64_t hash_64(uint64_t key)
66
+ {
67
+ key += ~(key << 32);
68
+ key ^= (key >> 22);
69
+ key += ~(key << 13);
70
+ key ^= (key >> 8);
71
+ key += (key << 3);
72
+ key ^= (key >> 15);
73
+ key += ~(key << 27);
74
+ key ^= (key >> 31);
75
+ return key;
76
+ }
77
+ /*
78
+ static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x);
79
+ {
80
+ const double a = 0.140012;
81
+ double b, c;
82
+ b = log(x * (2 - x));
83
+ c = 2./M_PI/a + b / 2.;
84
+ return sqrt(sqrt(c * c - b / a) - c);
85
+ }
86
+ */
87
+
88
+ // for normal distribution, this is about 3std
89
+ #define OUTLIER_BOUND 2.0
90
+
91
+ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double ap_prior, int64_t L)
92
+ {
93
+ uint64_t x, *isizes, n_ap = 0;
94
+ int n, i, tot, p25, p75, p50, max_len = 1, tmp;
95
+ double skewness = 0.0, kurtosis = 0.0, y;
96
+
97
+ ii->avg = ii->std = -1.0;
98
+ ii->low = ii->high = ii->high_bayesian = 0;
99
+ isizes = (uint64_t*)calloc(n_seqs, 8);
100
+ for (i = 0, tot = 0; i != n_seqs; ++i) {
101
+ bwa_seq_t *p[2];
102
+ p[0] = seqs[0] + i; p[1] = seqs[1] + i;
103
+ if (p[0]->mapQ >= 20 && p[1]->mapQ >= 20) {
104
+ x = (p[0]->pos < p[1]->pos)? p[1]->pos + p[1]->len - p[0]->pos : p[0]->pos + p[0]->len - p[1]->pos;
105
+ if (x < 100000) isizes[tot++] = x;
106
+ }
107
+ if (p[0]->len > max_len) max_len = p[0]->len;
108
+ if (p[1]->len > max_len) max_len = p[1]->len;
109
+ }
110
+ if (tot < 20) {
111
+ fprintf(stderr, "[infer_isize] fail to infer insert size: too few good pairs\n");
112
+ free(isizes);
113
+ return -1;
114
+ }
115
+ ks_introsort(uint64_t, tot, isizes);
116
+ p25 = isizes[(int)(tot*0.25 + 0.5)];
117
+ p50 = isizes[(int)(tot*0.50 + 0.5)];
118
+ p75 = isizes[(int)(tot*0.75 + 0.5)];
119
+ tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
120
+ ii->low = tmp > max_len? tmp : max_len; // ii->low is unsigned
121
+ ii->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
122
+ for (i = 0, x = n = 0; i < tot; ++i)
123
+ if (isizes[i] >= ii->low && isizes[i] <= ii->high)
124
+ ++n, x += isizes[i];
125
+ ii->avg = (double)x / n;
126
+ for (i = 0; i < tot; ++i) {
127
+ if (isizes[i] >= ii->low && isizes[i] <= ii->high) {
128
+ double tmp = (isizes[i] - ii->avg) * (isizes[i] - ii->avg);
129
+ ii->std += tmp;
130
+ skewness += tmp * (isizes[i] - ii->avg);
131
+ kurtosis += tmp * tmp;
132
+ }
133
+ }
134
+ kurtosis = kurtosis/n / (ii->std / n * ii->std / n) - 3;
135
+ ii->std = sqrt(ii->std / n); // it would be better as n-1, but n is usually very large
136
+ skewness = skewness / n / (ii->std * ii->std * ii->std);
137
+ for (y = 1.0; y < 10.0; y += 0.01)
138
+ if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break;
139
+ ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499);
140
+ for (i = 0; i < tot; ++i)
141
+ if (isizes[i] > ii->high_bayesian) ++n_ap;
142
+ ii->ap_prior = .01 * (n_ap + .01) / tot;
143
+ if (ii->ap_prior < ap_prior) ii->ap_prior = ap_prior;
144
+ free(isizes);
145
+ fprintf(stderr, "[infer_isize] (25, 50, 75) percentile: (%d, %d, %d)\n", p25, p50, p75);
146
+ if (isnan(ii->std) || p75 > 100000) {
147
+ ii->low = ii->high = ii->high_bayesian = 0; ii->avg = ii->std = -1.0;
148
+ fprintf(stderr, "[infer_isize] fail to infer insert size: weird pairing\n");
149
+ return -1;
150
+ }
151
+ for (y = 1.0; y < 10.0; y += 0.01)
152
+ if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break;
153
+ ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499);
154
+ fprintf(stderr, "[infer_isize] low and high boundaries: %d and %d for estimating avg and std\n", ii->low, ii->high);
155
+ fprintf(stderr, "[infer_isize] inferred external isize from %d pairs: %.3lf +/- %.3lf\n", n, ii->avg, ii->std);
156
+ fprintf(stderr, "[infer_isize] skewness: %.3lf; kurtosis: %.3lf; ap_prior: %.2e\n", skewness, kurtosis, ii->ap_prior);
157
+ fprintf(stderr, "[infer_isize] inferred maximum insert size: %d (%.2lf sigma)\n", ii->high_bayesian, y);
158
+ return 0;
159
+ }
160
+
161
+ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, const isize_info_t *ii)
162
+ {
163
+ int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len;
164
+ uint64_t last_pos[2][2], o_pos[2], subo_score, o_score;
165
+ max_len = p[0]->full_len;
166
+ if (max_len < p[1]->full_len) max_len = p[1]->full_len;
167
+ if (low_bound < max_len) low_bound = max_len;
168
+
169
+ // here v>=u. When ii is set, we check insert size with ii; otherwise with opt->max_isize
170
+ #define __pairing_aux(u,v) do { \
171
+ bwtint_t l = ((v)>>32) + p[(v)&1]->len - ((u)>>32); \
172
+ if ((u) != (uint64_t)-1 && (v)>>32 > (u)>>32 && l >= max_len \
173
+ && ((ii->high && l <= ii->high_bayesian) || (ii->high == 0 && l <= opt->max_isize))) \
174
+ { \
175
+ uint64_t s = d->aln[(v)&1].a[(uint32_t)(v)>>1].score + d->aln[(u)&1].a[(uint32_t)(u)>>1].score; \
176
+ s *= 10; \
177
+ if (ii->high) s += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * fabs(l - ii->avg) / ii->std)) + .499); \
178
+ s = s<<32 | (uint32_t)hash_64((u)>>32<<32 | (v)>>32); \
179
+ if (s>>32 == o_score>>32) ++o_n; \
180
+ else if (s>>32 < o_score>>32) { subo_n += o_n; o_n = 1; } \
181
+ else ++subo_n; \
182
+ if (s < o_score) subo_score = o_score, o_score = s, o_pos[(u)&1] = (u), o_pos[(v)&1] = (v); \
183
+ else if (s < subo_score) subo_score = s; \
184
+ } \
185
+ } while (0)
186
+
187
+ #define __pairing_aux2(q, w) do { \
188
+ const bwt_aln1_t *r = d->aln[(w)&1].a + ((uint32_t)(w)>>1); \
189
+ (q)->extra_flag |= SAM_FPP; \
190
+ if ((q)->pos != (w)>>32 || (q)->strand != r->a) { \
191
+ (q)->n_mm = r->n_mm; (q)->n_gapo = r->n_gapo; (q)->n_gape = r->n_gape; (q)->strand = r->a; \
192
+ (q)->score = r->score; \
193
+ (q)->pos = (w)>>32; \
194
+ if ((q)->mapQ > 0) ++cnt_chg; \
195
+ } \
196
+ } while (0)
197
+
198
+ o_score = subo_score = (uint64_t)-1;
199
+ o_n = subo_n = 0;
200
+ ks_introsort(uint64_t, d->arr.n, d->arr.a);
201
+ for (j = 0; j < 2; ++j) last_pos[j][0] = last_pos[j][1] = (uint64_t)-1;
202
+ if (opt->type == BWA_PET_STD) {
203
+ for (i = 0; i < d->arr.n; ++i) {
204
+ uint64_t x = d->arr.a[i];
205
+ int strand = d->aln[x&1].a[(uint32_t)x>>1].a;
206
+ if (strand == 1) { // reverse strand, then check
207
+ int y = 1 - (x&1);
208
+ __pairing_aux(last_pos[y][1], x);
209
+ __pairing_aux(last_pos[y][0], x);
210
+ } else { // forward strand, then push
211
+ last_pos[x&1][0] = last_pos[x&1][1];
212
+ last_pos[x&1][1] = x;
213
+ }
214
+ }
215
+ } else if (opt->type == BWA_PET_SOLID) {
216
+ for (i = 0; i < d->arr.n; ++i) {
217
+ uint64_t x = d->arr.a[i];
218
+ int strand = d->aln[x&1].a[(uint32_t)x>>1].a;
219
+ if ((strand^x)&1) { // push
220
+ int y = 1 - (x&1);
221
+ __pairing_aux(last_pos[y][1], x);
222
+ __pairing_aux(last_pos[y][0], x);
223
+ } else { // check
224
+ last_pos[x&1][0] = last_pos[x&1][1];
225
+ last_pos[x&1][1] = x;
226
+ }
227
+ }
228
+ } else {
229
+ fprintf(stderr, "[paring] not implemented yet!\n");
230
+ exit(1);
231
+ }
232
+ // set pairing
233
+ //fprintf(stderr, "[%d, %d, %d, %d]\n", d->arr.n, (int)(o_score>>32), (int)(subo_score>>32), o_n);
234
+ if (o_score != (uint64_t)-1) {
235
+ int mapQ_p = 0; // this is the maximum mapping quality when one end is moved
236
+ int rr[2];
237
+ //fprintf(stderr, "%d, %d\n", o_n, subo_n);
238
+ if (o_n == 1) {
239
+ if (subo_score == (uint64_t)-1) mapQ_p = 29; // no sub-optimal pair
240
+ else if ((subo_score>>32) - (o_score>>32) > s_mm * 10) mapQ_p = 23; // poor sub-optimal pair
241
+ else {
242
+ int n = subo_n > 255? 255 : subo_n;
243
+ mapQ_p = ((subo_score>>32) - (o_score>>32)) / 2 - g_log_n[n];
244
+ if (mapQ_p < 0) mapQ_p = 0;
245
+ }
246
+ }
247
+ rr[0] = d->aln[o_pos[0]&1].a[(uint32_t)o_pos[0]>>1].a;
248
+ rr[1] = d->aln[o_pos[1]&1].a[(uint32_t)o_pos[1]>>1].a;
249
+ if ((p[0]->pos == o_pos[0]>>32 && p[0]->strand == rr[0]) && (p[1]->pos == o_pos[1]>>32 && p[1]->strand == rr[1])) { // both ends not moved
250
+ if (p[0]->mapQ > 0 && p[1]->mapQ > 0) {
251
+ int mapQ = p[0]->mapQ + p[1]->mapQ;
252
+ if (mapQ > 60) mapQ = 60;
253
+ p[0]->mapQ = p[1]->mapQ = mapQ;
254
+ } else {
255
+ if (p[0]->mapQ == 0) p[0]->mapQ = (mapQ_p + 7 < p[1]->mapQ)? mapQ_p + 7 : p[1]->mapQ;
256
+ if (p[1]->mapQ == 0) p[1]->mapQ = (mapQ_p + 7 < p[0]->mapQ)? mapQ_p + 7 : p[0]->mapQ;
257
+ }
258
+ } else if (p[0]->pos == o_pos[0]>>32 && p[0]->strand == rr[0]) { // [1] moved
259
+ p[1]->seQ = 0; p[1]->mapQ = p[0]->mapQ;
260
+ if (p[1]->mapQ > mapQ_p) p[1]->mapQ = mapQ_p;
261
+ } else if (p[1]->pos == o_pos[1]>>32 && p[1]->strand == rr[1]) { // [0] moved
262
+ p[0]->seQ = 0; p[0]->mapQ = p[1]->mapQ;
263
+ if (p[0]->mapQ > mapQ_p) p[0]->mapQ = mapQ_p;
264
+ } else { // both ends moved
265
+ p[0]->seQ = p[1]->seQ = 0;
266
+ mapQ_p -= 20;
267
+ if (mapQ_p < 0) mapQ_p = 0;
268
+ p[0]->mapQ = p[1]->mapQ = mapQ_p;
269
+ }
270
+ __pairing_aux2(p[0], o_pos[0]);
271
+ __pairing_aux2(p[1], o_pos[1]);
272
+ }
273
+ return cnt_chg;
274
+ }
275
+
276
+ typedef struct {
277
+ kvec_t(bwt_aln1_t) aln;
278
+ } aln_buf_t;
279
+
280
+ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii,
281
+ const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii)
282
+ {
283
+ int i, j, cnt_chg = 0;
284
+ char str[1024];
285
+ bwt_t *bwt[2];
286
+ pe_data_t *d;
287
+ aln_buf_t *buf[2];
288
+
289
+ d = (pe_data_t*)calloc(1, sizeof(pe_data_t));
290
+ buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
291
+ buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
292
+
293
+ if (_bwt[0] == 0) { // load forward SA
294
+ strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
295
+ strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
296
+ strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
297
+ strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
298
+ } else bwt[0] = _bwt[0], bwt[1] = _bwt[1];
299
+
300
+ // SE
301
+ for (i = 0; i != n_seqs; ++i) {
302
+ bwa_seq_t *p[2];
303
+ for (j = 0; j < 2; ++j) {
304
+ int n_aln;
305
+ p[j] = seqs[j] + i;
306
+ p[j]->n_multi = 0;
307
+ p[j]->extra_flag |= SAM_FPD | (j == 0? SAM_FR1 : SAM_FR2);
308
+ fread(&n_aln, 4, 1, fp_sa[j]);
309
+ if (n_aln > kv_max(d->aln[j]))
310
+ kv_resize(bwt_aln1_t, d->aln[j], n_aln);
311
+ d->aln[j].n = n_aln;
312
+ fread(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]);
313
+ kv_copy(bwt_aln1_t, buf[j][i].aln, d->aln[j]); // backup d->aln[j]
314
+ // generate SE alignment and mapping quality
315
+ bwa_aln2seq(n_aln, d->aln[j].a, p[j]);
316
+ if (p[j]->type == BWA_TYPE_UNIQUE || p[j]->type == BWA_TYPE_REPEAT) {
317
+ int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff;
318
+ p[j]->pos = p[j]->strand? bwt_sa(bwt[0], p[j]->sa)
319
+ : bwt[1]->seq_len - (bwt_sa(bwt[1], p[j]->sa) + p[j]->len);
320
+ p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff);
321
+ }
322
+ }
323
+ }
324
+
325
+ // infer isize
326
+ infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt[0]->seq_len);
327
+ if (ii->avg < 0.0 && last_ii->avg > 0.0) *ii = *last_ii;
328
+ if (opt->force_isize) {
329
+ fprintf(stderr, "[%s] discard insert size estimate as user's request.\n", __func__);
330
+ ii->low = ii->high = 0; ii->avg = ii->std = -1.0;
331
+ }
332
+
333
+ // PE
334
+ for (i = 0; i != n_seqs; ++i) {
335
+ bwa_seq_t *p[2];
336
+ for (j = 0; j < 2; ++j) {
337
+ p[j] = seqs[j] + i;
338
+ kv_copy(bwt_aln1_t, d->aln[j], buf[j][i].aln);
339
+ }
340
+ if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT)
341
+ && (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT))
342
+ { // only when both ends mapped
343
+ uint64_t x;
344
+ int j, k, n_occ[2];
345
+ for (j = 0; j < 2; ++j) {
346
+ n_occ[j] = 0;
347
+ for (k = 0; k < d->aln[j].n; ++k)
348
+ n_occ[j] += d->aln[j].a[k].l - d->aln[j].a[k].k + 1;
349
+ }
350
+ if (n_occ[0] > opt->max_occ || n_occ[1] > opt->max_occ) continue;
351
+ d->arr.n = 0;
352
+ for (j = 0; j < 2; ++j) {
353
+ for (k = 0; k < d->aln[j].n; ++k) {
354
+ bwt_aln1_t *r = d->aln[j].a + k;
355
+ bwtint_t l;
356
+ if (r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table
357
+ uint64_t key = (uint64_t)r->k<<32 | r->l;
358
+ int ret;
359
+ khint_t iter = kh_put(64, g_hash, key, &ret);
360
+ if (ret) { // not in the hash table; ret must equal 1 as we never remove elements
361
+ poslist_t *z = &kh_val(g_hash, iter);
362
+ z->n = r->l - r->k + 1;
363
+ z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n);
364
+ for (l = r->k; l <= r->l; ++l)
365
+ z->a[l - r->k] = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len);
366
+ }
367
+ for (l = 0; l < kh_val(g_hash, iter).n; ++l) {
368
+ x = kh_val(g_hash, iter).a[l];
369
+ x = x<<32 | k<<1 | j;
370
+ kv_push(uint64_t, d->arr, x);
371
+ }
372
+ } else { // then calculate on the fly
373
+ for (l = r->k; l <= r->l; ++l) {
374
+ x = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len);
375
+ x = x<<32 | k<<1 | j;
376
+ kv_push(uint64_t, d->arr, x);
377
+ }
378
+ }
379
+ }
380
+ }
381
+ cnt_chg += pairing(p, d, opt, gopt->s_mm, ii);
382
+ }
383
+
384
+ if (opt->N_multi || opt->n_multi) {
385
+ for (j = 0; j < 2; ++j) {
386
+ if (p[j]->type != BWA_TYPE_NO_MATCH) {
387
+ int k;
388
+ if (!(p[j]->extra_flag&SAM_FPP) && p[1-j]->type != BWA_TYPE_NO_MATCH) {
389
+ bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi);
390
+ } else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi);
391
+ for (k = 0; k < p[j]->n_multi; ++k) {
392
+ bwt_multi1_t *q = p[j]->multi + k;
393
+ q->pos = q->strand? bwt_sa(bwt[0], q->pos) : bwt[1]->seq_len - (bwt_sa(bwt[1], q->pos) + p[j]->len);
394
+ }
395
+ }
396
+ }
397
+ }
398
+ }
399
+
400
+ // free
401
+ for (i = 0; i < n_seqs; ++i) {
402
+ kv_destroy(buf[0][i].aln);
403
+ kv_destroy(buf[1][i].aln);
404
+ }
405
+ free(buf[0]); free(buf[1]);
406
+ if (_bwt[0] == 0) {
407
+ bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
408
+ }
409
+ kv_destroy(d->arr);
410
+ kv_destroy(d->pos[0]); kv_destroy(d->pos[1]);
411
+ kv_destroy(d->aln[0]); kv_destroy(d->aln[1]);
412
+ free(d);
413
+ return cnt_chg;
414
+ }
415
+
416
+ #define SW_MIN_MATCH_LEN 20
417
+ #define SW_MIN_MAPQ 17
418
+
419
+ // cnt = n_mm<<16 | n_gapo<<8 | n_gape
420
+ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen,
421
+ int *n_cigar, uint32_t *_cnt)
422
+ {
423
+ bwa_cigar_t *cigar = 0;
424
+ ubyte_t *ref_seq;
425
+ bwtint_t k, x, y, l;
426
+ int path_len, ret;
427
+ AlnParam ap = aln_param_bwa;
428
+ path_t *path, *p;
429
+
430
+ // check whether there are too many N's
431
+ if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0;
432
+ for (k = 0, x = 0; k < len; ++k)
433
+ if (seq[k] >= 4) ++x;
434
+ if ((float)x/len >= 0.25 || len - x < SW_MIN_MATCH_LEN) return 0;
435
+
436
+ // get reference subsequence
437
+ ref_seq = (ubyte_t*)calloc(reglen, 1);
438
+ for (k = *beg, l = 0; l < reglen && k < l_pac; ++k)
439
+ ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
440
+ path = (path_t*)calloc(l+len, sizeof(path_t));
441
+
442
+ // do alignment
443
+ ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, 0);
444
+ if (ret < 0) {
445
+ free(path); free(cigar); free(ref_seq); *n_cigar = 0;
446
+ return 0;
447
+ }
448
+ cigar = bwa_aln_path2cigar(path, path_len, n_cigar);
449
+
450
+ // check whether the alignment is good enough
451
+ for (k = 0, x = y = 0; k < *n_cigar; ++k) {
452
+ bwa_cigar_t c = cigar[k];
453
+ if (__cigar_op(c) == FROM_M) x += __cigar_len(c), y += __cigar_len(c);
454
+ else if (__cigar_op(c) == FROM_D) x += __cigar_len(c);
455
+ else y += __cigar_len(c);
456
+ }
457
+ if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough
458
+ free(path); free(cigar); free(ref_seq);
459
+ *n_cigar = 0;
460
+ return 0;
461
+ }
462
+
463
+ { // update cigar and coordinate;
464
+ int start, end;
465
+ p = path + path_len - 1;
466
+ *beg += (p->i? p->i : 1) - 1;
467
+ start = (p->j? p->j : 1) - 1;
468
+ end = path->j;
469
+ cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2));
470
+ if (start) {
471
+ memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar));
472
+ cigar[0] = __cigar_create(3, start);
473
+ ++(*n_cigar);
474
+ }
475
+ if (end < len) {
476
+ /*cigar[*n_cigar] = 3<<14 | (len - end);*/
477
+ cigar[*n_cigar] = __cigar_create(3, (len - end));
478
+ ++(*n_cigar);
479
+ }
480
+ }
481
+
482
+ { // set *cnt
483
+ int n_mm, n_gapo, n_gape;
484
+ n_mm = n_gapo = n_gape = 0;
485
+ p = path + path_len - 1;
486
+ x = p->i? p->i - 1 : 0; y = p->j? p->j - 1 : 0;
487
+ for (k = 0; k < *n_cigar; ++k) {
488
+ bwa_cigar_t c = cigar[k];
489
+ if (__cigar_op(c) == FROM_M) {
490
+ for (l = 0; l < (__cigar_len(c)); ++l)
491
+ if (ref_seq[x+l] < 4 && seq[y+l] < 4 && ref_seq[x+l] != seq[y+l]) ++n_mm;
492
+ x += __cigar_len(c), y += __cigar_len(c);
493
+ } else if (__cigar_op(c) == FROM_D) {
494
+ x += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1;
495
+ } else if (__cigar_op(c) == FROM_I) {
496
+ y += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1;
497
+ }
498
+ }
499
+ *_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape;
500
+ }
501
+
502
+ free(ref_seq); free(path);
503
+ return cigar;
504
+ }
505
+
506
+ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, bwa_seq_t *seqs[2], const pe_opt_t *popt, const isize_info_t *ii)
507
+ {
508
+ ubyte_t *pacseq;
509
+ int i;
510
+ uint64_t n_tot[2], n_mapped[2];
511
+
512
+ // load reference sequence
513
+ if (_pacseq == 0) {
514
+ pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
515
+ rewind(bns->fp_pac);
516
+ fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
517
+ } else pacseq = (ubyte_t*)_pacseq;
518
+ if (!popt->is_sw || ii->avg < 0.0) return pacseq;
519
+
520
+ // perform mate alignment
521
+ n_tot[0] = n_tot[1] = n_mapped[0] = n_mapped[1] = 0;
522
+ for (i = 0; i != n_seqs; ++i) {
523
+ bwa_seq_t *p[2];
524
+ p[0] = seqs[0] + i; p[1] = seqs[1] + i;
525
+ if ((p[0]->mapQ >= SW_MIN_MAPQ || p[1]->mapQ >= SW_MIN_MAPQ) && (p[0]->extra_flag&SAM_FPP) == 0) { // unpaired and one read has high mapQ
526
+ int k, n_cigar[2], is_singleton, mapQ = 0, mq_adjust[2];
527
+ int64_t beg[2], end[2];
528
+ bwa_cigar_t *cigar[2];
529
+ uint32_t cnt[2];
530
+
531
+ /* In the following, _pref points to the reference read
532
+ * which must be aligned; _pmate points to its mate which is
533
+ * considered to be modified. */
534
+
535
+ #define __set_rght_coor(_a, _b, _pref, _pmate) do { \
536
+ (_a) = (int64_t)_pref->pos + ii->avg - 3 * ii->std - _pmate->len * 1.5; \
537
+ (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \
538
+ if ((_a) < (int64_t)_pref->pos + _pref->len) (_a) = _pref->pos + _pref->len; \
539
+ if ((_b) > bns->l_pac) (_b) = bns->l_pac; \
540
+ } while (0)
541
+
542
+ #define __set_left_coor(_a, _b, _pref, _pmate) do { \
543
+ (_a) = (int64_t)_pref->pos + _pref->len - ii->avg - 3 * ii->std - _pmate->len * 0.5; \
544
+ (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \
545
+ if ((_a) < 0) (_a) = 0; \
546
+ if ((_b) > _pref->pos) (_b) = _pref->pos; \
547
+ } while (0)
548
+
549
+ #define __set_fixed(_pref, _pmate, _beg, _cnt) do { \
550
+ _pmate->type = BWA_TYPE_MATESW; \
551
+ _pmate->pos = _beg; \
552
+ _pmate->seQ = _pref->seQ; \
553
+ _pmate->strand = (popt->type == BWA_PET_STD)? 1 - _pref->strand : _pref->strand; \
554
+ _pmate->n_mm = _cnt>>16; _pmate->n_gapo = _cnt>>8&0xff; _pmate->n_gape = _cnt&0xff; \
555
+ _pmate->extra_flag |= SAM_FPP; \
556
+ _pref->extra_flag |= SAM_FPP; \
557
+ } while (0)
558
+
559
+ mq_adjust[0] = mq_adjust[1] = 255; // not effective
560
+ is_singleton = (p[0]->type == BWA_TYPE_NO_MATCH || p[1]->type == BWA_TYPE_NO_MATCH)? 1 : 0;
561
+
562
+ ++n_tot[is_singleton];
563
+ cigar[0] = cigar[1] = 0;
564
+ n_cigar[0] = n_cigar[1] = 0;
565
+ if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered
566
+ for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified
567
+ ubyte_t *seq;
568
+ if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip
569
+ if (popt->type == BWA_PET_STD) {
570
+ if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate
571
+ __set_rght_coor(beg[k], end[k], p[1-k], p[k]);
572
+ seq = p[k]->rseq;
573
+ } else { // then the mate is on forward stand and has smaller coordinate
574
+ __set_left_coor(beg[k], end[k], p[1-k], p[k]);
575
+ seq = p[k]->seq;
576
+ seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly
577
+ }
578
+ } else { // BWA_PET_SOLID
579
+ if (p[1-k]->strand == 0) { // R3-F3 pairing
580
+ if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
581
+ else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
582
+ seq = p[k]->rseq;
583
+ seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed
584
+ } else { // F3-R3 pairing
585
+ if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
586
+ else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
587
+ seq = p[k]->seq;
588
+ }
589
+ }
590
+ // perform SW alignment
591
+ cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]);
592
+ if (cigar[k] && p[k]->type != BWA_TYPE_NO_MATCH) { // re-evaluate cigar[k]
593
+ int s_old, clip = 0, s_new;
594
+ if (__cigar_op(cigar[k][0]) == 3) clip += __cigar_len(cigar[k][0]);
595
+ if (__cigar_op(cigar[k][n_cigar[k]-1]) == 3) clip += __cigar_len(cigar[k][n_cigar[k]-1]);
596
+ s_old = (int)((p[k]->n_mm * 9 + p[k]->n_gapo * 13 + p[k]->n_gape * 2) / 3. * 8. + .499);
597
+ s_new = (int)(((cnt[k]>>16) * 9 + (cnt[k]>>8&0xff) * 13 + (cnt[k]&0xff) * 2 + clip * 3) / 3. * 8. + .499);
598
+ s_old += -4.343 * log(ii->ap_prior / bns->l_pac);
599
+ s_new += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * 1.5) + .499)); // assume the mapped isize is 1.5\sigma
600
+ if (s_old < s_new) { // reject SW alignment
601
+ mq_adjust[k] = s_new - s_old;
602
+ free(cigar[k]); cigar[k] = 0; n_cigar[k] = 0;
603
+ } else mq_adjust[k] = s_old - s_new;
604
+ }
605
+ // now revserse sequence back such that p[*]->seq looks untouched
606
+ if (popt->type == BWA_PET_STD) {
607
+ if (p[1-k]->strand == 1) seq_reverse(p[k]->len, seq, 0);
608
+ } else {
609
+ if (p[1-k]->strand == 0) seq_reverse(p[k]->len, seq, 0);
610
+ }
611
+ }
612
+ k = -1; // no read to be changed
613
+ if (cigar[0] && cigar[1]) {
614
+ k = p[0]->mapQ < p[1]->mapQ? 0 : 1; // p[k] to be fixed
615
+ mapQ = abs(p[1]->mapQ - p[0]->mapQ);
616
+ } else if (cigar[0]) k = 0, mapQ = p[1]->mapQ;
617
+ else if (cigar[1]) k = 1, mapQ = p[0]->mapQ;
618
+ if (k >= 0 && p[k]->pos != beg[k]) {
619
+ ++n_mapped[is_singleton];
620
+ { // recalculate mapping quality
621
+ int tmp = (int)p[1-k]->mapQ - p[k]->mapQ/2 - 8;
622
+ if (tmp <= 0) tmp = 1;
623
+ if (mapQ > tmp) mapQ = tmp;
624
+ p[k]->mapQ = p[1-k]->mapQ = mapQ;
625
+ p[k]->seQ = p[1-k]->seQ = p[1-k]->seQ < mapQ? p[1-k]->seQ : mapQ;
626
+ if (p[k]->mapQ > mq_adjust[k]) p[k]->mapQ = mq_adjust[k];
627
+ if (p[k]->seQ > mq_adjust[k]) p[k]->seQ = mq_adjust[k];
628
+ }
629
+ // update CIGAR
630
+ free(p[k]->cigar); p[k]->cigar = cigar[k]; cigar[k] = 0;
631
+ p[k]->n_cigar = n_cigar[k];
632
+ // update the rest of information
633
+ __set_fixed(p[1-k], p[k], beg[k], cnt[k]);
634
+ }
635
+ free(cigar[0]); free(cigar[1]);
636
+ }
637
+ }
638
+ fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d singletons are mated.\n",
639
+ (long long)n_mapped[1], (long long)n_tot[1], SW_MIN_MAPQ);
640
+ fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d discordant pairs are fixed.\n",
641
+ (long long)n_mapped[0], (long long)n_tot[0], SW_MIN_MAPQ);
642
+ return pacseq;
643
+ }
644
+
645
+ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt)
646
+ {
647
+ extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
648
+ int i, j, n_seqs, tot_seqs = 0;
649
+ bwa_seq_t *seqs[2];
650
+ bwa_seqio_t *ks[2];
651
+ clock_t t;
652
+ bntseq_t *bns, *ntbns = 0;
653
+ FILE *fp_sa[2];
654
+ gap_opt_t opt, opt0;
655
+ khint_t iter;
656
+ isize_info_t last_ii; // this is for the last batch of reads
657
+ char str[1024];
658
+ bwt_t *bwt[2];
659
+ uint8_t *pac;
660
+
661
+ // initialization
662
+ bwase_initialize(); // initialize g_log_n[] in bwase.c
663
+ pac = 0; bwt[0] = bwt[1] = 0;
664
+ for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
665
+ bns = bns_restore(prefix);
666
+ srand48(bns->seed);
667
+ fp_sa[0] = xopen(fn_sa[0], "r");
668
+ fp_sa[1] = xopen(fn_sa[1], "r");
669
+ g_hash = kh_init(64);
670
+ last_ii.avg = -1.0;
671
+
672
+ fread(&opt, sizeof(gap_opt_t), 1, fp_sa[0]);
673
+ ks[0] = bwa_open_reads(opt.mode, fn_fa[0]);
674
+ opt0 = opt;
675
+ fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
676
+ ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
677
+ if (!(opt.mode & BWA_MODE_COMPREAD)) {
678
+ popt->type = BWA_PET_SOLID;
679
+ ntbns = bwa_open_nt(prefix);
680
+ } else { // for Illumina alignment only
681
+ if (popt->is_preload) {
682
+ strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
683
+ strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
684
+ strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
685
+ strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
686
+ pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
687
+ rewind(bns->fp_pac);
688
+ fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
689
+ }
690
+ }
691
+
692
+ // core loop
693
+ bwa_print_sam_SQ(bns);
694
+ bwa_print_sam_PG();
695
+ while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
696
+ int cnt_chg;
697
+ isize_info_t ii;
698
+ ubyte_t *pacseq;
699
+
700
+ seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode, opt.trim_qual);
701
+ tot_seqs += n_seqs;
702
+ t = clock();
703
+
704
+ fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n");
705
+ cnt_chg = bwa_cal_pac_pos_pe(prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii);
706
+ fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
707
+ fprintf(stderr, "[bwa_sai2sam_pe_core] changing coordinates of %d alignments.\n", cnt_chg);
708
+
709
+ fprintf(stderr, "[bwa_sai2sam_pe_core] align unmapped mate...\n");
710
+ pacseq = bwa_paired_sw(bns, pac, n_seqs, seqs, popt, &ii);
711
+ fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
712
+
713
+ fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... ");
714
+ for (j = 0; j < 2; ++j)
715
+ bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns);
716
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
717
+ if (pac == 0) free(pacseq);
718
+
719
+ fprintf(stderr, "[bwa_sai2sam_pe_core] print alignments... ");
720
+ for (i = 0; i < n_seqs; ++i) {
721
+ bwa_seq_t *p[2];
722
+ p[0] = seqs[0] + i; p[1] = seqs[1] + i;
723
+ if (p[0]->bc[0] || p[1]->bc[0]) {
724
+ strcat(p[0]->bc, p[1]->bc);
725
+ strcpy(p[1]->bc, p[0]->bc);
726
+ }
727
+ bwa_print_sam1(bns, p[0], p[1], opt.mode, opt.max_top2);
728
+ bwa_print_sam1(bns, p[1], p[0], opt.mode, opt.max_top2);
729
+ }
730
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
731
+
732
+ for (j = 0; j < 2; ++j)
733
+ bwa_free_read_seq(n_seqs, seqs[j]);
734
+ fprintf(stderr, "[bwa_sai2sam_pe_core] %d sequences have been processed.\n", tot_seqs);
735
+ last_ii = ii;
736
+ }
737
+
738
+ // destroy
739
+ bns_destroy(bns);
740
+ if (ntbns) bns_destroy(ntbns);
741
+ for (i = 0; i < 2; ++i) {
742
+ bwa_seq_close(ks[i]);
743
+ fclose(fp_sa[i]);
744
+ }
745
+ for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter)
746
+ if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a);
747
+ kh_destroy(64, g_hash);
748
+ if (pac) {
749
+ free(pac); bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
750
+ }
751
+ }
752
+
753
+ int bwa_sai2sam_pe(int argc, char *argv[])
754
+ {
755
+ extern char *bwa_rg_line, *bwa_rg_id;
756
+ extern int bwa_set_rg(const char *s);
757
+ int c;
758
+ pe_opt_t *popt;
759
+ popt = bwa_init_pe_opt();
760
+ optind = 1;
761
+ while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) {
762
+ switch (c) {
763
+ case 'r':
764
+ if (bwa_set_rg(optarg) < 0) {
765
+ fprintf(stderr, "[%s] malformated @RG line\n", __func__);
766
+ return 1;
767
+ }
768
+ break;
769
+ case 'a': popt->max_isize = atoi(optarg); break;
770
+ case 'o': popt->max_occ = atoi(optarg); break;
771
+ case 's': popt->is_sw = 0; break;
772
+ case 'P': popt->is_preload = 1; break;
773
+ case 'n': popt->n_multi = atoi(optarg); break;
774
+ case 'N': popt->N_multi = atoi(optarg); break;
775
+ case 'c': popt->ap_prior = atof(optarg); break;
776
+ case 'f': xreopen(optarg, "w", stdout); break;
777
+ case 'A': popt->force_isize = 1; break;
778
+ default: return 1;
779
+ }
780
+ }
781
+
782
+ if (optind + 5 > argc) {
783
+ fprintf(stderr, "\n");
784
+ fprintf(stderr, "Usage: bwa sampe [options] <prefix> <in1.sai> <in2.sai> <in1.fq> <in2.fq>\n\n");
785
+ fprintf(stderr, "Options: -a INT maximum insert size [%d]\n", popt->max_isize);
786
+ fprintf(stderr, " -o INT maximum occurrences for one end [%d]\n", popt->max_occ);
787
+ fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi);
788
+ fprintf(stderr, " -N INT maximum hits to output for discordant pairs [%d]\n", popt->N_multi);
789
+ fprintf(stderr, " -c FLOAT prior of chimeric rate (lower bound) [%.1le]\n", popt->ap_prior);
790
+ fprintf(stderr, " -f FILE sam file to output results to [stdout]\n");
791
+ fprintf(stderr, " -r STR read group header line such as `@RG\\tID:foo\\tSM:bar' [null]\n");
792
+ fprintf(stderr, " -P preload index into memory (for base-space reads only)\n");
793
+ fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n");
794
+ fprintf(stderr, " -A disable insert size estimate (force -s)\n\n");
795
+ fprintf(stderr, "Notes: 1. For SOLiD reads, <in1.fq> corresponds R3 reads and <in2.fq> to F3.\n");
796
+ fprintf(stderr, " 2. For reads shorter than 30bp, applying a smaller -o is recommended to\n");
797
+ fprintf(stderr, " to get a sensible speed at the cost of pairing accuracy.\n");
798
+ fprintf(stderr, "\n");
799
+ return 1;
800
+ }
801
+ bwa_sai2sam_pe_core(argv[optind], argv + optind + 1, argv + optind+3, popt);
802
+ free(bwa_rg_line); free(bwa_rg_id);
803
+ free(popt);
804
+ fflush(stdout);
805
+ xreopen("/dev/tty","w",stdout);
806
+ return 0;
807
+ }