bio-bwa 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
data/ext/bwtindex.c ADDED
@@ -0,0 +1,186 @@
1
+ /* The MIT License
2
+
3
+ Copyright (c) 2008 Genome Research Ltd (GRL).
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
24
+ */
25
+
26
+ /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
+
28
+ #include <stdio.h>
29
+ #include <stdlib.h>
30
+ #include <string.h>
31
+ #include <unistd.h>
32
+ #include <time.h>
33
+ #include <zlib.h>
34
+ #include "bntseq.h"
35
+ #include "bwt.h"
36
+ #include "main.h"
37
+ #include "utils.h"
38
+
39
+ bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is);
40
+ void bwa_pac_rev_core(const char *fn, const char *fn_rev);
41
+
42
+ int bwa_index(int argc, char *argv[])
43
+ {
44
+ char *prefix = 0, *str, *str2, *str3;
45
+ int c, algo_type = 3, is_color = 0;
46
+ clock_t t;
47
+ optind = 1;
48
+ while ((c = getopt(argc, argv, "ca:p:")) >= 0) {
49
+ switch (c) {
50
+ case 'a':
51
+ if (strcmp(optarg, "div") == 0) algo_type = 1;
52
+ else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2;
53
+ else if (strcmp(optarg, "is") == 0) algo_type = 3;
54
+ else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
55
+ break;
56
+ case 'p': prefix = strdup(optarg); break;
57
+ case 'c': is_color = 1; break;
58
+ default: return 1;
59
+ }
60
+ }
61
+
62
+ if (optind + 1 > argc) {
63
+ fprintf(stderr, "\n");
64
+ fprintf(stderr, "Usage: bwa index [-a bwtsw|div|is] [-c] <in.fasta>\n\n");
65
+ fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [is]\n");
66
+ fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
67
+ fprintf(stderr, " -c build color-space index\n\n");
68
+ fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
69
+ fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n");
70
+ fprintf(stderr, " according to the length of the genome.\n\n");
71
+ return 1;
72
+ }
73
+ if (prefix == 0) prefix = strdup(argv[optind]);
74
+ str = (char*)calloc(strlen(prefix) + 10, 1);
75
+ str2 = (char*)calloc(strlen(prefix) + 10, 1);
76
+ str3 = (char*)calloc(strlen(prefix) + 10, 1);
77
+
78
+ if (is_color == 0) { // nucleotide indexing
79
+ gzFile fp = xzopen(argv[optind], "r");
80
+ t = clock();
81
+ fprintf(stderr, "[bwa_index] Pack FASTA... ");
82
+ bns_fasta2bntseq(fp, prefix);
83
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
84
+ gzclose(fp);
85
+ } else { // color indexing
86
+ gzFile fp = xzopen(argv[optind], "r");
87
+ strcat(strcpy(str, prefix), ".nt");
88
+ t = clock();
89
+ fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... ");
90
+ bns_fasta2bntseq(fp, str);
91
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
92
+ gzclose(fp);
93
+ {
94
+ char *tmp_argv[3];
95
+ tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix;
96
+ t = clock();
97
+ fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... ");
98
+ bwa_pac2cspac(3, tmp_argv);
99
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
100
+ }
101
+ }
102
+ {
103
+ strcpy(str, prefix); strcat(str, ".pac");
104
+ strcpy(str2, prefix); strcat(str2, ".rpac");
105
+ t = clock();
106
+ fprintf(stderr, "[bwa_index] Reverse the packed sequence... ");
107
+ bwa_pac_rev_core(str, str2);
108
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
109
+ }
110
+ {
111
+ strcpy(str, prefix); strcat(str, ".pac");
112
+ strcpy(str2, prefix); strcat(str2, ".bwt");
113
+ t = clock();
114
+ fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
115
+ if (algo_type == 2) bwt_bwtgen(str, str2);
116
+ else if (algo_type == 1 || algo_type == 3) {
117
+ bwt_t *bwt;
118
+ bwt = bwt_pac2bwt(str, algo_type == 3);
119
+ bwt_dump_bwt(str2, bwt);
120
+ bwt_destroy(bwt);
121
+ }
122
+ fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
123
+ }
124
+ {
125
+ strcpy(str, prefix); strcat(str, ".rpac");
126
+ strcpy(str2, prefix); strcat(str2, ".rbwt");
127
+ t = clock();
128
+ fprintf(stderr, "[bwa_index] Construct BWT for the reverse packed sequence...\n");
129
+ if (algo_type == 2) bwt_bwtgen(str, str2);
130
+ else if (algo_type == 1 || algo_type == 3) {
131
+ bwt_t *bwt;
132
+ bwt = bwt_pac2bwt(str, algo_type == 3);
133
+ bwt_dump_bwt(str2, bwt);
134
+ bwt_destroy(bwt);
135
+ }
136
+ fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
137
+ }
138
+ {
139
+ bwt_t *bwt;
140
+ strcpy(str, prefix); strcat(str, ".bwt");
141
+ t = clock();
142
+ fprintf(stderr, "[bwa_index] Update BWT... ");
143
+ bwt = bwt_restore_bwt(str);
144
+ bwt_bwtupdate_core(bwt);
145
+ bwt_dump_bwt(str, bwt);
146
+ bwt_destroy(bwt);
147
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
148
+ }
149
+ {
150
+ bwt_t *bwt;
151
+ strcpy(str, prefix); strcat(str, ".rbwt");
152
+ t = clock();
153
+ fprintf(stderr, "[bwa_index] Update reverse BWT... ");
154
+ bwt = bwt_restore_bwt(str);
155
+ bwt_bwtupdate_core(bwt);
156
+ bwt_dump_bwt(str, bwt);
157
+ bwt_destroy(bwt);
158
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
159
+ }
160
+ {
161
+ bwt_t *bwt;
162
+ strcpy(str, prefix); strcat(str, ".bwt");
163
+ strcpy(str3, prefix); strcat(str3, ".sa");
164
+ t = clock();
165
+ fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... ");
166
+ bwt = bwt_restore_bwt(str);
167
+ bwt_cal_sa(bwt, 32);
168
+ bwt_dump_sa(str3, bwt);
169
+ bwt_destroy(bwt);
170
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
171
+ }
172
+ {
173
+ bwt_t *bwt;
174
+ strcpy(str, prefix); strcat(str, ".rbwt");
175
+ strcpy(str3, prefix); strcat(str3, ".rsa");
176
+ t = clock();
177
+ fprintf(stderr, "[bwa_index] Construct SA from reverse BWT and Occ... ");
178
+ bwt = bwt_restore_bwt(str);
179
+ bwt_cal_sa(bwt, 32);
180
+ bwt_dump_sa(str3, bwt);
181
+ bwt_destroy(bwt);
182
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
183
+ }
184
+ free(str3); free(str2); free(str); free(prefix);
185
+ return 0;
186
+ }
data/ext/bwtio.c ADDED
@@ -0,0 +1,77 @@
1
+ #include <string.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include "bwt.h"
5
+ #include "utils.h"
6
+
7
+ void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
8
+ {
9
+ FILE *fp;
10
+ fp = xopen(fn, "wb");
11
+ fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
12
+ fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
13
+ fwrite(bwt->bwt, sizeof(bwtint_t), bwt->bwt_size, fp);
14
+ fclose(fp);
15
+ }
16
+
17
+ void bwt_dump_sa(const char *fn, const bwt_t *bwt)
18
+ {
19
+ FILE *fp;
20
+ fp = xopen(fn, "wb");
21
+ fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
22
+ fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
23
+ fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
24
+ fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
25
+ fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
26
+ fclose(fp);
27
+ }
28
+
29
+ void bwt_restore_sa(const char *fn, bwt_t *bwt)
30
+ {
31
+ char skipped[256];
32
+ FILE *fp;
33
+ bwtint_t primary;
34
+
35
+ fp = xopen(fn, "rb");
36
+ fread(&primary, sizeof(bwtint_t), 1, fp);
37
+ xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same.");
38
+ fread(skipped, sizeof(bwtint_t), 4, fp); // skip
39
+ fread(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
40
+ fread(&primary, sizeof(bwtint_t), 1, fp);
41
+ xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
42
+
43
+ bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
44
+ bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
45
+ bwt->sa[0] = -1;
46
+
47
+ fread(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
48
+ fclose(fp);
49
+ }
50
+
51
+ bwt_t *bwt_restore_bwt(const char *fn)
52
+ {
53
+ bwt_t *bwt;
54
+ FILE *fp;
55
+
56
+ bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
57
+ fp = xopen(fn, "rb");
58
+ fseek(fp, 0, SEEK_END);
59
+ bwt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 5) >> 2;
60
+ bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4);
61
+ fseek(fp, 0, SEEK_SET);
62
+ fread(&bwt->primary, sizeof(bwtint_t), 1, fp);
63
+ fread(bwt->L2+1, sizeof(bwtint_t), 4, fp);
64
+ fread(bwt->bwt, 4, bwt->bwt_size, fp);
65
+ bwt->seq_len = bwt->L2[4];
66
+ fclose(fp);
67
+ bwt_gen_cnt_table(bwt);
68
+
69
+ return bwt;
70
+ }
71
+
72
+ void bwt_destroy(bwt_t *bwt)
73
+ {
74
+ if (bwt == 0) return;
75
+ free(bwt->sa); free(bwt->bwt);
76
+ free(bwt);
77
+ }
data/ext/bwtmisc.c ADDED
@@ -0,0 +1,269 @@
1
+ /* The MIT License
2
+
3
+ Copyright (c) 2008 Genome Research Ltd (GRL).
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
24
+ */
25
+
26
+ /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
+
28
+ #include <stdlib.h>
29
+ #include <stdio.h>
30
+ #include <string.h>
31
+ #include <unistd.h>
32
+ #include "bntseq.h"
33
+ #include "utils.h"
34
+ #include "main.h"
35
+ #include "bwt.h"
36
+
37
+ #ifdef _DIVBWT
38
+ #include "divsufsort.h"
39
+ #endif
40
+
41
+ int is_bwt(ubyte_t *T, int n);
42
+
43
+ int64_t bwa_seq_len(const char *fn_pac)
44
+ {
45
+ FILE *fp;
46
+ int64_t pac_len;
47
+ ubyte_t c;
48
+ fp = xopen(fn_pac, "rb");
49
+ fseek(fp, -1, SEEK_END);
50
+ pac_len = ftell(fp);
51
+ fread(&c, 1, 1, fp);
52
+ fclose(fp);
53
+ return (pac_len - 1) * 4 + (int)c;
54
+ }
55
+
56
+ bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
57
+ {
58
+ bwt_t *bwt;
59
+ ubyte_t *buf, *buf2;
60
+ int i, pac_size;
61
+ FILE *fp;
62
+
63
+ // initialization
64
+ bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
65
+ bwt->seq_len = bwa_seq_len(fn_pac);
66
+ bwt->bwt_size = (bwt->seq_len + 15) >> 4;
67
+ fp = xopen(fn_pac, "rb");
68
+
69
+ // prepare sequence
70
+ pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
71
+ buf2 = (ubyte_t*)calloc(pac_size, 1);
72
+ fread(buf2, 1, pac_size, fp);
73
+ fclose(fp);
74
+ memset(bwt->L2, 0, 5 * 4);
75
+ buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
76
+ for (i = 0; i < bwt->seq_len; ++i) {
77
+ buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
78
+ ++bwt->L2[1+buf[i]];
79
+ }
80
+ for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
81
+ free(buf2);
82
+
83
+ // Burrows-Wheeler Transform
84
+ if (use_is) {
85
+ bwt->primary = is_bwt(buf, bwt->seq_len);
86
+ } else {
87
+ #ifdef _DIVBWT
88
+ bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
89
+ #else
90
+ err_fatal_simple("libdivsufsort is not compiled in.");
91
+ #endif
92
+ }
93
+ bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
94
+ for (i = 0; i < bwt->seq_len; ++i)
95
+ bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
96
+ free(buf);
97
+ return bwt;
98
+ }
99
+
100
+ int bwa_pac2bwt(int argc, char *argv[])
101
+ {
102
+ bwt_t *bwt;
103
+ int c, use_is = 1;
104
+ optind = 1;
105
+ while ((c = getopt(argc, argv, "d")) >= 0) {
106
+ switch (c) {
107
+ case 'd': use_is = 0; break;
108
+ default: return 1;
109
+ }
110
+ }
111
+ if (optind + 2 > argc) {
112
+ fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
113
+ return 1;
114
+ }
115
+ bwt = bwt_pac2bwt(argv[optind], use_is);
116
+ bwt_dump_bwt(argv[optind+1], bwt);
117
+ bwt_destroy(bwt);
118
+ return 0;
119
+ }
120
+
121
+ #define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
122
+
123
+ void bwt_bwtupdate_core(bwt_t *bwt)
124
+ {
125
+ bwtint_t i, k, c[4], n_occ;
126
+ uint32_t *buf;
127
+
128
+ n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
129
+ bwt->bwt_size += n_occ * 4; // the new size
130
+ buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt
131
+ c[0] = c[1] = c[2] = c[3] = 0;
132
+ for (i = k = 0; i < bwt->seq_len; ++i) {
133
+ if (i % OCC_INTERVAL == 0) {
134
+ memcpy(buf + k, c, sizeof(bwtint_t) * 4);
135
+ k += 4;
136
+ }
137
+ if (i % 16 == 0) buf[k++] = bwt->bwt[i/16];
138
+ ++c[bwt_B00(bwt, i)];
139
+ }
140
+ // the last element
141
+ memcpy(buf + k, c, sizeof(bwtint_t) * 4);
142
+ xassert(k + 4 == bwt->bwt_size, "inconsistent bwt_size");
143
+ // update bwt
144
+ free(bwt->bwt); bwt->bwt = buf;
145
+ }
146
+
147
+ int bwa_bwtupdate(int argc, char *argv[])
148
+ {
149
+ bwt_t *bwt;
150
+ if (argc < 2) {
151
+ fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
152
+ return 1;
153
+ }
154
+ bwt = bwt_restore_bwt(argv[1]);
155
+ bwt_bwtupdate_core(bwt);
156
+ bwt_dump_bwt(argv[1], bwt);
157
+ bwt_destroy(bwt);
158
+ return 0;
159
+ }
160
+
161
+ void bwa_pac_rev_core(const char *fn, const char *fn_rev)
162
+ {
163
+ int64_t seq_len, i;
164
+ bwtint_t pac_len, j;
165
+ ubyte_t *bufin, *bufout, ct;
166
+ FILE *fp;
167
+ seq_len = bwa_seq_len(fn);
168
+ pac_len = (seq_len >> 2) + 1;
169
+ bufin = (ubyte_t*)calloc(pac_len, 1);
170
+ bufout = (ubyte_t*)calloc(pac_len, 1);
171
+ fp = xopen(fn, "rb");
172
+ fread(bufin, 1, pac_len, fp);
173
+ fclose(fp);
174
+ for (i = seq_len - 1, j = 0; i >= 0; --i) {
175
+ int c = bufin[i>>2] >> ((~i&3)<<1) & 3;
176
+ bwtint_t j = seq_len - 1 - i;
177
+ bufout[j>>2] |= c << ((~j&3)<<1);
178
+ }
179
+ free(bufin);
180
+ fp = xopen(fn_rev, "wb");
181
+ fwrite(bufout, 1, pac_len, fp);
182
+ ct = seq_len % 4;
183
+ fwrite(&ct, 1, 1, fp);
184
+ fclose(fp);
185
+ free(bufout);
186
+ }
187
+
188
+ int bwa_pac_rev(int argc, char *argv[])
189
+ {
190
+ if (argc < 3) {
191
+ fprintf(stderr, "Usage: bwa pac_rev <in.pac> <out.pac>\n");
192
+ return 1;
193
+ }
194
+ bwa_pac_rev_core(argv[1], argv[2]);
195
+ return 0;
196
+ }
197
+
198
+ const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
199
+
200
+ /* this function is not memory efficient, but this will make life easier
201
+ Ideally we should also change .amb files as one 'N' in the nucleotide
202
+ sequence leads to two ambiguous colors. I may do this later... */
203
+ uint8_t *bwa_pac2cspac_core(const bntseq_t *bns)
204
+ {
205
+ uint8_t *pac, *cspac;
206
+ bwtint_t i;
207
+ int c1, c2;
208
+ pac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1);
209
+ cspac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1);
210
+ fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
211
+ rewind(bns->fp_pac);
212
+ c1 = pac[0]>>6; cspac[0] = c1<<6;
213
+ for (i = 1; i < bns->l_pac; ++i) {
214
+ c2 = pac[i>>2] >> (~i&3)*2 & 3;
215
+ cspac[i>>2] |= nst_color_space_table[(1<<c1)|(1<<c2)] << (~i&3)*2;
216
+ c1 = c2;
217
+ }
218
+ free(pac);
219
+ return cspac;
220
+ }
221
+
222
+ int bwa_pac2cspac(int argc, char *argv[])
223
+ {
224
+ bntseq_t *bns;
225
+ uint8_t *cspac, ct;
226
+ char *str;
227
+ FILE *fp;
228
+
229
+ if (argc < 3) {
230
+ fprintf(stderr, "Usage: bwa pac2cspac <in.nt.prefix> <out.cs.prefix>\n");
231
+ return 1;
232
+ }
233
+ bns = bns_restore(argv[1]);
234
+ cspac = bwa_pac2cspac_core(bns);
235
+ bns_dump(bns, argv[2]);
236
+ // now write cspac
237
+ str = (char*)calloc(strlen(argv[2]) + 5, 1);
238
+ strcat(strcpy(str, argv[2]), ".pac");
239
+ fp = xopen(str, "wb");
240
+ fwrite(cspac, 1, bns->l_pac/4 + 1, fp);
241
+ ct = bns->l_pac % 4;
242
+ fwrite(&ct, 1, 1, fp);
243
+ fclose(fp);
244
+ bns_destroy(bns);
245
+ free(cspac);
246
+ return 0;
247
+ }
248
+
249
+ int bwa_bwt2sa(int argc, char *argv[])
250
+ {
251
+ bwt_t *bwt;
252
+ int c, sa_intv = 32;
253
+ optind = 1;
254
+ while ((c = getopt(argc, argv, "i:")) >= 0) {
255
+ switch (c) {
256
+ case 'i': sa_intv = atoi(optarg); break;
257
+ default: return 1;
258
+ }
259
+ }
260
+ if (optind + 2 > argc) {
261
+ fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
262
+ return 1;
263
+ }
264
+ bwt = bwt_restore_bwt(argv[optind]);
265
+ bwt_cal_sa(bwt, sa_intv);
266
+ bwt_dump_sa(argv[optind+1], bwt);
267
+ bwt_destroy(bwt);
268
+ return 0;
269
+ }