bio-bwa 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
data/ext/bwtindex.c ADDED
@@ -0,0 +1,186 @@
1
+ /* The MIT License
2
+
3
+ Copyright (c) 2008 Genome Research Ltd (GRL).
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
24
+ */
25
+
26
+ /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
+
28
+ #include <stdio.h>
29
+ #include <stdlib.h>
30
+ #include <string.h>
31
+ #include <unistd.h>
32
+ #include <time.h>
33
+ #include <zlib.h>
34
+ #include "bntseq.h"
35
+ #include "bwt.h"
36
+ #include "main.h"
37
+ #include "utils.h"
38
+
39
+ bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is);
40
+ void bwa_pac_rev_core(const char *fn, const char *fn_rev);
41
+
42
+ int bwa_index(int argc, char *argv[])
43
+ {
44
+ char *prefix = 0, *str, *str2, *str3;
45
+ int c, algo_type = 3, is_color = 0;
46
+ clock_t t;
47
+ optind = 1;
48
+ while ((c = getopt(argc, argv, "ca:p:")) >= 0) {
49
+ switch (c) {
50
+ case 'a':
51
+ if (strcmp(optarg, "div") == 0) algo_type = 1;
52
+ else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2;
53
+ else if (strcmp(optarg, "is") == 0) algo_type = 3;
54
+ else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
55
+ break;
56
+ case 'p': prefix = strdup(optarg); break;
57
+ case 'c': is_color = 1; break;
58
+ default: return 1;
59
+ }
60
+ }
61
+
62
+ if (optind + 1 > argc) {
63
+ fprintf(stderr, "\n");
64
+ fprintf(stderr, "Usage: bwa index [-a bwtsw|div|is] [-c] <in.fasta>\n\n");
65
+ fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [is]\n");
66
+ fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
67
+ fprintf(stderr, " -c build color-space index\n\n");
68
+ fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
69
+ fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n");
70
+ fprintf(stderr, " according to the length of the genome.\n\n");
71
+ return 1;
72
+ }
73
+ if (prefix == 0) prefix = strdup(argv[optind]);
74
+ str = (char*)calloc(strlen(prefix) + 10, 1);
75
+ str2 = (char*)calloc(strlen(prefix) + 10, 1);
76
+ str3 = (char*)calloc(strlen(prefix) + 10, 1);
77
+
78
+ if (is_color == 0) { // nucleotide indexing
79
+ gzFile fp = xzopen(argv[optind], "r");
80
+ t = clock();
81
+ fprintf(stderr, "[bwa_index] Pack FASTA... ");
82
+ bns_fasta2bntseq(fp, prefix);
83
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
84
+ gzclose(fp);
85
+ } else { // color indexing
86
+ gzFile fp = xzopen(argv[optind], "r");
87
+ strcat(strcpy(str, prefix), ".nt");
88
+ t = clock();
89
+ fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... ");
90
+ bns_fasta2bntseq(fp, str);
91
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
92
+ gzclose(fp);
93
+ {
94
+ char *tmp_argv[3];
95
+ tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix;
96
+ t = clock();
97
+ fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... ");
98
+ bwa_pac2cspac(3, tmp_argv);
99
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
100
+ }
101
+ }
102
+ {
103
+ strcpy(str, prefix); strcat(str, ".pac");
104
+ strcpy(str2, prefix); strcat(str2, ".rpac");
105
+ t = clock();
106
+ fprintf(stderr, "[bwa_index] Reverse the packed sequence... ");
107
+ bwa_pac_rev_core(str, str2);
108
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
109
+ }
110
+ {
111
+ strcpy(str, prefix); strcat(str, ".pac");
112
+ strcpy(str2, prefix); strcat(str2, ".bwt");
113
+ t = clock();
114
+ fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
115
+ if (algo_type == 2) bwt_bwtgen(str, str2);
116
+ else if (algo_type == 1 || algo_type == 3) {
117
+ bwt_t *bwt;
118
+ bwt = bwt_pac2bwt(str, algo_type == 3);
119
+ bwt_dump_bwt(str2, bwt);
120
+ bwt_destroy(bwt);
121
+ }
122
+ fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
123
+ }
124
+ {
125
+ strcpy(str, prefix); strcat(str, ".rpac");
126
+ strcpy(str2, prefix); strcat(str2, ".rbwt");
127
+ t = clock();
128
+ fprintf(stderr, "[bwa_index] Construct BWT for the reverse packed sequence...\n");
129
+ if (algo_type == 2) bwt_bwtgen(str, str2);
130
+ else if (algo_type == 1 || algo_type == 3) {
131
+ bwt_t *bwt;
132
+ bwt = bwt_pac2bwt(str, algo_type == 3);
133
+ bwt_dump_bwt(str2, bwt);
134
+ bwt_destroy(bwt);
135
+ }
136
+ fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
137
+ }
138
+ {
139
+ bwt_t *bwt;
140
+ strcpy(str, prefix); strcat(str, ".bwt");
141
+ t = clock();
142
+ fprintf(stderr, "[bwa_index] Update BWT... ");
143
+ bwt = bwt_restore_bwt(str);
144
+ bwt_bwtupdate_core(bwt);
145
+ bwt_dump_bwt(str, bwt);
146
+ bwt_destroy(bwt);
147
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
148
+ }
149
+ {
150
+ bwt_t *bwt;
151
+ strcpy(str, prefix); strcat(str, ".rbwt");
152
+ t = clock();
153
+ fprintf(stderr, "[bwa_index] Update reverse BWT... ");
154
+ bwt = bwt_restore_bwt(str);
155
+ bwt_bwtupdate_core(bwt);
156
+ bwt_dump_bwt(str, bwt);
157
+ bwt_destroy(bwt);
158
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
159
+ }
160
+ {
161
+ bwt_t *bwt;
162
+ strcpy(str, prefix); strcat(str, ".bwt");
163
+ strcpy(str3, prefix); strcat(str3, ".sa");
164
+ t = clock();
165
+ fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... ");
166
+ bwt = bwt_restore_bwt(str);
167
+ bwt_cal_sa(bwt, 32);
168
+ bwt_dump_sa(str3, bwt);
169
+ bwt_destroy(bwt);
170
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
171
+ }
172
+ {
173
+ bwt_t *bwt;
174
+ strcpy(str, prefix); strcat(str, ".rbwt");
175
+ strcpy(str3, prefix); strcat(str3, ".rsa");
176
+ t = clock();
177
+ fprintf(stderr, "[bwa_index] Construct SA from reverse BWT and Occ... ");
178
+ bwt = bwt_restore_bwt(str);
179
+ bwt_cal_sa(bwt, 32);
180
+ bwt_dump_sa(str3, bwt);
181
+ bwt_destroy(bwt);
182
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
183
+ }
184
+ free(str3); free(str2); free(str); free(prefix);
185
+ return 0;
186
+ }
data/ext/bwtio.c ADDED
@@ -0,0 +1,77 @@
1
+ #include <string.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include "bwt.h"
5
+ #include "utils.h"
6
+
7
+ void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
8
+ {
9
+ FILE *fp;
10
+ fp = xopen(fn, "wb");
11
+ fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
12
+ fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
13
+ fwrite(bwt->bwt, sizeof(bwtint_t), bwt->bwt_size, fp);
14
+ fclose(fp);
15
+ }
16
+
17
+ void bwt_dump_sa(const char *fn, const bwt_t *bwt)
18
+ {
19
+ FILE *fp;
20
+ fp = xopen(fn, "wb");
21
+ fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
22
+ fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
23
+ fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
24
+ fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
25
+ fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
26
+ fclose(fp);
27
+ }
28
+
29
+ void bwt_restore_sa(const char *fn, bwt_t *bwt)
30
+ {
31
+ char skipped[256];
32
+ FILE *fp;
33
+ bwtint_t primary;
34
+
35
+ fp = xopen(fn, "rb");
36
+ fread(&primary, sizeof(bwtint_t), 1, fp);
37
+ xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same.");
38
+ fread(skipped, sizeof(bwtint_t), 4, fp); // skip
39
+ fread(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
40
+ fread(&primary, sizeof(bwtint_t), 1, fp);
41
+ xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
42
+
43
+ bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
44
+ bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
45
+ bwt->sa[0] = -1;
46
+
47
+ fread(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
48
+ fclose(fp);
49
+ }
50
+
51
+ bwt_t *bwt_restore_bwt(const char *fn)
52
+ {
53
+ bwt_t *bwt;
54
+ FILE *fp;
55
+
56
+ bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
57
+ fp = xopen(fn, "rb");
58
+ fseek(fp, 0, SEEK_END);
59
+ bwt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 5) >> 2;
60
+ bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4);
61
+ fseek(fp, 0, SEEK_SET);
62
+ fread(&bwt->primary, sizeof(bwtint_t), 1, fp);
63
+ fread(bwt->L2+1, sizeof(bwtint_t), 4, fp);
64
+ fread(bwt->bwt, 4, bwt->bwt_size, fp);
65
+ bwt->seq_len = bwt->L2[4];
66
+ fclose(fp);
67
+ bwt_gen_cnt_table(bwt);
68
+
69
+ return bwt;
70
+ }
71
+
72
+ void bwt_destroy(bwt_t *bwt)
73
+ {
74
+ if (bwt == 0) return;
75
+ free(bwt->sa); free(bwt->bwt);
76
+ free(bwt);
77
+ }
data/ext/bwtmisc.c ADDED
@@ -0,0 +1,269 @@
1
+ /* The MIT License
2
+
3
+ Copyright (c) 2008 Genome Research Ltd (GRL).
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
24
+ */
25
+
26
+ /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
+
28
+ #include <stdlib.h>
29
+ #include <stdio.h>
30
+ #include <string.h>
31
+ #include <unistd.h>
32
+ #include "bntseq.h"
33
+ #include "utils.h"
34
+ #include "main.h"
35
+ #include "bwt.h"
36
+
37
+ #ifdef _DIVBWT
38
+ #include "divsufsort.h"
39
+ #endif
40
+
41
+ int is_bwt(ubyte_t *T, int n);
42
+
43
+ int64_t bwa_seq_len(const char *fn_pac)
44
+ {
45
+ FILE *fp;
46
+ int64_t pac_len;
47
+ ubyte_t c;
48
+ fp = xopen(fn_pac, "rb");
49
+ fseek(fp, -1, SEEK_END);
50
+ pac_len = ftell(fp);
51
+ fread(&c, 1, 1, fp);
52
+ fclose(fp);
53
+ return (pac_len - 1) * 4 + (int)c;
54
+ }
55
+
56
+ bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
57
+ {
58
+ bwt_t *bwt;
59
+ ubyte_t *buf, *buf2;
60
+ int i, pac_size;
61
+ FILE *fp;
62
+
63
+ // initialization
64
+ bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
65
+ bwt->seq_len = bwa_seq_len(fn_pac);
66
+ bwt->bwt_size = (bwt->seq_len + 15) >> 4;
67
+ fp = xopen(fn_pac, "rb");
68
+
69
+ // prepare sequence
70
+ pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
71
+ buf2 = (ubyte_t*)calloc(pac_size, 1);
72
+ fread(buf2, 1, pac_size, fp);
73
+ fclose(fp);
74
+ memset(bwt->L2, 0, 5 * 4);
75
+ buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
76
+ for (i = 0; i < bwt->seq_len; ++i) {
77
+ buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
78
+ ++bwt->L2[1+buf[i]];
79
+ }
80
+ for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
81
+ free(buf2);
82
+
83
+ // Burrows-Wheeler Transform
84
+ if (use_is) {
85
+ bwt->primary = is_bwt(buf, bwt->seq_len);
86
+ } else {
87
+ #ifdef _DIVBWT
88
+ bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
89
+ #else
90
+ err_fatal_simple("libdivsufsort is not compiled in.");
91
+ #endif
92
+ }
93
+ bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
94
+ for (i = 0; i < bwt->seq_len; ++i)
95
+ bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
96
+ free(buf);
97
+ return bwt;
98
+ }
99
+
100
+ int bwa_pac2bwt(int argc, char *argv[])
101
+ {
102
+ bwt_t *bwt;
103
+ int c, use_is = 1;
104
+ optind = 1;
105
+ while ((c = getopt(argc, argv, "d")) >= 0) {
106
+ switch (c) {
107
+ case 'd': use_is = 0; break;
108
+ default: return 1;
109
+ }
110
+ }
111
+ if (optind + 2 > argc) {
112
+ fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
113
+ return 1;
114
+ }
115
+ bwt = bwt_pac2bwt(argv[optind], use_is);
116
+ bwt_dump_bwt(argv[optind+1], bwt);
117
+ bwt_destroy(bwt);
118
+ return 0;
119
+ }
120
+
121
+ #define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
122
+
123
+ void bwt_bwtupdate_core(bwt_t *bwt)
124
+ {
125
+ bwtint_t i, k, c[4], n_occ;
126
+ uint32_t *buf;
127
+
128
+ n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
129
+ bwt->bwt_size += n_occ * 4; // the new size
130
+ buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt
131
+ c[0] = c[1] = c[2] = c[3] = 0;
132
+ for (i = k = 0; i < bwt->seq_len; ++i) {
133
+ if (i % OCC_INTERVAL == 0) {
134
+ memcpy(buf + k, c, sizeof(bwtint_t) * 4);
135
+ k += 4;
136
+ }
137
+ if (i % 16 == 0) buf[k++] = bwt->bwt[i/16];
138
+ ++c[bwt_B00(bwt, i)];
139
+ }
140
+ // the last element
141
+ memcpy(buf + k, c, sizeof(bwtint_t) * 4);
142
+ xassert(k + 4 == bwt->bwt_size, "inconsistent bwt_size");
143
+ // update bwt
144
+ free(bwt->bwt); bwt->bwt = buf;
145
+ }
146
+
147
+ int bwa_bwtupdate(int argc, char *argv[])
148
+ {
149
+ bwt_t *bwt;
150
+ if (argc < 2) {
151
+ fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
152
+ return 1;
153
+ }
154
+ bwt = bwt_restore_bwt(argv[1]);
155
+ bwt_bwtupdate_core(bwt);
156
+ bwt_dump_bwt(argv[1], bwt);
157
+ bwt_destroy(bwt);
158
+ return 0;
159
+ }
160
+
161
+ void bwa_pac_rev_core(const char *fn, const char *fn_rev)
162
+ {
163
+ int64_t seq_len, i;
164
+ bwtint_t pac_len, j;
165
+ ubyte_t *bufin, *bufout, ct;
166
+ FILE *fp;
167
+ seq_len = bwa_seq_len(fn);
168
+ pac_len = (seq_len >> 2) + 1;
169
+ bufin = (ubyte_t*)calloc(pac_len, 1);
170
+ bufout = (ubyte_t*)calloc(pac_len, 1);
171
+ fp = xopen(fn, "rb");
172
+ fread(bufin, 1, pac_len, fp);
173
+ fclose(fp);
174
+ for (i = seq_len - 1, j = 0; i >= 0; --i) {
175
+ int c = bufin[i>>2] >> ((~i&3)<<1) & 3;
176
+ bwtint_t j = seq_len - 1 - i;
177
+ bufout[j>>2] |= c << ((~j&3)<<1);
178
+ }
179
+ free(bufin);
180
+ fp = xopen(fn_rev, "wb");
181
+ fwrite(bufout, 1, pac_len, fp);
182
+ ct = seq_len % 4;
183
+ fwrite(&ct, 1, 1, fp);
184
+ fclose(fp);
185
+ free(bufout);
186
+ }
187
+
188
+ int bwa_pac_rev(int argc, char *argv[])
189
+ {
190
+ if (argc < 3) {
191
+ fprintf(stderr, "Usage: bwa pac_rev <in.pac> <out.pac>\n");
192
+ return 1;
193
+ }
194
+ bwa_pac_rev_core(argv[1], argv[2]);
195
+ return 0;
196
+ }
197
+
198
+ const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
199
+
200
+ /* this function is not memory efficient, but this will make life easier
201
+ Ideally we should also change .amb files as one 'N' in the nucleotide
202
+ sequence leads to two ambiguous colors. I may do this later... */
203
+ uint8_t *bwa_pac2cspac_core(const bntseq_t *bns)
204
+ {
205
+ uint8_t *pac, *cspac;
206
+ bwtint_t i;
207
+ int c1, c2;
208
+ pac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1);
209
+ cspac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1);
210
+ fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
211
+ rewind(bns->fp_pac);
212
+ c1 = pac[0]>>6; cspac[0] = c1<<6;
213
+ for (i = 1; i < bns->l_pac; ++i) {
214
+ c2 = pac[i>>2] >> (~i&3)*2 & 3;
215
+ cspac[i>>2] |= nst_color_space_table[(1<<c1)|(1<<c2)] << (~i&3)*2;
216
+ c1 = c2;
217
+ }
218
+ free(pac);
219
+ return cspac;
220
+ }
221
+
222
+ int bwa_pac2cspac(int argc, char *argv[])
223
+ {
224
+ bntseq_t *bns;
225
+ uint8_t *cspac, ct;
226
+ char *str;
227
+ FILE *fp;
228
+
229
+ if (argc < 3) {
230
+ fprintf(stderr, "Usage: bwa pac2cspac <in.nt.prefix> <out.cs.prefix>\n");
231
+ return 1;
232
+ }
233
+ bns = bns_restore(argv[1]);
234
+ cspac = bwa_pac2cspac_core(bns);
235
+ bns_dump(bns, argv[2]);
236
+ // now write cspac
237
+ str = (char*)calloc(strlen(argv[2]) + 5, 1);
238
+ strcat(strcpy(str, argv[2]), ".pac");
239
+ fp = xopen(str, "wb");
240
+ fwrite(cspac, 1, bns->l_pac/4 + 1, fp);
241
+ ct = bns->l_pac % 4;
242
+ fwrite(&ct, 1, 1, fp);
243
+ fclose(fp);
244
+ bns_destroy(bns);
245
+ free(cspac);
246
+ return 0;
247
+ }
248
+
249
+ int bwa_bwt2sa(int argc, char *argv[])
250
+ {
251
+ bwt_t *bwt;
252
+ int c, sa_intv = 32;
253
+ optind = 1;
254
+ while ((c = getopt(argc, argv, "i:")) >= 0) {
255
+ switch (c) {
256
+ case 'i': sa_intv = atoi(optarg); break;
257
+ default: return 1;
258
+ }
259
+ }
260
+ if (optind + 2 > argc) {
261
+ fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
262
+ return 1;
263
+ }
264
+ bwt = bwt_restore_bwt(argv[optind]);
265
+ bwt_cal_sa(bwt, sa_intv);
266
+ bwt_dump_sa(argv[optind+1], bwt);
267
+ bwt_destroy(bwt);
268
+ return 0;
269
+ }