bio-bwa 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
data/ext/bwtindex.c
ADDED
@@ -0,0 +1,186 @@
|
|
1
|
+
/* The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2008 Genome Research Ltd (GRL).
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
20
|
+
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
21
|
+
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
22
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
23
|
+
SOFTWARE.
|
24
|
+
*/
|
25
|
+
|
26
|
+
/* Contact: Heng Li <lh3@sanger.ac.uk> */
|
27
|
+
|
28
|
+
#include <stdio.h>
|
29
|
+
#include <stdlib.h>
|
30
|
+
#include <string.h>
|
31
|
+
#include <unistd.h>
|
32
|
+
#include <time.h>
|
33
|
+
#include <zlib.h>
|
34
|
+
#include "bntseq.h"
|
35
|
+
#include "bwt.h"
|
36
|
+
#include "main.h"
|
37
|
+
#include "utils.h"
|
38
|
+
|
39
|
+
bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is);
|
40
|
+
void bwa_pac_rev_core(const char *fn, const char *fn_rev);
|
41
|
+
|
42
|
+
int bwa_index(int argc, char *argv[])
|
43
|
+
{
|
44
|
+
char *prefix = 0, *str, *str2, *str3;
|
45
|
+
int c, algo_type = 3, is_color = 0;
|
46
|
+
clock_t t;
|
47
|
+
optind = 1;
|
48
|
+
while ((c = getopt(argc, argv, "ca:p:")) >= 0) {
|
49
|
+
switch (c) {
|
50
|
+
case 'a':
|
51
|
+
if (strcmp(optarg, "div") == 0) algo_type = 1;
|
52
|
+
else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2;
|
53
|
+
else if (strcmp(optarg, "is") == 0) algo_type = 3;
|
54
|
+
else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
|
55
|
+
break;
|
56
|
+
case 'p': prefix = strdup(optarg); break;
|
57
|
+
case 'c': is_color = 1; break;
|
58
|
+
default: return 1;
|
59
|
+
}
|
60
|
+
}
|
61
|
+
|
62
|
+
if (optind + 1 > argc) {
|
63
|
+
fprintf(stderr, "\n");
|
64
|
+
fprintf(stderr, "Usage: bwa index [-a bwtsw|div|is] [-c] <in.fasta>\n\n");
|
65
|
+
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [is]\n");
|
66
|
+
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
|
67
|
+
fprintf(stderr, " -c build color-space index\n\n");
|
68
|
+
fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
|
69
|
+
fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n");
|
70
|
+
fprintf(stderr, " according to the length of the genome.\n\n");
|
71
|
+
return 1;
|
72
|
+
}
|
73
|
+
if (prefix == 0) prefix = strdup(argv[optind]);
|
74
|
+
str = (char*)calloc(strlen(prefix) + 10, 1);
|
75
|
+
str2 = (char*)calloc(strlen(prefix) + 10, 1);
|
76
|
+
str3 = (char*)calloc(strlen(prefix) + 10, 1);
|
77
|
+
|
78
|
+
if (is_color == 0) { // nucleotide indexing
|
79
|
+
gzFile fp = xzopen(argv[optind], "r");
|
80
|
+
t = clock();
|
81
|
+
fprintf(stderr, "[bwa_index] Pack FASTA... ");
|
82
|
+
bns_fasta2bntseq(fp, prefix);
|
83
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
84
|
+
gzclose(fp);
|
85
|
+
} else { // color indexing
|
86
|
+
gzFile fp = xzopen(argv[optind], "r");
|
87
|
+
strcat(strcpy(str, prefix), ".nt");
|
88
|
+
t = clock();
|
89
|
+
fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... ");
|
90
|
+
bns_fasta2bntseq(fp, str);
|
91
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
92
|
+
gzclose(fp);
|
93
|
+
{
|
94
|
+
char *tmp_argv[3];
|
95
|
+
tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix;
|
96
|
+
t = clock();
|
97
|
+
fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... ");
|
98
|
+
bwa_pac2cspac(3, tmp_argv);
|
99
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
100
|
+
}
|
101
|
+
}
|
102
|
+
{
|
103
|
+
strcpy(str, prefix); strcat(str, ".pac");
|
104
|
+
strcpy(str2, prefix); strcat(str2, ".rpac");
|
105
|
+
t = clock();
|
106
|
+
fprintf(stderr, "[bwa_index] Reverse the packed sequence... ");
|
107
|
+
bwa_pac_rev_core(str, str2);
|
108
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
109
|
+
}
|
110
|
+
{
|
111
|
+
strcpy(str, prefix); strcat(str, ".pac");
|
112
|
+
strcpy(str2, prefix); strcat(str2, ".bwt");
|
113
|
+
t = clock();
|
114
|
+
fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
|
115
|
+
if (algo_type == 2) bwt_bwtgen(str, str2);
|
116
|
+
else if (algo_type == 1 || algo_type == 3) {
|
117
|
+
bwt_t *bwt;
|
118
|
+
bwt = bwt_pac2bwt(str, algo_type == 3);
|
119
|
+
bwt_dump_bwt(str2, bwt);
|
120
|
+
bwt_destroy(bwt);
|
121
|
+
}
|
122
|
+
fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
123
|
+
}
|
124
|
+
{
|
125
|
+
strcpy(str, prefix); strcat(str, ".rpac");
|
126
|
+
strcpy(str2, prefix); strcat(str2, ".rbwt");
|
127
|
+
t = clock();
|
128
|
+
fprintf(stderr, "[bwa_index] Construct BWT for the reverse packed sequence...\n");
|
129
|
+
if (algo_type == 2) bwt_bwtgen(str, str2);
|
130
|
+
else if (algo_type == 1 || algo_type == 3) {
|
131
|
+
bwt_t *bwt;
|
132
|
+
bwt = bwt_pac2bwt(str, algo_type == 3);
|
133
|
+
bwt_dump_bwt(str2, bwt);
|
134
|
+
bwt_destroy(bwt);
|
135
|
+
}
|
136
|
+
fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
137
|
+
}
|
138
|
+
{
|
139
|
+
bwt_t *bwt;
|
140
|
+
strcpy(str, prefix); strcat(str, ".bwt");
|
141
|
+
t = clock();
|
142
|
+
fprintf(stderr, "[bwa_index] Update BWT... ");
|
143
|
+
bwt = bwt_restore_bwt(str);
|
144
|
+
bwt_bwtupdate_core(bwt);
|
145
|
+
bwt_dump_bwt(str, bwt);
|
146
|
+
bwt_destroy(bwt);
|
147
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
148
|
+
}
|
149
|
+
{
|
150
|
+
bwt_t *bwt;
|
151
|
+
strcpy(str, prefix); strcat(str, ".rbwt");
|
152
|
+
t = clock();
|
153
|
+
fprintf(stderr, "[bwa_index] Update reverse BWT... ");
|
154
|
+
bwt = bwt_restore_bwt(str);
|
155
|
+
bwt_bwtupdate_core(bwt);
|
156
|
+
bwt_dump_bwt(str, bwt);
|
157
|
+
bwt_destroy(bwt);
|
158
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
159
|
+
}
|
160
|
+
{
|
161
|
+
bwt_t *bwt;
|
162
|
+
strcpy(str, prefix); strcat(str, ".bwt");
|
163
|
+
strcpy(str3, prefix); strcat(str3, ".sa");
|
164
|
+
t = clock();
|
165
|
+
fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... ");
|
166
|
+
bwt = bwt_restore_bwt(str);
|
167
|
+
bwt_cal_sa(bwt, 32);
|
168
|
+
bwt_dump_sa(str3, bwt);
|
169
|
+
bwt_destroy(bwt);
|
170
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
171
|
+
}
|
172
|
+
{
|
173
|
+
bwt_t *bwt;
|
174
|
+
strcpy(str, prefix); strcat(str, ".rbwt");
|
175
|
+
strcpy(str3, prefix); strcat(str3, ".rsa");
|
176
|
+
t = clock();
|
177
|
+
fprintf(stderr, "[bwa_index] Construct SA from reverse BWT and Occ... ");
|
178
|
+
bwt = bwt_restore_bwt(str);
|
179
|
+
bwt_cal_sa(bwt, 32);
|
180
|
+
bwt_dump_sa(str3, bwt);
|
181
|
+
bwt_destroy(bwt);
|
182
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
183
|
+
}
|
184
|
+
free(str3); free(str2); free(str); free(prefix);
|
185
|
+
return 0;
|
186
|
+
}
|
data/ext/bwtio.c
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include "bwt.h"
|
5
|
+
#include "utils.h"
|
6
|
+
|
7
|
+
void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
|
8
|
+
{
|
9
|
+
FILE *fp;
|
10
|
+
fp = xopen(fn, "wb");
|
11
|
+
fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
12
|
+
fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
|
13
|
+
fwrite(bwt->bwt, sizeof(bwtint_t), bwt->bwt_size, fp);
|
14
|
+
fclose(fp);
|
15
|
+
}
|
16
|
+
|
17
|
+
void bwt_dump_sa(const char *fn, const bwt_t *bwt)
|
18
|
+
{
|
19
|
+
FILE *fp;
|
20
|
+
fp = xopen(fn, "wb");
|
21
|
+
fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
22
|
+
fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
|
23
|
+
fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
|
24
|
+
fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
|
25
|
+
fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
|
26
|
+
fclose(fp);
|
27
|
+
}
|
28
|
+
|
29
|
+
void bwt_restore_sa(const char *fn, bwt_t *bwt)
|
30
|
+
{
|
31
|
+
char skipped[256];
|
32
|
+
FILE *fp;
|
33
|
+
bwtint_t primary;
|
34
|
+
|
35
|
+
fp = xopen(fn, "rb");
|
36
|
+
fread(&primary, sizeof(bwtint_t), 1, fp);
|
37
|
+
xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same.");
|
38
|
+
fread(skipped, sizeof(bwtint_t), 4, fp); // skip
|
39
|
+
fread(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
|
40
|
+
fread(&primary, sizeof(bwtint_t), 1, fp);
|
41
|
+
xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
|
42
|
+
|
43
|
+
bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
|
44
|
+
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
|
45
|
+
bwt->sa[0] = -1;
|
46
|
+
|
47
|
+
fread(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
|
48
|
+
fclose(fp);
|
49
|
+
}
|
50
|
+
|
51
|
+
bwt_t *bwt_restore_bwt(const char *fn)
|
52
|
+
{
|
53
|
+
bwt_t *bwt;
|
54
|
+
FILE *fp;
|
55
|
+
|
56
|
+
bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
|
57
|
+
fp = xopen(fn, "rb");
|
58
|
+
fseek(fp, 0, SEEK_END);
|
59
|
+
bwt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 5) >> 2;
|
60
|
+
bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4);
|
61
|
+
fseek(fp, 0, SEEK_SET);
|
62
|
+
fread(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
63
|
+
fread(bwt->L2+1, sizeof(bwtint_t), 4, fp);
|
64
|
+
fread(bwt->bwt, 4, bwt->bwt_size, fp);
|
65
|
+
bwt->seq_len = bwt->L2[4];
|
66
|
+
fclose(fp);
|
67
|
+
bwt_gen_cnt_table(bwt);
|
68
|
+
|
69
|
+
return bwt;
|
70
|
+
}
|
71
|
+
|
72
|
+
void bwt_destroy(bwt_t *bwt)
|
73
|
+
{
|
74
|
+
if (bwt == 0) return;
|
75
|
+
free(bwt->sa); free(bwt->bwt);
|
76
|
+
free(bwt);
|
77
|
+
}
|
data/ext/bwtmisc.c
ADDED
@@ -0,0 +1,269 @@
|
|
1
|
+
/* The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2008 Genome Research Ltd (GRL).
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
20
|
+
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
21
|
+
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
22
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
23
|
+
SOFTWARE.
|
24
|
+
*/
|
25
|
+
|
26
|
+
/* Contact: Heng Li <lh3@sanger.ac.uk> */
|
27
|
+
|
28
|
+
#include <stdlib.h>
|
29
|
+
#include <stdio.h>
|
30
|
+
#include <string.h>
|
31
|
+
#include <unistd.h>
|
32
|
+
#include "bntseq.h"
|
33
|
+
#include "utils.h"
|
34
|
+
#include "main.h"
|
35
|
+
#include "bwt.h"
|
36
|
+
|
37
|
+
#ifdef _DIVBWT
|
38
|
+
#include "divsufsort.h"
|
39
|
+
#endif
|
40
|
+
|
41
|
+
int is_bwt(ubyte_t *T, int n);
|
42
|
+
|
43
|
+
int64_t bwa_seq_len(const char *fn_pac)
|
44
|
+
{
|
45
|
+
FILE *fp;
|
46
|
+
int64_t pac_len;
|
47
|
+
ubyte_t c;
|
48
|
+
fp = xopen(fn_pac, "rb");
|
49
|
+
fseek(fp, -1, SEEK_END);
|
50
|
+
pac_len = ftell(fp);
|
51
|
+
fread(&c, 1, 1, fp);
|
52
|
+
fclose(fp);
|
53
|
+
return (pac_len - 1) * 4 + (int)c;
|
54
|
+
}
|
55
|
+
|
56
|
+
bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
|
57
|
+
{
|
58
|
+
bwt_t *bwt;
|
59
|
+
ubyte_t *buf, *buf2;
|
60
|
+
int i, pac_size;
|
61
|
+
FILE *fp;
|
62
|
+
|
63
|
+
// initialization
|
64
|
+
bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
|
65
|
+
bwt->seq_len = bwa_seq_len(fn_pac);
|
66
|
+
bwt->bwt_size = (bwt->seq_len + 15) >> 4;
|
67
|
+
fp = xopen(fn_pac, "rb");
|
68
|
+
|
69
|
+
// prepare sequence
|
70
|
+
pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
|
71
|
+
buf2 = (ubyte_t*)calloc(pac_size, 1);
|
72
|
+
fread(buf2, 1, pac_size, fp);
|
73
|
+
fclose(fp);
|
74
|
+
memset(bwt->L2, 0, 5 * 4);
|
75
|
+
buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
|
76
|
+
for (i = 0; i < bwt->seq_len; ++i) {
|
77
|
+
buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
|
78
|
+
++bwt->L2[1+buf[i]];
|
79
|
+
}
|
80
|
+
for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
|
81
|
+
free(buf2);
|
82
|
+
|
83
|
+
// Burrows-Wheeler Transform
|
84
|
+
if (use_is) {
|
85
|
+
bwt->primary = is_bwt(buf, bwt->seq_len);
|
86
|
+
} else {
|
87
|
+
#ifdef _DIVBWT
|
88
|
+
bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
|
89
|
+
#else
|
90
|
+
err_fatal_simple("libdivsufsort is not compiled in.");
|
91
|
+
#endif
|
92
|
+
}
|
93
|
+
bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
|
94
|
+
for (i = 0; i < bwt->seq_len; ++i)
|
95
|
+
bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
|
96
|
+
free(buf);
|
97
|
+
return bwt;
|
98
|
+
}
|
99
|
+
|
100
|
+
int bwa_pac2bwt(int argc, char *argv[])
|
101
|
+
{
|
102
|
+
bwt_t *bwt;
|
103
|
+
int c, use_is = 1;
|
104
|
+
optind = 1;
|
105
|
+
while ((c = getopt(argc, argv, "d")) >= 0) {
|
106
|
+
switch (c) {
|
107
|
+
case 'd': use_is = 0; break;
|
108
|
+
default: return 1;
|
109
|
+
}
|
110
|
+
}
|
111
|
+
if (optind + 2 > argc) {
|
112
|
+
fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
|
113
|
+
return 1;
|
114
|
+
}
|
115
|
+
bwt = bwt_pac2bwt(argv[optind], use_is);
|
116
|
+
bwt_dump_bwt(argv[optind+1], bwt);
|
117
|
+
bwt_destroy(bwt);
|
118
|
+
return 0;
|
119
|
+
}
|
120
|
+
|
121
|
+
#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
|
122
|
+
|
123
|
+
void bwt_bwtupdate_core(bwt_t *bwt)
|
124
|
+
{
|
125
|
+
bwtint_t i, k, c[4], n_occ;
|
126
|
+
uint32_t *buf;
|
127
|
+
|
128
|
+
n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
|
129
|
+
bwt->bwt_size += n_occ * 4; // the new size
|
130
|
+
buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt
|
131
|
+
c[0] = c[1] = c[2] = c[3] = 0;
|
132
|
+
for (i = k = 0; i < bwt->seq_len; ++i) {
|
133
|
+
if (i % OCC_INTERVAL == 0) {
|
134
|
+
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
135
|
+
k += 4;
|
136
|
+
}
|
137
|
+
if (i % 16 == 0) buf[k++] = bwt->bwt[i/16];
|
138
|
+
++c[bwt_B00(bwt, i)];
|
139
|
+
}
|
140
|
+
// the last element
|
141
|
+
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
142
|
+
xassert(k + 4 == bwt->bwt_size, "inconsistent bwt_size");
|
143
|
+
// update bwt
|
144
|
+
free(bwt->bwt); bwt->bwt = buf;
|
145
|
+
}
|
146
|
+
|
147
|
+
int bwa_bwtupdate(int argc, char *argv[])
|
148
|
+
{
|
149
|
+
bwt_t *bwt;
|
150
|
+
if (argc < 2) {
|
151
|
+
fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
|
152
|
+
return 1;
|
153
|
+
}
|
154
|
+
bwt = bwt_restore_bwt(argv[1]);
|
155
|
+
bwt_bwtupdate_core(bwt);
|
156
|
+
bwt_dump_bwt(argv[1], bwt);
|
157
|
+
bwt_destroy(bwt);
|
158
|
+
return 0;
|
159
|
+
}
|
160
|
+
|
161
|
+
void bwa_pac_rev_core(const char *fn, const char *fn_rev)
|
162
|
+
{
|
163
|
+
int64_t seq_len, i;
|
164
|
+
bwtint_t pac_len, j;
|
165
|
+
ubyte_t *bufin, *bufout, ct;
|
166
|
+
FILE *fp;
|
167
|
+
seq_len = bwa_seq_len(fn);
|
168
|
+
pac_len = (seq_len >> 2) + 1;
|
169
|
+
bufin = (ubyte_t*)calloc(pac_len, 1);
|
170
|
+
bufout = (ubyte_t*)calloc(pac_len, 1);
|
171
|
+
fp = xopen(fn, "rb");
|
172
|
+
fread(bufin, 1, pac_len, fp);
|
173
|
+
fclose(fp);
|
174
|
+
for (i = seq_len - 1, j = 0; i >= 0; --i) {
|
175
|
+
int c = bufin[i>>2] >> ((~i&3)<<1) & 3;
|
176
|
+
bwtint_t j = seq_len - 1 - i;
|
177
|
+
bufout[j>>2] |= c << ((~j&3)<<1);
|
178
|
+
}
|
179
|
+
free(bufin);
|
180
|
+
fp = xopen(fn_rev, "wb");
|
181
|
+
fwrite(bufout, 1, pac_len, fp);
|
182
|
+
ct = seq_len % 4;
|
183
|
+
fwrite(&ct, 1, 1, fp);
|
184
|
+
fclose(fp);
|
185
|
+
free(bufout);
|
186
|
+
}
|
187
|
+
|
188
|
+
int bwa_pac_rev(int argc, char *argv[])
|
189
|
+
{
|
190
|
+
if (argc < 3) {
|
191
|
+
fprintf(stderr, "Usage: bwa pac_rev <in.pac> <out.pac>\n");
|
192
|
+
return 1;
|
193
|
+
}
|
194
|
+
bwa_pac_rev_core(argv[1], argv[2]);
|
195
|
+
return 0;
|
196
|
+
}
|
197
|
+
|
198
|
+
const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
|
199
|
+
|
200
|
+
/* this function is not memory efficient, but this will make life easier
|
201
|
+
Ideally we should also change .amb files as one 'N' in the nucleotide
|
202
|
+
sequence leads to two ambiguous colors. I may do this later... */
|
203
|
+
uint8_t *bwa_pac2cspac_core(const bntseq_t *bns)
|
204
|
+
{
|
205
|
+
uint8_t *pac, *cspac;
|
206
|
+
bwtint_t i;
|
207
|
+
int c1, c2;
|
208
|
+
pac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1);
|
209
|
+
cspac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1);
|
210
|
+
fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
|
211
|
+
rewind(bns->fp_pac);
|
212
|
+
c1 = pac[0]>>6; cspac[0] = c1<<6;
|
213
|
+
for (i = 1; i < bns->l_pac; ++i) {
|
214
|
+
c2 = pac[i>>2] >> (~i&3)*2 & 3;
|
215
|
+
cspac[i>>2] |= nst_color_space_table[(1<<c1)|(1<<c2)] << (~i&3)*2;
|
216
|
+
c1 = c2;
|
217
|
+
}
|
218
|
+
free(pac);
|
219
|
+
return cspac;
|
220
|
+
}
|
221
|
+
|
222
|
+
int bwa_pac2cspac(int argc, char *argv[])
|
223
|
+
{
|
224
|
+
bntseq_t *bns;
|
225
|
+
uint8_t *cspac, ct;
|
226
|
+
char *str;
|
227
|
+
FILE *fp;
|
228
|
+
|
229
|
+
if (argc < 3) {
|
230
|
+
fprintf(stderr, "Usage: bwa pac2cspac <in.nt.prefix> <out.cs.prefix>\n");
|
231
|
+
return 1;
|
232
|
+
}
|
233
|
+
bns = bns_restore(argv[1]);
|
234
|
+
cspac = bwa_pac2cspac_core(bns);
|
235
|
+
bns_dump(bns, argv[2]);
|
236
|
+
// now write cspac
|
237
|
+
str = (char*)calloc(strlen(argv[2]) + 5, 1);
|
238
|
+
strcat(strcpy(str, argv[2]), ".pac");
|
239
|
+
fp = xopen(str, "wb");
|
240
|
+
fwrite(cspac, 1, bns->l_pac/4 + 1, fp);
|
241
|
+
ct = bns->l_pac % 4;
|
242
|
+
fwrite(&ct, 1, 1, fp);
|
243
|
+
fclose(fp);
|
244
|
+
bns_destroy(bns);
|
245
|
+
free(cspac);
|
246
|
+
return 0;
|
247
|
+
}
|
248
|
+
|
249
|
+
int bwa_bwt2sa(int argc, char *argv[])
|
250
|
+
{
|
251
|
+
bwt_t *bwt;
|
252
|
+
int c, sa_intv = 32;
|
253
|
+
optind = 1;
|
254
|
+
while ((c = getopt(argc, argv, "i:")) >= 0) {
|
255
|
+
switch (c) {
|
256
|
+
case 'i': sa_intv = atoi(optarg); break;
|
257
|
+
default: return 1;
|
258
|
+
}
|
259
|
+
}
|
260
|
+
if (optind + 2 > argc) {
|
261
|
+
fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
|
262
|
+
return 1;
|
263
|
+
}
|
264
|
+
bwt = bwt_restore_bwt(argv[optind]);
|
265
|
+
bwt_cal_sa(bwt, sa_intv);
|
266
|
+
bwt_dump_sa(argv[optind+1], bwt);
|
267
|
+
bwt_destroy(bwt);
|
268
|
+
return 0;
|
269
|
+
}
|