bio-bwa 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
data/ext/bwtsw2_main.c ADDED
@@ -0,0 +1,100 @@
1
+ #include <unistd.h>
2
+ #include <stdlib.h>
3
+ #include <string.h>
4
+ #include <stdio.h>
5
+ #include <math.h>
6
+ #include "bwt.h"
7
+ #include "bwtsw2.h"
8
+ #include "utils.h"
9
+
10
+ int bwa_bwtsw2(int argc, char *argv[])
11
+ {
12
+ bsw2opt_t *opt;
13
+ bwt_t *target[2];
14
+ char buf[1024];
15
+ bntseq_t *bns;
16
+ int c;
17
+
18
+ opt = bsw2_init_opt();
19
+ srand48(11);
20
+ optind = 1;
21
+ while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:y:s:c:N:Hf:")) >= 0) {
22
+ switch (c) {
23
+ case 'q': opt->q = atoi(optarg); break;
24
+ case 'r': opt->r = atoi(optarg); break;
25
+ case 'a': opt->a = atoi(optarg); break;
26
+ case 'b': opt->b = atoi(optarg); break;
27
+ case 'w': opt->bw = atoi(optarg); break;
28
+ case 'T': opt->t = atoi(optarg); break;
29
+ case 't': opt->n_threads = atoi(optarg); break;
30
+ case 'z': opt->z = atoi(optarg); break;
31
+ case 'y': opt->yita = atof(optarg); break;
32
+ case 's': opt->is = atoi(optarg); break;
33
+ case 'm': opt->mask_level = atof(optarg); break;
34
+ case 'c': opt->coef = atof(optarg); break;
35
+ case 'N': opt->t_seeds = atoi(optarg); break;
36
+ case 'H': opt->hard_clip = 1; break;
37
+ case 'f': xreopen(optarg, "w", stdout); break;
38
+ }
39
+ }
40
+ opt->qr = opt->q + opt->r;
41
+
42
+ if (optind + 2 > argc) {
43
+ fprintf(stderr, "\n");
44
+ fprintf(stderr, "Usage: bwa bwasw [options] <target.prefix> <query.fa>\n\n");
45
+ fprintf(stderr, "Options: -a INT score for a match [%d]\n", opt->a);
46
+ fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b);
47
+ fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q);
48
+ fprintf(stderr, " -r INT gap extension penalty [%d]\n", opt->r);
49
+ // fprintf(stderr, " -y FLOAT error recurrence coef. (4..16) [%.1f]\n", opt->yita);
50
+ fprintf(stderr, "\n");
51
+ fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
52
+ fprintf(stderr, " -s INT size of a chunk of reads [%d]\n", opt->chunk_size);
53
+ fprintf(stderr, "\n");
54
+ fprintf(stderr, " -w INT band width [%d]\n", opt->bw);
55
+ fprintf(stderr, " -m FLOAT mask level [%.2f]\n", opt->mask_level);
56
+ fprintf(stderr, "\n");
57
+ fprintf(stderr, " -T INT score threshold divided by a [%d]\n", opt->t);
58
+ fprintf(stderr, " -s INT maximum seeding interval size [%d]\n", opt->is);
59
+ fprintf(stderr, " -z INT Z-best [%d]\n", opt->z);
60
+ fprintf(stderr, " -N INT # seeds to trigger reverse alignment [%d]\n", opt->t_seeds);
61
+ fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef);
62
+ fprintf(stderr, " -H in SAM output, use hard clipping rather than soft\n");
63
+ fprintf(stderr, " -f FILE file to output results to instead of stdout\n\n");
64
+ fprintf(stderr, "Note: For long Illumina, 454 and Sanger reads, assembly contigs, fosmids and\n");
65
+ fprintf(stderr, " BACs, the default setting usually works well. For the current PacBio\n");
66
+ fprintf(stderr, " reads (end of 2010), '-b5 -q2 -r1 -z10' is recommended. One may also\n");
67
+ fprintf(stderr, " increase '-z' for better sensitivity.\n");
68
+ fprintf(stderr, "\n");
69
+
70
+ if (0) {
71
+ double c, theta, eps, delta;
72
+ c = opt->a / log(opt->yita);
73
+ theta = exp(-opt->b / c) / opt->yita;
74
+ eps = exp(-opt->q / c);
75
+ delta = exp(-opt->r / c);
76
+ fprintf(stderr, "mismatch: %lf, gap_open: %lf, gap_ext: %lf\n\n",
77
+ theta, eps, delta);
78
+ }
79
+ return 1;
80
+ }
81
+
82
+ // adjust opt for opt->a
83
+ opt->t *= opt->a;
84
+ opt->coef *= opt->a;
85
+
86
+ strcpy(buf, argv[optind]); target[0] = bwt_restore_bwt(strcat(buf, ".bwt"));
87
+ strcpy(buf, argv[optind]); bwt_restore_sa(strcat(buf, ".sa"), target[0]);
88
+ strcpy(buf, argv[optind]); target[1] = bwt_restore_bwt(strcat(buf, ".rbwt"));
89
+ strcpy(buf, argv[optind]); bwt_restore_sa(strcat(buf, ".rsa"), target[1]);
90
+ bns = bns_restore(argv[optind]);
91
+
92
+ bsw2_aln(opt, bns, target, argv[optind+1]);
93
+
94
+ bns_destroy(bns);
95
+ bwt_destroy(target[0]); bwt_destroy(target[1]);
96
+ free(opt);
97
+ fflush(stdout);
98
+ xreopen("/dev/tty","w",stdout);
99
+ return 0;
100
+ }
data/ext/cs2nt.c ADDED
@@ -0,0 +1,191 @@
1
+ #include <string.h>
2
+ #include <stdint.h>
3
+ #include <stdlib.h>
4
+ #include "bwtaln.h"
5
+ #include "stdaln.h"
6
+
7
+ /*
8
+ Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we
9
+ decode as ATTGAC(RBGOG), there are one color change and one nt change;
10
+ if we decode as ATTAAC(RBRBG), there are two color changes.
11
+
12
+ In DP, if color quality is smaller than COLOR_MM, we will use COLOR_MM
13
+ as the penalty; otherwise, we will use color quality as the
14
+ penalty. This means we always prefer two consistent color changes over
15
+ a nt change, but if a color has high quality, we may prefer one nt
16
+ change.
17
+
18
+ In the above example, the penalties of the two types of decoding are
19
+ q(B)+25 and q(B)+q(O), respectively. If q(O)>25, we prefer the first;
20
+ otherwise the second. Note that no matter what we choose, the fourth
21
+ base will get a low nt quality.
22
+ */
23
+
24
+ #define COLOR_MM 19
25
+ #define NUCL_MM 25
26
+
27
+ static const int nst_ntnt2cs_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4 };
28
+
29
+ /*
30
+ {A,C,G,T,N} -> {0,1,2,3,4}
31
+ nt_ref[0..size]: nucleotide reference: 0/1/2/3/4
32
+ cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
33
+ nt_read[0..size]: nucleotide read sequence: 0/1/2/3 (returned)
34
+ btarray[0..4*size]: backtrack array (working space)
35
+ */
36
+ void cs2nt_DP(int size, const uint8_t *nt_ref, const uint8_t *cs_read, uint8_t *nt_read, uint8_t *btarray)
37
+ {
38
+ int h[8], curr, last;
39
+ int x, y, xmin, hmin, k;
40
+
41
+ // h[0..3] and h[4..7] are the current and last best score array, depending on curr and last
42
+
43
+ // recursion: initial value
44
+ if (nt_ref[0] >= 4) memset(h, 0, sizeof(int) << 2);
45
+ else {
46
+ for (x = 0; x != 4; ++x) h[x] = NUCL_MM;
47
+ h[nt_ref[0]] = 0;
48
+ }
49
+ // recursion: main loop
50
+ curr = 1; last = 0;
51
+ for (k = 1; k <= size; ++k) {
52
+ for (x = 0; x != 4; ++x) {
53
+ int min = 0x7fffffff, ymin = 0;
54
+ for (y = 0; y != 4; ++y) {
55
+ int s = h[last<<2|y];
56
+ if ((cs_read[k-1]&0x3f) != 63 && cs_read[k-1]>>6 != nst_ntnt2cs_table[1<<x|1<<y])
57
+ s += ((cs_read[k-1]&0x3f) < COLOR_MM)? COLOR_MM : (cs_read[k-1]&0x3f); // color mismatch
58
+ if (nt_ref[k] < 4 && nt_ref[k] != x) s += NUCL_MM; // nt mismatch
59
+ if (s < min) {
60
+ min = s; ymin = y;
61
+ }
62
+ }
63
+ h[curr<<2|x] = min; btarray[k<<2|x] = ymin;
64
+ }
65
+ last = curr; curr = 1 - curr; // swap
66
+ }
67
+ // back trace
68
+ hmin = 0x7fffffff; xmin = 0;
69
+ for (x = 0; x != 4; ++x) {
70
+ if (h[last<<2|x] < hmin) {
71
+ hmin = h[last<<2|x]; xmin = x;
72
+ }
73
+ }
74
+ nt_read[size] = xmin;
75
+ for (k = size - 1; k >= 0; --k)
76
+ nt_read[k] = btarray[(k+1)<<2 | nt_read[k+1]];
77
+ }
78
+ /*
79
+ nt_read[0..size]: nucleotide read sequence: 0/1/2/3
80
+ cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
81
+ tarray[0..size*2-1]: temporary array
82
+ */
83
+ uint8_t *cs2nt_nt_qual(int size, const uint8_t *nt_read, const uint8_t *cs_read, uint8_t *tarray)
84
+ {
85
+ int k, c1, c2;
86
+ uint8_t *t2array = tarray + size;
87
+ // get the color sequence of nt_read
88
+ c1 = nt_read[0];
89
+ for (k = 1; k <= size; ++k) {
90
+ c2 = nt_read[k]; // in principle, there is no 'N' in nt_read[]; just in case
91
+ tarray[k-1] = (c1 >= 4 || c2 >= 4)? 4 : nst_ntnt2cs_table[1<<c1 | 1<<c2];
92
+ c1 = c2;
93
+ }
94
+ for (k = 1; k != size; ++k) {
95
+ int q = 0;
96
+ if (tarray[k-1] == cs_read[k-1]>>6 && tarray[k] == cs_read[k]>>6) {
97
+ q = (int)(cs_read[k-1]&0x3f) + (int)(cs_read[k]&0x3f) + 10;
98
+ } else if (tarray[k-1] == cs_read[k-1]>>6) {
99
+ q = (int)(cs_read[k-1]&0x3f) - (int)(cs_read[k]&0x3f);
100
+ } else if (tarray[k] == cs_read[k]>>6) {
101
+ q = (int)(cs_read[k]&0x3f) - (int)(cs_read[k-1]&0x3f);
102
+ } // else, q = 0
103
+ if (q < 0) q = 0;
104
+ if (q > 60) q = 60;
105
+ t2array[k] = nt_read[k]<<6 | q;
106
+ if ((cs_read[k-1]&0x3f) == 63 || (cs_read[k]&0x3f) == 63) t2array[k] = 0;
107
+ }
108
+ return t2array + 1; // of size-2
109
+ }
110
+
111
+ // this function will be called when p->seq has been reversed by refine_gapped()
112
+ void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac)
113
+ {
114
+ uint8_t *ta, *nt_read, *btarray, *tarray, *nt_ref, *cs_read, *new_nt_read;
115
+ int i, len;
116
+ uint8_t *seq;
117
+
118
+ // set temporary arrays
119
+ if (p->type == BWA_TYPE_NO_MATCH) return;
120
+ len = p->len + p->n_gapo + p->n_gape + 100; // leave enough space
121
+ ta = (uint8_t*)malloc(len * 7);
122
+ nt_ref = ta;
123
+ cs_read = nt_ref + len;
124
+ nt_read = cs_read + len;
125
+ btarray = nt_read + len;
126
+ tarray = nt_read + len;
127
+
128
+ #define __gen_csbase(_cs, _i, _seq) do { \
129
+ int q = p->qual[p->strand? p->len - 1 - (_i) : (_i)] - 33; \
130
+ if (q > 60) q = 60; \
131
+ if (_seq[_i] > 3) q = 63; \
132
+ (_cs) = _seq[_i]<<6 | q; \
133
+ } while (0)
134
+
135
+ // generate len, nt_ref[] and cs_read
136
+ seq = p->strand? p->rseq : p->seq;
137
+ nt_ref[0] = p->pos? bns_pac(pac, p->pos-1) : 4;
138
+ if (p->cigar == 0) { // no gap or clipping
139
+ len = p->len;
140
+ for (i = 0; i < p->len; ++i) {
141
+ __gen_csbase(cs_read[i], i, seq);
142
+ nt_ref[i+1] = bns_pac(pac, p->pos + i);
143
+ }
144
+ } else {
145
+ int k, z;
146
+ bwtint_t x, y;
147
+ x = p->pos; y = 0;
148
+ for (k = z = 0; k < p->n_cigar; ++k) {
149
+ int l = __cigar_len(p->cigar[k]);
150
+ if (__cigar_op(p->cigar[k]) == FROM_M) {
151
+ for (i = 0; i < l; ++i, ++x, ++y) {
152
+ __gen_csbase(cs_read[z], y, seq);
153
+ nt_ref[z+1] = bns_pac(pac, x);
154
+ ++z;
155
+ }
156
+ } else if (__cigar_op(p->cigar[k]) == FROM_I) {
157
+ for (i = 0; i < l; ++i, ++y) {
158
+ __gen_csbase(cs_read[z], y, seq);
159
+ nt_ref[z+1] = 4;
160
+ ++z;
161
+ }
162
+ } else if (__cigar_op(p->cigar[k]) == FROM_S) y += l;
163
+ else x += l;
164
+ }
165
+ len = z;
166
+ }
167
+
168
+ cs2nt_DP(len, nt_ref, cs_read, nt_read, btarray);
169
+ new_nt_read = cs2nt_nt_qual(len, nt_read, cs_read, tarray);
170
+
171
+ // update p
172
+ p->len = p->full_len = len - 1;
173
+ for (i = 0; i < p->len; ++i) {
174
+ if ((new_nt_read[i]&0x3f) == 63) {
175
+ p->qual[i] = 33; seq[i] = 4;
176
+ } else {
177
+ p->qual[i] = (new_nt_read[i]&0x3f) + 33;
178
+ seq[i] = new_nt_read[i]>>6;
179
+ }
180
+ }
181
+ p->qual[p->len] = seq[p->len] = 0;
182
+ if (p->strand) {
183
+ memcpy(p->seq, seq, p->len);
184
+ seq_reverse(p->len, p->seq, 1);
185
+ seq_reverse(p->len, p->qual, 0);
186
+ } else {
187
+ memcpy(p->rseq, seq, p->len);
188
+ seq_reverse(p->len, p->rseq, 1);
189
+ }
190
+ free(ta);
191
+ }
data/ext/is.c ADDED
@@ -0,0 +1,218 @@
1
+ /*
2
+ * sais.c for sais-lite
3
+ * Copyright (c) 2008 Yuta Mori All Rights Reserved.
4
+ *
5
+ * Permission is hereby granted, free of charge, to any person
6
+ * obtaining a copy of this software and associated documentation
7
+ * files (the "Software"), to deal in the Software without
8
+ * restriction, including without limitation the rights to use,
9
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ * copies of the Software, and to permit persons to whom the
11
+ * Software is furnished to do so, subject to the following
12
+ * conditions:
13
+ *
14
+ * The above copyright notice and this permission notice shall be
15
+ * included in all copies or substantial portions of the Software.
16
+ *
17
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24
+ * OTHER DEALINGS IN THE SOFTWARE.
25
+ */
26
+
27
+ #include <stdlib.h>
28
+
29
+ typedef unsigned char ubyte_t;
30
+ #define chr(i) (cs == sizeof(int) ? ((const int *)T)[i]:((const unsigned char *)T)[i])
31
+
32
+ /* find the start or end of each bucket */
33
+ static void getCounts(const unsigned char *T, int *C, int n, int k, int cs)
34
+ {
35
+ int i;
36
+ for (i = 0; i < k; ++i) C[i] = 0;
37
+ for (i = 0; i < n; ++i) ++C[chr(i)];
38
+ }
39
+ static void getBuckets(const int *C, int *B, int k, int end)
40
+ {
41
+ int i, sum = 0;
42
+ if (end) {
43
+ for (i = 0; i < k; ++i) {
44
+ sum += C[i];
45
+ B[i] = sum;
46
+ }
47
+ } else {
48
+ for (i = 0; i < k; ++i) {
49
+ sum += C[i];
50
+ B[i] = sum - C[i];
51
+ }
52
+ }
53
+ }
54
+
55
+ /* compute SA */
56
+ static void induceSA(const unsigned char *T, int *SA, int *C, int *B, int n, int k, int cs)
57
+ {
58
+ int *b, i, j;
59
+ int c0, c1;
60
+ /* compute SAl */
61
+ if (C == B) getCounts(T, C, n, k, cs);
62
+ getBuckets(C, B, k, 0); /* find starts of buckets */
63
+ j = n - 1;
64
+ b = SA + B[c1 = chr(j)];
65
+ *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j;
66
+ for (i = 0; i < n; ++i) {
67
+ j = SA[i], SA[i] = ~j;
68
+ if (0 < j) {
69
+ --j;
70
+ if ((c0 = chr(j)) != c1) {
71
+ B[c1] = b - SA;
72
+ b = SA + B[c1 = c0];
73
+ }
74
+ *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j;
75
+ }
76
+ }
77
+ /* compute SAs */
78
+ if (C == B) getCounts(T, C, n, k, cs);
79
+ getBuckets(C, B, k, 1); /* find ends of buckets */
80
+ for (i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) {
81
+ if (0 < (j = SA[i])) {
82
+ --j;
83
+ if ((c0 = chr(j)) != c1) {
84
+ B[c1] = b - SA;
85
+ b = SA + B[c1 = c0];
86
+ }
87
+ *--b = ((j == 0) || (chr(j - 1) > c1)) ? ~j : j;
88
+ } else SA[i] = ~j;
89
+ }
90
+ }
91
+
92
+ /*
93
+ * find the suffix array SA of T[0..n-1] in {0..k-1}^n use a working
94
+ * space (excluding T and SA) of at most 2n+O(1) for a constant alphabet
95
+ */
96
+ static int sais_main(const unsigned char *T, int *SA, int fs, int n, int k, int cs)
97
+ {
98
+ int *C, *B, *RA;
99
+ int i, j, c, m, p, q, plen, qlen, name;
100
+ int c0, c1;
101
+ int diff;
102
+
103
+ /* stage 1: reduce the problem by at least 1/2 sort all the
104
+ * S-substrings */
105
+ if (k <= fs) {
106
+ C = SA + n;
107
+ B = (k <= (fs - k)) ? C + k : C;
108
+ } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2;
109
+ getCounts(T, C, n, k, cs);
110
+ getBuckets(C, B, k, 1); /* find ends of buckets */
111
+ for (i = 0; i < n; ++i) SA[i] = 0;
112
+ for (i = n - 2, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
113
+ if ((c0 = chr(i)) < (c1 + c)) c = 1;
114
+ else if (c != 0) SA[--B[c1]] = i + 1, c = 0;
115
+ }
116
+ induceSA(T, SA, C, B, n, k, cs);
117
+ if (fs < k) free(C);
118
+ /* compact all the sorted substrings into the first m items of SA
119
+ * 2*m must be not larger than n (proveable) */
120
+ for (i = 0, m = 0; i < n; ++i) {
121
+ p = SA[i];
122
+ if ((0 < p) && (chr(p - 1) > (c0 = chr(p)))) {
123
+ for (j = p + 1; (j < n) && (c0 == (c1 = chr(j))); ++j);
124
+ if ((j < n) && (c0 < c1)) SA[m++] = p;
125
+ }
126
+ }
127
+ for (i = m; i < n; ++i) SA[i] = 0; /* init the name array buffer */
128
+ /* store the length of all substrings */
129
+ for (i = n - 2, j = n, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
130
+ if ((c0 = chr(i)) < (c1 + c)) c = 1;
131
+ else if (c != 0) {
132
+ SA[m + ((i + 1) >> 1)] = j - i - 1;
133
+ j = i + 1;
134
+ c = 0;
135
+ }
136
+ }
137
+ /* find the lexicographic names of all substrings */
138
+ for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) {
139
+ p = SA[i], plen = SA[m + (p >> 1)], diff = 1;
140
+ if (plen == qlen) {
141
+ for (j = 0; (j < plen) && (chr(p + j) == chr(q + j)); j++);
142
+ if (j == plen) diff = 0;
143
+ }
144
+ if (diff != 0) ++name, q = p, qlen = plen;
145
+ SA[m + (p >> 1)] = name;
146
+ }
147
+
148
+ /* stage 2: solve the reduced problem recurse if names are not yet
149
+ * unique */
150
+ if (name < m) {
151
+ RA = SA + n + fs - m;
152
+ for (i = n - 1, j = m - 1; m <= i; --i) {
153
+ if (SA[i] != 0) RA[j--] = SA[i] - 1;
154
+ }
155
+ if (sais_main((unsigned char *) RA, SA, fs + n - m * 2, m, name, sizeof(int)) != 0) return -2;
156
+ for (i = n - 2, j = m - 1, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
157
+ if ((c0 = chr(i)) < (c1 + c)) c = 1;
158
+ else if (c != 0) RA[j--] = i + 1, c = 0; /* get p1 */
159
+ }
160
+ for (i = 0; i < m; ++i) SA[i] = RA[SA[i]]; /* get index */
161
+ }
162
+ /* stage 3: induce the result for the original problem */
163
+ if (k <= fs) {
164
+ C = SA + n;
165
+ B = (k <= (fs - k)) ? C + k : C;
166
+ } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2;
167
+ /* put all left-most S characters into their buckets */
168
+ getCounts(T, C, n, k, cs);
169
+ getBuckets(C, B, k, 1); /* find ends of buckets */
170
+ for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */
171
+ for (i = m - 1; 0 <= i; --i) {
172
+ j = SA[i], SA[i] = 0;
173
+ SA[--B[chr(j)]] = j;
174
+ }
175
+ induceSA(T, SA, C, B, n, k, cs);
176
+ if (fs < k) free(C);
177
+ return 0;
178
+ }
179
+
180
+ /**
181
+ * Constructs the suffix array of a given string.
182
+ * @param T[0..n-1] The input string.
183
+ * @param SA[0..n] The output array of suffixes.
184
+ * @param n The length of the given string.
185
+ * @return 0 if no error occurred
186
+ */
187
+ int is_sa(const ubyte_t *T, int *SA, int n)
188
+ {
189
+ if ((T == NULL) || (SA == NULL) || (n < 0)) return -1;
190
+ SA[0] = n;
191
+ if (n <= 1) {
192
+ if (n == 1) SA[1] = 0;
193
+ return 0;
194
+ }
195
+ return sais_main(T, SA+1, 0, n, 256, 1);
196
+ }
197
+
198
+ /**
199
+ * Constructs the burrows-wheeler transformed string of a given string.
200
+ * @param T[0..n-1] The input string.
201
+ * @param n The length of the given string.
202
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
203
+ */
204
+ int is_bwt(ubyte_t *T, int n)
205
+ {
206
+ int *SA, i, primary = 0;
207
+ SA = (int*)calloc(n+1, sizeof(int));
208
+ is_sa(T, SA, n);
209
+
210
+ for (i = 0; i <= n; ++i) {
211
+ if (SA[i] == 0) primary = i;
212
+ else SA[i] = T[SA[i] - 1];
213
+ }
214
+ for (i = 0; i < primary; ++i) T[i] = SA[i];
215
+ for (; i < n; ++i) T[i] = SA[i + 1];
216
+ free(SA);
217
+ return primary;
218
+ }