bio-bwa 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
data/ext/bwt.h ADDED
@@ -0,0 +1,105 @@
1
+ /* The MIT License
2
+
3
+ Copyright (c) 2008 Genome Research Ltd (GRL).
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
24
+ */
25
+
26
+ /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
+
28
+ #ifndef BWA_BWT_H
29
+ #define BWA_BWT_H
30
+
31
+ #include <stdint.h>
32
+
33
+ // requirement: (OCC_INTERVAL%16 == 0)
34
+ #define OCC_INTERVAL 0x80
35
+
36
+ #ifndef BWA_UBYTE
37
+ #define BWA_UBYTE
38
+ typedef unsigned char ubyte_t;
39
+ #endif
40
+ typedef uint32_t bwtint_t;
41
+
42
+ typedef struct {
43
+ bwtint_t primary; // S^{-1}(0), or the primary index of BWT
44
+ bwtint_t L2[5]; // C(), cumulative count
45
+ bwtint_t seq_len; // sequence length
46
+ bwtint_t bwt_size; // size of bwt, about seq_len/4
47
+ uint32_t *bwt; // BWT
48
+ // occurance array, separated to two parts
49
+ uint32_t cnt_table[256];
50
+ // suffix array
51
+ int sa_intv;
52
+ bwtint_t n_sa;
53
+ bwtint_t *sa;
54
+ } bwt_t;
55
+
56
+ #define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL*12 + 4 + (k)%OCC_INTERVAL/16])
57
+
58
+ /* retrieve a character from the $-removed BWT string. Note that
59
+ * bwt_t::bwt is not exactly the BWT string and therefore this macro is
60
+ * called bwt_B0 instead of bwt_B */
61
+ #define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3)
62
+
63
+ #define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL*12)
64
+
65
+ // inverse Psi function
66
+ #define bwt_invPsi(bwt, k) \
67
+ (((k) == (bwt)->primary)? 0 : \
68
+ ((k) < (bwt)->primary)? \
69
+ (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \
70
+ : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1)))
71
+
72
+ #ifdef __cplusplus
73
+ extern "C" {
74
+ #endif
75
+
76
+ void bwt_dump_bwt(const char *fn, const bwt_t *bwt);
77
+ void bwt_dump_sa(const char *fn, const bwt_t *bwt);
78
+
79
+ bwt_t *bwt_restore_bwt(const char *fn);
80
+ void bwt_restore_sa(const char *fn, bwt_t *bwt);
81
+
82
+ void bwt_destroy(bwt_t *bwt);
83
+
84
+ void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW
85
+ void bwt_cal_sa(bwt_t *bwt, int intv);
86
+
87
+ void bwt_bwtupdate_core(bwt_t *bwt);
88
+
89
+ inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c);
90
+ inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
91
+ bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k);
92
+
93
+ // more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values
94
+ void bwt_gen_cnt_table(bwt_t *bwt);
95
+ inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol);
96
+ inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]);
97
+
98
+ int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end);
99
+ int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0);
100
+
101
+ #ifdef __cplusplus
102
+ }
103
+ #endif
104
+
105
+ #endif
@@ -0,0 +1,23 @@
1
+ CC= gcc
2
+ CFLAGS= -g -Wall -O2 -m64 # comment out `-m64' for 32-bit compilation
3
+ DFLAGS= -D_FILE_OFFSET_BITS=64
4
+ OBJS= bwt_gen.o QSufSort.o
5
+ INCLUDES=
6
+ VERSION= 0.1.0
7
+ LIBS=
8
+ SUBDIRS=
9
+
10
+ .SUFFIXES:.c .o
11
+
12
+ .c.o:
13
+ $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
14
+
15
+ lib:libbwtgen.a
16
+
17
+ libbwtgen.a:$(OBJS)
18
+ $(AR) -cru $@ $(OBJS)
19
+
20
+ cleanlocal:
21
+ rm -f gmon.out *.o a.out $(PROG) *~ *.a
22
+
23
+ clean:cleanlocal
@@ -0,0 +1,496 @@
1
+ /* QSufSort.c
2
+
3
+ Original source from qsufsort.c
4
+
5
+ Copyright 1999, N. Jesper Larsson, all rights reserved.
6
+
7
+ This file contains an implementation of the algorithm presented in "Faster
8
+ Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko
9
+ Sadakane (sada@is.s.u-tokyo.ac.jp).
10
+
11
+ This software may be used freely for any purpose. However, when distributed,
12
+ the original source must be clearly stated, and, when the source code is
13
+ distributed, the copyright notice must be retained and any alterations in
14
+ the code must be clearly marked. No warranty is given regarding the quality
15
+ of this software.
16
+
17
+ Modified by Wong Chi-Kwong, 2004
18
+
19
+ Changes summary: - Used long variable and function names
20
+ - Removed global variables
21
+ - Replace pointer references with array references
22
+ - Used insertion sort in place of selection sort and increased insertion sort threshold
23
+ - Reconstructing suffix array from inverse becomes an option
24
+ - Add handling where end-of-text symbol is not necessary < all characters
25
+ - Removed codes for supporting alphabet size > number of characters
26
+
27
+ No warrenty is given regarding the quality of the modifications.
28
+
29
+ */
30
+
31
+
32
+ #include <stdio.h>
33
+ #include <stdlib.h>
34
+ #include <limits.h>
35
+ #include "bwt_gen.h"
36
+ #include "QSufSort.h"
37
+
38
+ // Static functions
39
+ static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
40
+ const int highestPos, const int numSortedChar);
41
+ static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int lowestPos,
42
+ const int highestPos, const int numSortedChar);
43
+ static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
44
+ const int highestPos, const int numSortedChar);
45
+ static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int numChar, const int alphabetSize);
46
+ static int QSufSortTransform(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
47
+ const int smallestInputSymbol, const int maxNewAlphabetSize, int *numSymbolAggregated);
48
+
49
+ // from MiscUtilities.c
50
+ static unsigned int leadingZero(const unsigned int input) {
51
+
52
+ unsigned int l;
53
+ const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
54
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
55
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
56
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
57
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
58
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
59
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
60
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
61
+
62
+ if (input & 0xFFFF0000) {
63
+ if (input & 0xFF000000) {
64
+ l = leadingZero8bit[input >> 24];
65
+ } else {
66
+ l = 8 + leadingZero8bit[input >> 16];
67
+ }
68
+ } else {
69
+ if (input & 0x0000FF00) {
70
+ l = 16 + leadingZero8bit[input >> 8];
71
+ } else {
72
+ l = 24 + leadingZero8bit[input];
73
+ }
74
+ }
75
+ return l;
76
+
77
+ }
78
+
79
+ /* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
80
+ n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
81
+ contents of x[n] is disregarded, the n-th symbol being regarded as
82
+ end-of-string smaller than all other symbols.*/
83
+ void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
84
+ const int smallestInputSymbol, const int skipTransform) {
85
+
86
+ int i, j;
87
+ int s, negatedSortedGroupLength;
88
+ int numSymbolAggregated;
89
+ int maxNumInputSymbol;
90
+ int numSortedPos = 1;
91
+ int newAlphabetSize;
92
+
93
+ maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
94
+
95
+ if (!skipTransform) {
96
+ /* bucketing possible*/
97
+ newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol,
98
+ numChar, &numSymbolAggregated);
99
+ QSufSortBucketSort(V, I, numChar, newAlphabetSize);
100
+ I[0] = -1;
101
+ V[numChar] = 0;
102
+ numSortedPos = numSymbolAggregated;
103
+ }
104
+
105
+ while ((int)(I[0]) >= -(int)numChar) {
106
+ i = 0;
107
+ negatedSortedGroupLength = 0;
108
+ do {
109
+ s = I[i];
110
+ if (s < 0) {
111
+ i -= s; /* skip over sorted group.*/
112
+ negatedSortedGroupLength += s;
113
+ } else {
114
+ if (negatedSortedGroupLength) {
115
+ I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine preceding sorted groups */
116
+ negatedSortedGroupLength = 0;
117
+ }
118
+ j = V[s] + 1;
119
+ QSufSortSortSplit(V, I, i, j - 1, numSortedPos);
120
+ i = j;
121
+ }
122
+ } while (i <= numChar);
123
+ if (negatedSortedGroupLength) {
124
+ /* array ends with a sorted group.*/
125
+ I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine sorted groups at end of I.*/
126
+ }
127
+ numSortedPos *= 2; /* double sorted-depth.*/
128
+ }
129
+
130
+ }
131
+
132
+ void QSufSortGenerateSaFromInverse(const int* V, int* __restrict I, const int numChar) {
133
+
134
+ int i;
135
+ for (i=0; i<=numChar; i++) {
136
+ I[V[i]] = i + 1;
137
+ }
138
+
139
+ }
140
+
141
+ /* Sorting routine called for each unsorted group. Sorts the array of integers
142
+ (suffix numbers) of length n starting at p. The algorithm is a ternary-split
143
+ quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
144
+ Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
145
+ function is based on Program 7.*/
146
+ static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
147
+ const int highestPos, const int numSortedChar) {
148
+
149
+ int a, b, c, d;
150
+ int l, m;
151
+ int f, v, s, t;
152
+ int tmp;
153
+ int numItem;
154
+
155
+ #ifdef DEBUG
156
+ if (lowestPos > highestPos) {
157
+ fprintf(stderr, "QSufSortSortSplit(): lowestPos > highestPos!\n");
158
+ exit(1);
159
+ }
160
+ #endif
161
+
162
+ numItem = highestPos - lowestPos + 1;
163
+
164
+ if (numItem <= INSERT_SORT_NUM_ITEM) {
165
+ QSufSortInsertSortSplit(V, I, lowestPos, highestPos, numSortedChar);
166
+ return;
167
+ }
168
+
169
+ v = QSufSortChoosePivot(V, I, lowestPos, highestPos, numSortedChar);
170
+
171
+ a = b = lowestPos;
172
+ c = d = highestPos;
173
+
174
+ while (TRUE) {
175
+ while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) {
176
+ if (f == v) {
177
+ swap(I[a], I[b], tmp);
178
+ a++;
179
+ }
180
+ b++;
181
+ }
182
+ while (c >= b && (f = KEY(V, I, c, numSortedChar)) >= v) {
183
+ if (f == v) {
184
+ swap(I[c], I[d], tmp);
185
+ d--;
186
+ }
187
+ c--;
188
+ }
189
+ if (b > c) {
190
+ break;
191
+ }
192
+ swap(I[b], I[c], tmp);
193
+ b++;
194
+ c--;
195
+ }
196
+
197
+ s = a - lowestPos;
198
+ t = b - a;
199
+ s = min(s, t);
200
+ for (l = lowestPos, m = b - s; m < b; l++, m++) {
201
+ swap(I[l], I[m], tmp);
202
+ }
203
+
204
+ s = d - c;
205
+ t = highestPos - d;
206
+ s = min(s, t);
207
+ for (l = b, m = highestPos - s + 1; m <= highestPos; l++, m++) {
208
+ swap(I[l], I[m], tmp);
209
+ }
210
+
211
+ s = b - a;
212
+ t = d - c;
213
+ if (s > 0) {
214
+ QSufSortSortSplit(V, I, lowestPos, lowestPos + s - 1, numSortedChar);
215
+ }
216
+
217
+ // Update group number for equal portion
218
+ a = lowestPos + s;
219
+ b = highestPos - t;
220
+ if (a == b) {
221
+ // Sorted group
222
+ V[I[a]] = a;
223
+ I[a] = -1;
224
+ } else {
225
+ // Unsorted group
226
+ for (c=a; c<=b; c++) {
227
+ V[I[c]] = b;
228
+ }
229
+ }
230
+
231
+ if (t > 0) {
232
+ QSufSortSortSplit(V, I, highestPos - t + 1, highestPos, numSortedChar);
233
+ }
234
+
235
+ }
236
+
237
+ /* Algorithm by Bentley & McIlroy.*/
238
+ static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int lowestPos,
239
+ const int highestPos, const int numSortedChar) {
240
+
241
+ int m;
242
+ int keyl, keym, keyn;
243
+ int key1, key2, key3;
244
+ int s;
245
+ int numItem;
246
+
247
+ #ifdef DEBUG
248
+ if (lowestPos > highestPos) {
249
+ fprintf(stderr, "QSufSortChoosePivot(): lowestPos > highestPos!\n");
250
+ exit(1);
251
+ }
252
+ #endif
253
+
254
+ numItem = highestPos - lowestPos + 1;
255
+
256
+ #ifdef DEBUG
257
+ if (numItem <= INSERT_SORT_NUM_ITEM) {
258
+ fprintf(stderr, "QSufSortChoosePivot(): number of items <= INSERT_SORT_NUM_ITEM!\n");
259
+ exit(1);
260
+ }
261
+ #endif
262
+
263
+ m = lowestPos + numItem / 2;
264
+
265
+ s = numItem / 8;
266
+ key1 = KEY(V, I, lowestPos, numSortedChar);
267
+ key2 = KEY(V, I, lowestPos+s, numSortedChar);
268
+ key3 = KEY(V, I, lowestPos+2*s, numSortedChar);
269
+ keyl = med3(key1, key2, key3);
270
+ key1 = KEY(V, I, m-s, numSortedChar);
271
+ key2 = KEY(V, I, m, numSortedChar);
272
+ key3 = KEY(V, I, m+s, numSortedChar);
273
+ keym = med3(key1, key2, key3);
274
+ key1 = KEY(V, I, highestPos-2*s, numSortedChar);
275
+ key2 = KEY(V, I, highestPos-s, numSortedChar);
276
+ key3 = KEY(V, I, highestPos, numSortedChar);
277
+ keyn = med3(key1, key2, key3);
278
+
279
+ return med3(keyl, keym, keyn);
280
+
281
+
282
+ }
283
+
284
+ /* Quadratic sorting method to use for small subarrays. */
285
+ static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
286
+ const int highestPos, const int numSortedChar) {
287
+
288
+ int i, j;
289
+ int tmpKey, tmpPos;
290
+ int numItem;
291
+ int key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM];
292
+ int negativeSortedLength;
293
+ int groupNum;
294
+
295
+ #ifdef DEBUG
296
+ if (lowestPos > highestPos) {
297
+ fprintf(stderr, "QSufSortInsertSortSplit(): lowestPos > highestPos!\n");
298
+ exit(1);
299
+ }
300
+ #endif
301
+
302
+ numItem = highestPos - lowestPos + 1;
303
+
304
+ #ifdef DEBUG
305
+ if (numItem > INSERT_SORT_NUM_ITEM) {
306
+ fprintf(stderr, "QSufSortInsertSortSplit(): number of items > INSERT_SORT_NUM_ITEM!\n");
307
+ exit(1);
308
+ }
309
+ #endif
310
+
311
+ for (i=0; i<numItem; i++) {
312
+ #ifdef DEBUG
313
+ if (I[lowestPos + i] < 0) {
314
+ fprintf(stderr, "QSufSortInsertSortSplit(): I < 0 in unsorted region!\n");
315
+ exit(1);
316
+ }
317
+ #endif
318
+ pos[i] = I[lowestPos + i];
319
+ key[i] = V[pos[i] + numSortedChar];
320
+ }
321
+
322
+ for (i=1; i<numItem; i++) {
323
+ tmpKey = key[i];
324
+ tmpPos = pos[i];
325
+ for (j=i; j>0 && key[j-1] > tmpKey; j--) {
326
+ key[j] = key[j-1];
327
+ pos[j] = pos[j-1];
328
+ }
329
+ key[j] = tmpKey;
330
+ pos[j] = tmpPos;
331
+ }
332
+
333
+ negativeSortedLength = -1;
334
+
335
+ i = numItem - 1;
336
+ groupNum = highestPos;
337
+ while (i > 0) {
338
+ I[i+lowestPos] = pos[i];
339
+ V[I[i+lowestPos]] = groupNum;
340
+ if (key[i-1] == key[i]) {
341
+ negativeSortedLength = 0;
342
+ } else {
343
+ if (negativeSortedLength < 0) {
344
+ I[i+lowestPos] = negativeSortedLength;
345
+ }
346
+ groupNum = i + lowestPos - 1;
347
+ negativeSortedLength--;
348
+ }
349
+ i--;
350
+ }
351
+
352
+ I[lowestPos] = pos[0];
353
+ V[I[lowestPos]] = groupNum;
354
+ if (negativeSortedLength < 0) {
355
+ I[lowestPos] = negativeSortedLength;
356
+ }
357
+
358
+ }
359
+
360
+ /* Bucketsort for first iteration.
361
+
362
+ Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear
363
+ at least once. x[n] is 0. (This is the corresponding output of transform.) k
364
+ must be at most n+1. p is array of size n+1 whose contents are disregarded.
365
+
366
+ Output: x is V and p is I after the initial sorting stage of the refined
367
+ suffix sorting algorithm.*/
368
+
369
+ static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int numChar, const int alphabetSize) {
370
+
371
+ int i, c;
372
+ int d;
373
+ int groupNum;
374
+ int currentIndex;
375
+
376
+ // mark linked list empty
377
+ for (i=0; i<alphabetSize; i++) {
378
+ I[i] = -1;
379
+ }
380
+
381
+ // insert to linked list
382
+ for (i=0; i<=numChar; i++) {
383
+ c = V[i];
384
+ V[i] = (int)(I[c]);
385
+ I[c] = i;
386
+ }
387
+
388
+ currentIndex = numChar;
389
+ for (i=alphabetSize; i>0; i--) {
390
+ c = I[i-1];
391
+ d = (int)(V[c]);
392
+ groupNum = currentIndex;
393
+ V[c] = groupNum;
394
+ if (d >= 0) {
395
+ I[currentIndex] = c;
396
+ while (d >= 0) {
397
+ c = d;
398
+ d = V[c];
399
+ V[c] = groupNum;
400
+ currentIndex--;
401
+ I[currentIndex] = c;
402
+ }
403
+ } else {
404
+ // sorted group
405
+ I[currentIndex] = -1;
406
+ }
407
+ currentIndex--;
408
+ }
409
+
410
+ }
411
+
412
+ /* Transforms the alphabet of x by attempting to aggregate several symbols into
413
+ one, while preserving the suffix order of x. The alphabet may also be
414
+ compacted, so that x on output comprises all integers of the new alphabet
415
+ with no skipped numbers.
416
+
417
+ Input: x is an array of size n+1 whose first n elements are positive
418
+ integers in the range l...k-1. p is array of size n+1, used for temporary
419
+ storage. q controls aggregation and compaction by defining the maximum intue
420
+ for any symbol during transformation: q must be at least k-l; if q<=n,
421
+ compaction is guaranteed; if k-l>n, compaction is never done; if q is
422
+ INT_MAX, the maximum number of symbols are aggregated into one.
423
+
424
+ Output: Returns an integer j in the range 1...q representing the size of the
425
+ new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
426
+ set to the number of old symbols grouped into one. Only x[n] is 0.*/
427
+ static int QSufSortTransform(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
428
+ const int smallestInputSymbol, const int maxNewAlphabetSize, int *numSymbolAggregated) {
429
+
430
+ int c, i, j;
431
+ int a; // numSymbolAggregated
432
+ int mask;
433
+ int minSymbolInChunk = 0, maxSymbolInChunk = 0;
434
+ int newAlphabetSize;
435
+ int maxNumInputSymbol, maxNumBit, maxSymbol;
436
+
437
+ maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
438
+
439
+ maxNumBit = BITS_IN_WORD - leadingZero(maxNumInputSymbol);
440
+ maxSymbol = INT_MAX >> maxNumBit;
441
+
442
+ c = maxNumInputSymbol;
443
+ for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) {
444
+ minSymbolInChunk = (minSymbolInChunk << maxNumBit) | (V[a] - smallestInputSymbol + 1);
445
+ maxSymbolInChunk = c;
446
+ c = (maxSymbolInChunk << maxNumBit) | maxNumInputSymbol;
447
+ }
448
+
449
+ mask = (1 << (a-1) * maxNumBit) - 1; /* mask masks off top old symbol from chunk.*/
450
+ V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/
451
+
452
+ #ifdef DEBUG
453
+ // Section of code for maxSymbolInChunk > numChar removed!
454
+ if (maxSymbolInChunk > numChar) {
455
+ fprintf(stderr, "QSufSortTransform(): maxSymbolInChunk > numChar!\n");
456
+ exit(1);
457
+ }
458
+ #endif
459
+
460
+ /* bucketing possible, compact alphabet.*/
461
+ for (i=0; i<=maxSymbolInChunk; i++) {
462
+ I[i] = 0; /* zero transformation table.*/
463
+ }
464
+ c = minSymbolInChunk;
465
+ for (i=a; i<=numChar; i++) {
466
+ I[c] = 1; /* mark used chunk symbol.*/
467
+ c = ((c & mask) << maxNumBit) | (V[i] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/
468
+ }
469
+ for (i=1; i<a; i++) { /* handle last r-1 positions.*/
470
+ I[c] = 1; /* mark used chunk symbol.*/
471
+ c = (c & mask) << maxNumBit; /* shift in next old symbol in chunk.*/
472
+ }
473
+ newAlphabetSize = 1;
474
+ for (i=0; i<=maxSymbolInChunk; i++) {
475
+ if (I[i]) {
476
+ I[i] = newAlphabetSize;
477
+ newAlphabetSize++;
478
+ }
479
+ }
480
+ c = minSymbolInChunk;
481
+ for (i=0, j=a; j<=numChar; i++, j++) {
482
+ V[i] = I[c]; /* transform to new alphabet.*/
483
+ c = ((c & mask) << maxNumBit) | (V[j] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/
484
+ }
485
+ for (; i<numChar; i++) { /* handle last a-1 positions.*/
486
+ V[i] = I[c]; /* transform to new alphabet.*/
487
+ c = (c & mask) << maxNumBit; /* shift right-end zero in chunk.*/
488
+ }
489
+
490
+ V[numChar] = 0; /* end-of-string symbol is zero.*/
491
+
492
+ *numSymbolAggregated = a;
493
+ return newAlphabetSize;
494
+
495
+ }
496
+