bio-bwa 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
@@ -0,0 +1,40 @@
1
+ /* QSufSort.h
2
+
3
+ Header file for QSufSort.c
4
+
5
+ This file contains an implementation of the algorithm presented in "Faster
6
+ Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko
7
+ Sadakane (sada@is.s.u-tokyo.ac.jp).
8
+
9
+ This software may be used freely for any purpose. However, when distributed,
10
+ the original source must be clearly stated, and, when the source code is
11
+ distributed, the copyright notice must be retained and any alterations in
12
+ the code must be clearly marked. No warranty is given regarding the quality
13
+ of this software.
14
+
15
+ Modified by Wong Chi-Kwong, 2004
16
+
17
+ Changes summary: - Used long variable and function names
18
+ - Removed global variables
19
+ - Replace pointer references with array references
20
+ - Used insertion sort in place of selection sort and increased insertion sort threshold
21
+ - Reconstructing suffix array from inverse becomes an option
22
+ - Add handling where end-of-text symbol is not necessary < all characters
23
+ - Removed codes for supporting alphabet size > number of characters
24
+
25
+ No warrenty is given regarding the quality of the modifications.
26
+
27
+ */
28
+
29
+ #ifndef __QSUFSORT_H__
30
+ #define __QSUFSORT_H__
31
+
32
+ #define KEY(V, I, p, h) ( V[ I[p] + h ] )
33
+ #define INSERT_SORT_NUM_ITEM 16
34
+
35
+ void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
36
+ const int smallestInputSymbol, const int skipTransform);
37
+ void QSufSortGenerateSaFromInverse(const int *V, int* __restrict I, const int numChar);
38
+
39
+
40
+ #endif
@@ -0,0 +1,1547 @@
1
+ /*
2
+
3
+ BWTConstruct.c BWT-Index Construction
4
+
5
+ This module constructs BWT and auxiliary data structures.
6
+
7
+ Copyright (C) 2004, Wong Chi Kwong.
8
+
9
+ This program is free software; you can redistribute it and/or
10
+ modify it under the terms of the GNU General Public License
11
+ as published by the Free Software Foundation; either version 2
12
+ of the License, or (at your option) any later version.
13
+
14
+ This program is distributed in the hope that it will be useful,
15
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ GNU General Public License for more details.
18
+
19
+ You should have received a copy of the GNU General Public License
20
+ along with this program; if not, write to the Free Software
21
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22
+
23
+ */
24
+
25
+ #include <stdio.h>
26
+ #include <stdlib.h>
27
+ #include <string.h>
28
+ #include "bwt_gen.h"
29
+ #include "QSufSort.h"
30
+
31
+ static unsigned int TextLengthFromBytePacked(unsigned int bytePackedLength, unsigned int bitPerChar,
32
+ unsigned int lastByteLength)
33
+ {
34
+ if (bytePackedLength > ALL_ONE_MASK / (BITS_IN_BYTE / bitPerChar)) {
35
+ fprintf(stderr, "TextLengthFromBytePacked(): text length > 2^32!\n");
36
+ exit(1);
37
+ }
38
+ return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength;
39
+ }
40
+
41
+ static void initializeVAL(unsigned int *startAddr, const unsigned int length, const unsigned int initValue)
42
+ {
43
+ unsigned int i;
44
+ for (i=0; i<length; i++) startAddr[i] = initValue;
45
+ }
46
+
47
+ static void GenerateDNAOccCountTable(unsigned int *dnaDecodeTable)
48
+ {
49
+ unsigned int i, j, c, t;
50
+
51
+ for (i=0; i<DNA_OCC_CNT_TABLE_SIZE_IN_WORD; i++) {
52
+ dnaDecodeTable[i] = 0;
53
+ c = i;
54
+ for (j=0; j<8; j++) {
55
+ t = c & 0x00000003;
56
+ dnaDecodeTable[i] += 1 << (t * 8);
57
+ c >>= 2;
58
+ }
59
+ }
60
+
61
+ }
62
+ // for BWTIncCreate()
63
+ static unsigned int BWTOccValueMajorSizeInWord(const unsigned int numChar)
64
+ {
65
+ unsigned int numOfOccValue;
66
+ unsigned int numOfOccIntervalPerMajor;
67
+ numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
68
+ numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL;
69
+ return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE;
70
+ }
71
+ // for BWTIncCreate()
72
+ static unsigned int BWTOccValueMinorSizeInWord(const unsigned int numChar)
73
+ {
74
+ unsigned int numOfOccValue;
75
+ numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
76
+ return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE;
77
+ }
78
+ // for BWTIncCreate()
79
+ static unsigned int BWTResidentSizeInWord(const unsigned int numChar) {
80
+
81
+ unsigned int numCharRoundUpToOccInterval;
82
+
83
+ // The $ in BWT at the position of inverseSa0 is not encoded
84
+ numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL;
85
+
86
+ return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD;
87
+
88
+ }
89
+
90
+ static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc)
91
+ {
92
+ unsigned int maxBuildSize;
93
+
94
+ if (bwtInc->bwt->textLength == 0) {
95
+ // initial build
96
+ // Minus 2 because n+1 entries of seq and rank needed for n char
97
+ maxBuildSize = (bwtInc->availableWord - 2 - OCC_INTERVAL / CHAR_PER_WORD)
98
+ / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD;
99
+ if (bwtInc->initialMaxBuildSize > 0) {
100
+ bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize);
101
+ } else {
102
+ bwtInc->buildSize = maxBuildSize;
103
+ }
104
+ } else {
105
+ // Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char
106
+ // Minus numberOfIterationDone because bwt slightly shift to left in each iteration
107
+ maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - 3
108
+ - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR)
109
+ / 3;
110
+ if (maxBuildSize < CHAR_PER_WORD) {
111
+ fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n");
112
+ exit(1);
113
+ }
114
+ if (bwtInc->incMaxBuildSize > 0) {
115
+ bwtInc->buildSize = min(bwtInc->incMaxBuildSize, maxBuildSize);
116
+ } else {
117
+ bwtInc->buildSize = maxBuildSize;
118
+ }
119
+ if (bwtInc->buildSize < CHAR_PER_WORD) {
120
+ bwtInc->buildSize = CHAR_PER_WORD;
121
+ }
122
+ }
123
+
124
+ if (bwtInc->buildSize < CHAR_PER_WORD) {
125
+ fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n");
126
+ exit(1);
127
+ }
128
+
129
+ bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD;
130
+
131
+ bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1);
132
+ bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + bwtInc->buildSize + 1);
133
+
134
+ }
135
+
136
+ // for ceilLog2()
137
+ unsigned int leadingZero(const unsigned int input)
138
+ {
139
+ unsigned int l;
140
+ const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
141
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
142
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
143
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
144
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
145
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
146
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
147
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
148
+
149
+ if (input & 0xFFFF0000) {
150
+ if (input & 0xFF000000) {
151
+ l = leadingZero8bit[input >> 24];
152
+ } else {
153
+ l = 8 + leadingZero8bit[input >> 16];
154
+ }
155
+ } else {
156
+ if (input & 0x0000FF00) {
157
+ l = 16 + leadingZero8bit[input >> 8];
158
+ } else {
159
+ l = 24 + leadingZero8bit[input];
160
+ }
161
+ }
162
+ return l;
163
+
164
+ }
165
+ // for BitPerBytePackedChar()
166
+ static unsigned int ceilLog2(const unsigned int input)
167
+ {
168
+ if (input <= 1) return 0;
169
+ return BITS_IN_WORD - leadingZero(input - 1);
170
+
171
+ }
172
+ // for ConvertBytePackedToWordPacked()
173
+ static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize)
174
+ {
175
+ unsigned int bitPerChar;
176
+ bitPerChar = ceilLog2(alphabetSize);
177
+ // Return the largest number of bit that does not affect packing efficiency
178
+ if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar)
179
+ bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar);
180
+ return bitPerChar;
181
+ }
182
+ // for ConvertBytePackedToWordPacked()
183
+ static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize)
184
+ {
185
+ return ceilLog2(alphabetSize);
186
+ }
187
+
188
+ static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize,
189
+ const unsigned int textLength)
190
+ {
191
+ unsigned int i, j, k;
192
+ unsigned int c;
193
+ unsigned int bitPerBytePackedChar;
194
+ unsigned int bitPerWordPackedChar;
195
+ unsigned int charPerWord;
196
+ unsigned int charPerByte;
197
+ unsigned int bytePerIteration;
198
+ unsigned int byteProcessed = 0;
199
+ unsigned int wordProcessed = 0;
200
+ unsigned int mask, shift;
201
+
202
+ unsigned int buffer[BITS_IN_WORD];
203
+
204
+ bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize);
205
+ bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize);
206
+ charPerByte = BITS_IN_BYTE / bitPerBytePackedChar;
207
+ charPerWord = BITS_IN_WORD / bitPerWordPackedChar;
208
+
209
+ bytePerIteration = charPerWord / charPerByte;
210
+ mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar);
211
+ shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar;
212
+
213
+ while ((wordProcessed + 1) * charPerWord < textLength) {
214
+
215
+ k = 0;
216
+ for (i=0; i<bytePerIteration; i++) {
217
+ c = (unsigned int)input[byteProcessed] << shift;
218
+ for (j=0; j<charPerByte; j++) {
219
+ buffer[k] = c & mask;
220
+ c <<= bitPerBytePackedChar;
221
+ k++;
222
+ }
223
+ byteProcessed++;
224
+ }
225
+
226
+ c = 0;
227
+ for (i=0; i<charPerWord; i++) {
228
+ c |= buffer[i] >> bitPerWordPackedChar * i;
229
+ }
230
+ output[wordProcessed] = c;
231
+ wordProcessed++;
232
+
233
+ }
234
+
235
+ k = 0;
236
+ for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) {
237
+ c = (unsigned int)input[byteProcessed] << shift;
238
+ for (j=0; j<charPerByte; j++) {
239
+ buffer[k] = c & mask;
240
+ c <<= bitPerBytePackedChar;
241
+ k++;
242
+ }
243
+ byteProcessed++;
244
+ }
245
+
246
+ c = 0;
247
+ for (i=0; i<textLength - wordProcessed * charPerWord; i++) {
248
+ c |= buffer[i] >> bitPerWordPackedChar * i;
249
+ }
250
+ output[wordProcessed] = c;
251
+ }
252
+
253
+ BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable)
254
+ {
255
+ BWT *bwt;
256
+
257
+ bwt = (BWT*)calloc(1, sizeof(BWT));
258
+
259
+ bwt->textLength = 0;
260
+ bwt->inverseSa = 0;
261
+
262
+ bwt->cumulativeFreq = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int*));
263
+ initializeVAL(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0);
264
+
265
+ bwt->bwtSizeInWord = 0;
266
+ bwt->saValueOnBoundary = NULL;
267
+
268
+ // Generate decode tables
269
+ if (decodeTable == NULL) {
270
+ bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int));
271
+ GenerateDNAOccCountTable(bwt->decodeTable);
272
+ } else {
273
+ bwt->decodeTable = decodeTable;
274
+ }
275
+
276
+ bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength);
277
+ bwt->occValueMajor = (unsigned*)calloc(bwt->occMajorSizeInWord, sizeof(unsigned int));
278
+
279
+ bwt->occSizeInWord = 0;
280
+ bwt->occValue = NULL;
281
+
282
+ bwt->saInterval = ALL_ONE_MASK;
283
+ bwt->saValueSize = 0;
284
+ bwt->saValue = NULL;
285
+
286
+ bwt->inverseSaInterval = ALL_ONE_MASK;
287
+ bwt->inverseSaSize = 0;
288
+ bwt->inverseSa = NULL;
289
+
290
+ return bwt;
291
+ }
292
+
293
+ BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit,
294
+ const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize)
295
+ {
296
+ BWTInc *bwtInc;
297
+ unsigned int i;
298
+
299
+ if (targetNBit == 0) {
300
+ fprintf(stderr, "BWTIncCreate() : targetNBit = 0!\n");
301
+ exit(1);
302
+ }
303
+
304
+ bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc));
305
+ bwtInc->numberOfIterationDone = 0;
306
+ bwtInc->bwt = BWTCreate(textLength, NULL);
307
+ bwtInc->initialMaxBuildSize = initialMaxBuildSize;
308
+ bwtInc->incMaxBuildSize = incMaxBuildSize;
309
+ bwtInc->targetNBit = targetNBit;
310
+ bwtInc->cumulativeCountInCurrentBuild = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int));
311
+ initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
312
+
313
+ // Build frequently accessed data
314
+ bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int));
315
+ for (i=0; i<CHAR_PER_WORD; i++) {
316
+ bwtInc->packedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR;
317
+ }
318
+
319
+ bwtInc->targetTextLength = textLength;
320
+ bwtInc->availableWord = (unsigned int)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit);
321
+ if (bwtInc->availableWord < BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength)) {
322
+ fprintf(stderr, "BWTIncCreate() : targetNBit is too low!\n");
323
+ exit(1);
324
+ }
325
+ bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD);
326
+
327
+ return bwtInc;
328
+
329
+ }
330
+ // for BWTIncConstruct()
331
+ static void BWTIncPutPackedTextToRank(const unsigned int *packedText, unsigned int* __restrict rank,
332
+ unsigned int* __restrict cumulativeCount, const unsigned int numChar)
333
+ {
334
+ unsigned int i, j;
335
+ unsigned int c, t;
336
+ unsigned int packedMask;
337
+ unsigned int rankIndex;
338
+ unsigned int lastWord, numCharInLastWord;
339
+
340
+ lastWord = (numChar - 1) / CHAR_PER_WORD;
341
+ numCharInLastWord = numChar - lastWord * CHAR_PER_WORD;
342
+
343
+ packedMask = ALL_ONE_MASK >> (BITS_IN_WORD - BIT_PER_CHAR);
344
+ rankIndex = numChar - 1;
345
+
346
+ t = packedText[lastWord] >> (BITS_IN_WORD - numCharInLastWord * BIT_PER_CHAR);
347
+ for (i=0; i<numCharInLastWord; i++) {
348
+ c = t & packedMask;
349
+ cumulativeCount[c+1]++;
350
+ rank[rankIndex] = c;
351
+ rankIndex--;
352
+ t >>= BIT_PER_CHAR;
353
+ }
354
+
355
+ for (i=lastWord; i--;) { // loop from lastWord - 1 to 0
356
+ t = packedText[i];
357
+ for (j=0; j<CHAR_PER_WORD; j++) {
358
+ c = t & packedMask;
359
+ cumulativeCount[c+1]++;
360
+ rank[rankIndex] = c;
361
+ rankIndex--;
362
+ t >>= BIT_PER_CHAR;
363
+ }
364
+ }
365
+
366
+ // Convert occurrence to cumulativeCount
367
+ cumulativeCount[2] += cumulativeCount[1];
368
+ cumulativeCount[3] += cumulativeCount[2];
369
+ cumulativeCount[4] += cumulativeCount[3];
370
+ }
371
+
372
+
373
+ static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigned int index,
374
+ unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable)
375
+ {
376
+ static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000,
377
+ 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000,
378
+ 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00,
379
+ 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC };
380
+
381
+ unsigned int iteration, wordToCount, charToCount;
382
+ unsigned int i, j, c;
383
+ unsigned int sum;
384
+
385
+ occCount[0] = 0;
386
+ occCount[1] = 0;
387
+ occCount[2] = 0;
388
+ occCount[3] = 0;
389
+
390
+ iteration = index / 256;
391
+ wordToCount = (index - iteration * 256) / 16;
392
+ charToCount = index - iteration * 256 - wordToCount * 16;
393
+
394
+ for (i=0; i<iteration; i++) {
395
+
396
+ sum = 0;
397
+ for (j=0; j<16; j++) {
398
+ sum += dnaDecodeTable[*dna >> 16];
399
+ sum += dnaDecodeTable[*dna & 0x0000FFFF];
400
+ dna++;
401
+ }
402
+ if (!DNA_OCC_SUM_EXCEPTION(sum)) {
403
+ occCount[0] += sum & 0x000000FF; sum >>= 8;
404
+ occCount[1] += sum & 0x000000FF; sum >>= 8;
405
+ occCount[2] += sum & 0x000000FF; sum >>= 8;
406
+ occCount[3] += sum;
407
+ } else {
408
+ // only some or all of the 3 bits are on
409
+ // in reality, only one of the four cases are possible
410
+ if (sum == 0x00000100) {
411
+ occCount[0] += 256;
412
+ } else if (sum == 0x00010000) {
413
+ occCount[1] += 256;
414
+ } else if (sum == 0x01000000) {
415
+ occCount[2] += 256;
416
+ } else if (sum == 0x00000000) {
417
+ occCount[3] += 256;
418
+ } else {
419
+ fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n");
420
+ exit(1);
421
+ }
422
+ }
423
+
424
+ }
425
+
426
+ sum = 0;
427
+ for (j=0; j<wordToCount; j++) {
428
+ sum += dnaDecodeTable[*dna >> 16];
429
+ sum += dnaDecodeTable[*dna & 0x0000FFFF];
430
+ dna++;
431
+ }
432
+
433
+ if (charToCount > 0) {
434
+ c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c;
435
+ sum += dnaDecodeTable[c >> 16];
436
+ sum += dnaDecodeTable[c & 0xFFFF];
437
+ sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess
438
+ }
439
+
440
+ occCount[0] += sum & 0x000000FF; sum >>= 8;
441
+ occCount[1] += sum & 0x000000FF; sum >>= 8;
442
+ occCount[2] += sum & 0x000000FF; sum >>= 8;
443
+ occCount[3] += sum;
444
+ }
445
+
446
+ static void BWTIncBuildPackedBwt(const unsigned int *relativeRank, unsigned int* __restrict bwt, const unsigned int numChar,
447
+ const unsigned int *cumulativeCount, const unsigned int *packedShift) {
448
+
449
+ unsigned int i, c, r;
450
+ unsigned int previousRank, currentRank;
451
+ unsigned int wordIndex, charIndex;
452
+ unsigned int inverseSa0;
453
+
454
+ inverseSa0 = previousRank = relativeRank[0];
455
+
456
+ for (i=1; i<=numChar; i++) {
457
+ currentRank = relativeRank[i];
458
+ // previousRank > cumulativeCount[c] because $ is one of the char
459
+ c = (previousRank > cumulativeCount[1]) + (previousRank > cumulativeCount[2])
460
+ + (previousRank > cumulativeCount[3]);
461
+ // set bwt for currentRank
462
+ if (c > 0) {
463
+ // c <> 'a'
464
+ r = currentRank;
465
+ if (r > inverseSa0) {
466
+ // - 1 because $ at inverseSa0 is not encoded
467
+ r--;
468
+ }
469
+ wordIndex = r / CHAR_PER_WORD;
470
+ charIndex = r - wordIndex * CHAR_PER_WORD;
471
+ bwt[wordIndex] |= c << packedShift[charIndex];
472
+ }
473
+ previousRank = currentRank;
474
+ }
475
+ }
476
+
477
+ static inline unsigned int BWTOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit,
478
+ const unsigned int character)
479
+ {
480
+ unsigned int occIndexMajor;
481
+
482
+ occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR;
483
+
484
+ if (occIndexExplicit % OCC_VALUE_PER_WORD == 0) {
485
+ return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] +
486
+ (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> 16);
487
+
488
+ } else {
489
+ return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] +
490
+ (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] & 0x0000FFFF);
491
+ }
492
+ }
493
+
494
+
495
+ static unsigned int ForwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character,
496
+ const unsigned int* dnaDecodeTable)
497
+ {
498
+ static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000,
499
+ 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000,
500
+ 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00,
501
+ 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC };
502
+
503
+ unsigned int wordToCount, charToCount;
504
+ unsigned int i, c;
505
+ unsigned int sum = 0;
506
+
507
+ wordToCount = index / 16;
508
+ charToCount = index - wordToCount * 16;
509
+
510
+ for (i=0; i<wordToCount; i++) {
511
+ sum += dnaDecodeTable[dna[i] >> 16];
512
+ sum += dnaDecodeTable[dna[i] & 0x0000FFFF];
513
+ }
514
+
515
+ if (charToCount > 0) {
516
+ c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c;
517
+ sum += dnaDecodeTable[c >> 16];
518
+ sum += dnaDecodeTable[c & 0xFFFF];
519
+ sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess
520
+ }
521
+
522
+ return (sum >> (character * 8)) & 0x000000FF;
523
+
524
+ }
525
+
526
+ static unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character,
527
+ const unsigned int* dnaDecodeTable)
528
+ {
529
+ static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F,
530
+ 0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF,
531
+ 0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF,
532
+ 0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF };
533
+
534
+ unsigned int wordToCount, charToCount;
535
+ unsigned int i, c;
536
+ unsigned int sum = 0;
537
+
538
+ wordToCount = index / 16;
539
+ charToCount = index - wordToCount * 16;
540
+
541
+ dna -= wordToCount + 1;
542
+
543
+ if (charToCount > 0) {
544
+ c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c;
545
+ sum += dnaDecodeTable[c >> 16];
546
+ sum += dnaDecodeTable[c & 0xFFFF];
547
+ sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess
548
+ }
549
+
550
+ for (i=0; i<wordToCount; i++) {
551
+ dna++;
552
+ sum += dnaDecodeTable[*dna >> 16];
553
+ sum += dnaDecodeTable[*dna & 0x0000FFFF];
554
+ }
555
+
556
+ return (sum >> (character * 8)) & 0x000000FF;
557
+
558
+ }
559
+
560
+ unsigned int BWTOccValue(const BWT *bwt, unsigned int index, const unsigned int character) {
561
+
562
+ unsigned int occValue;
563
+ unsigned int occExplicitIndex, occIndex;
564
+
565
+ // $ is supposed to be positioned at inverseSa0 but it is not encoded
566
+ // therefore index is subtracted by 1 for adjustment
567
+ if (index > bwt->inverseSa0) {
568
+ index--;
569
+ }
570
+
571
+ occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding
572
+ occIndex = occExplicitIndex * OCC_INTERVAL;
573
+ occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character);
574
+
575
+ if (occIndex == index) {
576
+ return occValue;
577
+ }
578
+
579
+ if (occIndex < index) {
580
+ return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable);
581
+ } else {
582
+ return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable);
583
+ }
584
+
585
+ }
586
+
587
+ static unsigned int BWTIncGetAbsoluteRank(BWT *bwt, unsigned int* __restrict absoluteRank, unsigned int* __restrict seq,
588
+ const unsigned int *packedText, const unsigned int numChar,
589
+ const unsigned int* cumulativeCount, const unsigned int firstCharInLastIteration)
590
+ {
591
+ unsigned int saIndex;
592
+ unsigned int lastWord;
593
+ unsigned int packedMask;
594
+ unsigned int i, j;
595
+ unsigned int c, t;
596
+ unsigned int rankIndex;
597
+ unsigned int shift;
598
+ unsigned int seqIndexFromStart[ALPHABET_SIZE];
599
+ unsigned int seqIndexFromEnd[ALPHABET_SIZE];
600
+
601
+ for (i=0; i<ALPHABET_SIZE; i++) {
602
+ seqIndexFromStart[i] = cumulativeCount[i];
603
+ seqIndexFromEnd[i] = cumulativeCount[i+1] - 1;
604
+ }
605
+
606
+ shift = BITS_IN_WORD - BIT_PER_CHAR;
607
+ packedMask = ALL_ONE_MASK >> shift;
608
+ saIndex = bwt->inverseSa0;
609
+ rankIndex = numChar - 1;
610
+
611
+ lastWord = numChar / CHAR_PER_WORD;
612
+ for (i=lastWord; i--;) { // loop from lastWord - 1 to 0
613
+ t = packedText[i];
614
+ for (j=0; j<CHAR_PER_WORD; j++) {
615
+ c = t & packedMask;
616
+ saIndex = bwt->cumulativeFreq[c] + BWTOccValue(bwt, saIndex, c) + 1;
617
+ // A counting sort using the first character of suffix is done here
618
+ // If rank > inverseSa0 -> fill seq from end, otherwise fill seq from start -> to leave the right entry for inverseSa0
619
+ if (saIndex > bwt->inverseSa0) {
620
+ seq[seqIndexFromEnd[c]] = rankIndex;
621
+ absoluteRank[seqIndexFromEnd[c]] = saIndex;
622
+ seqIndexFromEnd[c]--;
623
+ } else {
624
+ seq[seqIndexFromStart[c]] = rankIndex;
625
+ absoluteRank[seqIndexFromStart[c]] = saIndex;
626
+ seqIndexFromStart[c]++;
627
+ }
628
+ rankIndex--;
629
+ t >>= BIT_PER_CHAR;
630
+ }
631
+ }
632
+
633
+ absoluteRank[seqIndexFromStart[firstCharInLastIteration]] = bwt->inverseSa0; // representing the substring of all preceding characters
634
+ seq[seqIndexFromStart[firstCharInLastIteration]] = numChar;
635
+
636
+ return seqIndexFromStart[firstCharInLastIteration];
637
+ }
638
+
639
+ static void BWTIncSortKey(unsigned int* __restrict key, unsigned int* __restrict seq, const unsigned int numItem)
640
+ {
641
+ #define EQUAL_KEY_THRESHOLD 4 // Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD
642
+
643
+ int lowIndex, highIndex, midIndex;
644
+ int lowPartitionIndex, highPartitionIndex;
645
+ int lowStack[32], highStack[32];
646
+ int stackDepth;
647
+ int i, j;
648
+ unsigned int tempSeq, tempKey;
649
+ int numberOfEqualKey;
650
+
651
+ if (numItem < 2) return;
652
+
653
+ stackDepth = 0;
654
+
655
+ lowIndex = 0;
656
+ highIndex = numItem - 1;
657
+
658
+ for (;;) {
659
+
660
+ for (;;) {
661
+
662
+ // Sort small array of data
663
+ if (highIndex - lowIndex < BWTINC_INSERT_SORT_NUM_ITEM) { // Insertion sort on smallest arrays
664
+ for (i=lowIndex+1; i<=highIndex; i++) {
665
+ tempSeq = seq[i];
666
+ tempKey = key[i];
667
+ for (j = i; j > lowIndex && key[j-1] > tempKey; j--) {
668
+ seq[j] = seq[j-1];
669
+ key[j] = key[j-1];
670
+ }
671
+ if (j != i) {
672
+ seq[j] = tempSeq;
673
+ key[j] = tempKey;
674
+ }
675
+ }
676
+ break;
677
+ }
678
+
679
+ // Choose pivot as median of the lowest, middle, and highest data; sort the three data
680
+
681
+ midIndex = average(lowIndex, highIndex);
682
+ if (key[lowIndex] > key[midIndex]) {
683
+ tempSeq = seq[lowIndex];
684
+ tempKey = key[lowIndex];
685
+ seq[lowIndex] = seq[midIndex];
686
+ key[lowIndex] = key[midIndex];
687
+ seq[midIndex] = tempSeq;
688
+ key[midIndex] = tempKey;
689
+ }
690
+ if (key[lowIndex] > key[highIndex]) {
691
+ tempSeq = seq[lowIndex];
692
+ tempKey = key[lowIndex];
693
+ seq[lowIndex] = seq[highIndex];
694
+ key[lowIndex] = key[highIndex];
695
+ seq[highIndex] = tempSeq;
696
+ key[highIndex] = tempKey;
697
+ }
698
+ if (key[midIndex] > key[highIndex]) {
699
+ tempSeq = seq[midIndex];
700
+ tempKey = key[midIndex];
701
+ seq[midIndex] = seq[highIndex];
702
+ key[midIndex] = key[highIndex];
703
+ seq[highIndex] = tempSeq;
704
+ key[highIndex] = tempKey;
705
+ }
706
+
707
+ // Partition data
708
+
709
+ numberOfEqualKey = 0;
710
+
711
+ lowPartitionIndex = lowIndex + 1;
712
+ highPartitionIndex = highIndex - 1;
713
+
714
+ for (;;) {
715
+ while (lowPartitionIndex <= highPartitionIndex && key[lowPartitionIndex] <= key[midIndex]) {
716
+ numberOfEqualKey += (key[lowPartitionIndex] == key[midIndex]);
717
+ lowPartitionIndex++;
718
+ }
719
+ while (lowPartitionIndex < highPartitionIndex) {
720
+ if (key[midIndex] >= key[highPartitionIndex]) {
721
+ numberOfEqualKey += (key[midIndex] == key[highPartitionIndex]);
722
+ break;
723
+ }
724
+ highPartitionIndex--;
725
+ }
726
+ if (lowPartitionIndex >= highPartitionIndex) {
727
+ break;
728
+ }
729
+ tempSeq = seq[lowPartitionIndex];
730
+ tempKey = key[lowPartitionIndex];
731
+ seq[lowPartitionIndex] = seq[highPartitionIndex];
732
+ key[lowPartitionIndex] = key[highPartitionIndex];
733
+ seq[highPartitionIndex] = tempSeq;
734
+ key[highPartitionIndex] = tempKey;
735
+ if (highPartitionIndex == midIndex) {
736
+ // partition key has been moved
737
+ midIndex = lowPartitionIndex;
738
+ }
739
+ lowPartitionIndex++;
740
+ highPartitionIndex--;
741
+ }
742
+
743
+ // Adjust the partition index
744
+ highPartitionIndex = lowPartitionIndex;
745
+ lowPartitionIndex--;
746
+
747
+ // move the partition key to end of low partition
748
+ tempSeq = seq[midIndex];
749
+ tempKey = key[midIndex];
750
+ seq[midIndex] = seq[lowPartitionIndex];
751
+ key[midIndex] = key[lowPartitionIndex];
752
+ seq[lowPartitionIndex] = tempSeq;
753
+ key[lowPartitionIndex] = tempKey;
754
+
755
+ if (highIndex - lowIndex + BWTINC_INSERT_SORT_NUM_ITEM <= EQUAL_KEY_THRESHOLD * numberOfEqualKey) {
756
+
757
+ // Many keys = partition key; separate the equal key data from the lower partition
758
+
759
+ midIndex = lowIndex;
760
+
761
+ for (;;) {
762
+ while (midIndex < lowPartitionIndex && key[midIndex] < key[lowPartitionIndex]) {
763
+ midIndex++;
764
+ }
765
+ while (midIndex < lowPartitionIndex && key[lowPartitionIndex] == key[lowPartitionIndex - 1]) {
766
+ lowPartitionIndex--;
767
+ }
768
+ if (midIndex >= lowPartitionIndex) {
769
+ break;
770
+ }
771
+ tempSeq = seq[midIndex];
772
+ tempKey = key[midIndex];
773
+ seq[midIndex] = seq[lowPartitionIndex - 1];
774
+ key[midIndex] = key[lowPartitionIndex - 1];
775
+ seq[lowPartitionIndex - 1] = tempSeq;
776
+ key[lowPartitionIndex - 1] = tempKey;
777
+ midIndex++;
778
+ lowPartitionIndex--;
779
+ }
780
+
781
+ }
782
+
783
+ if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) {
784
+ // put the larger partition to stack
785
+ lowStack[stackDepth] = lowIndex;
786
+ highStack[stackDepth] = lowPartitionIndex - 1;
787
+ stackDepth++;
788
+ // sort the smaller partition first
789
+ lowIndex = highPartitionIndex;
790
+ } else {
791
+ // put the larger partition to stack
792
+ lowStack[stackDepth] = highPartitionIndex;
793
+ highStack[stackDepth] = highIndex;
794
+ stackDepth++;
795
+ // sort the smaller partition first
796
+ if (lowPartitionIndex > lowIndex) {
797
+ highIndex = lowPartitionIndex - 1;
798
+ } else {
799
+ // all keys in the partition equals to the partition key
800
+ break;
801
+ }
802
+ }
803
+ continue;
804
+ }
805
+
806
+ // Pop a range from stack
807
+ if (stackDepth > 0) {
808
+ stackDepth--;
809
+ lowIndex = lowStack[stackDepth];
810
+ highIndex = highStack[stackDepth];
811
+ continue;
812
+ } else return;
813
+ }
814
+ }
815
+
816
+
817
+ static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigned int* __restrict seq,
818
+ unsigned int* __restrict relativeRank, const unsigned int numItem,
819
+ unsigned int oldInverseSa0, const unsigned int *cumulativeCount)
820
+ {
821
+ unsigned int i, c;
822
+ unsigned int s, r;
823
+ unsigned int lastRank, lastIndex;
824
+ unsigned int oldInverseSa0RelativeRank = 0;
825
+ unsigned int freq;
826
+
827
+ lastIndex = numItem;
828
+ lastRank = sortedRank[numItem];
829
+ if (lastRank > oldInverseSa0) {
830
+ sortedRank[numItem]--; // to prepare for merging; $ is not encoded in bwt
831
+ }
832
+ s = seq[numItem];
833
+ relativeRank[s] = numItem;
834
+ if (lastRank == oldInverseSa0) {
835
+ oldInverseSa0RelativeRank = numItem;
836
+ oldInverseSa0++; // so that this segment of code is not run again
837
+ lastRank++; // so that oldInverseSa0 become a sorted group with 1 item
838
+ }
839
+
840
+ c = ALPHABET_SIZE - 1;
841
+ freq = cumulativeCount[c];
842
+
843
+ for (i=numItem; i--;) { // from numItem - 1 to 0
844
+ r = sortedRank[i];
845
+ if (r > oldInverseSa0) {
846
+ sortedRank[i]--; // to prepare for merging; $ is not encoded in bwt
847
+ }
848
+ s = seq[i];
849
+ if (i < freq) {
850
+ if (lastIndex >= freq) {
851
+ lastRank++; // to trigger the group across alphabet boundary to be split
852
+ }
853
+ c--;
854
+ freq = cumulativeCount[c];
855
+ }
856
+ if (r == lastRank) {
857
+ relativeRank[s] = lastIndex;
858
+ } else {
859
+ if (i == lastIndex - 1) {
860
+ if (lastIndex < numItem && (int)seq[lastIndex + 1] < 0) {
861
+ seq[lastIndex] = seq[lastIndex + 1] - 1;
862
+ } else {
863
+ seq[lastIndex] = (unsigned int)-1;
864
+ }
865
+ }
866
+ lastIndex = i;
867
+ lastRank = r;
868
+ relativeRank[s] = i;
869
+ if (r == oldInverseSa0) {
870
+ oldInverseSa0RelativeRank = i;
871
+ oldInverseSa0++; // so that this segment of code is not run again
872
+ lastRank++; // so that oldInverseSa0 become a sorted group with 1 item
873
+ }
874
+ }
875
+ }
876
+
877
+ }
878
+
879
+ static void BWTIncBuildBwt(unsigned int* seq, const unsigned int *relativeRank, const unsigned int numChar,
880
+ const unsigned int *cumulativeCount)
881
+ {
882
+ unsigned int i, c;
883
+ unsigned int previousRank, currentRank;
884
+
885
+ previousRank = relativeRank[0];
886
+
887
+ for (i=1; i<=numChar; i++) {
888
+ currentRank = relativeRank[i];
889
+ c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2])
890
+ + (previousRank >= cumulativeCount[3]);
891
+ seq[currentRank] = c;
892
+ previousRank = currentRank;
893
+ }
894
+ }
895
+
896
+ static void BWTIncMergeBwt(const unsigned int *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt,
897
+ unsigned int* __restrict mergedBwt, const unsigned int numOldBwt, const unsigned int numInsertBwt)
898
+ {
899
+ unsigned int bitsInWordMinusBitPerChar;
900
+ unsigned int leftShift, rightShift;
901
+ unsigned int o;
902
+ unsigned int oIndex, iIndex, mIndex;
903
+ unsigned int mWord, mChar, oWord, oChar;
904
+ unsigned int numInsert;
905
+
906
+ bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR;
907
+
908
+ oIndex = 0;
909
+ iIndex = 0;
910
+ mIndex = 0;
911
+
912
+ mWord = 0;
913
+ mChar = 0;
914
+
915
+ mergedBwt[0] = 0; // this can be cleared as merged Bwt slightly shift to the left in each iteration
916
+
917
+ while (oIndex < numOldBwt) {
918
+
919
+ // copy from insertBwt
920
+ while (iIndex <= numInsertBwt && sortedRank[iIndex] <= oIndex) {
921
+ if (sortedRank[iIndex] != 0) { // special value to indicate that this is for new inverseSa0
922
+ mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR);
923
+ mIndex++;
924
+ mChar++;
925
+ if (mChar == CHAR_PER_WORD) {
926
+ mChar = 0;
927
+ mWord++;
928
+ mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary
929
+ }
930
+ }
931
+ iIndex++;
932
+ }
933
+
934
+ // Copy from oldBwt to mergedBwt
935
+ if (iIndex <= numInsertBwt) {
936
+ o = sortedRank[iIndex];
937
+ } else {
938
+ o = numOldBwt;
939
+ }
940
+ numInsert = o - oIndex;
941
+
942
+ oWord = oIndex / CHAR_PER_WORD;
943
+ oChar = oIndex - oWord * CHAR_PER_WORD;
944
+ if (oChar > mChar) {
945
+ leftShift = (oChar - mChar) * BIT_PER_CHAR;
946
+ rightShift = (CHAR_PER_WORD + mChar - oChar) * BIT_PER_CHAR;
947
+ mergedBwt[mWord] = mergedBwt[mWord]
948
+ | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR))
949
+ | (oldBwt[oWord+1] >> rightShift);
950
+ oIndex += min(numInsert, CHAR_PER_WORD - mChar);
951
+ while (o > oIndex) {
952
+ oWord++;
953
+ mWord++;
954
+ mergedBwt[mWord] = (oldBwt[oWord] << leftShift) | (oldBwt[oWord+1] >> rightShift);
955
+ oIndex += CHAR_PER_WORD;
956
+ }
957
+ } else if (oChar < mChar) {
958
+ rightShift = (mChar - oChar) * BIT_PER_CHAR;
959
+ leftShift = (CHAR_PER_WORD + oChar - mChar) * BIT_PER_CHAR;
960
+ mergedBwt[mWord] = mergedBwt[mWord]
961
+ | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR));
962
+ oIndex += min(numInsert, CHAR_PER_WORD - mChar);
963
+ while (o > oIndex) {
964
+ oWord++;
965
+ mWord++;
966
+ mergedBwt[mWord] = (oldBwt[oWord-1] << leftShift) | (oldBwt[oWord] >> rightShift);
967
+ oIndex += CHAR_PER_WORD;
968
+ }
969
+ } else { // oChar == mChar
970
+ mergedBwt[mWord] = mergedBwt[mWord] | truncateLeft(oldBwt[oWord], mChar * BIT_PER_CHAR);
971
+ oIndex += min(numInsert, CHAR_PER_WORD - mChar);
972
+ while (o > oIndex) {
973
+ oWord++;
974
+ mWord++;
975
+ mergedBwt[mWord] = oldBwt[oWord];
976
+ oIndex += CHAR_PER_WORD;
977
+ }
978
+ }
979
+ oIndex = o;
980
+ mIndex += numInsert;
981
+
982
+ // Clear the trailing garbage in mergedBwt
983
+ mWord = mIndex / CHAR_PER_WORD;
984
+ mChar = mIndex - mWord * CHAR_PER_WORD;
985
+ if (mChar == 0) {
986
+ mergedBwt[mWord] = 0;
987
+ } else {
988
+ mergedBwt[mWord] = truncateRight(mergedBwt[mWord], (BITS_IN_WORD - mChar * BIT_PER_CHAR));
989
+ }
990
+
991
+ }
992
+
993
+ // copy from insertBwt
994
+ while (iIndex <= numInsertBwt) {
995
+ if (sortedRank[iIndex] != 0) {
996
+ mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR);
997
+ mIndex++;
998
+ mChar++;
999
+ if (mChar == CHAR_PER_WORD) {
1000
+ mChar = 0;
1001
+ mWord++;
1002
+ mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary
1003
+ }
1004
+ }
1005
+ iIndex++;
1006
+ }
1007
+ }
1008
+
1009
+ void BWTClearTrailingBwtCode(BWT *bwt)
1010
+ {
1011
+ unsigned int bwtResidentSizeInWord;
1012
+ unsigned int wordIndex, offset;
1013
+ unsigned int i;
1014
+
1015
+ bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength);
1016
+
1017
+ wordIndex = bwt->textLength / CHAR_PER_WORD;
1018
+ offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR;
1019
+ if (offset > 0) {
1020
+ bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset);
1021
+ } else {
1022
+ if (wordIndex < bwtResidentSizeInWord) {
1023
+ bwt->bwtCode[wordIndex] = 0;
1024
+ }
1025
+ }
1026
+
1027
+ for (i=wordIndex+1; i<bwtResidentSizeInWord; i++) {
1028
+ bwt->bwtCode[i] = 0;
1029
+ }
1030
+ }
1031
+
1032
+
1033
+ void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restrict occValue,
1034
+ unsigned int* __restrict occValueMajor,
1035
+ const unsigned int textLength, const unsigned int* decodeTable)
1036
+ {
1037
+ unsigned int numberOfOccValueMajor, numberOfOccValue;
1038
+ unsigned int wordBetweenOccValue;
1039
+ unsigned int numberOfOccIntervalPerMajor;
1040
+ unsigned int c;
1041
+ unsigned int i, j;
1042
+ unsigned int occMajorIndex;
1043
+ unsigned int occIndex, bwtIndex;
1044
+ unsigned int sum;
1045
+ unsigned int tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE];
1046
+
1047
+ wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD;
1048
+
1049
+ // Calculate occValue
1050
+ // [lh3] by default: OCC_INTERVAL_MAJOR=65536, OCC_INTERVAL=256
1051
+ numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
1052
+ numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL;
1053
+ numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor;
1054
+
1055
+ tempOccValue0[0] = 0;
1056
+ tempOccValue0[1] = 0;
1057
+ tempOccValue0[2] = 0;
1058
+ tempOccValue0[3] = 0;
1059
+ occValueMajor[0] = 0;
1060
+ occValueMajor[1] = 0;
1061
+ occValueMajor[2] = 0;
1062
+ occValueMajor[3] = 0;
1063
+
1064
+ occIndex = 0;
1065
+ bwtIndex = 0;
1066
+ for (occMajorIndex=1; occMajorIndex<numberOfOccValueMajor; occMajorIndex++) {
1067
+
1068
+ for (i=0; i<numberOfOccIntervalPerMajor/2; i++) {
1069
+
1070
+ sum = 0;
1071
+ tempOccValue1[0] = tempOccValue0[0];
1072
+ tempOccValue1[1] = tempOccValue0[1];
1073
+ tempOccValue1[2] = tempOccValue0[2];
1074
+ tempOccValue1[3] = tempOccValue0[3];
1075
+
1076
+ for (j=0; j<wordBetweenOccValue; j++) {
1077
+ c = bwt[bwtIndex];
1078
+ sum += decodeTable[c >> 16];
1079
+ sum += decodeTable[c & 0x0000FFFF];
1080
+ bwtIndex++;
1081
+ }
1082
+ if (!DNA_OCC_SUM_EXCEPTION(sum)) {
1083
+ tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8;
1084
+ tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8;
1085
+ tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8;
1086
+ tempOccValue1[3] += sum;
1087
+ } else {
1088
+ if (sum == 0x00000100) {
1089
+ tempOccValue1[0] += 256;
1090
+ } else if (sum == 0x00010000) {
1091
+ tempOccValue1[1] += 256;
1092
+ } else if (sum == 0x01000000) {
1093
+ tempOccValue1[2] += 256;
1094
+ } else {
1095
+ tempOccValue1[3] += 256;
1096
+ }
1097
+ }
1098
+ occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
1099
+ occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
1100
+ occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
1101
+ occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
1102
+ tempOccValue0[0] = tempOccValue1[0];
1103
+ tempOccValue0[1] = tempOccValue1[1];
1104
+ tempOccValue0[2] = tempOccValue1[2];
1105
+ tempOccValue0[3] = tempOccValue1[3];
1106
+ sum = 0;
1107
+
1108
+ occIndex++;
1109
+
1110
+ for (j=0; j<wordBetweenOccValue; j++) {
1111
+ c = bwt[bwtIndex];
1112
+ sum += decodeTable[c >> 16];
1113
+ sum += decodeTable[c & 0x0000FFFF];
1114
+ bwtIndex++;
1115
+ }
1116
+ if (!DNA_OCC_SUM_EXCEPTION(sum)) {
1117
+ tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8;
1118
+ tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8;
1119
+ tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8;
1120
+ tempOccValue0[3] += sum;
1121
+ } else {
1122
+ if (sum == 0x00000100) {
1123
+ tempOccValue0[0] += 256;
1124
+ } else if (sum == 0x00010000) {
1125
+ tempOccValue0[1] += 256;
1126
+ } else if (sum == 0x01000000) {
1127
+ tempOccValue0[2] += 256;
1128
+ } else {
1129
+ tempOccValue0[3] += 256;
1130
+ }
1131
+ }
1132
+ }
1133
+
1134
+ occValueMajor[occMajorIndex * 4 + 0] = occValueMajor[(occMajorIndex - 1) * 4 + 0] + tempOccValue0[0];
1135
+ occValueMajor[occMajorIndex * 4 + 1] = occValueMajor[(occMajorIndex - 1) * 4 + 1] + tempOccValue0[1];
1136
+ occValueMajor[occMajorIndex * 4 + 2] = occValueMajor[(occMajorIndex - 1) * 4 + 2] + tempOccValue0[2];
1137
+ occValueMajor[occMajorIndex * 4 + 3] = occValueMajor[(occMajorIndex - 1) * 4 + 3] + tempOccValue0[3];
1138
+ tempOccValue0[0] = 0;
1139
+ tempOccValue0[1] = 0;
1140
+ tempOccValue0[2] = 0;
1141
+ tempOccValue0[3] = 0;
1142
+
1143
+ }
1144
+
1145
+ while (occIndex < (numberOfOccValue-1)/2) {
1146
+ sum = 0;
1147
+ tempOccValue1[0] = tempOccValue0[0];
1148
+ tempOccValue1[1] = tempOccValue0[1];
1149
+ tempOccValue1[2] = tempOccValue0[2];
1150
+ tempOccValue1[3] = tempOccValue0[3];
1151
+ for (j=0; j<wordBetweenOccValue; j++) {
1152
+ c = bwt[bwtIndex];
1153
+ sum += decodeTable[c >> 16];
1154
+ sum += decodeTable[c & 0x0000FFFF];
1155
+ bwtIndex++;
1156
+ }
1157
+ if (!DNA_OCC_SUM_EXCEPTION(sum)) {
1158
+ tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8;
1159
+ tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8;
1160
+ tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8;
1161
+ tempOccValue1[3] += sum;
1162
+ } else {
1163
+ if (sum == 0x00000100) {
1164
+ tempOccValue1[0] += 256;
1165
+ } else if (sum == 0x00010000) {
1166
+ tempOccValue1[1] += 256;
1167
+ } else if (sum == 0x01000000) {
1168
+ tempOccValue1[2] += 256;
1169
+ } else {
1170
+ tempOccValue1[3] += 256;
1171
+ }
1172
+ }
1173
+ occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
1174
+ occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
1175
+ occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
1176
+ occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
1177
+ tempOccValue0[0] = tempOccValue1[0];
1178
+ tempOccValue0[1] = tempOccValue1[1];
1179
+ tempOccValue0[2] = tempOccValue1[2];
1180
+ tempOccValue0[3] = tempOccValue1[3];
1181
+ sum = 0;
1182
+ occIndex++;
1183
+
1184
+ for (j=0; j<wordBetweenOccValue; j++) {
1185
+ c = bwt[bwtIndex];
1186
+ sum += decodeTable[c >> 16];
1187
+ sum += decodeTable[c & 0x0000FFFF];
1188
+ bwtIndex++;
1189
+ }
1190
+ if (!DNA_OCC_SUM_EXCEPTION(sum)) {
1191
+ tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8;
1192
+ tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8;
1193
+ tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8;
1194
+ tempOccValue0[3] += sum;
1195
+ } else {
1196
+ if (sum == 0x00000100) {
1197
+ tempOccValue0[0] += 256;
1198
+ } else if (sum == 0x00010000) {
1199
+ tempOccValue0[1] += 256;
1200
+ } else if (sum == 0x01000000) {
1201
+ tempOccValue0[2] += 256;
1202
+ } else {
1203
+ tempOccValue0[3] += 256;
1204
+ }
1205
+ }
1206
+ }
1207
+
1208
+ sum = 0;
1209
+ tempOccValue1[0] = tempOccValue0[0];
1210
+ tempOccValue1[1] = tempOccValue0[1];
1211
+ tempOccValue1[2] = tempOccValue0[2];
1212
+ tempOccValue1[3] = tempOccValue0[3];
1213
+
1214
+ if (occIndex * 2 < numberOfOccValue - 1) {
1215
+ for (j=0; j<wordBetweenOccValue; j++) {
1216
+ c = bwt[bwtIndex];
1217
+ sum += decodeTable[c >> 16];
1218
+ sum += decodeTable[c & 0x0000FFFF];
1219
+ bwtIndex++;
1220
+ }
1221
+ if (!DNA_OCC_SUM_EXCEPTION(sum)) {
1222
+ tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8;
1223
+ tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8;
1224
+ tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8;
1225
+ tempOccValue1[3] += sum;
1226
+ } else {
1227
+ if (sum == 0x00000100) {
1228
+ tempOccValue1[0] += 256;
1229
+ } else if (sum == 0x00010000) {
1230
+ tempOccValue1[1] += 256;
1231
+ } else if (sum == 0x01000000) {
1232
+ tempOccValue1[2] += 256;
1233
+ } else {
1234
+ tempOccValue1[3] += 256;
1235
+ }
1236
+ }
1237
+ }
1238
+
1239
+ occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
1240
+ occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
1241
+ occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
1242
+ occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
1243
+
1244
+ }
1245
+
1246
+ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar)
1247
+ {
1248
+ unsigned int i;
1249
+ unsigned int mergedBwtSizeInWord, mergedOccSizeInWord;
1250
+ unsigned int firstCharInThisIteration;
1251
+
1252
+ unsigned int *relativeRank, *seq, *sortedRank, *insertBwt, *mergedBwt;
1253
+ unsigned int newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0;
1254
+
1255
+ #ifdef DEBUG
1256
+ if (numChar > bwtInc->buildSize) {
1257
+ fprintf(stderr, "BWTIncConstruct(): numChar > buildSize!\n");
1258
+ exit(1);
1259
+ }
1260
+ #endif
1261
+
1262
+ mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar);
1263
+ mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar);
1264
+
1265
+ initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
1266
+
1267
+ if (bwtInc->bwt->textLength == 0) { // Initial build
1268
+
1269
+ // Set address
1270
+ seq = bwtInc->workingMemory;
1271
+ relativeRank = seq + bwtInc->buildSize + 1;
1272
+ mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place
1273
+
1274
+ BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar);
1275
+
1276
+ firstCharInThisIteration = relativeRank[0];
1277
+ relativeRank[numChar] = 0;
1278
+
1279
+ // Sort suffix
1280
+ QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)ALPHABET_SIZE - 1, 0, FALSE);
1281
+ newInverseSa0 = relativeRank[0];
1282
+
1283
+ // Clear BWT area
1284
+ initializeVAL(insertBwt, mergedBwtSizeInWord, 0);
1285
+
1286
+ // Build BWT
1287
+ BWTIncBuildPackedBwt(relativeRank, insertBwt, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->packedShift);
1288
+
1289
+ // so that the cumulativeCount is not deducted
1290
+ bwtInc->firstCharInLastIteration = ALPHABET_SIZE;
1291
+
1292
+ } else { // Incremental build
1293
+ // Set address
1294
+ sortedRank = bwtInc->workingMemory;
1295
+ seq = sortedRank + bwtInc->buildSize + 1;
1296
+ insertBwt = seq;
1297
+ relativeRank = seq + bwtInc->buildSize + 1;
1298
+
1299
+ // Store the first character of this iteration
1300
+ firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR);
1301
+
1302
+ // Count occurrence of input text
1303
+ ForwardDNAAllOccCountNoLimit(bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild + 1, bwtInc->bwt->decodeTable);
1304
+ // Add the first character of the previous iteration to represent the inverseSa0 of the previous iteration
1305
+ bwtInc->cumulativeCountInCurrentBuild[bwtInc->firstCharInLastIteration + 1]++;
1306
+ bwtInc->cumulativeCountInCurrentBuild[2] += bwtInc->cumulativeCountInCurrentBuild[1];
1307
+ bwtInc->cumulativeCountInCurrentBuild[3] += bwtInc->cumulativeCountInCurrentBuild[2];
1308
+ bwtInc->cumulativeCountInCurrentBuild[4] += bwtInc->cumulativeCountInCurrentBuild[3];
1309
+
1310
+ // Get rank of new suffix among processed suffix
1311
+ // The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group
1312
+ oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText,
1313
+ numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration);
1314
+
1315
+ // Sort rank by ALPHABET_SIZE + 2 groups (or ALPHABET_SIZE + 1 groups when inverseSa0 sit on the border of a group)
1316
+ for (i=0; i<ALPHABET_SIZE; i++) {
1317
+ if (bwtInc->cumulativeCountInCurrentBuild[i] > oldInverseSa0RelativeRank ||
1318
+ bwtInc->cumulativeCountInCurrentBuild[i+1] <= oldInverseSa0RelativeRank) {
1319
+ BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], bwtInc->cumulativeCountInCurrentBuild[i+1] - bwtInc->cumulativeCountInCurrentBuild[i]);
1320
+ } else {
1321
+ if (bwtInc->cumulativeCountInCurrentBuild[i] < oldInverseSa0RelativeRank) {
1322
+ BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], oldInverseSa0RelativeRank - bwtInc->cumulativeCountInCurrentBuild[i]);
1323
+ }
1324
+ if (bwtInc->cumulativeCountInCurrentBuild[i+1] > oldInverseSa0RelativeRank + 1) {
1325
+ BWTIncSortKey(sortedRank + oldInverseSa0RelativeRank + 1, seq + oldInverseSa0RelativeRank + 1, bwtInc->cumulativeCountInCurrentBuild[i+1] - oldInverseSa0RelativeRank - 1);
1326
+ }
1327
+ }
1328
+ }
1329
+
1330
+ // build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt
1331
+ // the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups)
1332
+ BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild);
1333
+ #ifdef DEBUG
1334
+ if (relativeRank[numChar] != oldInverseSa0RelativeRank) {
1335
+ fprintf(stderr, "BWTIncConstruct(): relativeRank[numChar] != oldInverseSa0RelativeRank!\n");
1336
+ exit(1);
1337
+ }
1338
+ #endif
1339
+
1340
+ // Sort suffix
1341
+ QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)numChar, 1, TRUE);
1342
+
1343
+ newInverseSa0RelativeRank = relativeRank[0];
1344
+ newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank;
1345
+
1346
+ sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt
1347
+
1348
+ // Build BWT
1349
+ BWTIncBuildBwt(seq, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild);
1350
+
1351
+ // Merge BWT
1352
+ mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord
1353
+ - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR;
1354
+ // minus numberOfIteration * occInterval to create a buffer for merging
1355
+ BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar);
1356
+
1357
+ }
1358
+
1359
+ // Build auxiliary structure and update info and pointers in BWT
1360
+ bwtInc->bwt->textLength += numChar;
1361
+ bwtInc->bwt->bwtCode = mergedBwt;
1362
+ bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord;
1363
+ bwtInc->bwt->occSizeInWord = mergedOccSizeInWord;
1364
+ if (mergedBwt < bwtInc->workingMemory + mergedOccSizeInWord) {
1365
+ fprintf(stderr, "BWTIncConstruct() : Not enough memory allocated!\n");
1366
+ exit(1);
1367
+ }
1368
+
1369
+ bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord;
1370
+
1371
+ BWTClearTrailingBwtCode(bwtInc->bwt);
1372
+ BWTGenerateOccValueFromBwt(bwtInc->bwt->bwtCode, bwtInc->bwt->occValue, bwtInc->bwt->occValueMajor,
1373
+ bwtInc->bwt->textLength, bwtInc->bwt->decodeTable);
1374
+
1375
+ bwtInc->bwt->inverseSa0 = newInverseSa0;
1376
+
1377
+ bwtInc->bwt->cumulativeFreq[1] += bwtInc->cumulativeCountInCurrentBuild[1] - (bwtInc->firstCharInLastIteration <= 0);
1378
+ bwtInc->bwt->cumulativeFreq[2] += bwtInc->cumulativeCountInCurrentBuild[2] - (bwtInc->firstCharInLastIteration <= 1);
1379
+ bwtInc->bwt->cumulativeFreq[3] += bwtInc->cumulativeCountInCurrentBuild[3] - (bwtInc->firstCharInLastIteration <= 2);
1380
+ bwtInc->bwt->cumulativeFreq[4] += bwtInc->cumulativeCountInCurrentBuild[4] - (bwtInc->firstCharInLastIteration <= 3);
1381
+
1382
+ bwtInc->firstCharInLastIteration = firstCharInThisIteration;
1383
+
1384
+ // Set build size and text address for the next build
1385
+ BWTIncSetBuildSizeAndTextAddr(bwtInc);
1386
+ bwtInc->numberOfIterationDone++;
1387
+
1388
+ }
1389
+
1390
+ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetNBit,
1391
+ const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize)
1392
+ {
1393
+
1394
+ FILE *packedFile;
1395
+ unsigned int packedFileLen;
1396
+ unsigned int totalTextLength;
1397
+ unsigned int textToLoad, textSizeInByte;
1398
+ unsigned int processedTextLength;
1399
+ unsigned char lastByteLength;
1400
+
1401
+ BWTInc *bwtInc;
1402
+
1403
+ packedFile = (FILE*)fopen(inputFileName, "rb");
1404
+
1405
+ if (packedFile == NULL) {
1406
+ fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open inputFileName!\n");
1407
+ exit(1);
1408
+ }
1409
+
1410
+ fseek(packedFile, -1, SEEK_END);
1411
+ packedFileLen = ftell(packedFile);
1412
+ if ((int)packedFileLen < 0) {
1413
+ fprintf(stderr, "BWTIncConstructFromPacked: Cannot determine file length!\n");
1414
+ exit(1);
1415
+ }
1416
+ fread(&lastByteLength, sizeof(unsigned char), 1, packedFile);
1417
+ totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength);
1418
+
1419
+ bwtInc = BWTIncCreate(totalTextLength, targetNBit, initialMaxBuildSize, incMaxBuildSize);
1420
+
1421
+ BWTIncSetBuildSizeAndTextAddr(bwtInc);
1422
+
1423
+ if (bwtInc->buildSize > totalTextLength) {
1424
+ textToLoad = totalTextLength;
1425
+ } else {
1426
+ textToLoad = totalTextLength - ((totalTextLength - bwtInc->buildSize + CHAR_PER_WORD - 1) / CHAR_PER_WORD * CHAR_PER_WORD);
1427
+ }
1428
+ textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte
1429
+
1430
+ fseek(packedFile, -2, SEEK_CUR);
1431
+ fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
1432
+ fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile);
1433
+ fseek(packedFile, -((int)textSizeInByte + 1), SEEK_CUR);
1434
+
1435
+ ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
1436
+ BWTIncConstruct(bwtInc, textToLoad);
1437
+
1438
+ processedTextLength = textToLoad;
1439
+
1440
+ while (processedTextLength < totalTextLength) {
1441
+ textToLoad = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD;
1442
+ if (textToLoad > totalTextLength - processedTextLength) {
1443
+ textToLoad = totalTextLength - processedTextLength;
1444
+ }
1445
+ textSizeInByte = textToLoad / CHAR_PER_BYTE;
1446
+ fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
1447
+ fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile);
1448
+ fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
1449
+ ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
1450
+ BWTIncConstruct(bwtInc, textToLoad);
1451
+ processedTextLength += textToLoad;
1452
+ if (bwtInc->numberOfIterationDone % 10 == 0) {
1453
+ printf("[BWTIncConstructFromPacked] %u iterations done. %u characters processed.\n",
1454
+ bwtInc->numberOfIterationDone, processedTextLength);
1455
+ }
1456
+ }
1457
+ return bwtInc;
1458
+ }
1459
+
1460
+ void BWTFree(BWT *bwt)
1461
+ {
1462
+ if (bwt == 0) return;
1463
+ free(bwt->cumulativeFreq);
1464
+ free(bwt->bwtCode);
1465
+ free(bwt->occValue);
1466
+ free(bwt->occValueMajor);
1467
+ free(bwt->saValue);
1468
+ free(bwt->inverseSa);
1469
+ free(bwt->decodeTable);
1470
+ free(bwt->saIndexRange);
1471
+ free(bwt->saValueOnBoundary);
1472
+ free(bwt);
1473
+ }
1474
+
1475
+ void BWTIncFree(BWTInc *bwtInc)
1476
+ {
1477
+ if (bwtInc == 0) return;
1478
+ free(bwtInc->bwt);
1479
+ free(bwtInc->workingMemory);
1480
+ free(bwtInc);
1481
+ }
1482
+
1483
+ static unsigned int BWTFileSizeInWord(const unsigned int numChar)
1484
+ {
1485
+ // The $ in BWT at the position of inverseSa0 is not encoded
1486
+ return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD;
1487
+ }
1488
+
1489
+ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *occValueFileName)
1490
+ {
1491
+ FILE *bwtFile;
1492
+ /* FILE *occValueFile; */
1493
+ unsigned int bwtLength;
1494
+
1495
+ bwtFile = (FILE*)fopen(bwtFileName, "wb");
1496
+ if (bwtFile == NULL) {
1497
+ fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open BWT code file!\n");
1498
+ exit(1);
1499
+ }
1500
+
1501
+ fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, bwtFile);
1502
+ fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, bwtFile);
1503
+ bwtLength = BWTFileSizeInWord(bwt->textLength);
1504
+ fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile);
1505
+ fclose(bwtFile);
1506
+ /*
1507
+ occValueFile = (FILE*)fopen(occValueFileName, "wb");
1508
+ if (occValueFile == NULL) {
1509
+ fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open occ value file!\n");
1510
+ exit(1);
1511
+ }
1512
+
1513
+ fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, occValueFile);
1514
+ fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, occValueFile);
1515
+ fwrite(bwt->occValue, sizeof(unsigned int), bwt->occSizeInWord, occValueFile);
1516
+ fwrite(bwt->occValueMajor, sizeof(unsigned int), bwt->occMajorSizeInWord, occValueFile);
1517
+ fclose(occValueFile);
1518
+ */
1519
+ }
1520
+
1521
+ void bwt_bwtgen(const char *fn_pac, const char *fn_bwt)
1522
+ {
1523
+ BWTInc *bwtInc;
1524
+ bwtInc = BWTIncConstructFromPacked(fn_pac, 2.5, 10000000, 10000000);
1525
+ printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone);
1526
+ BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0);
1527
+ BWTIncFree(bwtInc);
1528
+ }
1529
+
1530
+ int bwt_bwtgen_main(int argc, char *argv[])
1531
+ {
1532
+ if (argc < 3) {
1533
+ fprintf(stderr, "Usage: bwtgen <in.pac> <out.bwt>\n");
1534
+ return 1;
1535
+ }
1536
+ bwt_bwtgen(argv[1], argv[2]);
1537
+ return 0;
1538
+ }
1539
+
1540
+ #ifdef MAIN_BWT_GEN
1541
+
1542
+ int main(int argc, char *argv[])
1543
+ {
1544
+ return bwt_bwtgen_main(argc, argv);
1545
+ }
1546
+
1547
+ #endif