bio-bwa 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
@@ -0,0 +1,40 @@
1
+ /* QSufSort.h
2
+
3
+ Header file for QSufSort.c
4
+
5
+ This file contains an implementation of the algorithm presented in "Faster
6
+ Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko
7
+ Sadakane (sada@is.s.u-tokyo.ac.jp).
8
+
9
+ This software may be used freely for any purpose. However, when distributed,
10
+ the original source must be clearly stated, and, when the source code is
11
+ distributed, the copyright notice must be retained and any alterations in
12
+ the code must be clearly marked. No warranty is given regarding the quality
13
+ of this software.
14
+
15
+ Modified by Wong Chi-Kwong, 2004
16
+
17
+ Changes summary: - Used long variable and function names
18
+ - Removed global variables
19
+ - Replace pointer references with array references
20
+ - Used insertion sort in place of selection sort and increased insertion sort threshold
21
+ - Reconstructing suffix array from inverse becomes an option
22
+ - Add handling where end-of-text symbol is not necessary < all characters
23
+ - Removed codes for supporting alphabet size > number of characters
24
+
25
+ No warrenty is given regarding the quality of the modifications.
26
+
27
+ */
28
+
29
+ #ifndef __QSUFSORT_H__
30
+ #define __QSUFSORT_H__
31
+
32
+ #define KEY(V, I, p, h) ( V[ I[p] + h ] )
33
+ #define INSERT_SORT_NUM_ITEM 16
34
+
35
+ void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
36
+ const int smallestInputSymbol, const int skipTransform);
37
+ void QSufSortGenerateSaFromInverse(const int *V, int* __restrict I, const int numChar);
38
+
39
+
40
+ #endif
@@ -0,0 +1,1547 @@
1
+ /*
2
+
3
+ BWTConstruct.c BWT-Index Construction
4
+
5
+ This module constructs BWT and auxiliary data structures.
6
+
7
+ Copyright (C) 2004, Wong Chi Kwong.
8
+
9
+ This program is free software; you can redistribute it and/or
10
+ modify it under the terms of the GNU General Public License
11
+ as published by the Free Software Foundation; either version 2
12
+ of the License, or (at your option) any later version.
13
+
14
+ This program is distributed in the hope that it will be useful,
15
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ GNU General Public License for more details.
18
+
19
+ You should have received a copy of the GNU General Public License
20
+ along with this program; if not, write to the Free Software
21
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22
+
23
+ */
24
+
25
+ #include <stdio.h>
26
+ #include <stdlib.h>
27
+ #include <string.h>
28
+ #include "bwt_gen.h"
29
+ #include "QSufSort.h"
30
+
31
+ static unsigned int TextLengthFromBytePacked(unsigned int bytePackedLength, unsigned int bitPerChar,
32
+ unsigned int lastByteLength)
33
+ {
34
+ if (bytePackedLength > ALL_ONE_MASK / (BITS_IN_BYTE / bitPerChar)) {
35
+ fprintf(stderr, "TextLengthFromBytePacked(): text length > 2^32!\n");
36
+ exit(1);
37
+ }
38
+ return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength;
39
+ }
40
+
41
+ static void initializeVAL(unsigned int *startAddr, const unsigned int length, const unsigned int initValue)
42
+ {
43
+ unsigned int i;
44
+ for (i=0; i<length; i++) startAddr[i] = initValue;
45
+ }
46
+
47
+ static void GenerateDNAOccCountTable(unsigned int *dnaDecodeTable)
48
+ {
49
+ unsigned int i, j, c, t;
50
+
51
+ for (i=0; i<DNA_OCC_CNT_TABLE_SIZE_IN_WORD; i++) {
52
+ dnaDecodeTable[i] = 0;
53
+ c = i;
54
+ for (j=0; j<8; j++) {
55
+ t = c & 0x00000003;
56
+ dnaDecodeTable[i] += 1 << (t * 8);
57
+ c >>= 2;
58
+ }
59
+ }
60
+
61
+ }
62
+ // for BWTIncCreate()
63
+ static unsigned int BWTOccValueMajorSizeInWord(const unsigned int numChar)
64
+ {
65
+ unsigned int numOfOccValue;
66
+ unsigned int numOfOccIntervalPerMajor;
67
+ numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
68
+ numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL;
69
+ return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE;
70
+ }
71
+ // for BWTIncCreate()
72
+ static unsigned int BWTOccValueMinorSizeInWord(const unsigned int numChar)
73
+ {
74
+ unsigned int numOfOccValue;
75
+ numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
76
+ return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE;
77
+ }
78
+ // for BWTIncCreate()
79
+ static unsigned int BWTResidentSizeInWord(const unsigned int numChar) {
80
+
81
+ unsigned int numCharRoundUpToOccInterval;
82
+
83
+ // The $ in BWT at the position of inverseSa0 is not encoded
84
+ numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL;
85
+
86
+ return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD;
87
+
88
+ }
89
+
90
+ static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc)
91
+ {
92
+ unsigned int maxBuildSize;
93
+
94
+ if (bwtInc->bwt->textLength == 0) {
95
+ // initial build
96
+ // Minus 2 because n+1 entries of seq and rank needed for n char
97
+ maxBuildSize = (bwtInc->availableWord - 2 - OCC_INTERVAL / CHAR_PER_WORD)
98
+ / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD;
99
+ if (bwtInc->initialMaxBuildSize > 0) {
100
+ bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize);
101
+ } else {
102
+ bwtInc->buildSize = maxBuildSize;
103
+ }
104
+ } else {
105
+ // Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char
106
+ // Minus numberOfIterationDone because bwt slightly shift to left in each iteration
107
+ maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - 3
108
+ - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR)
109
+ / 3;
110
+ if (maxBuildSize < CHAR_PER_WORD) {
111
+ fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n");
112
+ exit(1);
113
+ }
114
+ if (bwtInc->incMaxBuildSize > 0) {
115
+ bwtInc->buildSize = min(bwtInc->incMaxBuildSize, maxBuildSize);
116
+ } else {
117
+ bwtInc->buildSize = maxBuildSize;
118
+ }
119
+ if (bwtInc->buildSize < CHAR_PER_WORD) {
120
+ bwtInc->buildSize = CHAR_PER_WORD;
121
+ }
122
+ }
123
+
124
+ if (bwtInc->buildSize < CHAR_PER_WORD) {
125
+ fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n");
126
+ exit(1);
127
+ }
128
+
129
+ bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD;
130
+
131
+ bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1);
132
+ bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + bwtInc->buildSize + 1);
133
+
134
+ }
135
+
136
+ // for ceilLog2()
137
+ unsigned int leadingZero(const unsigned int input)
138
+ {
139
+ unsigned int l;
140
+ const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
141
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
142
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
143
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
144
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
145
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
146
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
147
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
148
+
149
+ if (input & 0xFFFF0000) {
150
+ if (input & 0xFF000000) {
151
+ l = leadingZero8bit[input >> 24];
152
+ } else {
153
+ l = 8 + leadingZero8bit[input >> 16];
154
+ }
155
+ } else {
156
+ if (input & 0x0000FF00) {
157
+ l = 16 + leadingZero8bit[input >> 8];
158
+ } else {
159
+ l = 24 + leadingZero8bit[input];
160
+ }
161
+ }
162
+ return l;
163
+
164
+ }
165
+ // for BitPerBytePackedChar()
166
+ static unsigned int ceilLog2(const unsigned int input)
167
+ {
168
+ if (input <= 1) return 0;
169
+ return BITS_IN_WORD - leadingZero(input - 1);
170
+
171
+ }
172
+ // for ConvertBytePackedToWordPacked()
173
+ static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize)
174
+ {
175
+ unsigned int bitPerChar;
176
+ bitPerChar = ceilLog2(alphabetSize);
177
+ // Return the largest number of bit that does not affect packing efficiency
178
+ if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar)
179
+ bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar);
180
+ return bitPerChar;
181
+ }
182
+ // for ConvertBytePackedToWordPacked()
183
+ static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize)
184
+ {
185
+ return ceilLog2(alphabetSize);
186
+ }
187
+
188
+ static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize,
189
+ const unsigned int textLength)
190
+ {
191
+ unsigned int i, j, k;
192
+ unsigned int c;
193
+ unsigned int bitPerBytePackedChar;
194
+ unsigned int bitPerWordPackedChar;
195
+ unsigned int charPerWord;
196
+ unsigned int charPerByte;
197
+ unsigned int bytePerIteration;
198
+ unsigned int byteProcessed = 0;
199
+ unsigned int wordProcessed = 0;
200
+ unsigned int mask, shift;
201
+
202
+ unsigned int buffer[BITS_IN_WORD];
203
+
204
+ bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize);
205
+ bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize);
206
+ charPerByte = BITS_IN_BYTE / bitPerBytePackedChar;
207
+ charPerWord = BITS_IN_WORD / bitPerWordPackedChar;
208
+
209
+ bytePerIteration = charPerWord / charPerByte;
210
+ mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar);
211
+ shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar;
212
+
213
+ while ((wordProcessed + 1) * charPerWord < textLength) {
214
+
215
+ k = 0;
216
+ for (i=0; i<bytePerIteration; i++) {
217
+ c = (unsigned int)input[byteProcessed] << shift;
218
+ for (j=0; j<charPerByte; j++) {
219
+ buffer[k] = c & mask;
220
+ c <<= bitPerBytePackedChar;
221
+ k++;
222
+ }
223
+ byteProcessed++;
224
+ }
225
+
226
+ c = 0;
227
+ for (i=0; i<charPerWord; i++) {
228
+ c |= buffer[i] >> bitPerWordPackedChar * i;
229
+ }
230
+ output[wordProcessed] = c;
231
+ wordProcessed++;
232
+
233
+ }
234
+
235
+ k = 0;
236
+ for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) {
237
+ c = (unsigned int)input[byteProcessed] << shift;
238
+ for (j=0; j<charPerByte; j++) {
239
+ buffer[k] = c & mask;
240
+ c <<= bitPerBytePackedChar;
241
+ k++;
242
+ }
243
+ byteProcessed++;
244
+ }
245
+
246
+ c = 0;
247
+ for (i=0; i<textLength - wordProcessed * charPerWord; i++) {
248
+ c |= buffer[i] >> bitPerWordPackedChar * i;
249
+ }
250
+ output[wordProcessed] = c;
251
+ }
252
+
253
+ BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable)
254
+ {
255
+ BWT *bwt;
256
+
257
+ bwt = (BWT*)calloc(1, sizeof(BWT));
258
+
259
+ bwt->textLength = 0;
260
+ bwt->inverseSa = 0;
261
+
262
+ bwt->cumulativeFreq = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int*));
263
+ initializeVAL(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0);
264
+
265
+ bwt->bwtSizeInWord = 0;
266
+ bwt->saValueOnBoundary = NULL;
267
+
268
+ // Generate decode tables
269
+ if (decodeTable == NULL) {
270
+ bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int));
271
+ GenerateDNAOccCountTable(bwt->decodeTable);
272
+ } else {
273
+ bwt->decodeTable = decodeTable;
274
+ }
275
+
276
+ bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength);
277
+ bwt->occValueMajor = (unsigned*)calloc(bwt->occMajorSizeInWord, sizeof(unsigned int));
278
+
279
+ bwt->occSizeInWord = 0;
280
+ bwt->occValue = NULL;
281
+
282
+ bwt->saInterval = ALL_ONE_MASK;
283
+ bwt->saValueSize = 0;
284
+ bwt->saValue = NULL;
285
+
286
+ bwt->inverseSaInterval = ALL_ONE_MASK;
287
+ bwt->inverseSaSize = 0;
288
+ bwt->inverseSa = NULL;
289
+
290
+ return bwt;
291
+ }
292
+
293
+ BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit,
294
+ const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize)
295
+ {
296
+ BWTInc *bwtInc;
297
+ unsigned int i;
298
+
299
+ if (targetNBit == 0) {
300
+ fprintf(stderr, "BWTIncCreate() : targetNBit = 0!\n");
301
+ exit(1);
302
+ }
303
+
304
+ bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc));
305
+ bwtInc->numberOfIterationDone = 0;
306
+ bwtInc->bwt = BWTCreate(textLength, NULL);
307
+ bwtInc->initialMaxBuildSize = initialMaxBuildSize;
308
+ bwtInc->incMaxBuildSize = incMaxBuildSize;
309
+ bwtInc->targetNBit = targetNBit;
310
+ bwtInc->cumulativeCountInCurrentBuild = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int));
311
+ initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
312
+
313
+ // Build frequently accessed data
314
+ bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int));
315
+ for (i=0; i<CHAR_PER_WORD; i++) {
316
+ bwtInc->packedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR;
317
+ }
318
+
319
+ bwtInc->targetTextLength = textLength;
320
+ bwtInc->availableWord = (unsigned int)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit);
321
+ if (bwtInc->availableWord < BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength)) {
322
+ fprintf(stderr, "BWTIncCreate() : targetNBit is too low!\n");
323
+ exit(1);
324
+ }
325
+ bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD);
326
+
327
+ return bwtInc;
328
+
329
+ }
330
+ // for BWTIncConstruct()
331
+ static void BWTIncPutPackedTextToRank(const unsigned int *packedText, unsigned int* __restrict rank,
332
+ unsigned int* __restrict cumulativeCount, const unsigned int numChar)
333
+ {
334
+ unsigned int i, j;
335
+ unsigned int c, t;
336
+ unsigned int packedMask;
337
+ unsigned int rankIndex;
338
+ unsigned int lastWord, numCharInLastWord;
339
+
340
+ lastWord = (numChar - 1) / CHAR_PER_WORD;
341
+ numCharInLastWord = numChar - lastWord * CHAR_PER_WORD;
342
+
343
+ packedMask = ALL_ONE_MASK >> (BITS_IN_WORD - BIT_PER_CHAR);
344
+ rankIndex = numChar - 1;
345
+
346
+ t = packedText[lastWord] >> (BITS_IN_WORD - numCharInLastWord * BIT_PER_CHAR);
347
+ for (i=0; i<numCharInLastWord; i++) {
348
+ c = t & packedMask;
349
+ cumulativeCount[c+1]++;
350
+ rank[rankIndex] = c;
351
+ rankIndex--;
352
+ t >>= BIT_PER_CHAR;
353
+ }
354
+
355
+ for (i=lastWord; i--;) { // loop from lastWord - 1 to 0
356
+ t = packedText[i];
357
+ for (j=0; j<CHAR_PER_WORD; j++) {
358
+ c = t & packedMask;
359
+ cumulativeCount[c+1]++;
360
+ rank[rankIndex] = c;
361
+ rankIndex--;
362
+ t >>= BIT_PER_CHAR;
363
+ }
364
+ }
365
+
366
+ // Convert occurrence to cumulativeCount
367
+ cumulativeCount[2] += cumulativeCount[1];
368
+ cumulativeCount[3] += cumulativeCount[2];
369
+ cumulativeCount[4] += cumulativeCount[3];
370
+ }
371
+
372
+
373
+ static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigned int index,
374
+ unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable)
375
+ {
376
+ static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000,
377
+ 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000,
378
+ 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00,
379
+ 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC };
380
+
381
+ unsigned int iteration, wordToCount, charToCount;
382
+ unsigned int i, j, c;
383
+ unsigned int sum;
384
+
385
+ occCount[0] = 0;
386
+ occCount[1] = 0;
387
+ occCount[2] = 0;
388
+ occCount[3] = 0;
389
+
390
+ iteration = index / 256;
391
+ wordToCount = (index - iteration * 256) / 16;
392
+ charToCount = index - iteration * 256 - wordToCount * 16;
393
+
394
+ for (i=0; i<iteration; i++) {
395
+
396
+ sum = 0;
397
+ for (j=0; j<16; j++) {
398
+ sum += dnaDecodeTable[*dna >> 16];
399
+ sum += dnaDecodeTable[*dna & 0x0000FFFF];
400
+ dna++;
401
+ }
402
+ if (!DNA_OCC_SUM_EXCEPTION(sum)) {
403
+ occCount[0] += sum & 0x000000FF; sum >>= 8;
404
+ occCount[1] += sum & 0x000000FF; sum >>= 8;
405
+ occCount[2] += sum & 0x000000FF; sum >>= 8;
406
+ occCount[3] += sum;
407
+ } else {
408
+ // only some or all of the 3 bits are on
409
+ // in reality, only one of the four cases are possible
410
+ if (sum == 0x00000100) {
411
+ occCount[0] += 256;
412
+ } else if (sum == 0x00010000) {
413
+ occCount[1] += 256;
414
+ } else if (sum == 0x01000000) {
415
+ occCount[2] += 256;
416
+ } else if (sum == 0x00000000) {
417
+ occCount[3] += 256;
418
+ } else {
419
+ fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n");
420
+ exit(1);
421
+ }
422
+ }
423
+
424
+ }
425
+
426
+ sum = 0;
427
+ for (j=0; j<wordToCount; j++) {
428
+ sum += dnaDecodeTable[*dna >> 16];
429
+ sum += dnaDecodeTable[*dna & 0x0000FFFF];
430
+ dna++;
431
+ }
432
+
433
+ if (charToCount > 0) {
434
+ c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c;
435
+ sum += dnaDecodeTable[c >> 16];
436
+ sum += dnaDecodeTable[c & 0xFFFF];
437
+ sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess
438
+ }
439
+
440
+ occCount[0] += sum & 0x000000FF; sum >>= 8;
441
+ occCount[1] += sum & 0x000000FF; sum >>= 8;
442
+ occCount[2] += sum & 0x000000FF; sum >>= 8;
443
+ occCount[3] += sum;
444
+ }
445
+
446
+ static void BWTIncBuildPackedBwt(const unsigned int *relativeRank, unsigned int* __restrict bwt, const unsigned int numChar,
447
+ const unsigned int *cumulativeCount, const unsigned int *packedShift) {
448
+
449
+ unsigned int i, c, r;
450
+ unsigned int previousRank, currentRank;
451
+ unsigned int wordIndex, charIndex;
452
+ unsigned int inverseSa0;
453
+
454
+ inverseSa0 = previousRank = relativeRank[0];
455
+
456
+ for (i=1; i<=numChar; i++) {
457
+ currentRank = relativeRank[i];
458
+ // previousRank > cumulativeCount[c] because $ is one of the char
459
+ c = (previousRank > cumulativeCount[1]) + (previousRank > cumulativeCount[2])
460
+ + (previousRank > cumulativeCount[3]);
461
+ // set bwt for currentRank
462
+ if (c > 0) {
463
+ // c <> 'a'
464
+ r = currentRank;
465
+ if (r > inverseSa0) {
466
+ // - 1 because $ at inverseSa0 is not encoded
467
+ r--;
468
+ }
469
+ wordIndex = r / CHAR_PER_WORD;
470
+ charIndex = r - wordIndex * CHAR_PER_WORD;
471
+ bwt[wordIndex] |= c << packedShift[charIndex];
472
+ }
473
+ previousRank = currentRank;
474
+ }
475
+ }
476
+
477
+ static inline unsigned int BWTOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit,
478
+ const unsigned int character)
479
+ {
480
+ unsigned int occIndexMajor;
481
+
482
+ occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR;
483
+
484
+ if (occIndexExplicit % OCC_VALUE_PER_WORD == 0) {
485
+ return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] +
486
+ (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> 16);
487
+
488
+ } else {
489
+ return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] +
490
+ (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] & 0x0000FFFF);
491
+ }
492
+ }
493
+
494
+
495
+ static unsigned int ForwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character,
496
+ const unsigned int* dnaDecodeTable)
497
+ {
498
+ static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000,
499
+ 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000,
500
+ 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00,
501
+ 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC };
502
+
503
+ unsigned int wordToCount, charToCount;
504
+ unsigned int i, c;
505
+ unsigned int sum = 0;
506
+
507
+ wordToCount = index / 16;
508
+ charToCount = index - wordToCount * 16;
509
+
510
+ for (i=0; i<wordToCount; i++) {
511
+ sum += dnaDecodeTable[dna[i] >> 16];
512
+ sum += dnaDecodeTable[dna[i] & 0x0000FFFF];
513
+ }
514
+
515
+ if (charToCount > 0) {
516
+ c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c;
517
+ sum += dnaDecodeTable[c >> 16];
518
+ sum += dnaDecodeTable[c & 0xFFFF];
519
+ sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess
520
+ }
521
+
522
+ return (sum >> (character * 8)) & 0x000000FF;
523
+
524
+ }
525
+
526
+ static unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character,
527
+ const unsigned int* dnaDecodeTable)
528
+ {
529
+ static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F,
530
+ 0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF,
531
+ 0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF,
532
+ 0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF };
533
+
534
+ unsigned int wordToCount, charToCount;
535
+ unsigned int i, c;
536
+ unsigned int sum = 0;
537
+
538
+ wordToCount = index / 16;
539
+ charToCount = index - wordToCount * 16;
540
+
541
+ dna -= wordToCount + 1;
542
+
543
+ if (charToCount > 0) {
544
+ c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c;
545
+ sum += dnaDecodeTable[c >> 16];
546
+ sum += dnaDecodeTable[c & 0xFFFF];
547
+ sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess
548
+ }
549
+
550
+ for (i=0; i<wordToCount; i++) {
551
+ dna++;
552
+ sum += dnaDecodeTable[*dna >> 16];
553
+ sum += dnaDecodeTable[*dna & 0x0000FFFF];
554
+ }
555
+
556
+ return (sum >> (character * 8)) & 0x000000FF;
557
+
558
+ }
559
+
560
+ unsigned int BWTOccValue(const BWT *bwt, unsigned int index, const unsigned int character) {
561
+
562
+ unsigned int occValue;
563
+ unsigned int occExplicitIndex, occIndex;
564
+
565
+ // $ is supposed to be positioned at inverseSa0 but it is not encoded
566
+ // therefore index is subtracted by 1 for adjustment
567
+ if (index > bwt->inverseSa0) {
568
+ index--;
569
+ }
570
+
571
+ occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding
572
+ occIndex = occExplicitIndex * OCC_INTERVAL;
573
+ occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character);
574
+
575
+ if (occIndex == index) {
576
+ return occValue;
577
+ }
578
+
579
+ if (occIndex < index) {
580
+ return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable);
581
+ } else {
582
+ return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable);
583
+ }
584
+
585
+ }
586
+
587
+ static unsigned int BWTIncGetAbsoluteRank(BWT *bwt, unsigned int* __restrict absoluteRank, unsigned int* __restrict seq,
588
+ const unsigned int *packedText, const unsigned int numChar,
589
+ const unsigned int* cumulativeCount, const unsigned int firstCharInLastIteration)
590
+ {
591
+ unsigned int saIndex;
592
+ unsigned int lastWord;
593
+ unsigned int packedMask;
594
+ unsigned int i, j;
595
+ unsigned int c, t;
596
+ unsigned int rankIndex;
597
+ unsigned int shift;
598
+ unsigned int seqIndexFromStart[ALPHABET_SIZE];
599
+ unsigned int seqIndexFromEnd[ALPHABET_SIZE];
600
+
601
+ for (i=0; i<ALPHABET_SIZE; i++) {
602
+ seqIndexFromStart[i] = cumulativeCount[i];
603
+ seqIndexFromEnd[i] = cumulativeCount[i+1] - 1;
604
+ }
605
+
606
+ shift = BITS_IN_WORD - BIT_PER_CHAR;
607
+ packedMask = ALL_ONE_MASK >> shift;
608
+ saIndex = bwt->inverseSa0;
609
+ rankIndex = numChar - 1;
610
+
611
+ lastWord = numChar / CHAR_PER_WORD;
612
+ for (i=lastWord; i--;) { // loop from lastWord - 1 to 0
613
+ t = packedText[i];
614
+ for (j=0; j<CHAR_PER_WORD; j++) {
615
+ c = t & packedMask;
616
+ saIndex = bwt->cumulativeFreq[c] + BWTOccValue(bwt, saIndex, c) + 1;
617
+ // A counting sort using the first character of suffix is done here
618
+ // If rank > inverseSa0 -> fill seq from end, otherwise fill seq from start -> to leave the right entry for inverseSa0
619
+ if (saIndex > bwt->inverseSa0) {
620
+ seq[seqIndexFromEnd[c]] = rankIndex;
621
+ absoluteRank[seqIndexFromEnd[c]] = saIndex;
622
+ seqIndexFromEnd[c]--;
623
+ } else {
624
+ seq[seqIndexFromStart[c]] = rankIndex;
625
+ absoluteRank[seqIndexFromStart[c]] = saIndex;
626
+ seqIndexFromStart[c]++;
627
+ }
628
+ rankIndex--;
629
+ t >>= BIT_PER_CHAR;
630
+ }
631
+ }
632
+
633
+ absoluteRank[seqIndexFromStart[firstCharInLastIteration]] = bwt->inverseSa0; // representing the substring of all preceding characters
634
+ seq[seqIndexFromStart[firstCharInLastIteration]] = numChar;
635
+
636
+ return seqIndexFromStart[firstCharInLastIteration];
637
+ }
638
+
639
+ static void BWTIncSortKey(unsigned int* __restrict key, unsigned int* __restrict seq, const unsigned int numItem)
640
+ {
641
+ #define EQUAL_KEY_THRESHOLD 4 // Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD
642
+
643
+ int lowIndex, highIndex, midIndex;
644
+ int lowPartitionIndex, highPartitionIndex;
645
+ int lowStack[32], highStack[32];
646
+ int stackDepth;
647
+ int i, j;
648
+ unsigned int tempSeq, tempKey;
649
+ int numberOfEqualKey;
650
+
651
+ if (numItem < 2) return;
652
+
653
+ stackDepth = 0;
654
+
655
+ lowIndex = 0;
656
+ highIndex = numItem - 1;
657
+
658
+ for (;;) {
659
+
660
+ for (;;) {
661
+
662
+ // Sort small array of data
663
+ if (highIndex - lowIndex < BWTINC_INSERT_SORT_NUM_ITEM) { // Insertion sort on smallest arrays
664
+ for (i=lowIndex+1; i<=highIndex; i++) {
665
+ tempSeq = seq[i];
666
+ tempKey = key[i];
667
+ for (j = i; j > lowIndex && key[j-1] > tempKey; j--) {
668
+ seq[j] = seq[j-1];
669
+ key[j] = key[j-1];
670
+ }
671
+ if (j != i) {
672
+ seq[j] = tempSeq;
673
+ key[j] = tempKey;
674
+ }
675
+ }
676
+ break;
677
+ }
678
+
679
+ // Choose pivot as median of the lowest, middle, and highest data; sort the three data
680
+
681
+ midIndex = average(lowIndex, highIndex);
682
+ if (key[lowIndex] > key[midIndex]) {
683
+ tempSeq = seq[lowIndex];
684
+ tempKey = key[lowIndex];
685
+ seq[lowIndex] = seq[midIndex];
686
+ key[lowIndex] = key[midIndex];
687
+ seq[midIndex] = tempSeq;
688
+ key[midIndex] = tempKey;
689
+ }
690
+ if (key[lowIndex] > key[highIndex]) {
691
+ tempSeq = seq[lowIndex];
692
+ tempKey = key[lowIndex];
693
+ seq[lowIndex] = seq[highIndex];
694
+ key[lowIndex] = key[highIndex];
695
+ seq[highIndex] = tempSeq;
696
+ key[highIndex] = tempKey;
697
+ }
698
+ if (key[midIndex] > key[highIndex]) {
699
+ tempSeq = seq[midIndex];
700
+ tempKey = key[midIndex];
701
+ seq[midIndex] = seq[highIndex];
702
+ key[midIndex] = key[highIndex];
703
+ seq[highIndex] = tempSeq;
704
+ key[highIndex] = tempKey;
705
+ }
706
+
707
+ // Partition data
708
+
709
+ numberOfEqualKey = 0;
710
+
711
+ lowPartitionIndex = lowIndex + 1;
712
+ highPartitionIndex = highIndex - 1;
713
+
714
+ for (;;) {
715
+ while (lowPartitionIndex <= highPartitionIndex && key[lowPartitionIndex] <= key[midIndex]) {
716
+ numberOfEqualKey += (key[lowPartitionIndex] == key[midIndex]);
717
+ lowPartitionIndex++;
718
+ }
719
+ while (lowPartitionIndex < highPartitionIndex) {
720
+ if (key[midIndex] >= key[highPartitionIndex]) {
721
+ numberOfEqualKey += (key[midIndex] == key[highPartitionIndex]);
722
+ break;
723
+ }
724
+ highPartitionIndex--;
725
+ }
726
+ if (lowPartitionIndex >= highPartitionIndex) {
727
+ break;
728
+ }
729
+ tempSeq = seq[lowPartitionIndex];
730
+ tempKey = key[lowPartitionIndex];
731
+ seq[lowPartitionIndex] = seq[highPartitionIndex];
732
+ key[lowPartitionIndex] = key[highPartitionIndex];
733
+ seq[highPartitionIndex] = tempSeq;
734
+ key[highPartitionIndex] = tempKey;
735
+ if (highPartitionIndex == midIndex) {
736
+ // partition key has been moved
737
+ midIndex = lowPartitionIndex;
738
+ }
739
+ lowPartitionIndex++;
740
+ highPartitionIndex--;
741
+ }
742
+
743
+ // Adjust the partition index
744
+ highPartitionIndex = lowPartitionIndex;
745
+ lowPartitionIndex--;
746
+
747
+ // move the partition key to end of low partition
748
+ tempSeq = seq[midIndex];
749
+ tempKey = key[midIndex];
750
+ seq[midIndex] = seq[lowPartitionIndex];
751
+ key[midIndex] = key[lowPartitionIndex];
752
+ seq[lowPartitionIndex] = tempSeq;
753
+ key[lowPartitionIndex] = tempKey;
754
+
755
+ if (highIndex - lowIndex + BWTINC_INSERT_SORT_NUM_ITEM <= EQUAL_KEY_THRESHOLD * numberOfEqualKey) {
756
+
757
+ // Many keys = partition key; separate the equal key data from the lower partition
758
+
759
+ midIndex = lowIndex;
760
+
761
+ for (;;) {
762
+ while (midIndex < lowPartitionIndex && key[midIndex] < key[lowPartitionIndex]) {
763
+ midIndex++;
764
+ }
765
+ while (midIndex < lowPartitionIndex && key[lowPartitionIndex] == key[lowPartitionIndex - 1]) {
766
+ lowPartitionIndex--;
767
+ }
768
+ if (midIndex >= lowPartitionIndex) {
769
+ break;
770
+ }
771
+ tempSeq = seq[midIndex];
772
+ tempKey = key[midIndex];
773
+ seq[midIndex] = seq[lowPartitionIndex - 1];
774
+ key[midIndex] = key[lowPartitionIndex - 1];
775
+ seq[lowPartitionIndex - 1] = tempSeq;
776
+ key[lowPartitionIndex - 1] = tempKey;
777
+ midIndex++;
778
+ lowPartitionIndex--;
779
+ }
780
+
781
+ }
782
+
783
+ if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) {
784
+ // put the larger partition to stack
785
+ lowStack[stackDepth] = lowIndex;
786
+ highStack[stackDepth] = lowPartitionIndex - 1;
787
+ stackDepth++;
788
+ // sort the smaller partition first
789
+ lowIndex = highPartitionIndex;
790
+ } else {
791
+ // put the larger partition to stack
792
+ lowStack[stackDepth] = highPartitionIndex;
793
+ highStack[stackDepth] = highIndex;
794
+ stackDepth++;
795
+ // sort the smaller partition first
796
+ if (lowPartitionIndex > lowIndex) {
797
+ highIndex = lowPartitionIndex - 1;
798
+ } else {
799
+ // all keys in the partition equals to the partition key
800
+ break;
801
+ }
802
+ }
803
+ continue;
804
+ }
805
+
806
+ // Pop a range from stack
807
+ if (stackDepth > 0) {
808
+ stackDepth--;
809
+ lowIndex = lowStack[stackDepth];
810
+ highIndex = highStack[stackDepth];
811
+ continue;
812
+ } else return;
813
+ }
814
+ }
815
+
816
+
817
+ static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigned int* __restrict seq,
818
+ unsigned int* __restrict relativeRank, const unsigned int numItem,
819
+ unsigned int oldInverseSa0, const unsigned int *cumulativeCount)
820
+ {
821
+ unsigned int i, c;
822
+ unsigned int s, r;
823
+ unsigned int lastRank, lastIndex;
824
+ unsigned int oldInverseSa0RelativeRank = 0;
825
+ unsigned int freq;
826
+
827
+ lastIndex = numItem;
828
+ lastRank = sortedRank[numItem];
829
+ if (lastRank > oldInverseSa0) {
830
+ sortedRank[numItem]--; // to prepare for merging; $ is not encoded in bwt
831
+ }
832
+ s = seq[numItem];
833
+ relativeRank[s] = numItem;
834
+ if (lastRank == oldInverseSa0) {
835
+ oldInverseSa0RelativeRank = numItem;
836
+ oldInverseSa0++; // so that this segment of code is not run again
837
+ lastRank++; // so that oldInverseSa0 become a sorted group with 1 item
838
+ }
839
+
840
+ c = ALPHABET_SIZE - 1;
841
+ freq = cumulativeCount[c];
842
+
843
+ for (i=numItem; i--;) { // from numItem - 1 to 0
844
+ r = sortedRank[i];
845
+ if (r > oldInverseSa0) {
846
+ sortedRank[i]--; // to prepare for merging; $ is not encoded in bwt
847
+ }
848
+ s = seq[i];
849
+ if (i < freq) {
850
+ if (lastIndex >= freq) {
851
+ lastRank++; // to trigger the group across alphabet boundary to be split
852
+ }
853
+ c--;
854
+ freq = cumulativeCount[c];
855
+ }
856
+ if (r == lastRank) {
857
+ relativeRank[s] = lastIndex;
858
+ } else {
859
+ if (i == lastIndex - 1) {
860
+ if (lastIndex < numItem && (int)seq[lastIndex + 1] < 0) {
861
+ seq[lastIndex] = seq[lastIndex + 1] - 1;
862
+ } else {
863
+ seq[lastIndex] = (unsigned int)-1;
864
+ }
865
+ }
866
+ lastIndex = i;
867
+ lastRank = r;
868
+ relativeRank[s] = i;
869
+ if (r == oldInverseSa0) {
870
+ oldInverseSa0RelativeRank = i;
871
+ oldInverseSa0++; // so that this segment of code is not run again
872
+ lastRank++; // so that oldInverseSa0 become a sorted group with 1 item
873
+ }
874
+ }
875
+ }
876
+
877
+ }
878
+
879
+ static void BWTIncBuildBwt(unsigned int* seq, const unsigned int *relativeRank, const unsigned int numChar,
880
+ const unsigned int *cumulativeCount)
881
+ {
882
+ unsigned int i, c;
883
+ unsigned int previousRank, currentRank;
884
+
885
+ previousRank = relativeRank[0];
886
+
887
+ for (i=1; i<=numChar; i++) {
888
+ currentRank = relativeRank[i];
889
+ c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2])
890
+ + (previousRank >= cumulativeCount[3]);
891
+ seq[currentRank] = c;
892
+ previousRank = currentRank;
893
+ }
894
+ }
895
+
896
+ static void BWTIncMergeBwt(const unsigned int *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt,
897
+ unsigned int* __restrict mergedBwt, const unsigned int numOldBwt, const unsigned int numInsertBwt)
898
+ {
899
+ unsigned int bitsInWordMinusBitPerChar;
900
+ unsigned int leftShift, rightShift;
901
+ unsigned int o;
902
+ unsigned int oIndex, iIndex, mIndex;
903
+ unsigned int mWord, mChar, oWord, oChar;
904
+ unsigned int numInsert;
905
+
906
+ bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR;
907
+
908
+ oIndex = 0;
909
+ iIndex = 0;
910
+ mIndex = 0;
911
+
912
+ mWord = 0;
913
+ mChar = 0;
914
+
915
+ mergedBwt[0] = 0; // this can be cleared as merged Bwt slightly shift to the left in each iteration
916
+
917
+ while (oIndex < numOldBwt) {
918
+
919
+ // copy from insertBwt
920
+ while (iIndex <= numInsertBwt && sortedRank[iIndex] <= oIndex) {
921
+ if (sortedRank[iIndex] != 0) { // special value to indicate that this is for new inverseSa0
922
+ mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR);
923
+ mIndex++;
924
+ mChar++;
925
+ if (mChar == CHAR_PER_WORD) {
926
+ mChar = 0;
927
+ mWord++;
928
+ mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary
929
+ }
930
+ }
931
+ iIndex++;
932
+ }
933
+
934
+ // Copy from oldBwt to mergedBwt
935
+ if (iIndex <= numInsertBwt) {
936
+ o = sortedRank[iIndex];
937
+ } else {
938
+ o = numOldBwt;
939
+ }
940
+ numInsert = o - oIndex;
941
+
942
+ oWord = oIndex / CHAR_PER_WORD;
943
+ oChar = oIndex - oWord * CHAR_PER_WORD;
944
+ if (oChar > mChar) {
945
+ leftShift = (oChar - mChar) * BIT_PER_CHAR;
946
+ rightShift = (CHAR_PER_WORD + mChar - oChar) * BIT_PER_CHAR;
947
+ mergedBwt[mWord] = mergedBwt[mWord]
948
+ | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR))
949
+ | (oldBwt[oWord+1] >> rightShift);
950
+ oIndex += min(numInsert, CHAR_PER_WORD - mChar);
951
+ while (o > oIndex) {
952
+ oWord++;
953
+ mWord++;
954
+ mergedBwt[mWord] = (oldBwt[oWord] << leftShift) | (oldBwt[oWord+1] >> rightShift);
955
+ oIndex += CHAR_PER_WORD;
956
+ }
957
+ } else if (oChar < mChar) {
958
+ rightShift = (mChar - oChar) * BIT_PER_CHAR;
959
+ leftShift = (CHAR_PER_WORD + oChar - mChar) * BIT_PER_CHAR;
960
+ mergedBwt[mWord] = mergedBwt[mWord]
961
+ | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR));
962
+ oIndex += min(numInsert, CHAR_PER_WORD - mChar);
963
+ while (o > oIndex) {
964
+ oWord++;
965
+ mWord++;
966
+ mergedBwt[mWord] = (oldBwt[oWord-1] << leftShift) | (oldBwt[oWord] >> rightShift);
967
+ oIndex += CHAR_PER_WORD;
968
+ }
969
+ } else { // oChar == mChar
970
+ mergedBwt[mWord] = mergedBwt[mWord] | truncateLeft(oldBwt[oWord], mChar * BIT_PER_CHAR);
971
+ oIndex += min(numInsert, CHAR_PER_WORD - mChar);
972
+ while (o > oIndex) {
973
+ oWord++;
974
+ mWord++;
975
+ mergedBwt[mWord] = oldBwt[oWord];
976
+ oIndex += CHAR_PER_WORD;
977
+ }
978
+ }
979
+ oIndex = o;
980
+ mIndex += numInsert;
981
+
982
+ // Clear the trailing garbage in mergedBwt
983
+ mWord = mIndex / CHAR_PER_WORD;
984
+ mChar = mIndex - mWord * CHAR_PER_WORD;
985
+ if (mChar == 0) {
986
+ mergedBwt[mWord] = 0;
987
+ } else {
988
+ mergedBwt[mWord] = truncateRight(mergedBwt[mWord], (BITS_IN_WORD - mChar * BIT_PER_CHAR));
989
+ }
990
+
991
+ }
992
+
993
+ // copy from insertBwt
994
+ while (iIndex <= numInsertBwt) {
995
+ if (sortedRank[iIndex] != 0) {
996
+ mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR);
997
+ mIndex++;
998
+ mChar++;
999
+ if (mChar == CHAR_PER_WORD) {
1000
+ mChar = 0;
1001
+ mWord++;
1002
+ mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary
1003
+ }
1004
+ }
1005
+ iIndex++;
1006
+ }
1007
+ }
1008
+
1009
+ void BWTClearTrailingBwtCode(BWT *bwt)
1010
+ {
1011
+ unsigned int bwtResidentSizeInWord;
1012
+ unsigned int wordIndex, offset;
1013
+ unsigned int i;
1014
+
1015
+ bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength);
1016
+
1017
+ wordIndex = bwt->textLength / CHAR_PER_WORD;
1018
+ offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR;
1019
+ if (offset > 0) {
1020
+ bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset);
1021
+ } else {
1022
+ if (wordIndex < bwtResidentSizeInWord) {
1023
+ bwt->bwtCode[wordIndex] = 0;
1024
+ }
1025
+ }
1026
+
1027
+ for (i=wordIndex+1; i<bwtResidentSizeInWord; i++) {
1028
+ bwt->bwtCode[i] = 0;
1029
+ }
1030
+ }
1031
+
1032
+
1033
+ void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restrict occValue,
1034
+ unsigned int* __restrict occValueMajor,
1035
+ const unsigned int textLength, const unsigned int* decodeTable)
1036
+ {
1037
+ unsigned int numberOfOccValueMajor, numberOfOccValue;
1038
+ unsigned int wordBetweenOccValue;
1039
+ unsigned int numberOfOccIntervalPerMajor;
1040
+ unsigned int c;
1041
+ unsigned int i, j;
1042
+ unsigned int occMajorIndex;
1043
+ unsigned int occIndex, bwtIndex;
1044
+ unsigned int sum;
1045
+ unsigned int tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE];
1046
+
1047
+ wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD;
1048
+
1049
+ // Calculate occValue
1050
+ // [lh3] by default: OCC_INTERVAL_MAJOR=65536, OCC_INTERVAL=256
1051
+ numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
1052
+ numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL;
1053
+ numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor;
1054
+
1055
+ tempOccValue0[0] = 0;
1056
+ tempOccValue0[1] = 0;
1057
+ tempOccValue0[2] = 0;
1058
+ tempOccValue0[3] = 0;
1059
+ occValueMajor[0] = 0;
1060
+ occValueMajor[1] = 0;
1061
+ occValueMajor[2] = 0;
1062
+ occValueMajor[3] = 0;
1063
+
1064
+ occIndex = 0;
1065
+ bwtIndex = 0;
1066
+ for (occMajorIndex=1; occMajorIndex<numberOfOccValueMajor; occMajorIndex++) {
1067
+
1068
+ for (i=0; i<numberOfOccIntervalPerMajor/2; i++) {
1069
+
1070
+ sum = 0;
1071
+ tempOccValue1[0] = tempOccValue0[0];
1072
+ tempOccValue1[1] = tempOccValue0[1];
1073
+ tempOccValue1[2] = tempOccValue0[2];
1074
+ tempOccValue1[3] = tempOccValue0[3];
1075
+
1076
+ for (j=0; j<wordBetweenOccValue; j++) {
1077
+ c = bwt[bwtIndex];
1078
+ sum += decodeTable[c >> 16];
1079
+ sum += decodeTable[c & 0x0000FFFF];
1080
+ bwtIndex++;
1081
+ }
1082
+ if (!DNA_OCC_SUM_EXCEPTION(sum)) {
1083
+ tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8;
1084
+ tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8;
1085
+ tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8;
1086
+ tempOccValue1[3] += sum;
1087
+ } else {
1088
+ if (sum == 0x00000100) {
1089
+ tempOccValue1[0] += 256;
1090
+ } else if (sum == 0x00010000) {
1091
+ tempOccValue1[1] += 256;
1092
+ } else if (sum == 0x01000000) {
1093
+ tempOccValue1[2] += 256;
1094
+ } else {
1095
+ tempOccValue1[3] += 256;
1096
+ }
1097
+ }
1098
+ occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
1099
+ occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
1100
+ occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
1101
+ occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
1102
+ tempOccValue0[0] = tempOccValue1[0];
1103
+ tempOccValue0[1] = tempOccValue1[1];
1104
+ tempOccValue0[2] = tempOccValue1[2];
1105
+ tempOccValue0[3] = tempOccValue1[3];
1106
+ sum = 0;
1107
+
1108
+ occIndex++;
1109
+
1110
+ for (j=0; j<wordBetweenOccValue; j++) {
1111
+ c = bwt[bwtIndex];
1112
+ sum += decodeTable[c >> 16];
1113
+ sum += decodeTable[c & 0x0000FFFF];
1114
+ bwtIndex++;
1115
+ }
1116
+ if (!DNA_OCC_SUM_EXCEPTION(sum)) {
1117
+ tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8;
1118
+ tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8;
1119
+ tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8;
1120
+ tempOccValue0[3] += sum;
1121
+ } else {
1122
+ if (sum == 0x00000100) {
1123
+ tempOccValue0[0] += 256;
1124
+ } else if (sum == 0x00010000) {
1125
+ tempOccValue0[1] += 256;
1126
+ } else if (sum == 0x01000000) {
1127
+ tempOccValue0[2] += 256;
1128
+ } else {
1129
+ tempOccValue0[3] += 256;
1130
+ }
1131
+ }
1132
+ }
1133
+
1134
+ occValueMajor[occMajorIndex * 4 + 0] = occValueMajor[(occMajorIndex - 1) * 4 + 0] + tempOccValue0[0];
1135
+ occValueMajor[occMajorIndex * 4 + 1] = occValueMajor[(occMajorIndex - 1) * 4 + 1] + tempOccValue0[1];
1136
+ occValueMajor[occMajorIndex * 4 + 2] = occValueMajor[(occMajorIndex - 1) * 4 + 2] + tempOccValue0[2];
1137
+ occValueMajor[occMajorIndex * 4 + 3] = occValueMajor[(occMajorIndex - 1) * 4 + 3] + tempOccValue0[3];
1138
+ tempOccValue0[0] = 0;
1139
+ tempOccValue0[1] = 0;
1140
+ tempOccValue0[2] = 0;
1141
+ tempOccValue0[3] = 0;
1142
+
1143
+ }
1144
+
1145
+ while (occIndex < (numberOfOccValue-1)/2) {
1146
+ sum = 0;
1147
+ tempOccValue1[0] = tempOccValue0[0];
1148
+ tempOccValue1[1] = tempOccValue0[1];
1149
+ tempOccValue1[2] = tempOccValue0[2];
1150
+ tempOccValue1[3] = tempOccValue0[3];
1151
+ for (j=0; j<wordBetweenOccValue; j++) {
1152
+ c = bwt[bwtIndex];
1153
+ sum += decodeTable[c >> 16];
1154
+ sum += decodeTable[c & 0x0000FFFF];
1155
+ bwtIndex++;
1156
+ }
1157
+ if (!DNA_OCC_SUM_EXCEPTION(sum)) {
1158
+ tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8;
1159
+ tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8;
1160
+ tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8;
1161
+ tempOccValue1[3] += sum;
1162
+ } else {
1163
+ if (sum == 0x00000100) {
1164
+ tempOccValue1[0] += 256;
1165
+ } else if (sum == 0x00010000) {
1166
+ tempOccValue1[1] += 256;
1167
+ } else if (sum == 0x01000000) {
1168
+ tempOccValue1[2] += 256;
1169
+ } else {
1170
+ tempOccValue1[3] += 256;
1171
+ }
1172
+ }
1173
+ occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
1174
+ occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
1175
+ occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
1176
+ occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
1177
+ tempOccValue0[0] = tempOccValue1[0];
1178
+ tempOccValue0[1] = tempOccValue1[1];
1179
+ tempOccValue0[2] = tempOccValue1[2];
1180
+ tempOccValue0[3] = tempOccValue1[3];
1181
+ sum = 0;
1182
+ occIndex++;
1183
+
1184
+ for (j=0; j<wordBetweenOccValue; j++) {
1185
+ c = bwt[bwtIndex];
1186
+ sum += decodeTable[c >> 16];
1187
+ sum += decodeTable[c & 0x0000FFFF];
1188
+ bwtIndex++;
1189
+ }
1190
+ if (!DNA_OCC_SUM_EXCEPTION(sum)) {
1191
+ tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8;
1192
+ tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8;
1193
+ tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8;
1194
+ tempOccValue0[3] += sum;
1195
+ } else {
1196
+ if (sum == 0x00000100) {
1197
+ tempOccValue0[0] += 256;
1198
+ } else if (sum == 0x00010000) {
1199
+ tempOccValue0[1] += 256;
1200
+ } else if (sum == 0x01000000) {
1201
+ tempOccValue0[2] += 256;
1202
+ } else {
1203
+ tempOccValue0[3] += 256;
1204
+ }
1205
+ }
1206
+ }
1207
+
1208
+ sum = 0;
1209
+ tempOccValue1[0] = tempOccValue0[0];
1210
+ tempOccValue1[1] = tempOccValue0[1];
1211
+ tempOccValue1[2] = tempOccValue0[2];
1212
+ tempOccValue1[3] = tempOccValue0[3];
1213
+
1214
+ if (occIndex * 2 < numberOfOccValue - 1) {
1215
+ for (j=0; j<wordBetweenOccValue; j++) {
1216
+ c = bwt[bwtIndex];
1217
+ sum += decodeTable[c >> 16];
1218
+ sum += decodeTable[c & 0x0000FFFF];
1219
+ bwtIndex++;
1220
+ }
1221
+ if (!DNA_OCC_SUM_EXCEPTION(sum)) {
1222
+ tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8;
1223
+ tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8;
1224
+ tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8;
1225
+ tempOccValue1[3] += sum;
1226
+ } else {
1227
+ if (sum == 0x00000100) {
1228
+ tempOccValue1[0] += 256;
1229
+ } else if (sum == 0x00010000) {
1230
+ tempOccValue1[1] += 256;
1231
+ } else if (sum == 0x01000000) {
1232
+ tempOccValue1[2] += 256;
1233
+ } else {
1234
+ tempOccValue1[3] += 256;
1235
+ }
1236
+ }
1237
+ }
1238
+
1239
+ occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
1240
+ occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
1241
+ occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
1242
+ occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
1243
+
1244
+ }
1245
+
1246
+ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar)
1247
+ {
1248
+ unsigned int i;
1249
+ unsigned int mergedBwtSizeInWord, mergedOccSizeInWord;
1250
+ unsigned int firstCharInThisIteration;
1251
+
1252
+ unsigned int *relativeRank, *seq, *sortedRank, *insertBwt, *mergedBwt;
1253
+ unsigned int newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0;
1254
+
1255
+ #ifdef DEBUG
1256
+ if (numChar > bwtInc->buildSize) {
1257
+ fprintf(stderr, "BWTIncConstruct(): numChar > buildSize!\n");
1258
+ exit(1);
1259
+ }
1260
+ #endif
1261
+
1262
+ mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar);
1263
+ mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar);
1264
+
1265
+ initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
1266
+
1267
+ if (bwtInc->bwt->textLength == 0) { // Initial build
1268
+
1269
+ // Set address
1270
+ seq = bwtInc->workingMemory;
1271
+ relativeRank = seq + bwtInc->buildSize + 1;
1272
+ mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place
1273
+
1274
+ BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar);
1275
+
1276
+ firstCharInThisIteration = relativeRank[0];
1277
+ relativeRank[numChar] = 0;
1278
+
1279
+ // Sort suffix
1280
+ QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)ALPHABET_SIZE - 1, 0, FALSE);
1281
+ newInverseSa0 = relativeRank[0];
1282
+
1283
+ // Clear BWT area
1284
+ initializeVAL(insertBwt, mergedBwtSizeInWord, 0);
1285
+
1286
+ // Build BWT
1287
+ BWTIncBuildPackedBwt(relativeRank, insertBwt, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->packedShift);
1288
+
1289
+ // so that the cumulativeCount is not deducted
1290
+ bwtInc->firstCharInLastIteration = ALPHABET_SIZE;
1291
+
1292
+ } else { // Incremental build
1293
+ // Set address
1294
+ sortedRank = bwtInc->workingMemory;
1295
+ seq = sortedRank + bwtInc->buildSize + 1;
1296
+ insertBwt = seq;
1297
+ relativeRank = seq + bwtInc->buildSize + 1;
1298
+
1299
+ // Store the first character of this iteration
1300
+ firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR);
1301
+
1302
+ // Count occurrence of input text
1303
+ ForwardDNAAllOccCountNoLimit(bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild + 1, bwtInc->bwt->decodeTable);
1304
+ // Add the first character of the previous iteration to represent the inverseSa0 of the previous iteration
1305
+ bwtInc->cumulativeCountInCurrentBuild[bwtInc->firstCharInLastIteration + 1]++;
1306
+ bwtInc->cumulativeCountInCurrentBuild[2] += bwtInc->cumulativeCountInCurrentBuild[1];
1307
+ bwtInc->cumulativeCountInCurrentBuild[3] += bwtInc->cumulativeCountInCurrentBuild[2];
1308
+ bwtInc->cumulativeCountInCurrentBuild[4] += bwtInc->cumulativeCountInCurrentBuild[3];
1309
+
1310
+ // Get rank of new suffix among processed suffix
1311
+ // The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group
1312
+ oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText,
1313
+ numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration);
1314
+
1315
+ // Sort rank by ALPHABET_SIZE + 2 groups (or ALPHABET_SIZE + 1 groups when inverseSa0 sit on the border of a group)
1316
+ for (i=0; i<ALPHABET_SIZE; i++) {
1317
+ if (bwtInc->cumulativeCountInCurrentBuild[i] > oldInverseSa0RelativeRank ||
1318
+ bwtInc->cumulativeCountInCurrentBuild[i+1] <= oldInverseSa0RelativeRank) {
1319
+ BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], bwtInc->cumulativeCountInCurrentBuild[i+1] - bwtInc->cumulativeCountInCurrentBuild[i]);
1320
+ } else {
1321
+ if (bwtInc->cumulativeCountInCurrentBuild[i] < oldInverseSa0RelativeRank) {
1322
+ BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], oldInverseSa0RelativeRank - bwtInc->cumulativeCountInCurrentBuild[i]);
1323
+ }
1324
+ if (bwtInc->cumulativeCountInCurrentBuild[i+1] > oldInverseSa0RelativeRank + 1) {
1325
+ BWTIncSortKey(sortedRank + oldInverseSa0RelativeRank + 1, seq + oldInverseSa0RelativeRank + 1, bwtInc->cumulativeCountInCurrentBuild[i+1] - oldInverseSa0RelativeRank - 1);
1326
+ }
1327
+ }
1328
+ }
1329
+
1330
+ // build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt
1331
+ // the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups)
1332
+ BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild);
1333
+ #ifdef DEBUG
1334
+ if (relativeRank[numChar] != oldInverseSa0RelativeRank) {
1335
+ fprintf(stderr, "BWTIncConstruct(): relativeRank[numChar] != oldInverseSa0RelativeRank!\n");
1336
+ exit(1);
1337
+ }
1338
+ #endif
1339
+
1340
+ // Sort suffix
1341
+ QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)numChar, 1, TRUE);
1342
+
1343
+ newInverseSa0RelativeRank = relativeRank[0];
1344
+ newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank;
1345
+
1346
+ sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt
1347
+
1348
+ // Build BWT
1349
+ BWTIncBuildBwt(seq, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild);
1350
+
1351
+ // Merge BWT
1352
+ mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord
1353
+ - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR;
1354
+ // minus numberOfIteration * occInterval to create a buffer for merging
1355
+ BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar);
1356
+
1357
+ }
1358
+
1359
+ // Build auxiliary structure and update info and pointers in BWT
1360
+ bwtInc->bwt->textLength += numChar;
1361
+ bwtInc->bwt->bwtCode = mergedBwt;
1362
+ bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord;
1363
+ bwtInc->bwt->occSizeInWord = mergedOccSizeInWord;
1364
+ if (mergedBwt < bwtInc->workingMemory + mergedOccSizeInWord) {
1365
+ fprintf(stderr, "BWTIncConstruct() : Not enough memory allocated!\n");
1366
+ exit(1);
1367
+ }
1368
+
1369
+ bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord;
1370
+
1371
+ BWTClearTrailingBwtCode(bwtInc->bwt);
1372
+ BWTGenerateOccValueFromBwt(bwtInc->bwt->bwtCode, bwtInc->bwt->occValue, bwtInc->bwt->occValueMajor,
1373
+ bwtInc->bwt->textLength, bwtInc->bwt->decodeTable);
1374
+
1375
+ bwtInc->bwt->inverseSa0 = newInverseSa0;
1376
+
1377
+ bwtInc->bwt->cumulativeFreq[1] += bwtInc->cumulativeCountInCurrentBuild[1] - (bwtInc->firstCharInLastIteration <= 0);
1378
+ bwtInc->bwt->cumulativeFreq[2] += bwtInc->cumulativeCountInCurrentBuild[2] - (bwtInc->firstCharInLastIteration <= 1);
1379
+ bwtInc->bwt->cumulativeFreq[3] += bwtInc->cumulativeCountInCurrentBuild[3] - (bwtInc->firstCharInLastIteration <= 2);
1380
+ bwtInc->bwt->cumulativeFreq[4] += bwtInc->cumulativeCountInCurrentBuild[4] - (bwtInc->firstCharInLastIteration <= 3);
1381
+
1382
+ bwtInc->firstCharInLastIteration = firstCharInThisIteration;
1383
+
1384
+ // Set build size and text address for the next build
1385
+ BWTIncSetBuildSizeAndTextAddr(bwtInc);
1386
+ bwtInc->numberOfIterationDone++;
1387
+
1388
+ }
1389
+
1390
+ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetNBit,
1391
+ const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize)
1392
+ {
1393
+
1394
+ FILE *packedFile;
1395
+ unsigned int packedFileLen;
1396
+ unsigned int totalTextLength;
1397
+ unsigned int textToLoad, textSizeInByte;
1398
+ unsigned int processedTextLength;
1399
+ unsigned char lastByteLength;
1400
+
1401
+ BWTInc *bwtInc;
1402
+
1403
+ packedFile = (FILE*)fopen(inputFileName, "rb");
1404
+
1405
+ if (packedFile == NULL) {
1406
+ fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open inputFileName!\n");
1407
+ exit(1);
1408
+ }
1409
+
1410
+ fseek(packedFile, -1, SEEK_END);
1411
+ packedFileLen = ftell(packedFile);
1412
+ if ((int)packedFileLen < 0) {
1413
+ fprintf(stderr, "BWTIncConstructFromPacked: Cannot determine file length!\n");
1414
+ exit(1);
1415
+ }
1416
+ fread(&lastByteLength, sizeof(unsigned char), 1, packedFile);
1417
+ totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength);
1418
+
1419
+ bwtInc = BWTIncCreate(totalTextLength, targetNBit, initialMaxBuildSize, incMaxBuildSize);
1420
+
1421
+ BWTIncSetBuildSizeAndTextAddr(bwtInc);
1422
+
1423
+ if (bwtInc->buildSize > totalTextLength) {
1424
+ textToLoad = totalTextLength;
1425
+ } else {
1426
+ textToLoad = totalTextLength - ((totalTextLength - bwtInc->buildSize + CHAR_PER_WORD - 1) / CHAR_PER_WORD * CHAR_PER_WORD);
1427
+ }
1428
+ textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte
1429
+
1430
+ fseek(packedFile, -2, SEEK_CUR);
1431
+ fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
1432
+ fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile);
1433
+ fseek(packedFile, -((int)textSizeInByte + 1), SEEK_CUR);
1434
+
1435
+ ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
1436
+ BWTIncConstruct(bwtInc, textToLoad);
1437
+
1438
+ processedTextLength = textToLoad;
1439
+
1440
+ while (processedTextLength < totalTextLength) {
1441
+ textToLoad = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD;
1442
+ if (textToLoad > totalTextLength - processedTextLength) {
1443
+ textToLoad = totalTextLength - processedTextLength;
1444
+ }
1445
+ textSizeInByte = textToLoad / CHAR_PER_BYTE;
1446
+ fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
1447
+ fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile);
1448
+ fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
1449
+ ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
1450
+ BWTIncConstruct(bwtInc, textToLoad);
1451
+ processedTextLength += textToLoad;
1452
+ if (bwtInc->numberOfIterationDone % 10 == 0) {
1453
+ printf("[BWTIncConstructFromPacked] %u iterations done. %u characters processed.\n",
1454
+ bwtInc->numberOfIterationDone, processedTextLength);
1455
+ }
1456
+ }
1457
+ return bwtInc;
1458
+ }
1459
+
1460
+ void BWTFree(BWT *bwt)
1461
+ {
1462
+ if (bwt == 0) return;
1463
+ free(bwt->cumulativeFreq);
1464
+ free(bwt->bwtCode);
1465
+ free(bwt->occValue);
1466
+ free(bwt->occValueMajor);
1467
+ free(bwt->saValue);
1468
+ free(bwt->inverseSa);
1469
+ free(bwt->decodeTable);
1470
+ free(bwt->saIndexRange);
1471
+ free(bwt->saValueOnBoundary);
1472
+ free(bwt);
1473
+ }
1474
+
1475
+ void BWTIncFree(BWTInc *bwtInc)
1476
+ {
1477
+ if (bwtInc == 0) return;
1478
+ free(bwtInc->bwt);
1479
+ free(bwtInc->workingMemory);
1480
+ free(bwtInc);
1481
+ }
1482
+
1483
+ static unsigned int BWTFileSizeInWord(const unsigned int numChar)
1484
+ {
1485
+ // The $ in BWT at the position of inverseSa0 is not encoded
1486
+ return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD;
1487
+ }
1488
+
1489
+ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *occValueFileName)
1490
+ {
1491
+ FILE *bwtFile;
1492
+ /* FILE *occValueFile; */
1493
+ unsigned int bwtLength;
1494
+
1495
+ bwtFile = (FILE*)fopen(bwtFileName, "wb");
1496
+ if (bwtFile == NULL) {
1497
+ fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open BWT code file!\n");
1498
+ exit(1);
1499
+ }
1500
+
1501
+ fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, bwtFile);
1502
+ fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, bwtFile);
1503
+ bwtLength = BWTFileSizeInWord(bwt->textLength);
1504
+ fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile);
1505
+ fclose(bwtFile);
1506
+ /*
1507
+ occValueFile = (FILE*)fopen(occValueFileName, "wb");
1508
+ if (occValueFile == NULL) {
1509
+ fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open occ value file!\n");
1510
+ exit(1);
1511
+ }
1512
+
1513
+ fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, occValueFile);
1514
+ fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, occValueFile);
1515
+ fwrite(bwt->occValue, sizeof(unsigned int), bwt->occSizeInWord, occValueFile);
1516
+ fwrite(bwt->occValueMajor, sizeof(unsigned int), bwt->occMajorSizeInWord, occValueFile);
1517
+ fclose(occValueFile);
1518
+ */
1519
+ }
1520
+
1521
+ void bwt_bwtgen(const char *fn_pac, const char *fn_bwt)
1522
+ {
1523
+ BWTInc *bwtInc;
1524
+ bwtInc = BWTIncConstructFromPacked(fn_pac, 2.5, 10000000, 10000000);
1525
+ printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone);
1526
+ BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0);
1527
+ BWTIncFree(bwtInc);
1528
+ }
1529
+
1530
+ int bwt_bwtgen_main(int argc, char *argv[])
1531
+ {
1532
+ if (argc < 3) {
1533
+ fprintf(stderr, "Usage: bwtgen <in.pac> <out.bwt>\n");
1534
+ return 1;
1535
+ }
1536
+ bwt_bwtgen(argv[1], argv[2]);
1537
+ return 0;
1538
+ }
1539
+
1540
+ #ifdef MAIN_BWT_GEN
1541
+
1542
+ int main(int argc, char *argv[])
1543
+ {
1544
+ return bwt_bwtgen_main(argc, argv);
1545
+ }
1546
+
1547
+ #endif