ferret 0.11.6 → 0.11.8.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (185) hide show
  1. data/README +10 -22
  2. data/RELEASE_CHANGES +137 -0
  3. data/RELEASE_NOTES +60 -0
  4. data/Rakefile +379 -274
  5. data/TODO +100 -8
  6. data/bin/ferret-browser +0 -0
  7. data/ext/BZLIB_blocksort.c +1094 -0
  8. data/ext/BZLIB_bzlib.c +1578 -0
  9. data/ext/BZLIB_compress.c +672 -0
  10. data/ext/BZLIB_crctable.c +104 -0
  11. data/ext/BZLIB_decompress.c +626 -0
  12. data/ext/BZLIB_huffman.c +205 -0
  13. data/ext/BZLIB_randtable.c +84 -0
  14. data/ext/{api.c → STEMMER_api.c} +7 -10
  15. data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
  16. data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
  17. data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
  18. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  19. data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
  20. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  21. data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
  22. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  25. data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
  26. data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
  27. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  29. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  30. data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
  31. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  32. data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
  33. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  34. data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
  35. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  36. data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
  37. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  38. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  39. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  40. data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
  41. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  42. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  43. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  44. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  45. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  46. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  47. data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
  48. data/ext/analysis.c +276 -121
  49. data/ext/analysis.h +190 -143
  50. data/ext/api.h +3 -4
  51. data/ext/array.c +5 -3
  52. data/ext/array.h +52 -43
  53. data/ext/bitvector.c +38 -482
  54. data/ext/bitvector.h +446 -124
  55. data/ext/bzlib.h +282 -0
  56. data/ext/bzlib_private.h +503 -0
  57. data/ext/compound_io.c +23 -22
  58. data/ext/config.h +21 -11
  59. data/ext/document.c +43 -40
  60. data/ext/document.h +31 -21
  61. data/ext/except.c +20 -38
  62. data/ext/except.h +89 -76
  63. data/ext/extconf.rb +3 -2
  64. data/ext/ferret.c +49 -35
  65. data/ext/ferret.h +14 -11
  66. data/ext/field_index.c +262 -0
  67. data/ext/field_index.h +52 -0
  68. data/ext/filter.c +11 -10
  69. data/ext/fs_store.c +65 -47
  70. data/ext/global.c +245 -165
  71. data/ext/global.h +252 -54
  72. data/ext/hash.c +200 -243
  73. data/ext/hash.h +205 -163
  74. data/ext/hashset.c +118 -96
  75. data/ext/hashset.h +110 -82
  76. data/ext/header.h +19 -19
  77. data/ext/helper.c +11 -10
  78. data/ext/helper.h +14 -6
  79. data/ext/index.c +745 -366
  80. data/ext/index.h +503 -529
  81. data/ext/internal.h +1020 -0
  82. data/ext/lang.c +10 -0
  83. data/ext/lang.h +35 -15
  84. data/ext/mempool.c +5 -4
  85. data/ext/mempool.h +30 -22
  86. data/ext/modules.h +35 -7
  87. data/ext/multimapper.c +43 -2
  88. data/ext/multimapper.h +32 -23
  89. data/ext/posh.c +0 -0
  90. data/ext/posh.h +4 -38
  91. data/ext/priorityqueue.c +10 -12
  92. data/ext/priorityqueue.h +33 -21
  93. data/ext/q_boolean.c +22 -9
  94. data/ext/q_const_score.c +3 -2
  95. data/ext/q_filtered_query.c +15 -12
  96. data/ext/q_fuzzy.c +147 -135
  97. data/ext/q_match_all.c +3 -2
  98. data/ext/q_multi_term.c +28 -32
  99. data/ext/q_parser.c +451 -173
  100. data/ext/q_phrase.c +158 -79
  101. data/ext/q_prefix.c +16 -18
  102. data/ext/q_range.c +363 -31
  103. data/ext/q_span.c +130 -141
  104. data/ext/q_term.c +21 -21
  105. data/ext/q_wildcard.c +19 -23
  106. data/ext/r_analysis.c +369 -242
  107. data/ext/r_index.c +421 -434
  108. data/ext/r_qparser.c +142 -92
  109. data/ext/r_search.c +790 -407
  110. data/ext/r_store.c +44 -44
  111. data/ext/r_utils.c +264 -96
  112. data/ext/ram_store.c +29 -23
  113. data/ext/scanner.c +895 -0
  114. data/ext/scanner.h +36 -0
  115. data/ext/scanner_mb.c +6701 -0
  116. data/ext/scanner_utf8.c +4415 -0
  117. data/ext/search.c +210 -87
  118. data/ext/search.h +556 -488
  119. data/ext/similarity.c +17 -16
  120. data/ext/similarity.h +51 -44
  121. data/ext/sort.c +157 -354
  122. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  123. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  124. data/ext/stem_UTF_8_hungarian.h +16 -0
  125. data/ext/stem_UTF_8_romanian.h +16 -0
  126. data/ext/stem_UTF_8_turkish.h +16 -0
  127. data/ext/stopwords.c +287 -278
  128. data/ext/store.c +57 -51
  129. data/ext/store.h +308 -286
  130. data/ext/symbol.c +10 -0
  131. data/ext/symbol.h +23 -0
  132. data/ext/term_vectors.c +14 -293
  133. data/ext/threading.h +22 -22
  134. data/ext/win32.h +12 -4
  135. data/lib/ferret.rb +2 -1
  136. data/lib/ferret/browser.rb +1 -1
  137. data/lib/ferret/field_symbol.rb +94 -0
  138. data/lib/ferret/index.rb +221 -34
  139. data/lib/ferret/number_tools.rb +6 -6
  140. data/lib/ferret/version.rb +3 -0
  141. data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
  142. data/test/test_helper.rb +7 -2
  143. data/test/test_installed.rb +1 -0
  144. data/test/threading/thread_safety_index_test.rb +10 -1
  145. data/test/threading/thread_safety_read_write_test.rb +4 -7
  146. data/test/threading/thread_safety_test.rb +0 -0
  147. data/test/unit/analysis/tc_analyzer.rb +29 -27
  148. data/test/unit/analysis/tc_token_stream.rb +23 -16
  149. data/test/unit/index/tc_index.rb +116 -11
  150. data/test/unit/index/tc_index_reader.rb +27 -27
  151. data/test/unit/index/tc_index_writer.rb +10 -0
  152. data/test/unit/index/th_doc.rb +38 -21
  153. data/test/unit/search/tc_filter.rb +31 -10
  154. data/test/unit/search/tc_index_searcher.rb +6 -0
  155. data/test/unit/search/tm_searcher.rb +53 -1
  156. data/test/unit/store/tc_fs_store.rb +40 -2
  157. data/test/unit/store/tc_ram_store.rb +0 -0
  158. data/test/unit/store/tm_store.rb +0 -0
  159. data/test/unit/store/tm_store_lock.rb +7 -6
  160. data/test/unit/tc_field_symbol.rb +26 -0
  161. data/test/unit/ts_analysis.rb +0 -0
  162. data/test/unit/ts_index.rb +0 -0
  163. data/test/unit/ts_store.rb +0 -0
  164. data/test/unit/ts_utils.rb +0 -0
  165. data/test/unit/utils/tc_number_tools.rb +0 -0
  166. data/test/utils/content_generator.rb +226 -0
  167. metadata +262 -221
  168. data/ext/inc/lang.h +0 -48
  169. data/ext/inc/threading.h +0 -31
  170. data/ext/stem_ISO_8859_1_english.c +0 -1156
  171. data/ext/stem_ISO_8859_1_french.c +0 -1276
  172. data/ext/stem_ISO_8859_1_italian.c +0 -1091
  173. data/ext/stem_ISO_8859_1_norwegian.c +0 -296
  174. data/ext/stem_ISO_8859_1_spanish.c +0 -1119
  175. data/ext/stem_ISO_8859_1_swedish.c +0 -307
  176. data/ext/stem_UTF_8_danish.c +0 -344
  177. data/ext/stem_UTF_8_english.c +0 -1176
  178. data/ext/stem_UTF_8_french.c +0 -1296
  179. data/ext/stem_UTF_8_italian.c +0 -1113
  180. data/ext/stem_UTF_8_norwegian.c +0 -302
  181. data/ext/stem_UTF_8_portuguese.c +0 -1055
  182. data/ext/stem_UTF_8_russian.c +0 -709
  183. data/ext/stem_UTF_8_spanish.c +0 -1137
  184. data/ext/stem_UTF_8_swedish.c +0 -313
  185. data/lib/ferret_version.rb +0 -3
@@ -0,0 +1,672 @@
1
+
2
+ /*-------------------------------------------------------------*/
3
+ /*--- Compression machinery (not incl block sorting) ---*/
4
+ /*--- compress.c ---*/
5
+ /*-------------------------------------------------------------*/
6
+
7
+ /* ------------------------------------------------------------------
8
+ This file is part of bzip2/libbzip2, a program and library for
9
+ lossless, block-sorting data compression.
10
+
11
+ bzip2/libbzip2 version 1.0.4 of 20 December 2006
12
+ Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
13
+
14
+ Please read the WARNING, DISCLAIMER and PATENTS sections in the
15
+ README file.
16
+
17
+ This program is released under the terms of the license contained
18
+ in the file LICENSE.
19
+ ------------------------------------------------------------------ */
20
+
21
+
22
+ /* CHANGES
23
+ 0.9.0 -- original version.
24
+ 0.9.0a/b -- no changes in this file.
25
+ 0.9.0c -- changed setting of nGroups in sendMTFValues()
26
+ so as to do a bit better on small files
27
+ */
28
+
29
+ #include "bzlib_private.h"
30
+
31
+
32
+ /*---------------------------------------------------*/
33
+ /*--- Bit stream I/O ---*/
34
+ /*---------------------------------------------------*/
35
+
36
+ /*---------------------------------------------------*/
37
+ void BZ2_bsInitWrite ( EState* s )
38
+ {
39
+ s->bsLive = 0;
40
+ s->bsBuff = 0;
41
+ }
42
+
43
+
44
+ /*---------------------------------------------------*/
45
+ static
46
+ void bsFinishWrite ( EState* s )
47
+ {
48
+ while (s->bsLive > 0) {
49
+ s->zbits[s->numZ] = (UChar)(s->bsBuff >> 24);
50
+ s->numZ++;
51
+ s->bsBuff <<= 8;
52
+ s->bsLive -= 8;
53
+ }
54
+ }
55
+
56
+
57
+ /*---------------------------------------------------*/
58
+ #define bsNEEDW(nz) \
59
+ { \
60
+ while (s->bsLive >= 8) { \
61
+ s->zbits[s->numZ] \
62
+ = (UChar)(s->bsBuff >> 24); \
63
+ s->numZ++; \
64
+ s->bsBuff <<= 8; \
65
+ s->bsLive -= 8; \
66
+ } \
67
+ }
68
+
69
+
70
+ /*---------------------------------------------------*/
71
+ static
72
+ __inline__
73
+ void bsW ( EState* s, Int32 n, UInt32 v )
74
+ {
75
+ bsNEEDW ( n );
76
+ s->bsBuff |= (v << (32 - s->bsLive - n));
77
+ s->bsLive += n;
78
+ }
79
+
80
+
81
+ /*---------------------------------------------------*/
82
+ static
83
+ void bsPutUInt32 ( EState* s, UInt32 u )
84
+ {
85
+ bsW ( s, 8, (u >> 24) & 0xffL );
86
+ bsW ( s, 8, (u >> 16) & 0xffL );
87
+ bsW ( s, 8, (u >> 8) & 0xffL );
88
+ bsW ( s, 8, u & 0xffL );
89
+ }
90
+
91
+
92
+ /*---------------------------------------------------*/
93
+ static
94
+ void bsPutUChar ( EState* s, UChar c )
95
+ {
96
+ bsW( s, 8, (UInt32)c );
97
+ }
98
+
99
+
100
+ /*---------------------------------------------------*/
101
+ /*--- The back end proper ---*/
102
+ /*---------------------------------------------------*/
103
+
104
+ /*---------------------------------------------------*/
105
+ static
106
+ void makeMaps_e ( EState* s )
107
+ {
108
+ Int32 i;
109
+ s->nInUse = 0;
110
+ for (i = 0; i < 256; i++)
111
+ if (s->inUse[i]) {
112
+ s->unseqToSeq[i] = s->nInUse;
113
+ s->nInUse++;
114
+ }
115
+ }
116
+
117
+
118
+ /*---------------------------------------------------*/
119
+ static
120
+ void generateMTFValues ( EState* s )
121
+ {
122
+ UChar yy[256];
123
+ Int32 i, j;
124
+ Int32 zPend;
125
+ Int32 wr;
126
+ Int32 EOB;
127
+
128
+ /*
129
+ After sorting (eg, here),
130
+ s->arr1 [ 0 .. s->nblock-1 ] holds sorted order,
131
+ and
132
+ ((UChar*)s->arr2) [ 0 .. s->nblock-1 ]
133
+ holds the original block data.
134
+
135
+ The first thing to do is generate the MTF values,
136
+ and put them in
137
+ ((UInt16*)s->arr1) [ 0 .. s->nblock-1 ].
138
+ Because there are strictly fewer or equal MTF values
139
+ than block values, ptr values in this area are overwritten
140
+ with MTF values only when they are no longer needed.
141
+
142
+ The final compressed bitstream is generated into the
143
+ area starting at
144
+ (UChar*) (&((UChar*)s->arr2)[s->nblock])
145
+
146
+ These storage aliases are set up in bzCompressInit(),
147
+ except for the last one, which is arranged in
148
+ compressBlock().
149
+ */
150
+ UInt32* ptr = s->ptr;
151
+ UChar* block = s->block;
152
+ UInt16* mtfv = s->mtfv;
153
+
154
+ makeMaps_e ( s );
155
+ EOB = s->nInUse+1;
156
+
157
+ for (i = 0; i <= EOB; i++) s->mtfFreq[i] = 0;
158
+
159
+ wr = 0;
160
+ zPend = 0;
161
+ for (i = 0; i < s->nInUse; i++) yy[i] = (UChar) i;
162
+
163
+ for (i = 0; i < s->nblock; i++) {
164
+ UChar ll_i;
165
+ AssertD ( wr <= i, "generateMTFValues(1)" );
166
+ j = ptr[i]-1; if (j < 0) j += s->nblock;
167
+ ll_i = s->unseqToSeq[block[j]];
168
+ AssertD ( ll_i < s->nInUse, "generateMTFValues(2a)" );
169
+
170
+ if (yy[0] == ll_i) {
171
+ zPend++;
172
+ } else {
173
+
174
+ if (zPend > 0) {
175
+ zPend--;
176
+ while (True) {
177
+ if (zPend & 1) {
178
+ mtfv[wr] = BZ_RUNB; wr++;
179
+ s->mtfFreq[BZ_RUNB]++;
180
+ } else {
181
+ mtfv[wr] = BZ_RUNA; wr++;
182
+ s->mtfFreq[BZ_RUNA]++;
183
+ }
184
+ if (zPend < 2) break;
185
+ zPend = (zPend - 2) / 2;
186
+ };
187
+ zPend = 0;
188
+ }
189
+ {
190
+ register UChar rtmp;
191
+ register UChar* ryy_j;
192
+ register UChar rll_i;
193
+ rtmp = yy[1];
194
+ yy[1] = yy[0];
195
+ ryy_j = &(yy[1]);
196
+ rll_i = ll_i;
197
+ while ( rll_i != rtmp ) {
198
+ register UChar rtmp2;
199
+ ryy_j++;
200
+ rtmp2 = rtmp;
201
+ rtmp = *ryy_j;
202
+ *ryy_j = rtmp2;
203
+ };
204
+ yy[0] = rtmp;
205
+ j = ryy_j - &(yy[0]);
206
+ mtfv[wr] = j+1; wr++; s->mtfFreq[j+1]++;
207
+ }
208
+
209
+ }
210
+ }
211
+
212
+ if (zPend > 0) {
213
+ zPend--;
214
+ while (True) {
215
+ if (zPend & 1) {
216
+ mtfv[wr] = BZ_RUNB; wr++;
217
+ s->mtfFreq[BZ_RUNB]++;
218
+ } else {
219
+ mtfv[wr] = BZ_RUNA; wr++;
220
+ s->mtfFreq[BZ_RUNA]++;
221
+ }
222
+ if (zPend < 2) break;
223
+ zPend = (zPend - 2) / 2;
224
+ };
225
+ zPend = 0;
226
+ }
227
+
228
+ mtfv[wr] = EOB; wr++; s->mtfFreq[EOB]++;
229
+
230
+ s->nMTF = wr;
231
+ }
232
+
233
+
234
+ /*---------------------------------------------------*/
235
+ #define BZ_LESSER_ICOST 0
236
+ #define BZ_GREATER_ICOST 15
237
+
238
+ static
239
+ void sendMTFValues ( EState* s )
240
+ {
241
+ Int32 v, t, i, j, gs, ge, totc, bt, bc, iter;
242
+ Int32 nSelectors, alphaSize, minLen, maxLen, selCtr;
243
+ Int32 nGroups, nBytes;
244
+
245
+ /*--
246
+ UChar len [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
247
+ is a global since the decoder also needs it.
248
+
249
+ Int32 code[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
250
+ Int32 rfreq[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
251
+ are also globals only used in this proc.
252
+ Made global to keep stack frame size small.
253
+ --*/
254
+
255
+
256
+ UInt16 cost[BZ_N_GROUPS];
257
+ Int32 fave[BZ_N_GROUPS];
258
+
259
+ UInt16* mtfv = s->mtfv;
260
+
261
+ if (s->verbosity >= 3)
262
+ VPrintf3( " %d in block, %d after MTF & 1-2 coding, "
263
+ "%d+2 syms in use\n",
264
+ s->nblock, s->nMTF, s->nInUse );
265
+
266
+ alphaSize = s->nInUse+2;
267
+ for (t = 0; t < BZ_N_GROUPS; t++)
268
+ for (v = 0; v < alphaSize; v++)
269
+ s->len[t][v] = BZ_GREATER_ICOST;
270
+
271
+ /*--- Decide how many coding tables to use ---*/
272
+ AssertH ( s->nMTF > 0, 3001 );
273
+ if (s->nMTF < 200) nGroups = 2; else
274
+ if (s->nMTF < 600) nGroups = 3; else
275
+ if (s->nMTF < 1200) nGroups = 4; else
276
+ if (s->nMTF < 2400) nGroups = 5; else
277
+ nGroups = 6;
278
+
279
+ /*--- Generate an initial set of coding tables ---*/
280
+ {
281
+ Int32 nPart, remF, tFreq, aFreq;
282
+
283
+ nPart = nGroups;
284
+ remF = s->nMTF;
285
+ gs = 0;
286
+ while (nPart > 0) {
287
+ tFreq = remF / nPart;
288
+ ge = gs-1;
289
+ aFreq = 0;
290
+ while (aFreq < tFreq && ge < alphaSize-1) {
291
+ ge++;
292
+ aFreq += s->mtfFreq[ge];
293
+ }
294
+
295
+ if (ge > gs
296
+ && nPart != nGroups && nPart != 1
297
+ && ((nGroups-nPart) % 2 == 1)) {
298
+ aFreq -= s->mtfFreq[ge];
299
+ ge--;
300
+ }
301
+
302
+ if (s->verbosity >= 3)
303
+ VPrintf5( " initial group %d, [%d .. %d], "
304
+ "has %d syms (%4.1f%%)\n",
305
+ nPart, gs, ge, aFreq,
306
+ (100.0 * (float)aFreq) / (float)(s->nMTF) );
307
+
308
+ for (v = 0; v < alphaSize; v++)
309
+ if (v >= gs && v <= ge)
310
+ s->len[nPart-1][v] = BZ_LESSER_ICOST; else
311
+ s->len[nPart-1][v] = BZ_GREATER_ICOST;
312
+
313
+ nPart--;
314
+ gs = ge+1;
315
+ remF -= aFreq;
316
+ }
317
+ }
318
+
319
+ /*---
320
+ Iterate up to BZ_N_ITERS times to improve the tables.
321
+ ---*/
322
+ for (iter = 0; iter < BZ_N_ITERS; iter++) {
323
+
324
+ for (t = 0; t < nGroups; t++) fave[t] = 0;
325
+
326
+ for (t = 0; t < nGroups; t++)
327
+ for (v = 0; v < alphaSize; v++)
328
+ s->rfreq[t][v] = 0;
329
+
330
+ /*---
331
+ Set up an auxiliary length table which is used to fast-track
332
+ the common case (nGroups == 6).
333
+ ---*/
334
+ if (nGroups == 6) {
335
+ for (v = 0; v < alphaSize; v++) {
336
+ s->len_pack[v][0] = (s->len[1][v] << 16) | s->len[0][v];
337
+ s->len_pack[v][1] = (s->len[3][v] << 16) | s->len[2][v];
338
+ s->len_pack[v][2] = (s->len[5][v] << 16) | s->len[4][v];
339
+ }
340
+ }
341
+
342
+ nSelectors = 0;
343
+ totc = 0;
344
+ gs = 0;
345
+ while (True) {
346
+
347
+ /*--- Set group start & end marks. --*/
348
+ if (gs >= s->nMTF) break;
349
+ ge = gs + BZ_G_SIZE - 1;
350
+ if (ge >= s->nMTF) ge = s->nMTF-1;
351
+
352
+ /*--
353
+ Calculate the cost of this group as coded
354
+ by each of the coding tables.
355
+ --*/
356
+ for (t = 0; t < nGroups; t++) cost[t] = 0;
357
+
358
+ if (nGroups == 6 && 50 == ge-gs+1) {
359
+ /*--- fast track the common case ---*/
360
+ register UInt32 cost01, cost23, cost45;
361
+ register UInt16 icv;
362
+ cost01 = cost23 = cost45 = 0;
363
+
364
+ # define BZ_ITER(nn) \
365
+ icv = mtfv[gs+(nn)]; \
366
+ cost01 += s->len_pack[icv][0]; \
367
+ cost23 += s->len_pack[icv][1]; \
368
+ cost45 += s->len_pack[icv][2]; \
369
+
370
+ BZ_ITER(0); BZ_ITER(1); BZ_ITER(2); BZ_ITER(3); BZ_ITER(4);
371
+ BZ_ITER(5); BZ_ITER(6); BZ_ITER(7); BZ_ITER(8); BZ_ITER(9);
372
+ BZ_ITER(10); BZ_ITER(11); BZ_ITER(12); BZ_ITER(13); BZ_ITER(14);
373
+ BZ_ITER(15); BZ_ITER(16); BZ_ITER(17); BZ_ITER(18); BZ_ITER(19);
374
+ BZ_ITER(20); BZ_ITER(21); BZ_ITER(22); BZ_ITER(23); BZ_ITER(24);
375
+ BZ_ITER(25); BZ_ITER(26); BZ_ITER(27); BZ_ITER(28); BZ_ITER(29);
376
+ BZ_ITER(30); BZ_ITER(31); BZ_ITER(32); BZ_ITER(33); BZ_ITER(34);
377
+ BZ_ITER(35); BZ_ITER(36); BZ_ITER(37); BZ_ITER(38); BZ_ITER(39);
378
+ BZ_ITER(40); BZ_ITER(41); BZ_ITER(42); BZ_ITER(43); BZ_ITER(44);
379
+ BZ_ITER(45); BZ_ITER(46); BZ_ITER(47); BZ_ITER(48); BZ_ITER(49);
380
+
381
+ # undef BZ_ITER
382
+
383
+ cost[0] = cost01 & 0xffff; cost[1] = cost01 >> 16;
384
+ cost[2] = cost23 & 0xffff; cost[3] = cost23 >> 16;
385
+ cost[4] = cost45 & 0xffff; cost[5] = cost45 >> 16;
386
+
387
+ } else {
388
+ /*--- slow version which correctly handles all situations ---*/
389
+ for (i = gs; i <= ge; i++) {
390
+ UInt16 icv = mtfv[i];
391
+ for (t = 0; t < nGroups; t++) cost[t] += s->len[t][icv];
392
+ }
393
+ }
394
+
395
+ /*--
396
+ Find the coding table which is best for this group,
397
+ and record its identity in the selector table.
398
+ --*/
399
+ bc = 999999999; bt = -1;
400
+ for (t = 0; t < nGroups; t++)
401
+ if (cost[t] < bc) { bc = cost[t]; bt = t; };
402
+ totc += bc;
403
+ fave[bt]++;
404
+ s->selector[nSelectors] = bt;
405
+ nSelectors++;
406
+
407
+ /*--
408
+ Increment the symbol frequencies for the selected table.
409
+ --*/
410
+ if (nGroups == 6 && 50 == ge-gs+1) {
411
+ /*--- fast track the common case ---*/
412
+
413
+ # define BZ_ITUR(nn) s->rfreq[bt][ mtfv[gs+(nn)] ]++
414
+
415
+ BZ_ITUR(0); BZ_ITUR(1); BZ_ITUR(2); BZ_ITUR(3); BZ_ITUR(4);
416
+ BZ_ITUR(5); BZ_ITUR(6); BZ_ITUR(7); BZ_ITUR(8); BZ_ITUR(9);
417
+ BZ_ITUR(10); BZ_ITUR(11); BZ_ITUR(12); BZ_ITUR(13); BZ_ITUR(14);
418
+ BZ_ITUR(15); BZ_ITUR(16); BZ_ITUR(17); BZ_ITUR(18); BZ_ITUR(19);
419
+ BZ_ITUR(20); BZ_ITUR(21); BZ_ITUR(22); BZ_ITUR(23); BZ_ITUR(24);
420
+ BZ_ITUR(25); BZ_ITUR(26); BZ_ITUR(27); BZ_ITUR(28); BZ_ITUR(29);
421
+ BZ_ITUR(30); BZ_ITUR(31); BZ_ITUR(32); BZ_ITUR(33); BZ_ITUR(34);
422
+ BZ_ITUR(35); BZ_ITUR(36); BZ_ITUR(37); BZ_ITUR(38); BZ_ITUR(39);
423
+ BZ_ITUR(40); BZ_ITUR(41); BZ_ITUR(42); BZ_ITUR(43); BZ_ITUR(44);
424
+ BZ_ITUR(45); BZ_ITUR(46); BZ_ITUR(47); BZ_ITUR(48); BZ_ITUR(49);
425
+
426
+ # undef BZ_ITUR
427
+
428
+ } else {
429
+ /*--- slow version which correctly handles all situations ---*/
430
+ for (i = gs; i <= ge; i++)
431
+ s->rfreq[bt][ mtfv[i] ]++;
432
+ }
433
+
434
+ gs = ge+1;
435
+ }
436
+ if (s->verbosity >= 3) {
437
+ VPrintf2 ( " pass %d: size is %d, grp uses are ",
438
+ iter+1, totc/8 );
439
+ for (t = 0; t < nGroups; t++)
440
+ VPrintf1 ( "%d ", fave[t] );
441
+ VPrintf0 ( "\n" );
442
+ }
443
+
444
+ /*--
445
+ Recompute the tables based on the accumulated frequencies.
446
+ --*/
447
+ /* maxLen was changed from 20 to 17 in bzip2-1.0.3. See
448
+ comment in huffman.c for details. */
449
+ for (t = 0; t < nGroups; t++)
450
+ BZ2_hbMakeCodeLengths ( &(s->len[t][0]), &(s->rfreq[t][0]),
451
+ alphaSize, 17 /*20*/ );
452
+ }
453
+
454
+
455
+ AssertH( nGroups < 8, 3002 );
456
+ AssertH( nSelectors < 32768 &&
457
+ nSelectors <= (2 + (900000 / BZ_G_SIZE)),
458
+ 3003 );
459
+
460
+
461
+ /*--- Compute MTF values for the selectors. ---*/
462
+ {
463
+ UChar pos[BZ_N_GROUPS], ll_i, tmp2, tmp;
464
+ for (i = 0; i < nGroups; i++) pos[i] = i;
465
+ for (i = 0; i < nSelectors; i++) {
466
+ ll_i = s->selector[i];
467
+ j = 0;
468
+ tmp = pos[j];
469
+ while ( ll_i != tmp ) {
470
+ j++;
471
+ tmp2 = tmp;
472
+ tmp = pos[j];
473
+ pos[j] = tmp2;
474
+ };
475
+ pos[0] = tmp;
476
+ s->selectorMtf[i] = j;
477
+ }
478
+ };
479
+
480
+ /*--- Assign actual codes for the tables. --*/
481
+ for (t = 0; t < nGroups; t++) {
482
+ minLen = 32;
483
+ maxLen = 0;
484
+ for (i = 0; i < alphaSize; i++) {
485
+ if (s->len[t][i] > maxLen) maxLen = s->len[t][i];
486
+ if (s->len[t][i] < minLen) minLen = s->len[t][i];
487
+ }
488
+ AssertH ( !(maxLen > 17 /*20*/ ), 3004 );
489
+ AssertH ( !(minLen < 1), 3005 );
490
+ BZ2_hbAssignCodes ( &(s->code[t][0]), &(s->len[t][0]),
491
+ minLen, maxLen, alphaSize );
492
+ }
493
+
494
+ /*--- Transmit the mapping table. ---*/
495
+ {
496
+ Bool inUse16[16];
497
+ for (i = 0; i < 16; i++) {
498
+ inUse16[i] = False;
499
+ for (j = 0; j < 16; j++)
500
+ if (s->inUse[i * 16 + j]) inUse16[i] = True;
501
+ }
502
+
503
+ nBytes = s->numZ;
504
+ for (i = 0; i < 16; i++)
505
+ if (inUse16[i]) bsW(s,1,1); else bsW(s,1,0);
506
+
507
+ for (i = 0; i < 16; i++)
508
+ if (inUse16[i])
509
+ for (j = 0; j < 16; j++) {
510
+ if (s->inUse[i * 16 + j]) bsW(s,1,1); else bsW(s,1,0);
511
+ }
512
+
513
+ if (s->verbosity >= 3)
514
+ VPrintf1( " bytes: mapping %d, ", s->numZ-nBytes );
515
+ }
516
+
517
+ /*--- Now the selectors. ---*/
518
+ nBytes = s->numZ;
519
+ bsW ( s, 3, nGroups );
520
+ bsW ( s, 15, nSelectors );
521
+ for (i = 0; i < nSelectors; i++) {
522
+ for (j = 0; j < s->selectorMtf[i]; j++) bsW(s,1,1);
523
+ bsW(s,1,0);
524
+ }
525
+ if (s->verbosity >= 3)
526
+ VPrintf1( "selectors %d, ", s->numZ-nBytes );
527
+
528
+ /*--- Now the coding tables. ---*/
529
+ nBytes = s->numZ;
530
+
531
+ for (t = 0; t < nGroups; t++) {
532
+ Int32 curr = s->len[t][0];
533
+ bsW ( s, 5, curr );
534
+ for (i = 0; i < alphaSize; i++) {
535
+ while (curr < s->len[t][i]) { bsW(s,2,2); curr++; /* 10 */ };
536
+ while (curr > s->len[t][i]) { bsW(s,2,3); curr--; /* 11 */ };
537
+ bsW ( s, 1, 0 );
538
+ }
539
+ }
540
+
541
+ if (s->verbosity >= 3)
542
+ VPrintf1 ( "code lengths %d, ", s->numZ-nBytes );
543
+
544
+ /*--- And finally, the block data proper ---*/
545
+ nBytes = s->numZ;
546
+ selCtr = 0;
547
+ gs = 0;
548
+ while (True) {
549
+ if (gs >= s->nMTF) break;
550
+ ge = gs + BZ_G_SIZE - 1;
551
+ if (ge >= s->nMTF) ge = s->nMTF-1;
552
+ AssertH ( s->selector[selCtr] < nGroups, 3006 );
553
+
554
+ if (nGroups == 6 && 50 == ge-gs+1) {
555
+ /*--- fast track the common case ---*/
556
+ UInt16 mtfv_i;
557
+ UChar* s_len_sel_selCtr
558
+ = &(s->len[s->selector[selCtr]][0]);
559
+ Int32* s_code_sel_selCtr
560
+ = &(s->code[s->selector[selCtr]][0]);
561
+
562
+ # define BZ_ITAH(nn) \
563
+ mtfv_i = mtfv[gs+(nn)]; \
564
+ bsW ( s, \
565
+ s_len_sel_selCtr[mtfv_i], \
566
+ s_code_sel_selCtr[mtfv_i] )
567
+
568
+ BZ_ITAH(0); BZ_ITAH(1); BZ_ITAH(2); BZ_ITAH(3); BZ_ITAH(4);
569
+ BZ_ITAH(5); BZ_ITAH(6); BZ_ITAH(7); BZ_ITAH(8); BZ_ITAH(9);
570
+ BZ_ITAH(10); BZ_ITAH(11); BZ_ITAH(12); BZ_ITAH(13); BZ_ITAH(14);
571
+ BZ_ITAH(15); BZ_ITAH(16); BZ_ITAH(17); BZ_ITAH(18); BZ_ITAH(19);
572
+ BZ_ITAH(20); BZ_ITAH(21); BZ_ITAH(22); BZ_ITAH(23); BZ_ITAH(24);
573
+ BZ_ITAH(25); BZ_ITAH(26); BZ_ITAH(27); BZ_ITAH(28); BZ_ITAH(29);
574
+ BZ_ITAH(30); BZ_ITAH(31); BZ_ITAH(32); BZ_ITAH(33); BZ_ITAH(34);
575
+ BZ_ITAH(35); BZ_ITAH(36); BZ_ITAH(37); BZ_ITAH(38); BZ_ITAH(39);
576
+ BZ_ITAH(40); BZ_ITAH(41); BZ_ITAH(42); BZ_ITAH(43); BZ_ITAH(44);
577
+ BZ_ITAH(45); BZ_ITAH(46); BZ_ITAH(47); BZ_ITAH(48); BZ_ITAH(49);
578
+
579
+ # undef BZ_ITAH
580
+
581
+ } else {
582
+ /*--- slow version which correctly handles all situations ---*/
583
+ for (i = gs; i <= ge; i++) {
584
+ bsW ( s,
585
+ s->len [s->selector[selCtr]] [mtfv[i]],
586
+ s->code [s->selector[selCtr]] [mtfv[i]] );
587
+ }
588
+ }
589
+
590
+
591
+ gs = ge+1;
592
+ selCtr++;
593
+ }
594
+ AssertH( selCtr == nSelectors, 3007 );
595
+
596
+ if (s->verbosity >= 3)
597
+ VPrintf1( "codes %d\n", s->numZ-nBytes );
598
+ }
599
+
600
+
601
+ /*---------------------------------------------------*/
602
+ void BZ2_compressBlock ( EState* s, Bool is_last_block )
603
+ {
604
+ if (s->nblock > 0) {
605
+
606
+ BZ_FINALISE_CRC ( s->blockCRC );
607
+ s->combinedCRC = (s->combinedCRC << 1) | (s->combinedCRC >> 31);
608
+ s->combinedCRC ^= s->blockCRC;
609
+ if (s->blockNo > 1) s->numZ = 0;
610
+
611
+ if (s->verbosity >= 2)
612
+ VPrintf4( " block %d: crc = 0x%08x, "
613
+ "combined CRC = 0x%08x, size = %d\n",
614
+ s->blockNo, s->blockCRC, s->combinedCRC, s->nblock );
615
+
616
+ BZ2_blockSort ( s );
617
+ }
618
+
619
+ s->zbits = (UChar*) (&((UChar*)s->arr2)[s->nblock]);
620
+
621
+ /*-- If this is the first block, create the stream header. --*/
622
+ if (s->blockNo == 1) {
623
+ BZ2_bsInitWrite ( s );
624
+ bsPutUChar ( s, BZ_HDR_B );
625
+ bsPutUChar ( s, BZ_HDR_Z );
626
+ bsPutUChar ( s, BZ_HDR_h );
627
+ bsPutUChar ( s, (UChar)(BZ_HDR_0 + s->blockSize100k) );
628
+ }
629
+
630
+ if (s->nblock > 0) {
631
+
632
+ bsPutUChar ( s, 0x31 ); bsPutUChar ( s, 0x41 );
633
+ bsPutUChar ( s, 0x59 ); bsPutUChar ( s, 0x26 );
634
+ bsPutUChar ( s, 0x53 ); bsPutUChar ( s, 0x59 );
635
+
636
+ /*-- Now the block's CRC, so it is in a known place. --*/
637
+ bsPutUInt32 ( s, s->blockCRC );
638
+
639
+ /*--
640
+ Now a single bit indicating (non-)randomisation.
641
+ As of version 0.9.5, we use a better sorting algorithm
642
+ which makes randomisation unnecessary. So always set
643
+ the randomised bit to 'no'. Of course, the decoder
644
+ still needs to be able to handle randomised blocks
645
+ so as to maintain backwards compatibility with
646
+ older versions of bzip2.
647
+ --*/
648
+ bsW(s,1,0);
649
+
650
+ bsW ( s, 24, s->origPtr );
651
+ generateMTFValues ( s );
652
+ sendMTFValues ( s );
653
+ }
654
+
655
+
656
+ /*-- If this is the last block, add the stream trailer. --*/
657
+ if (is_last_block) {
658
+
659
+ bsPutUChar ( s, 0x17 ); bsPutUChar ( s, 0x72 );
660
+ bsPutUChar ( s, 0x45 ); bsPutUChar ( s, 0x38 );
661
+ bsPutUChar ( s, 0x50 ); bsPutUChar ( s, 0x90 );
662
+ bsPutUInt32 ( s, s->combinedCRC );
663
+ if (s->verbosity >= 2)
664
+ VPrintf1( " final combined CRC = 0x%08x\n ", s->combinedCRC );
665
+ bsFinishWrite ( s );
666
+ }
667
+ }
668
+
669
+
670
+ /*-------------------------------------------------------------*/
671
+ /*--- end compress.c ---*/
672
+ /*-------------------------------------------------------------*/