isomorfeus-ferret 0.12.7 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (164) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +101 -19
  3. data/README.md +54 -1
  4. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +22 -30
  5. data/ext/isomorfeus_ferret_ext/bm_hash.c +6 -12
  6. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +3 -6
  7. data/ext/isomorfeus_ferret_ext/bm_store.c +11 -22
  8. data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +1 -1
  9. data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +1 -1
  10. data/ext/isomorfeus_ferret_ext/bzip_blocksort.c +1094 -0
  11. data/ext/isomorfeus_ferret_ext/bzip_huffman.c +205 -0
  12. data/ext/isomorfeus_ferret_ext/bzlib.c +1572 -0
  13. data/ext/isomorfeus_ferret_ext/bzlib.h +282 -0
  14. data/ext/isomorfeus_ferret_ext/bzlib_compress.c +672 -0
  15. data/ext/isomorfeus_ferret_ext/bzlib_crctable.c +104 -0
  16. data/ext/isomorfeus_ferret_ext/bzlib_decompress.c +652 -0
  17. data/ext/isomorfeus_ferret_ext/bzlib_private.h +509 -0
  18. data/ext/isomorfeus_ferret_ext/bzlib_randtable.c +84 -0
  19. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -53
  20. data/ext/isomorfeus_ferret_ext/frb_analysis.c +785 -1192
  21. data/ext/isomorfeus_ferret_ext/frb_index.c +492 -474
  22. data/ext/isomorfeus_ferret_ext/frb_qparser.c +48 -60
  23. data/ext/isomorfeus_ferret_ext/frb_search.c +1520 -1002
  24. data/ext/isomorfeus_ferret_ext/frb_store.c +96 -96
  25. data/ext/isomorfeus_ferret_ext/frb_threading.h +0 -1
  26. data/ext/isomorfeus_ferret_ext/frb_utils.c +147 -196
  27. data/ext/isomorfeus_ferret_ext/frt_analysis.c +695 -1090
  28. data/ext/isomorfeus_ferret_ext/frt_analysis.h +174 -170
  29. data/ext/isomorfeus_ferret_ext/frt_array.c +2 -4
  30. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +9 -16
  31. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +32 -81
  32. data/ext/isomorfeus_ferret_ext/frt_document.c +15 -20
  33. data/ext/isomorfeus_ferret_ext/frt_document.h +10 -10
  34. data/ext/isomorfeus_ferret_ext/frt_except.c +5 -12
  35. data/ext/isomorfeus_ferret_ext/frt_field_index.c +3 -3
  36. data/ext/isomorfeus_ferret_ext/frt_field_index.h +6 -7
  37. data/ext/isomorfeus_ferret_ext/frt_filter.c +35 -46
  38. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +1 -0
  39. data/ext/isomorfeus_ferret_ext/frt_global.c +105 -63
  40. data/ext/isomorfeus_ferret_ext/frt_global.h +7 -3
  41. data/ext/isomorfeus_ferret_ext/frt_hash.c +1 -2
  42. data/ext/isomorfeus_ferret_ext/frt_ind.c +32 -35
  43. data/ext/isomorfeus_ferret_ext/frt_ind.h +9 -9
  44. data/ext/isomorfeus_ferret_ext/frt_index.c +580 -399
  45. data/ext/isomorfeus_ferret_ext/frt_index.h +272 -291
  46. data/ext/isomorfeus_ferret_ext/frt_mempool.c +1 -2
  47. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +4 -7
  48. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +67 -91
  49. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +35 -38
  50. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +53 -72
  51. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +25 -32
  52. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +21 -23
  53. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +66 -103
  54. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +207 -195
  55. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +20 -16
  56. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +17 -14
  57. data/ext/isomorfeus_ferret_ext/frt_q_range.c +102 -131
  58. data/ext/isomorfeus_ferret_ext/frt_q_span.c +179 -178
  59. data/ext/isomorfeus_ferret_ext/frt_q_term.c +47 -60
  60. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +18 -16
  61. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +45 -84
  62. data/ext/isomorfeus_ferret_ext/frt_search.c +105 -146
  63. data/ext/isomorfeus_ferret_ext/frt_search.h +331 -320
  64. data/ext/isomorfeus_ferret_ext/frt_similarity.c +5 -13
  65. data/ext/isomorfeus_ferret_ext/frt_similarity.h +7 -12
  66. data/ext/isomorfeus_ferret_ext/frt_sort.c +105 -149
  67. data/ext/isomorfeus_ferret_ext/frt_store.c +13 -7
  68. data/ext/isomorfeus_ferret_ext/frt_store.h +10 -2
  69. data/ext/isomorfeus_ferret_ext/frt_threading.h +0 -1
  70. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +21 -109
  71. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +2 -32
  72. data/ext/isomorfeus_ferret_ext/lz4.c +2495 -0
  73. data/ext/isomorfeus_ferret_ext/lz4.h +774 -0
  74. data/ext/isomorfeus_ferret_ext/lz4frame.c +1899 -0
  75. data/ext/isomorfeus_ferret_ext/lz4frame.h +623 -0
  76. data/ext/isomorfeus_ferret_ext/lz4hc.c +1615 -0
  77. data/ext/isomorfeus_ferret_ext/lz4hc.h +413 -0
  78. data/ext/isomorfeus_ferret_ext/lz4xxhash.c +1030 -0
  79. data/ext/isomorfeus_ferret_ext/lz4xxhash.h +328 -0
  80. data/ext/isomorfeus_ferret_ext/stem_modules.h +0 -86
  81. data/ext/isomorfeus_ferret_ext/test.c +1 -2
  82. data/ext/isomorfeus_ferret_ext/test_1710.c +11 -12
  83. data/ext/isomorfeus_ferret_ext/test_analysis.c +590 -583
  84. data/ext/isomorfeus_ferret_ext/test_compound_io.c +1 -1
  85. data/ext/isomorfeus_ferret_ext/test_document.c +19 -15
  86. data/ext/isomorfeus_ferret_ext/test_except.c +1 -2
  87. data/ext/isomorfeus_ferret_ext/test_fields.c +59 -60
  88. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +10 -27
  89. data/ext/isomorfeus_ferret_ext/test_filter.c +11 -8
  90. data/ext/isomorfeus_ferret_ext/test_hash.c +2 -2
  91. data/ext/isomorfeus_ferret_ext/test_hashset.c +1 -1
  92. data/ext/isomorfeus_ferret_ext/test_highlighter.c +15 -11
  93. data/ext/isomorfeus_ferret_ext/test_index.c +372 -365
  94. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +5 -3
  95. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +5 -3
  96. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +13 -10
  97. data/ext/isomorfeus_ferret_ext/test_q_parser.c +45 -7
  98. data/ext/isomorfeus_ferret_ext/test_q_span.c +15 -12
  99. data/ext/isomorfeus_ferret_ext/test_ram_store.c +3 -3
  100. data/ext/isomorfeus_ferret_ext/test_search.c +60 -62
  101. data/ext/isomorfeus_ferret_ext/test_segments.c +5 -4
  102. data/ext/isomorfeus_ferret_ext/test_sort.c +17 -14
  103. data/ext/isomorfeus_ferret_ext/test_store.c +2 -0
  104. data/ext/isomorfeus_ferret_ext/test_term.c +3 -1
  105. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +9 -10
  106. data/ext/isomorfeus_ferret_ext/test_test.c +1 -2
  107. data/ext/isomorfeus_ferret_ext/test_threading.c +9 -10
  108. data/ext/isomorfeus_ferret_ext/testhelper.c +1 -2
  109. data/lib/isomorfeus/ferret/version.rb +1 -1
  110. metadata +27 -57
  111. data/ext/isomorfeus_ferret_ext/email.rl +0 -21
  112. data/ext/isomorfeus_ferret_ext/frt_scanner.c +0 -900
  113. data/ext/isomorfeus_ferret_ext/frt_scanner.h +0 -28
  114. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +0 -6706
  115. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +0 -4420
  116. data/ext/isomorfeus_ferret_ext/scanner.h +0 -28
  117. data/ext/isomorfeus_ferret_ext/scanner.in +0 -43
  118. data/ext/isomorfeus_ferret_ext/scanner.rl +0 -84
  119. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +0 -200
  120. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +0 -85
  121. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +0 -1167
  122. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +0 -6
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +0 -1433
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +0 -6
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +0 -301
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +0 -6
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +0 -590
  128. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +0 -6
  129. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +0 -1049
  130. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +0 -6
  131. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +0 -705
  132. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +0 -6
  133. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +0 -1239
  134. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +0 -6
  135. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +0 -477
  136. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +0 -6
  137. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +0 -1217
  138. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +0 -7
  139. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +0 -394
  140. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +0 -6
  141. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +0 -457
  142. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +0 -6
  143. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +0 -1009
  144. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +0 -6
  145. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +0 -259
  146. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +0 -6
  147. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +0 -704
  148. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +0 -6
  149. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +0 -948
  150. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +0 -6
  151. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +0 -1028
  152. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +0 -6
  153. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +0 -275
  154. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +0 -6
  155. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +0 -849
  156. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +0 -6
  157. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +0 -952
  158. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +0 -6
  159. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +0 -669
  160. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +0 -6
  161. data/ext/isomorfeus_ferret_ext/stem_modules.txt +0 -63
  162. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +0 -1854
  163. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +0 -1999
  164. data/ext/isomorfeus_ferret_ext/url.rl +0 -27
@@ -0,0 +1,672 @@
1
+
2
+ /*-------------------------------------------------------------*/
3
+ /*--- Compression machinery (not incl block sorting) ---*/
4
+ /*--- compress.c ---*/
5
+ /*-------------------------------------------------------------*/
6
+
7
+ /* ------------------------------------------------------------------
8
+ This file is part of bzip2/libbzip2, a program and library for
9
+ lossless, block-sorting data compression.
10
+
11
+ bzip2/libbzip2 version 1.0.8 of 13 July 2019
12
+ Copyright (C) 1996-2019 Julian Seward <jseward@acm.org>
13
+
14
+ Please read the WARNING, DISCLAIMER and PATENTS sections in the
15
+ README file.
16
+
17
+ This program is released under the terms of the license contained
18
+ in the file LICENSE.
19
+ ------------------------------------------------------------------ */
20
+
21
+
22
+ /* CHANGES
23
+ 0.9.0 -- original version.
24
+ 0.9.0a/b -- no changes in this file.
25
+ 0.9.0c -- changed setting of nGroups in sendMTFValues()
26
+ so as to do a bit better on small files
27
+ */
28
+
29
+ #include "bzlib_private.h"
30
+
31
+
32
+ /*---------------------------------------------------*/
33
+ /*--- Bit stream I/O ---*/
34
+ /*---------------------------------------------------*/
35
+
36
+ /*---------------------------------------------------*/
37
+ void BZ2_bsInitWrite ( EState* s )
38
+ {
39
+ s->bsLive = 0;
40
+ s->bsBuff = 0;
41
+ }
42
+
43
+
44
+ /*---------------------------------------------------*/
45
+ static
46
+ void bsFinishWrite ( EState* s )
47
+ {
48
+ while (s->bsLive > 0) {
49
+ s->zbits[s->numZ] = (UChar)(s->bsBuff >> 24);
50
+ s->numZ++;
51
+ s->bsBuff <<= 8;
52
+ s->bsLive -= 8;
53
+ }
54
+ }
55
+
56
+
57
+ /*---------------------------------------------------*/
58
+ #define bsNEEDW(nz) \
59
+ { \
60
+ while (s->bsLive >= 8) { \
61
+ s->zbits[s->numZ] \
62
+ = (UChar)(s->bsBuff >> 24); \
63
+ s->numZ++; \
64
+ s->bsBuff <<= 8; \
65
+ s->bsLive -= 8; \
66
+ } \
67
+ }
68
+
69
+
70
+ /*---------------------------------------------------*/
71
+ static
72
+ __inline__
73
+ void bsW ( EState* s, Int32 n, UInt32 v )
74
+ {
75
+ bsNEEDW ( n );
76
+ s->bsBuff |= (v << (32 - s->bsLive - n));
77
+ s->bsLive += n;
78
+ }
79
+
80
+
81
+ /*---------------------------------------------------*/
82
+ static
83
+ void bsPutUInt32 ( EState* s, UInt32 u )
84
+ {
85
+ bsW ( s, 8, (u >> 24) & 0xffL );
86
+ bsW ( s, 8, (u >> 16) & 0xffL );
87
+ bsW ( s, 8, (u >> 8) & 0xffL );
88
+ bsW ( s, 8, u & 0xffL );
89
+ }
90
+
91
+
92
+ /*---------------------------------------------------*/
93
+ static
94
+ void bsPutUChar ( EState* s, UChar c )
95
+ {
96
+ bsW( s, 8, (UInt32)c );
97
+ }
98
+
99
+
100
+ /*---------------------------------------------------*/
101
+ /*--- The back end proper ---*/
102
+ /*---------------------------------------------------*/
103
+
104
+ /*---------------------------------------------------*/
105
+ static
106
+ void makeMaps_e ( EState* s )
107
+ {
108
+ Int32 i;
109
+ s->nInUse = 0;
110
+ for (i = 0; i < 256; i++)
111
+ if (s->inUse[i]) {
112
+ s->unseqToSeq[i] = s->nInUse;
113
+ s->nInUse++;
114
+ }
115
+ }
116
+
117
+
118
+ /*---------------------------------------------------*/
119
+ static
120
+ void generateMTFValues ( EState* s )
121
+ {
122
+ UChar yy[256];
123
+ Int32 i, j;
124
+ Int32 zPend;
125
+ Int32 wr;
126
+ Int32 EOB;
127
+
128
+ /*
129
+ After sorting (eg, here),
130
+ s->arr1 [ 0 .. s->nblock-1 ] holds sorted order,
131
+ and
132
+ ((UChar*)s->arr2) [ 0 .. s->nblock-1 ]
133
+ holds the original block data.
134
+
135
+ The first thing to do is generate the MTF values,
136
+ and put them in
137
+ ((UInt16*)s->arr1) [ 0 .. s->nblock-1 ].
138
+ Because there are strictly fewer or equal MTF values
139
+ than block values, ptr values in this area are overwritten
140
+ with MTF values only when they are no longer needed.
141
+
142
+ The final compressed bitstream is generated into the
143
+ area starting at
144
+ (UChar*) (&((UChar*)s->arr2)[s->nblock])
145
+
146
+ These storage aliases are set up in bzCompressInit(),
147
+ except for the last one, which is arranged in
148
+ compressBlock().
149
+ */
150
+ UInt32* ptr = s->ptr;
151
+ UChar* block = s->block;
152
+ UInt16* mtfv = s->mtfv;
153
+
154
+ makeMaps_e ( s );
155
+ EOB = s->nInUse+1;
156
+
157
+ for (i = 0; i <= EOB; i++) s->mtfFreq[i] = 0;
158
+
159
+ wr = 0;
160
+ zPend = 0;
161
+ for (i = 0; i < s->nInUse; i++) yy[i] = (UChar) i;
162
+
163
+ for (i = 0; i < s->nblock; i++) {
164
+ UChar ll_i;
165
+ AssertD ( wr <= i, "generateMTFValues(1)" );
166
+ j = ptr[i]-1; if (j < 0) j += s->nblock;
167
+ ll_i = s->unseqToSeq[block[j]];
168
+ AssertD ( ll_i < s->nInUse, "generateMTFValues(2a)" );
169
+
170
+ if (yy[0] == ll_i) {
171
+ zPend++;
172
+ } else {
173
+
174
+ if (zPend > 0) {
175
+ zPend--;
176
+ while (True) {
177
+ if (zPend & 1) {
178
+ mtfv[wr] = BZ_RUNB; wr++;
179
+ s->mtfFreq[BZ_RUNB]++;
180
+ } else {
181
+ mtfv[wr] = BZ_RUNA; wr++;
182
+ s->mtfFreq[BZ_RUNA]++;
183
+ }
184
+ if (zPend < 2) break;
185
+ zPend = (zPend - 2) / 2;
186
+ };
187
+ zPend = 0;
188
+ }
189
+ {
190
+ register UChar rtmp;
191
+ register UChar* ryy_j;
192
+ register UChar rll_i;
193
+ rtmp = yy[1];
194
+ yy[1] = yy[0];
195
+ ryy_j = &(yy[1]);
196
+ rll_i = ll_i;
197
+ while ( rll_i != rtmp ) {
198
+ register UChar rtmp2;
199
+ ryy_j++;
200
+ rtmp2 = rtmp;
201
+ rtmp = *ryy_j;
202
+ *ryy_j = rtmp2;
203
+ };
204
+ yy[0] = rtmp;
205
+ j = ryy_j - &(yy[0]);
206
+ mtfv[wr] = j+1; wr++; s->mtfFreq[j+1]++;
207
+ }
208
+
209
+ }
210
+ }
211
+
212
+ if (zPend > 0) {
213
+ zPend--;
214
+ while (True) {
215
+ if (zPend & 1) {
216
+ mtfv[wr] = BZ_RUNB; wr++;
217
+ s->mtfFreq[BZ_RUNB]++;
218
+ } else {
219
+ mtfv[wr] = BZ_RUNA; wr++;
220
+ s->mtfFreq[BZ_RUNA]++;
221
+ }
222
+ if (zPend < 2) break;
223
+ zPend = (zPend - 2) / 2;
224
+ };
225
+ zPend = 0;
226
+ }
227
+
228
+ mtfv[wr] = EOB; wr++; s->mtfFreq[EOB]++;
229
+
230
+ s->nMTF = wr;
231
+ }
232
+
233
+
234
+ /*---------------------------------------------------*/
235
+ #define BZ_LESSER_ICOST 0
236
+ #define BZ_GREATER_ICOST 15
237
+
238
+ static
239
+ void sendMTFValues ( EState* s )
240
+ {
241
+ Int32 v, t, i, j, gs, ge, totc, bt, bc, iter;
242
+ Int32 nSelectors, alphaSize, minLen, maxLen, selCtr;
243
+ Int32 nGroups, nBytes;
244
+
245
+ /*--
246
+ UChar len [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
247
+ is a global since the decoder also needs it.
248
+
249
+ Int32 code[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
250
+ Int32 rfreq[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
251
+ are also globals only used in this proc.
252
+ Made global to keep stack frame size small.
253
+ --*/
254
+
255
+
256
+ UInt16 cost[BZ_N_GROUPS];
257
+ Int32 fave[BZ_N_GROUPS];
258
+
259
+ UInt16* mtfv = s->mtfv;
260
+
261
+ if (s->verbosity >= 3)
262
+ VPrintf3( " %d in block, %d after MTF & 1-2 coding, "
263
+ "%d+2 syms in use\n",
264
+ s->nblock, s->nMTF, s->nInUse );
265
+
266
+ alphaSize = s->nInUse+2;
267
+ for (t = 0; t < BZ_N_GROUPS; t++)
268
+ for (v = 0; v < alphaSize; v++)
269
+ s->len[t][v] = BZ_GREATER_ICOST;
270
+
271
+ /*--- Decide how many coding tables to use ---*/
272
+ AssertH ( s->nMTF > 0, 3001 );
273
+ if (s->nMTF < 200) nGroups = 2; else
274
+ if (s->nMTF < 600) nGroups = 3; else
275
+ if (s->nMTF < 1200) nGroups = 4; else
276
+ if (s->nMTF < 2400) nGroups = 5; else
277
+ nGroups = 6;
278
+
279
+ /*--- Generate an initial set of coding tables ---*/
280
+ {
281
+ Int32 nPart, remF, tFreq, aFreq;
282
+
283
+ nPart = nGroups;
284
+ remF = s->nMTF;
285
+ gs = 0;
286
+ while (nPart > 0) {
287
+ tFreq = remF / nPart;
288
+ ge = gs-1;
289
+ aFreq = 0;
290
+ while (aFreq < tFreq && ge < alphaSize-1) {
291
+ ge++;
292
+ aFreq += s->mtfFreq[ge];
293
+ }
294
+
295
+ if (ge > gs
296
+ && nPart != nGroups && nPart != 1
297
+ && ((nGroups-nPart) % 2 == 1)) {
298
+ aFreq -= s->mtfFreq[ge];
299
+ ge--;
300
+ }
301
+
302
+ if (s->verbosity >= 3)
303
+ VPrintf5( " initial group %d, [%d .. %d], "
304
+ "has %d syms (%4.1f%%)\n",
305
+ nPart, gs, ge, aFreq,
306
+ (100.0 * (float)aFreq) / (float)(s->nMTF) );
307
+
308
+ for (v = 0; v < alphaSize; v++)
309
+ if (v >= gs && v <= ge)
310
+ s->len[nPart-1][v] = BZ_LESSER_ICOST; else
311
+ s->len[nPart-1][v] = BZ_GREATER_ICOST;
312
+
313
+ nPart--;
314
+ gs = ge+1;
315
+ remF -= aFreq;
316
+ }
317
+ }
318
+
319
+ /*---
320
+ Iterate up to BZ_N_ITERS times to improve the tables.
321
+ ---*/
322
+ for (iter = 0; iter < BZ_N_ITERS; iter++) {
323
+
324
+ for (t = 0; t < nGroups; t++) fave[t] = 0;
325
+
326
+ for (t = 0; t < nGroups; t++)
327
+ for (v = 0; v < alphaSize; v++)
328
+ s->rfreq[t][v] = 0;
329
+
330
+ /*---
331
+ Set up an auxiliary length table which is used to fast-track
332
+ the common case (nGroups == 6).
333
+ ---*/
334
+ if (nGroups == 6) {
335
+ for (v = 0; v < alphaSize; v++) {
336
+ s->len_pack[v][0] = (s->len[1][v] << 16) | s->len[0][v];
337
+ s->len_pack[v][1] = (s->len[3][v] << 16) | s->len[2][v];
338
+ s->len_pack[v][2] = (s->len[5][v] << 16) | s->len[4][v];
339
+ }
340
+ }
341
+
342
+ nSelectors = 0;
343
+ totc = 0;
344
+ gs = 0;
345
+ while (True) {
346
+
347
+ /*--- Set group start & end marks. --*/
348
+ if (gs >= s->nMTF) break;
349
+ ge = gs + BZ_G_SIZE - 1;
350
+ if (ge >= s->nMTF) ge = s->nMTF-1;
351
+
352
+ /*--
353
+ Calculate the cost of this group as coded
354
+ by each of the coding tables.
355
+ --*/
356
+ for (t = 0; t < nGroups; t++) cost[t] = 0;
357
+
358
+ if (nGroups == 6 && 50 == ge-gs+1) {
359
+ /*--- fast track the common case ---*/
360
+ register UInt32 cost01, cost23, cost45;
361
+ register UInt16 icv;
362
+ cost01 = cost23 = cost45 = 0;
363
+
364
+ # define BZ_ITER(nn) \
365
+ icv = mtfv[gs+(nn)]; \
366
+ cost01 += s->len_pack[icv][0]; \
367
+ cost23 += s->len_pack[icv][1]; \
368
+ cost45 += s->len_pack[icv][2]; \
369
+
370
+ BZ_ITER(0); BZ_ITER(1); BZ_ITER(2); BZ_ITER(3); BZ_ITER(4);
371
+ BZ_ITER(5); BZ_ITER(6); BZ_ITER(7); BZ_ITER(8); BZ_ITER(9);
372
+ BZ_ITER(10); BZ_ITER(11); BZ_ITER(12); BZ_ITER(13); BZ_ITER(14);
373
+ BZ_ITER(15); BZ_ITER(16); BZ_ITER(17); BZ_ITER(18); BZ_ITER(19);
374
+ BZ_ITER(20); BZ_ITER(21); BZ_ITER(22); BZ_ITER(23); BZ_ITER(24);
375
+ BZ_ITER(25); BZ_ITER(26); BZ_ITER(27); BZ_ITER(28); BZ_ITER(29);
376
+ BZ_ITER(30); BZ_ITER(31); BZ_ITER(32); BZ_ITER(33); BZ_ITER(34);
377
+ BZ_ITER(35); BZ_ITER(36); BZ_ITER(37); BZ_ITER(38); BZ_ITER(39);
378
+ BZ_ITER(40); BZ_ITER(41); BZ_ITER(42); BZ_ITER(43); BZ_ITER(44);
379
+ BZ_ITER(45); BZ_ITER(46); BZ_ITER(47); BZ_ITER(48); BZ_ITER(49);
380
+
381
+ # undef BZ_ITER
382
+
383
+ cost[0] = cost01 & 0xffff; cost[1] = cost01 >> 16;
384
+ cost[2] = cost23 & 0xffff; cost[3] = cost23 >> 16;
385
+ cost[4] = cost45 & 0xffff; cost[5] = cost45 >> 16;
386
+
387
+ } else {
388
+ /*--- slow version which correctly handles all situations ---*/
389
+ for (i = gs; i <= ge; i++) {
390
+ UInt16 icv = mtfv[i];
391
+ for (t = 0; t < nGroups; t++) cost[t] += s->len[t][icv];
392
+ }
393
+ }
394
+
395
+ /*--
396
+ Find the coding table which is best for this group,
397
+ and record its identity in the selector table.
398
+ --*/
399
+ bc = 999999999; bt = -1;
400
+ for (t = 0; t < nGroups; t++)
401
+ if (cost[t] < bc) { bc = cost[t]; bt = t; };
402
+ totc += bc;
403
+ fave[bt]++;
404
+ s->selector[nSelectors] = bt;
405
+ nSelectors++;
406
+
407
+ /*--
408
+ Increment the symbol frequencies for the selected table.
409
+ --*/
410
+ if (nGroups == 6 && 50 == ge-gs+1) {
411
+ /*--- fast track the common case ---*/
412
+
413
+ # define BZ_ITUR(nn) s->rfreq[bt][ mtfv[gs+(nn)] ]++
414
+
415
+ BZ_ITUR(0); BZ_ITUR(1); BZ_ITUR(2); BZ_ITUR(3); BZ_ITUR(4);
416
+ BZ_ITUR(5); BZ_ITUR(6); BZ_ITUR(7); BZ_ITUR(8); BZ_ITUR(9);
417
+ BZ_ITUR(10); BZ_ITUR(11); BZ_ITUR(12); BZ_ITUR(13); BZ_ITUR(14);
418
+ BZ_ITUR(15); BZ_ITUR(16); BZ_ITUR(17); BZ_ITUR(18); BZ_ITUR(19);
419
+ BZ_ITUR(20); BZ_ITUR(21); BZ_ITUR(22); BZ_ITUR(23); BZ_ITUR(24);
420
+ BZ_ITUR(25); BZ_ITUR(26); BZ_ITUR(27); BZ_ITUR(28); BZ_ITUR(29);
421
+ BZ_ITUR(30); BZ_ITUR(31); BZ_ITUR(32); BZ_ITUR(33); BZ_ITUR(34);
422
+ BZ_ITUR(35); BZ_ITUR(36); BZ_ITUR(37); BZ_ITUR(38); BZ_ITUR(39);
423
+ BZ_ITUR(40); BZ_ITUR(41); BZ_ITUR(42); BZ_ITUR(43); BZ_ITUR(44);
424
+ BZ_ITUR(45); BZ_ITUR(46); BZ_ITUR(47); BZ_ITUR(48); BZ_ITUR(49);
425
+
426
+ # undef BZ_ITUR
427
+
428
+ } else {
429
+ /*--- slow version which correctly handles all situations ---*/
430
+ for (i = gs; i <= ge; i++)
431
+ s->rfreq[bt][ mtfv[i] ]++;
432
+ }
433
+
434
+ gs = ge+1;
435
+ }
436
+ if (s->verbosity >= 3) {
437
+ VPrintf2 ( " pass %d: size is %d, grp uses are ",
438
+ iter+1, totc/8 );
439
+ for (t = 0; t < nGroups; t++)
440
+ VPrintf1 ( "%d ", fave[t] );
441
+ VPrintf0 ( "\n" );
442
+ }
443
+
444
+ /*--
445
+ Recompute the tables based on the accumulated frequencies.
446
+ --*/
447
+ /* maxLen was changed from 20 to 17 in bzip2-1.0.3. See
448
+ comment in huffman.c for details. */
449
+ for (t = 0; t < nGroups; t++)
450
+ BZ2_hbMakeCodeLengths ( &(s->len[t][0]), &(s->rfreq[t][0]),
451
+ alphaSize, 17 /*20*/ );
452
+ }
453
+
454
+
455
+ AssertH( nGroups < 8, 3002 );
456
+ AssertH( nSelectors < 32768 &&
457
+ nSelectors <= BZ_MAX_SELECTORS,
458
+ 3003 );
459
+
460
+
461
+ /*--- Compute MTF values for the selectors. ---*/
462
+ {
463
+ UChar pos[BZ_N_GROUPS], ll_i, tmp2, tmp;
464
+ for (i = 0; i < nGroups; i++) pos[i] = i;
465
+ for (i = 0; i < nSelectors; i++) {
466
+ ll_i = s->selector[i];
467
+ j = 0;
468
+ tmp = pos[j];
469
+ while ( ll_i != tmp ) {
470
+ j++;
471
+ tmp2 = tmp;
472
+ tmp = pos[j];
473
+ pos[j] = tmp2;
474
+ };
475
+ pos[0] = tmp;
476
+ s->selectorMtf[i] = j;
477
+ }
478
+ };
479
+
480
+ /*--- Assign actual codes for the tables. --*/
481
+ for (t = 0; t < nGroups; t++) {
482
+ minLen = 32;
483
+ maxLen = 0;
484
+ for (i = 0; i < alphaSize; i++) {
485
+ if (s->len[t][i] > maxLen) maxLen = s->len[t][i];
486
+ if (s->len[t][i] < minLen) minLen = s->len[t][i];
487
+ }
488
+ AssertH ( !(maxLen > 17 /*20*/ ), 3004 );
489
+ AssertH ( !(minLen < 1), 3005 );
490
+ BZ2_hbAssignCodes ( &(s->code[t][0]), &(s->len[t][0]),
491
+ minLen, maxLen, alphaSize );
492
+ }
493
+
494
+ /*--- Transmit the mapping table. ---*/
495
+ {
496
+ Bool inUse16[16];
497
+ for (i = 0; i < 16; i++) {
498
+ inUse16[i] = False;
499
+ for (j = 0; j < 16; j++)
500
+ if (s->inUse[i * 16 + j]) inUse16[i] = True;
501
+ }
502
+
503
+ nBytes = s->numZ;
504
+ for (i = 0; i < 16; i++)
505
+ if (inUse16[i]) bsW(s,1,1); else bsW(s,1,0);
506
+
507
+ for (i = 0; i < 16; i++)
508
+ if (inUse16[i])
509
+ for (j = 0; j < 16; j++) {
510
+ if (s->inUse[i * 16 + j]) bsW(s,1,1); else bsW(s,1,0);
511
+ }
512
+
513
+ if (s->verbosity >= 3)
514
+ VPrintf1( " bytes: mapping %d, ", s->numZ-nBytes );
515
+ }
516
+
517
+ /*--- Now the selectors. ---*/
518
+ nBytes = s->numZ;
519
+ bsW ( s, 3, nGroups );
520
+ bsW ( s, 15, nSelectors );
521
+ for (i = 0; i < nSelectors; i++) {
522
+ for (j = 0; j < s->selectorMtf[i]; j++) bsW(s,1,1);
523
+ bsW(s,1,0);
524
+ }
525
+ if (s->verbosity >= 3)
526
+ VPrintf1( "selectors %d, ", s->numZ-nBytes );
527
+
528
+ /*--- Now the coding tables. ---*/
529
+ nBytes = s->numZ;
530
+
531
+ for (t = 0; t < nGroups; t++) {
532
+ Int32 curr = s->len[t][0];
533
+ bsW ( s, 5, curr );
534
+ for (i = 0; i < alphaSize; i++) {
535
+ while (curr < s->len[t][i]) { bsW(s,2,2); curr++; /* 10 */ };
536
+ while (curr > s->len[t][i]) { bsW(s,2,3); curr--; /* 11 */ };
537
+ bsW ( s, 1, 0 );
538
+ }
539
+ }
540
+
541
+ if (s->verbosity >= 3)
542
+ VPrintf1 ( "code lengths %d, ", s->numZ-nBytes );
543
+
544
+ /*--- And finally, the block data proper ---*/
545
+ nBytes = s->numZ;
546
+ selCtr = 0;
547
+ gs = 0;
548
+ while (True) {
549
+ if (gs >= s->nMTF) break;
550
+ ge = gs + BZ_G_SIZE - 1;
551
+ if (ge >= s->nMTF) ge = s->nMTF-1;
552
+ AssertH ( s->selector[selCtr] < nGroups, 3006 );
553
+
554
+ if (nGroups == 6 && 50 == ge-gs+1) {
555
+ /*--- fast track the common case ---*/
556
+ UInt16 mtfv_i;
557
+ UChar* s_len_sel_selCtr
558
+ = &(s->len[s->selector[selCtr]][0]);
559
+ Int32* s_code_sel_selCtr
560
+ = &(s->code[s->selector[selCtr]][0]);
561
+
562
+ # define BZ_ITAH(nn) \
563
+ mtfv_i = mtfv[gs+(nn)]; \
564
+ bsW ( s, \
565
+ s_len_sel_selCtr[mtfv_i], \
566
+ s_code_sel_selCtr[mtfv_i] )
567
+
568
+ BZ_ITAH(0); BZ_ITAH(1); BZ_ITAH(2); BZ_ITAH(3); BZ_ITAH(4);
569
+ BZ_ITAH(5); BZ_ITAH(6); BZ_ITAH(7); BZ_ITAH(8); BZ_ITAH(9);
570
+ BZ_ITAH(10); BZ_ITAH(11); BZ_ITAH(12); BZ_ITAH(13); BZ_ITAH(14);
571
+ BZ_ITAH(15); BZ_ITAH(16); BZ_ITAH(17); BZ_ITAH(18); BZ_ITAH(19);
572
+ BZ_ITAH(20); BZ_ITAH(21); BZ_ITAH(22); BZ_ITAH(23); BZ_ITAH(24);
573
+ BZ_ITAH(25); BZ_ITAH(26); BZ_ITAH(27); BZ_ITAH(28); BZ_ITAH(29);
574
+ BZ_ITAH(30); BZ_ITAH(31); BZ_ITAH(32); BZ_ITAH(33); BZ_ITAH(34);
575
+ BZ_ITAH(35); BZ_ITAH(36); BZ_ITAH(37); BZ_ITAH(38); BZ_ITAH(39);
576
+ BZ_ITAH(40); BZ_ITAH(41); BZ_ITAH(42); BZ_ITAH(43); BZ_ITAH(44);
577
+ BZ_ITAH(45); BZ_ITAH(46); BZ_ITAH(47); BZ_ITAH(48); BZ_ITAH(49);
578
+
579
+ # undef BZ_ITAH
580
+
581
+ } else {
582
+ /*--- slow version which correctly handles all situations ---*/
583
+ for (i = gs; i <= ge; i++) {
584
+ bsW ( s,
585
+ s->len [s->selector[selCtr]] [mtfv[i]],
586
+ s->code [s->selector[selCtr]] [mtfv[i]] );
587
+ }
588
+ }
589
+
590
+
591
+ gs = ge+1;
592
+ selCtr++;
593
+ }
594
+ AssertH( selCtr == nSelectors, 3007 );
595
+
596
+ if (s->verbosity >= 3)
597
+ VPrintf1( "codes %d\n", s->numZ-nBytes );
598
+ }
599
+
600
+
601
+ /*---------------------------------------------------*/
602
+ void BZ2_compressBlock ( EState* s, Bool is_last_block )
603
+ {
604
+ if (s->nblock > 0) {
605
+
606
+ BZ_FINALISE_CRC ( s->blockCRC );
607
+ s->combinedCRC = (s->combinedCRC << 1) | (s->combinedCRC >> 31);
608
+ s->combinedCRC ^= s->blockCRC;
609
+ if (s->blockNo > 1) s->numZ = 0;
610
+
611
+ if (s->verbosity >= 2)
612
+ VPrintf4( " block %d: crc = 0x%08x, "
613
+ "combined CRC = 0x%08x, size = %d\n",
614
+ s->blockNo, s->blockCRC, s->combinedCRC, s->nblock );
615
+
616
+ BZ2_blockSort ( s );
617
+ }
618
+
619
+ s->zbits = (UChar*) (&((UChar*)s->arr2)[s->nblock]);
620
+
621
+ /*-- If this is the first block, create the stream header. --*/
622
+ if (s->blockNo == 1) {
623
+ BZ2_bsInitWrite ( s );
624
+ bsPutUChar ( s, BZ_HDR_B );
625
+ bsPutUChar ( s, BZ_HDR_Z );
626
+ bsPutUChar ( s, BZ_HDR_h );
627
+ bsPutUChar ( s, (UChar)(BZ_HDR_0 + s->blockSize100k) );
628
+ }
629
+
630
+ if (s->nblock > 0) {
631
+
632
+ bsPutUChar ( s, 0x31 ); bsPutUChar ( s, 0x41 );
633
+ bsPutUChar ( s, 0x59 ); bsPutUChar ( s, 0x26 );
634
+ bsPutUChar ( s, 0x53 ); bsPutUChar ( s, 0x59 );
635
+
636
+ /*-- Now the block's CRC, so it is in a known place. --*/
637
+ bsPutUInt32 ( s, s->blockCRC );
638
+
639
+ /*--
640
+ Now a single bit indicating (non-)randomisation.
641
+ As of version 0.9.5, we use a better sorting algorithm
642
+ which makes randomisation unnecessary. So always set
643
+ the randomised bit to 'no'. Of course, the decoder
644
+ still needs to be able to handle randomised blocks
645
+ so as to maintain backwards compatibility with
646
+ older versions of bzip2.
647
+ --*/
648
+ bsW(s,1,0);
649
+
650
+ bsW ( s, 24, s->origPtr );
651
+ generateMTFValues ( s );
652
+ sendMTFValues ( s );
653
+ }
654
+
655
+
656
+ /*-- If this is the last block, add the stream trailer. --*/
657
+ if (is_last_block) {
658
+
659
+ bsPutUChar ( s, 0x17 ); bsPutUChar ( s, 0x72 );
660
+ bsPutUChar ( s, 0x45 ); bsPutUChar ( s, 0x38 );
661
+ bsPutUChar ( s, 0x50 ); bsPutUChar ( s, 0x90 );
662
+ bsPutUInt32 ( s, s->combinedCRC );
663
+ if (s->verbosity >= 2)
664
+ VPrintf1( " final combined CRC = 0x%08x\n ", s->combinedCRC );
665
+ bsFinishWrite ( s );
666
+ }
667
+ }
668
+
669
+
670
+ /*-------------------------------------------------------------*/
671
+ /*--- end compress.c ---*/
672
+ /*-------------------------------------------------------------*/