brotli 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/.travis.yml +11 -3
  4. data/Gemfile +2 -0
  5. data/ext/brotli/brotli.c +279 -0
  6. data/ext/brotli/brotli.h +2 -0
  7. data/ext/brotli/buffer.c +95 -0
  8. data/ext/brotli/buffer.h +19 -0
  9. data/ext/brotli/extconf.rb +21 -81
  10. data/lib/brotli/version.rb +1 -1
  11. data/vendor/brotli/dec/bit_reader.c +5 -5
  12. data/vendor/brotli/dec/bit_reader.h +15 -15
  13. data/vendor/brotli/dec/context.h +1 -1
  14. data/vendor/brotli/dec/decode.c +433 -348
  15. data/vendor/brotli/dec/decode.h +74 -48
  16. data/vendor/brotli/dec/huffman.c +5 -4
  17. data/vendor/brotli/dec/huffman.h +4 -4
  18. data/vendor/brotli/dec/port.h +2 -95
  19. data/vendor/brotli/dec/prefix.h +5 -3
  20. data/vendor/brotli/dec/state.c +15 -27
  21. data/vendor/brotli/dec/state.h +21 -17
  22. data/vendor/brotli/dec/transform.h +1 -1
  23. data/vendor/brotli/enc/backward_references.c +892 -0
  24. data/vendor/brotli/enc/backward_references.h +85 -102
  25. data/vendor/brotli/enc/backward_references_inc.h +147 -0
  26. data/vendor/brotli/enc/bit_cost.c +35 -0
  27. data/vendor/brotli/enc/bit_cost.h +23 -121
  28. data/vendor/brotli/enc/bit_cost_inc.h +127 -0
  29. data/vendor/brotli/enc/block_encoder_inc.h +33 -0
  30. data/vendor/brotli/enc/block_splitter.c +197 -0
  31. data/vendor/brotli/enc/block_splitter.h +40 -50
  32. data/vendor/brotli/enc/block_splitter_inc.h +432 -0
  33. data/vendor/brotli/enc/brotli_bit_stream.c +1334 -0
  34. data/vendor/brotli/enc/brotli_bit_stream.h +95 -167
  35. data/vendor/brotli/enc/cluster.c +56 -0
  36. data/vendor/brotli/enc/cluster.h +23 -305
  37. data/vendor/brotli/enc/cluster_inc.h +315 -0
  38. data/vendor/brotli/enc/command.h +83 -76
  39. data/vendor/brotli/enc/compress_fragment.c +747 -0
  40. data/vendor/brotli/enc/compress_fragment.h +48 -37
  41. data/vendor/brotli/enc/compress_fragment_two_pass.c +557 -0
  42. data/vendor/brotli/enc/compress_fragment_two_pass.h +37 -26
  43. data/vendor/brotli/enc/compressor.cc +139 -0
  44. data/vendor/brotli/enc/compressor.h +146 -0
  45. data/vendor/brotli/enc/context.h +102 -96
  46. data/vendor/brotli/enc/dictionary_hash.h +9 -5
  47. data/vendor/brotli/enc/encode.c +1562 -0
  48. data/vendor/brotli/enc/encode.h +211 -199
  49. data/vendor/brotli/enc/encode_parallel.cc +161 -151
  50. data/vendor/brotli/enc/encode_parallel.h +7 -8
  51. data/vendor/brotli/enc/entropy_encode.c +501 -0
  52. data/vendor/brotli/enc/entropy_encode.h +107 -89
  53. data/vendor/brotli/enc/entropy_encode_static.h +29 -62
  54. data/vendor/brotli/enc/fast_log.h +26 -20
  55. data/vendor/brotli/enc/find_match_length.h +23 -20
  56. data/vendor/brotli/enc/hash.h +614 -871
  57. data/vendor/brotli/enc/hash_forgetful_chain_inc.h +249 -0
  58. data/vendor/brotli/enc/hash_longest_match_inc.h +241 -0
  59. data/vendor/brotli/enc/hash_longest_match_quickly_inc.h +230 -0
  60. data/vendor/brotli/enc/histogram.c +95 -0
  61. data/vendor/brotli/enc/histogram.h +49 -83
  62. data/vendor/brotli/enc/histogram_inc.h +51 -0
  63. data/vendor/brotli/enc/literal_cost.c +178 -0
  64. data/vendor/brotli/enc/literal_cost.h +16 -10
  65. data/vendor/brotli/enc/memory.c +181 -0
  66. data/vendor/brotli/enc/memory.h +62 -0
  67. data/vendor/brotli/enc/metablock.c +515 -0
  68. data/vendor/brotli/enc/metablock.h +87 -57
  69. data/vendor/brotli/enc/metablock_inc.h +183 -0
  70. data/vendor/brotli/enc/port.h +73 -47
  71. data/vendor/brotli/enc/prefix.h +34 -61
  72. data/vendor/brotli/enc/quality.h +130 -0
  73. data/vendor/brotli/enc/ringbuffer.h +137 -122
  74. data/vendor/brotli/enc/{static_dict.cc → static_dict.c} +162 -139
  75. data/vendor/brotli/enc/static_dict.h +23 -18
  76. data/vendor/brotli/enc/static_dict_lut.h +11223 -12037
  77. data/vendor/brotli/enc/streams.cc +7 -7
  78. data/vendor/brotli/enc/streams.h +32 -32
  79. data/vendor/brotli/enc/{utf8_util.cc → utf8_util.c} +22 -20
  80. data/vendor/brotli/enc/utf8_util.h +16 -9
  81. data/vendor/brotli/enc/write_bits.h +49 -43
  82. metadata +34 -25
  83. data/ext/brotli/brotli.cc +0 -181
  84. data/vendor/brotli/dec/Makefile +0 -12
  85. data/vendor/brotli/dec/dictionary.c +0 -9466
  86. data/vendor/brotli/dec/dictionary.h +0 -38
  87. data/vendor/brotli/dec/types.h +0 -38
  88. data/vendor/brotli/enc/Makefile +0 -14
  89. data/vendor/brotli/enc/backward_references.cc +0 -858
  90. data/vendor/brotli/enc/block_splitter.cc +0 -505
  91. data/vendor/brotli/enc/brotli_bit_stream.cc +0 -1181
  92. data/vendor/brotli/enc/compress_fragment.cc +0 -701
  93. data/vendor/brotli/enc/compress_fragment_two_pass.cc +0 -524
  94. data/vendor/brotli/enc/dictionary.cc +0 -9466
  95. data/vendor/brotli/enc/dictionary.h +0 -41
  96. data/vendor/brotli/enc/encode.cc +0 -1180
  97. data/vendor/brotli/enc/entropy_encode.cc +0 -480
  98. data/vendor/brotli/enc/histogram.cc +0 -67
  99. data/vendor/brotli/enc/literal_cost.cc +0 -165
  100. data/vendor/brotli/enc/metablock.cc +0 -539
  101. data/vendor/brotli/enc/transform.h +0 -248
  102. data/vendor/brotli/enc/types.h +0 -29
@@ -1,480 +0,0 @@
1
- /* Copyright 2010 Google Inc. All Rights Reserved.
2
-
3
- Distributed under MIT license.
4
- See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5
- */
6
-
7
- // Entropy encoding (Huffman) utilities.
8
-
9
- #include "./entropy_encode.h"
10
-
11
- #include <algorithm>
12
- #include <limits>
13
- #include <cstdlib>
14
-
15
- #include "./histogram.h"
16
- #include "./port.h"
17
- #include "./types.h"
18
-
19
- namespace brotli {
20
-
21
- void SetDepth(const HuffmanTree &p,
22
- HuffmanTree *pool,
23
- uint8_t *depth,
24
- uint8_t level) {
25
- if (p.index_left_ >= 0) {
26
- ++level;
27
- SetDepth(pool[p.index_left_], pool, depth, level);
28
- SetDepth(pool[p.index_right_or_value_], pool, depth, level);
29
- } else {
30
- depth[p.index_right_or_value_] = level;
31
- }
32
- }
33
-
34
- // Sort the root nodes, least popular first.
35
- static inline bool SortHuffmanTree(const HuffmanTree& v0,
36
- const HuffmanTree& v1) {
37
- if (v0.total_count_ != v1.total_count_) {
38
- return v0.total_count_ < v1.total_count_;
39
- }
40
- return v0.index_right_or_value_ > v1.index_right_or_value_;
41
- }
42
-
43
- // This function will create a Huffman tree.
44
- //
45
- // The catch here is that the tree cannot be arbitrarily deep.
46
- // Brotli specifies a maximum depth of 15 bits for "code trees"
47
- // and 7 bits for "code length code trees."
48
- //
49
- // count_limit is the value that is to be faked as the minimum value
50
- // and this minimum value is raised until the tree matches the
51
- // maximum length requirement.
52
- //
53
- // This algorithm is not of excellent performance for very long data blocks,
54
- // especially when population counts are longer than 2**tree_limit, but
55
- // we are not planning to use this with extremely long blocks.
56
- //
57
- // See http://en.wikipedia.org/wiki/Huffman_coding
58
- void CreateHuffmanTree(const uint32_t *data,
59
- const size_t length,
60
- const int tree_limit,
61
- HuffmanTree* tree,
62
- uint8_t *depth) {
63
- // For block sizes below 64 kB, we never need to do a second iteration
64
- // of this loop. Probably all of our block sizes will be smaller than
65
- // that, so this loop is mostly of academic interest. If we actually
66
- // would need this, we would be better off with the Katajainen algorithm.
67
- for (uint32_t count_limit = 1; ; count_limit *= 2) {
68
- size_t n = 0;
69
- for (size_t i = length; i != 0;) {
70
- --i;
71
- if (data[i]) {
72
- const uint32_t count = std::max(data[i], count_limit);
73
- tree[n++] = HuffmanTree(count, -1, static_cast<int16_t>(i));
74
- }
75
- }
76
-
77
- if (n == 1) {
78
- depth[tree[0].index_right_or_value_] = 1; // Only one element.
79
- break;
80
- }
81
-
82
- std::sort(tree, tree + n, SortHuffmanTree);
83
-
84
- // The nodes are:
85
- // [0, n): the sorted leaf nodes that we start with.
86
- // [n]: we add a sentinel here.
87
- // [n + 1, 2n): new parent nodes are added here, starting from
88
- // (n+1). These are naturally in ascending order.
89
- // [2n]: we add a sentinel at the end as well.
90
- // There will be (2n+1) elements at the end.
91
- const HuffmanTree sentinel(std::numeric_limits<uint32_t>::max(), -1, -1);
92
- tree[n] = sentinel;
93
- tree[n + 1] = sentinel;
94
-
95
- size_t i = 0; // Points to the next leaf node.
96
- size_t j = n + 1; // Points to the next non-leaf node.
97
- for (size_t k = n - 1; k != 0; --k) {
98
- size_t left, right;
99
- if (tree[i].total_count_ <= tree[j].total_count_) {
100
- left = i;
101
- ++i;
102
- } else {
103
- left = j;
104
- ++j;
105
- }
106
- if (tree[i].total_count_ <= tree[j].total_count_) {
107
- right = i;
108
- ++i;
109
- } else {
110
- right = j;
111
- ++j;
112
- }
113
-
114
- // The sentinel node becomes the parent node.
115
- size_t j_end = 2 * n - k;
116
- tree[j_end].total_count_ =
117
- tree[left].total_count_ + tree[right].total_count_;
118
- tree[j_end].index_left_ = static_cast<int16_t>(left);
119
- tree[j_end].index_right_or_value_ = static_cast<int16_t>(right);
120
-
121
- // Add back the last sentinel node.
122
- tree[j_end + 1] = sentinel;
123
- }
124
- SetDepth(tree[2 * n - 1], &tree[0], depth, 0);
125
-
126
- // We need to pack the Huffman tree in tree_limit bits.
127
- // If this was not successful, add fake entities to the lowest values
128
- // and retry.
129
- if (*std::max_element(&depth[0], &depth[length]) <= tree_limit) {
130
- break;
131
- }
132
- }
133
- }
134
-
135
- static void Reverse(uint8_t* v, size_t start, size_t end) {
136
- --end;
137
- while (start < end) {
138
- uint8_t tmp = v[start];
139
- v[start] = v[end];
140
- v[end] = tmp;
141
- ++start;
142
- --end;
143
- }
144
- }
145
-
146
- static void WriteHuffmanTreeRepetitions(
147
- const uint8_t previous_value,
148
- const uint8_t value,
149
- size_t repetitions,
150
- size_t* tree_size,
151
- uint8_t* tree,
152
- uint8_t* extra_bits_data) {
153
- assert(repetitions > 0);
154
- if (previous_value != value) {
155
- tree[*tree_size] = value;
156
- extra_bits_data[*tree_size] = 0;
157
- ++(*tree_size);
158
- --repetitions;
159
- }
160
- if (repetitions == 7) {
161
- tree[*tree_size] = value;
162
- extra_bits_data[*tree_size] = 0;
163
- ++(*tree_size);
164
- --repetitions;
165
- }
166
- if (repetitions < 3) {
167
- for (size_t i = 0; i < repetitions; ++i) {
168
- tree[*tree_size] = value;
169
- extra_bits_data[*tree_size] = 0;
170
- ++(*tree_size);
171
- }
172
- } else {
173
- repetitions -= 3;
174
- size_t start = *tree_size;
175
- while (true) {
176
- tree[*tree_size] = 16;
177
- extra_bits_data[*tree_size] = repetitions & 0x3;
178
- ++(*tree_size);
179
- repetitions >>= 2;
180
- if (repetitions == 0) {
181
- break;
182
- }
183
- --repetitions;
184
- }
185
- Reverse(tree, start, *tree_size);
186
- Reverse(extra_bits_data, start, *tree_size);
187
- }
188
- }
189
-
190
- static void WriteHuffmanTreeRepetitionsZeros(
191
- size_t repetitions,
192
- size_t* tree_size,
193
- uint8_t* tree,
194
- uint8_t* extra_bits_data) {
195
- if (repetitions == 11) {
196
- tree[*tree_size] = 0;
197
- extra_bits_data[*tree_size] = 0;
198
- ++(*tree_size);
199
- --repetitions;
200
- }
201
- if (repetitions < 3) {
202
- for (size_t i = 0; i < repetitions; ++i) {
203
- tree[*tree_size] = 0;
204
- extra_bits_data[*tree_size] = 0;
205
- ++(*tree_size);
206
- }
207
- } else {
208
- repetitions -= 3;
209
- size_t start = *tree_size;
210
- while (true) {
211
- tree[*tree_size] = 17;
212
- extra_bits_data[*tree_size] = repetitions & 0x7;
213
- ++(*tree_size);
214
- repetitions >>= 3;
215
- if (repetitions == 0) {
216
- break;
217
- }
218
- --repetitions;
219
- }
220
- Reverse(tree, start, *tree_size);
221
- Reverse(extra_bits_data, start, *tree_size);
222
- }
223
- }
224
-
225
- void OptimizeHuffmanCountsForRle(size_t length, uint32_t* counts,
226
- uint8_t* good_for_rle) {
227
- size_t nonzero_count = 0;
228
- size_t stride;
229
- size_t limit;
230
- size_t sum;
231
- const size_t streak_limit = 1240;
232
- // Let's make the Huffman code more compatible with rle encoding.
233
- size_t i;
234
- for (i = 0; i < length; i++) {
235
- if (counts[i]) {
236
- ++nonzero_count;
237
- }
238
- }
239
- if (nonzero_count < 16) {
240
- return;
241
- }
242
- while (length != 0 && counts[length - 1] == 0) {
243
- --length;
244
- }
245
- if (length == 0) {
246
- return; // All zeros.
247
- }
248
- // Now counts[0..length - 1] does not have trailing zeros.
249
- {
250
- size_t nonzeros = 0;
251
- uint32_t smallest_nonzero = 1 << 30;
252
- for (i = 0; i < length; ++i) {
253
- if (counts[i] != 0) {
254
- ++nonzeros;
255
- if (smallest_nonzero > counts[i]) {
256
- smallest_nonzero = counts[i];
257
- }
258
- }
259
- }
260
- if (nonzeros < 5) {
261
- // Small histogram will model it well.
262
- return;
263
- }
264
- size_t zeros = length - nonzeros;
265
- if (smallest_nonzero < 4) {
266
- if (zeros < 6) {
267
- for (i = 1; i < length - 1; ++i) {
268
- if (counts[i - 1] != 0 && counts[i] == 0 && counts[i + 1] != 0) {
269
- counts[i] = 1;
270
- }
271
- }
272
- }
273
- }
274
- if (nonzeros < 28) {
275
- return;
276
- }
277
- }
278
- // 2) Let's mark all population counts that already can be encoded
279
- // with an rle code.
280
- memset(good_for_rle, 0, length);
281
- {
282
- // Let's not spoil any of the existing good rle codes.
283
- // Mark any seq of 0's that is longer as 5 as a good_for_rle.
284
- // Mark any seq of non-0's that is longer as 7 as a good_for_rle.
285
- uint32_t symbol = counts[0];
286
- size_t step = 0;
287
- for (i = 0; i <= length; ++i) {
288
- if (i == length || counts[i] != symbol) {
289
- if ((symbol == 0 && step >= 5) ||
290
- (symbol != 0 && step >= 7)) {
291
- size_t k;
292
- for (k = 0; k < step; ++k) {
293
- good_for_rle[i - k - 1] = 1;
294
- }
295
- }
296
- step = 1;
297
- if (i != length) {
298
- symbol = counts[i];
299
- }
300
- } else {
301
- ++step;
302
- }
303
- }
304
- }
305
- // 3) Let's replace those population counts that lead to more rle codes.
306
- // Math here is in 24.8 fixed point representation.
307
- stride = 0;
308
- limit = 256 * (counts[0] + counts[1] + counts[2]) / 3 + 420;
309
- sum = 0;
310
- for (i = 0; i <= length; ++i) {
311
- if (i == length || good_for_rle[i] ||
312
- (i != 0 && good_for_rle[i - 1]) ||
313
- (256 * counts[i] - limit + streak_limit) >= 2 * streak_limit) {
314
- if (stride >= 4 || (stride >= 3 && sum == 0)) {
315
- size_t k;
316
- // The stride must end, collapse what we have, if we have enough (4).
317
- size_t count = (sum + stride / 2) / stride;
318
- if (count == 0) {
319
- count = 1;
320
- }
321
- if (sum == 0) {
322
- // Don't make an all zeros stride to be upgraded to ones.
323
- count = 0;
324
- }
325
- for (k = 0; k < stride; ++k) {
326
- // We don't want to change value at counts[i],
327
- // that is already belonging to the next stride. Thus - 1.
328
- counts[i - k - 1] = static_cast<uint32_t>(count);
329
- }
330
- }
331
- stride = 0;
332
- sum = 0;
333
- if (i < length - 2) {
334
- // All interesting strides have a count of at least 4,
335
- // at least when non-zeros.
336
- limit = 256 * (counts[i] + counts[i + 1] + counts[i + 2]) / 3 + 420;
337
- } else if (i < length) {
338
- limit = 256 * counts[i];
339
- } else {
340
- limit = 0;
341
- }
342
- }
343
- ++stride;
344
- if (i != length) {
345
- sum += counts[i];
346
- if (stride >= 4) {
347
- limit = (256 * sum + stride / 2) / stride;
348
- }
349
- if (stride == 4) {
350
- limit += 120;
351
- }
352
- }
353
- }
354
- }
355
-
356
- static void DecideOverRleUse(const uint8_t* depth, const size_t length,
357
- bool *use_rle_for_non_zero,
358
- bool *use_rle_for_zero) {
359
- size_t total_reps_zero = 0;
360
- size_t total_reps_non_zero = 0;
361
- size_t count_reps_zero = 1;
362
- size_t count_reps_non_zero = 1;
363
- for (size_t i = 0; i < length;) {
364
- const uint8_t value = depth[i];
365
- size_t reps = 1;
366
- for (size_t k = i + 1; k < length && depth[k] == value; ++k) {
367
- ++reps;
368
- }
369
- if (reps >= 3 && value == 0) {
370
- total_reps_zero += reps;
371
- ++count_reps_zero;
372
- }
373
- if (reps >= 4 && value != 0) {
374
- total_reps_non_zero += reps;
375
- ++count_reps_non_zero;
376
- }
377
- i += reps;
378
- }
379
- *use_rle_for_non_zero = total_reps_non_zero > count_reps_non_zero * 2;
380
- *use_rle_for_zero = total_reps_zero > count_reps_zero * 2;
381
- }
382
-
383
- void WriteHuffmanTree(const uint8_t* depth,
384
- size_t length,
385
- size_t* tree_size,
386
- uint8_t* tree,
387
- uint8_t* extra_bits_data) {
388
- uint8_t previous_value = 8;
389
-
390
- // Throw away trailing zeros.
391
- size_t new_length = length;
392
- for (size_t i = 0; i < length; ++i) {
393
- if (depth[length - i - 1] == 0) {
394
- --new_length;
395
- } else {
396
- break;
397
- }
398
- }
399
-
400
- // First gather statistics on if it is a good idea to do rle.
401
- bool use_rle_for_non_zero = false;
402
- bool use_rle_for_zero = false;
403
- if (length > 50) {
404
- // Find rle coding for longer codes.
405
- // Shorter codes seem not to benefit from rle.
406
- DecideOverRleUse(depth, new_length,
407
- &use_rle_for_non_zero, &use_rle_for_zero);
408
- }
409
-
410
- // Actual rle coding.
411
- for (size_t i = 0; i < new_length;) {
412
- const uint8_t value = depth[i];
413
- size_t reps = 1;
414
- if ((value != 0 && use_rle_for_non_zero) ||
415
- (value == 0 && use_rle_for_zero)) {
416
- for (size_t k = i + 1; k < new_length && depth[k] == value; ++k) {
417
- ++reps;
418
- }
419
- }
420
- if (value == 0) {
421
- WriteHuffmanTreeRepetitionsZeros(reps, tree_size, tree, extra_bits_data);
422
- } else {
423
- WriteHuffmanTreeRepetitions(previous_value,
424
- value, reps, tree_size,
425
- tree, extra_bits_data);
426
- previous_value = value;
427
- }
428
- i += reps;
429
- }
430
- }
431
-
432
- namespace {
433
-
434
- uint16_t ReverseBits(int num_bits, uint16_t bits) {
435
- static const size_t kLut[16] = { // Pre-reversed 4-bit values.
436
- 0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
437
- 0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf
438
- };
439
- size_t retval = kLut[bits & 0xf];
440
- for (int i = 4; i < num_bits; i += 4) {
441
- retval <<= 4;
442
- bits = static_cast<uint16_t>(bits >> 4);
443
- retval |= kLut[bits & 0xf];
444
- }
445
- retval >>= (-num_bits & 0x3);
446
- return static_cast<uint16_t>(retval);
447
- }
448
-
449
- } // namespace
450
-
451
- void ConvertBitDepthsToSymbols(const uint8_t *depth,
452
- size_t len,
453
- uint16_t *bits) {
454
- // In Brotli, all bit depths are [1..15]
455
- // 0 bit depth means that the symbol does not exist.
456
- const int kMaxBits = 16; // 0..15 are values for bits
457
- uint16_t bl_count[kMaxBits] = { 0 };
458
- {
459
- for (size_t i = 0; i < len; ++i) {
460
- ++bl_count[depth[i]];
461
- }
462
- bl_count[0] = 0;
463
- }
464
- uint16_t next_code[kMaxBits];
465
- next_code[0] = 0;
466
- {
467
- int code = 0;
468
- for (int bits = 1; bits < kMaxBits; ++bits) {
469
- code = (code + bl_count[bits - 1]) << 1;
470
- next_code[bits] = static_cast<uint16_t>(code);
471
- }
472
- }
473
- for (size_t i = 0; i < len; ++i) {
474
- if (depth[i]) {
475
- bits[i] = ReverseBits(depth[i], next_code[depth[i]]++);
476
- }
477
- }
478
- }
479
-
480
- } // namespace brotli
@@ -1,67 +0,0 @@
1
- /* Copyright 2013 Google Inc. All Rights Reserved.
2
-
3
- Distributed under MIT license.
4
- See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5
- */
6
-
7
- // Build per-context histograms of literals, commands and distance codes.
8
-
9
- #include "./histogram.h"
10
-
11
- #include <cmath>
12
-
13
- #include "./block_splitter.h"
14
- #include "./command.h"
15
- #include "./context.h"
16
- #include "./prefix.h"
17
-
18
- namespace brotli {
19
-
20
- void BuildHistograms(
21
- const Command* cmds,
22
- const size_t num_commands,
23
- const BlockSplit& literal_split,
24
- const BlockSplit& insert_and_copy_split,
25
- const BlockSplit& dist_split,
26
- const uint8_t* ringbuffer,
27
- size_t start_pos,
28
- size_t mask,
29
- uint8_t prev_byte,
30
- uint8_t prev_byte2,
31
- const std::vector<ContextType>& context_modes,
32
- std::vector<HistogramLiteral>* literal_histograms,
33
- std::vector<HistogramCommand>* insert_and_copy_histograms,
34
- std::vector<HistogramDistance>* copy_dist_histograms) {
35
- size_t pos = start_pos;
36
- BlockSplitIterator literal_it(literal_split);
37
- BlockSplitIterator insert_and_copy_it(insert_and_copy_split);
38
- BlockSplitIterator dist_it(dist_split);
39
- for (size_t i = 0; i < num_commands; ++i) {
40
- const Command &cmd = cmds[i];
41
- insert_and_copy_it.Next();
42
- (*insert_and_copy_histograms)[insert_and_copy_it.type_].Add(
43
- cmd.cmd_prefix_);
44
- for (size_t j = cmd.insert_len_; j != 0; --j) {
45
- literal_it.Next();
46
- size_t context = (literal_it.type_ << kLiteralContextBits) +
47
- Context(prev_byte, prev_byte2, context_modes[literal_it.type_]);
48
- (*literal_histograms)[context].Add(ringbuffer[pos & mask]);
49
- prev_byte2 = prev_byte;
50
- prev_byte = ringbuffer[pos & mask];
51
- ++pos;
52
- }
53
- pos += cmd.copy_len();
54
- if (cmd.copy_len()) {
55
- prev_byte2 = ringbuffer[(pos - 2) & mask];
56
- prev_byte = ringbuffer[(pos - 1) & mask];
57
- if (cmd.cmd_prefix_ >= 128) {
58
- dist_it.Next();
59
- size_t context = (dist_it.type_ << kDistanceContextBits) +
60
- cmd.DistanceContext();
61
- (*copy_dist_histograms)[context].Add(cmd.dist_prefix_);
62
- }
63
- }
64
- }
65
- }
66
-
67
- } // namespace brotli
@@ -1,165 +0,0 @@
1
- /* Copyright 2013 Google Inc. All Rights Reserved.
2
-
3
- Distributed under MIT license.
4
- See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5
- */
6
-
7
- // Literal cost model to allow backward reference replacement to be efficient.
8
-
9
- #include "./literal_cost.h"
10
-
11
- #include <math.h>
12
- #include <algorithm>
13
-
14
- #include "./fast_log.h"
15
- #include "./types.h"
16
- #include "./utf8_util.h"
17
-
18
- namespace brotli {
19
-
20
- static size_t UTF8Position(size_t last, size_t c, size_t clamp) {
21
- if (c < 128) {
22
- return 0; // Next one is the 'Byte 1' again.
23
- } else if (c >= 192) { // Next one is the 'Byte 2' of utf-8 encoding.
24
- return std::min<size_t>(1, clamp);
25
- } else {
26
- // Let's decide over the last byte if this ends the sequence.
27
- if (last < 0xe0) {
28
- return 0; // Completed two or three byte coding.
29
- } else { // Next one is the 'Byte 3' of utf-8 encoding.
30
- return std::min<size_t>(2, clamp);
31
- }
32
- }
33
- }
34
-
35
- static size_t DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask,
36
- const uint8_t *data) {
37
- size_t counts[3] = { 0 };
38
- size_t max_utf8 = 1; // should be 2, but 1 compresses better.
39
- size_t last_c = 0;
40
- size_t utf8_pos = 0;
41
- for (size_t i = 0; i < len; ++i) {
42
- size_t c = data[(pos + i) & mask];
43
- utf8_pos = UTF8Position(last_c, c, 2);
44
- ++counts[utf8_pos];
45
- last_c = c;
46
- }
47
- if (counts[2] < 500) {
48
- max_utf8 = 1;
49
- }
50
- if (counts[1] + counts[2] < 25) {
51
- max_utf8 = 0;
52
- }
53
- return max_utf8;
54
- }
55
-
56
- static void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
57
- const uint8_t *data, float *cost) {
58
-
59
- // max_utf8 is 0 (normal ascii single byte modeling),
60
- // 1 (for 2-byte utf-8 modeling), or 2 (for 3-byte utf-8 modeling).
61
- const size_t max_utf8 = DecideMultiByteStatsLevel(pos, len, mask, data);
62
- size_t histogram[3][256] = { { 0 } };
63
- size_t window_half = 495;
64
- size_t in_window = std::min(window_half, len);
65
- size_t in_window_utf8[3] = { 0 };
66
-
67
- // Bootstrap histograms.
68
- size_t last_c = 0;
69
- size_t utf8_pos = 0;
70
- for (size_t i = 0; i < in_window; ++i) {
71
- size_t c = data[(pos + i) & mask];
72
- ++histogram[utf8_pos][c];
73
- ++in_window_utf8[utf8_pos];
74
- utf8_pos = UTF8Position(last_c, c, max_utf8);
75
- last_c = c;
76
- }
77
-
78
- // Compute bit costs with sliding window.
79
- for (size_t i = 0; i < len; ++i) {
80
- if (i >= window_half) {
81
- // Remove a byte in the past.
82
- size_t c = i < window_half + 1 ?
83
- 0 : data[(pos + i - window_half - 1) & mask];
84
- size_t last_c = i < window_half + 2 ?
85
- 0 : data[(pos + i - window_half - 2) & mask];
86
- size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8);
87
- --histogram[utf8_pos2][data[(pos + i - window_half) & mask]];
88
- --in_window_utf8[utf8_pos2];
89
- }
90
- if (i + window_half < len) {
91
- // Add a byte in the future.
92
- size_t c = data[(pos + i + window_half - 1) & mask];
93
- size_t last_c = data[(pos + i + window_half - 2) & mask];
94
- size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8);
95
- ++histogram[utf8_pos2][data[(pos + i + window_half) & mask]];
96
- ++in_window_utf8[utf8_pos2];
97
- }
98
- size_t c = i < 1 ? 0 : data[(pos + i - 1) & mask];
99
- size_t last_c = i < 2 ? 0 : data[(pos + i - 2) & mask];
100
- size_t utf8_pos = UTF8Position(last_c, c, max_utf8);
101
- size_t masked_pos = (pos + i) & mask;
102
- size_t histo = histogram[utf8_pos][data[masked_pos]];
103
- if (histo == 0) {
104
- histo = 1;
105
- }
106
- double lit_cost = FastLog2(in_window_utf8[utf8_pos]) - FastLog2(histo);
107
- lit_cost += 0.02905;
108
- if (lit_cost < 1.0) {
109
- lit_cost *= 0.5;
110
- lit_cost += 0.5;
111
- }
112
- // Make the first bytes more expensive -- seems to help, not sure why.
113
- // Perhaps because the entropy source is changing its properties
114
- // rapidly in the beginning of the file, perhaps because the beginning
115
- // of the data is a statistical "anomaly".
116
- if (i < 2000) {
117
- lit_cost += 0.7 - (static_cast<double>(2000 - i) / 2000.0 * 0.35);
118
- }
119
- cost[i] = static_cast<float>(lit_cost);
120
- }
121
- }
122
-
123
- void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
124
- const uint8_t *data, float *cost) {
125
- if (IsMostlyUTF8(data, pos, mask, len, kMinUTF8Ratio)) {
126
- EstimateBitCostsForLiteralsUTF8(pos, len, mask, data, cost);
127
- return;
128
- }
129
- size_t histogram[256] = { 0 };
130
- size_t window_half = 2000;
131
- size_t in_window = std::min(window_half, len);
132
-
133
- // Bootstrap histogram.
134
- for (size_t i = 0; i < in_window; ++i) {
135
- ++histogram[data[(pos + i) & mask]];
136
- }
137
-
138
- // Compute bit costs with sliding window.
139
- for (size_t i = 0; i < len; ++i) {
140
- if (i >= window_half) {
141
- // Remove a byte in the past.
142
- --histogram[data[(pos + i - window_half) & mask]];
143
- --in_window;
144
- }
145
- if (i + window_half < len) {
146
- // Add a byte in the future.
147
- ++histogram[data[(pos + i + window_half) & mask]];
148
- ++in_window;
149
- }
150
- size_t histo = histogram[data[(pos + i) & mask]];
151
- if (histo == 0) {
152
- histo = 1;
153
- }
154
- double lit_cost = FastLog2(in_window) - FastLog2(histo);
155
- lit_cost += 0.029;
156
- if (lit_cost < 1.0) {
157
- lit_cost *= 0.5;
158
- lit_cost += 0.5;
159
- }
160
- cost[i] = static_cast<float>(lit_cost);
161
- }
162
- }
163
-
164
-
165
- } // namespace brotli