brotli 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/.travis.yml +11 -3
  4. data/Gemfile +2 -0
  5. data/ext/brotli/brotli.c +279 -0
  6. data/ext/brotli/brotli.h +2 -0
  7. data/ext/brotli/buffer.c +95 -0
  8. data/ext/brotli/buffer.h +19 -0
  9. data/ext/brotli/extconf.rb +21 -81
  10. data/lib/brotli/version.rb +1 -1
  11. data/vendor/brotli/dec/bit_reader.c +5 -5
  12. data/vendor/brotli/dec/bit_reader.h +15 -15
  13. data/vendor/brotli/dec/context.h +1 -1
  14. data/vendor/brotli/dec/decode.c +433 -348
  15. data/vendor/brotli/dec/decode.h +74 -48
  16. data/vendor/brotli/dec/huffman.c +5 -4
  17. data/vendor/brotli/dec/huffman.h +4 -4
  18. data/vendor/brotli/dec/port.h +2 -95
  19. data/vendor/brotli/dec/prefix.h +5 -3
  20. data/vendor/brotli/dec/state.c +15 -27
  21. data/vendor/brotli/dec/state.h +21 -17
  22. data/vendor/brotli/dec/transform.h +1 -1
  23. data/vendor/brotli/enc/backward_references.c +892 -0
  24. data/vendor/brotli/enc/backward_references.h +85 -102
  25. data/vendor/brotli/enc/backward_references_inc.h +147 -0
  26. data/vendor/brotli/enc/bit_cost.c +35 -0
  27. data/vendor/brotli/enc/bit_cost.h +23 -121
  28. data/vendor/brotli/enc/bit_cost_inc.h +127 -0
  29. data/vendor/brotli/enc/block_encoder_inc.h +33 -0
  30. data/vendor/brotli/enc/block_splitter.c +197 -0
  31. data/vendor/brotli/enc/block_splitter.h +40 -50
  32. data/vendor/brotli/enc/block_splitter_inc.h +432 -0
  33. data/vendor/brotli/enc/brotli_bit_stream.c +1334 -0
  34. data/vendor/brotli/enc/brotli_bit_stream.h +95 -167
  35. data/vendor/brotli/enc/cluster.c +56 -0
  36. data/vendor/brotli/enc/cluster.h +23 -305
  37. data/vendor/brotli/enc/cluster_inc.h +315 -0
  38. data/vendor/brotli/enc/command.h +83 -76
  39. data/vendor/brotli/enc/compress_fragment.c +747 -0
  40. data/vendor/brotli/enc/compress_fragment.h +48 -37
  41. data/vendor/brotli/enc/compress_fragment_two_pass.c +557 -0
  42. data/vendor/brotli/enc/compress_fragment_two_pass.h +37 -26
  43. data/vendor/brotli/enc/compressor.cc +139 -0
  44. data/vendor/brotli/enc/compressor.h +146 -0
  45. data/vendor/brotli/enc/context.h +102 -96
  46. data/vendor/brotli/enc/dictionary_hash.h +9 -5
  47. data/vendor/brotli/enc/encode.c +1562 -0
  48. data/vendor/brotli/enc/encode.h +211 -199
  49. data/vendor/brotli/enc/encode_parallel.cc +161 -151
  50. data/vendor/brotli/enc/encode_parallel.h +7 -8
  51. data/vendor/brotli/enc/entropy_encode.c +501 -0
  52. data/vendor/brotli/enc/entropy_encode.h +107 -89
  53. data/vendor/brotli/enc/entropy_encode_static.h +29 -62
  54. data/vendor/brotli/enc/fast_log.h +26 -20
  55. data/vendor/brotli/enc/find_match_length.h +23 -20
  56. data/vendor/brotli/enc/hash.h +614 -871
  57. data/vendor/brotli/enc/hash_forgetful_chain_inc.h +249 -0
  58. data/vendor/brotli/enc/hash_longest_match_inc.h +241 -0
  59. data/vendor/brotli/enc/hash_longest_match_quickly_inc.h +230 -0
  60. data/vendor/brotli/enc/histogram.c +95 -0
  61. data/vendor/brotli/enc/histogram.h +49 -83
  62. data/vendor/brotli/enc/histogram_inc.h +51 -0
  63. data/vendor/brotli/enc/literal_cost.c +178 -0
  64. data/vendor/brotli/enc/literal_cost.h +16 -10
  65. data/vendor/brotli/enc/memory.c +181 -0
  66. data/vendor/brotli/enc/memory.h +62 -0
  67. data/vendor/brotli/enc/metablock.c +515 -0
  68. data/vendor/brotli/enc/metablock.h +87 -57
  69. data/vendor/brotli/enc/metablock_inc.h +183 -0
  70. data/vendor/brotli/enc/port.h +73 -47
  71. data/vendor/brotli/enc/prefix.h +34 -61
  72. data/vendor/brotli/enc/quality.h +130 -0
  73. data/vendor/brotli/enc/ringbuffer.h +137 -122
  74. data/vendor/brotli/enc/{static_dict.cc → static_dict.c} +162 -139
  75. data/vendor/brotli/enc/static_dict.h +23 -18
  76. data/vendor/brotli/enc/static_dict_lut.h +11223 -12037
  77. data/vendor/brotli/enc/streams.cc +7 -7
  78. data/vendor/brotli/enc/streams.h +32 -32
  79. data/vendor/brotli/enc/{utf8_util.cc → utf8_util.c} +22 -20
  80. data/vendor/brotli/enc/utf8_util.h +16 -9
  81. data/vendor/brotli/enc/write_bits.h +49 -43
  82. metadata +34 -25
  83. data/ext/brotli/brotli.cc +0 -181
  84. data/vendor/brotli/dec/Makefile +0 -12
  85. data/vendor/brotli/dec/dictionary.c +0 -9466
  86. data/vendor/brotli/dec/dictionary.h +0 -38
  87. data/vendor/brotli/dec/types.h +0 -38
  88. data/vendor/brotli/enc/Makefile +0 -14
  89. data/vendor/brotli/enc/backward_references.cc +0 -858
  90. data/vendor/brotli/enc/block_splitter.cc +0 -505
  91. data/vendor/brotli/enc/brotli_bit_stream.cc +0 -1181
  92. data/vendor/brotli/enc/compress_fragment.cc +0 -701
  93. data/vendor/brotli/enc/compress_fragment_two_pass.cc +0 -524
  94. data/vendor/brotli/enc/dictionary.cc +0 -9466
  95. data/vendor/brotli/enc/dictionary.h +0 -41
  96. data/vendor/brotli/enc/encode.cc +0 -1180
  97. data/vendor/brotli/enc/entropy_encode.cc +0 -480
  98. data/vendor/brotli/enc/histogram.cc +0 -67
  99. data/vendor/brotli/enc/literal_cost.cc +0 -165
  100. data/vendor/brotli/enc/metablock.cc +0 -539
  101. data/vendor/brotli/enc/transform.h +0 -248
  102. data/vendor/brotli/enc/types.h +0 -29
@@ -4,327 +4,45 @@
4
4
  See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5
5
  */
6
6
 
7
- // Functions for clustering similar histograms together.
7
+ /* Functions for clustering similar histograms together. */
8
8
 
9
9
  #ifndef BROTLI_ENC_CLUSTER_H_
10
10
  #define BROTLI_ENC_CLUSTER_H_
11
11
 
12
- #include <math.h>
13
- #include <algorithm>
14
- #include <utility>
15
- #include <vector>
16
-
17
- #include "./bit_cost.h"
18
- #include "./entropy_encode.h"
19
- #include "./fast_log.h"
12
+ #include "../common/types.h"
20
13
  #include "./histogram.h"
14
+ #include "./memory.h"
21
15
  #include "./port.h"
22
- #include "./types.h"
23
16
 
24
- namespace brotli {
17
+ #if defined(__cplusplus) || defined(c_plusplus)
18
+ extern "C" {
19
+ #endif
25
20
 
26
- struct HistogramPair {
21
+ typedef struct HistogramPair {
27
22
  uint32_t idx1;
28
23
  uint32_t idx2;
29
24
  double cost_combo;
30
25
  double cost_diff;
31
- };
32
-
33
- inline bool operator<(const HistogramPair& p1, const HistogramPair& p2) {
34
- if (p1.cost_diff != p2.cost_diff) {
35
- return p1.cost_diff > p2.cost_diff;
36
- }
37
- return (p1.idx2 - p1.idx1) > (p2.idx2 - p2.idx1);
38
- }
39
-
40
- // Returns entropy reduction of the context map when we combine two clusters.
41
- inline double ClusterCostDiff(size_t size_a, size_t size_b) {
42
- size_t size_c = size_a + size_b;
43
- return static_cast<double>(size_a) * FastLog2(size_a) +
44
- static_cast<double>(size_b) * FastLog2(size_b) -
45
- static_cast<double>(size_c) * FastLog2(size_c);
46
- }
47
-
48
- // Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
49
- // it is below a threshold, stores the pair (idx1, idx2) in the *pairs queue.
50
- template<typename HistogramType>
51
- void CompareAndPushToQueue(const HistogramType* out,
52
- const uint32_t* cluster_size,
53
- uint32_t idx1, uint32_t idx2,
54
- size_t max_num_pairs,
55
- HistogramPair* pairs,
56
- size_t* num_pairs) {
57
- if (idx1 == idx2) {
58
- return;
59
- }
60
- if (idx2 < idx1) {
61
- uint32_t t = idx2;
62
- idx2 = idx1;
63
- idx1 = t;
64
- }
65
- bool store_pair = false;
66
- HistogramPair p;
67
- p.idx1 = idx1;
68
- p.idx2 = idx2;
69
- p.cost_diff = 0.5 * ClusterCostDiff(cluster_size[idx1], cluster_size[idx2]);
70
- p.cost_diff -= out[idx1].bit_cost_;
71
- p.cost_diff -= out[idx2].bit_cost_;
72
-
73
- if (out[idx1].total_count_ == 0) {
74
- p.cost_combo = out[idx2].bit_cost_;
75
- store_pair = true;
76
- } else if (out[idx2].total_count_ == 0) {
77
- p.cost_combo = out[idx1].bit_cost_;
78
- store_pair = true;
79
- } else {
80
- double threshold = *num_pairs == 0 ? 1e99 :
81
- std::max(0.0, pairs[0].cost_diff);
82
- HistogramType combo = out[idx1];
83
- combo.AddHistogram(out[idx2]);
84
- double cost_combo = PopulationCost(combo);
85
- if (cost_combo < threshold - p.cost_diff) {
86
- p.cost_combo = cost_combo;
87
- store_pair = true;
88
- }
89
- }
90
- if (store_pair) {
91
- p.cost_diff += p.cost_combo;
92
- if (*num_pairs > 0 && pairs[0] < p) {
93
- // Replace the top of the queue if needed.
94
- if (*num_pairs < max_num_pairs) {
95
- pairs[*num_pairs] = pairs[0];
96
- ++(*num_pairs);
97
- }
98
- pairs[0] = p;
99
- } else if (*num_pairs < max_num_pairs) {
100
- pairs[*num_pairs] = p;
101
- ++(*num_pairs);
102
- }
103
- }
104
- }
105
-
106
- template<typename HistogramType>
107
- size_t HistogramCombine(HistogramType* out,
108
- uint32_t* cluster_size,
109
- uint32_t* symbols,
110
- uint32_t* clusters,
111
- HistogramPair* pairs,
112
- size_t num_clusters,
113
- size_t symbols_size,
114
- size_t max_clusters,
115
- size_t max_num_pairs) {
116
- double cost_diff_threshold = 0.0;
117
- size_t min_cluster_size = 1;
118
-
119
- // We maintain a vector of histogram pairs, with the property that the pair
120
- // with the maximum bit cost reduction is the first.
121
- size_t num_pairs = 0;
122
- for (size_t idx1 = 0; idx1 < num_clusters; ++idx1) {
123
- for (size_t idx2 = idx1 + 1; idx2 < num_clusters; ++idx2) {
124
- CompareAndPushToQueue(out, cluster_size, clusters[idx1], clusters[idx2],
125
- max_num_pairs, &pairs[0], &num_pairs);
126
- }
127
- }
128
-
129
- while (num_clusters > min_cluster_size) {
130
- if (pairs[0].cost_diff >= cost_diff_threshold) {
131
- cost_diff_threshold = 1e99;
132
- min_cluster_size = max_clusters;
133
- continue;
134
- }
135
- // Take the best pair from the top of heap.
136
- uint32_t best_idx1 = pairs[0].idx1;
137
- uint32_t best_idx2 = pairs[0].idx2;
138
- out[best_idx1].AddHistogram(out[best_idx2]);
139
- out[best_idx1].bit_cost_ = pairs[0].cost_combo;
140
- cluster_size[best_idx1] += cluster_size[best_idx2];
141
- for (size_t i = 0; i < symbols_size; ++i) {
142
- if (symbols[i] == best_idx2) {
143
- symbols[i] = best_idx1;
144
- }
145
- }
146
- for (size_t i = 0; i < num_clusters; ++i) {
147
- if (clusters[i] == best_idx2) {
148
- memmove(&clusters[i], &clusters[i + 1],
149
- (num_clusters - i - 1) * sizeof(clusters[0]));
150
- break;
151
- }
152
- }
153
- --num_clusters;
154
- // Remove pairs intersecting the just combined best pair.
155
- size_t copy_to_idx = 0;
156
- for (size_t i = 0; i < num_pairs; ++i) {
157
- HistogramPair& p = pairs[i];
158
- if (p.idx1 == best_idx1 || p.idx2 == best_idx1 ||
159
- p.idx1 == best_idx2 || p.idx2 == best_idx2) {
160
- // Remove invalid pair from the queue.
161
- continue;
162
- }
163
- if (pairs[0] < p) {
164
- // Replace the top of the queue if needed.
165
- HistogramPair front = pairs[0];
166
- pairs[0] = p;
167
- pairs[copy_to_idx] = front;
168
- } else {
169
- pairs[copy_to_idx] = p;
170
- }
171
- ++copy_to_idx;
172
- }
173
- num_pairs = copy_to_idx;
174
-
175
- // Push new pairs formed with the combined histogram to the heap.
176
- for (size_t i = 0; i < num_clusters; ++i) {
177
- CompareAndPushToQueue(out, cluster_size, best_idx1, clusters[i],
178
- max_num_pairs, &pairs[0], &num_pairs);
179
- }
180
- }
181
- return num_clusters;
182
- }
183
-
184
- // -----------------------------------------------------------------------------
185
- // Histogram refinement
186
-
187
- // What is the bit cost of moving histogram from cur_symbol to candidate.
188
- template<typename HistogramType>
189
- double HistogramBitCostDistance(const HistogramType& histogram,
190
- const HistogramType& candidate) {
191
- if (histogram.total_count_ == 0) {
192
- return 0.0;
193
- }
194
- HistogramType tmp = histogram;
195
- tmp.AddHistogram(candidate);
196
- return PopulationCost(tmp) - candidate.bit_cost_;
197
- }
198
-
199
- // Find the best 'out' histogram for each of the 'in' histograms.
200
- // When called, clusters[0..num_clusters) contains the unique values from
201
- // symbols[0..in_size), but this property is not preserved in this function.
202
- // Note: we assume that out[]->bit_cost_ is already up-to-date.
203
- template<typename HistogramType>
204
- void HistogramRemap(const HistogramType* in, size_t in_size,
205
- const uint32_t* clusters, size_t num_clusters,
206
- HistogramType* out, uint32_t* symbols) {
207
- for (size_t i = 0; i < in_size; ++i) {
208
- uint32_t best_out = i == 0 ? symbols[0] : symbols[i - 1];
209
- double best_bits = HistogramBitCostDistance(in[i], out[best_out]);
210
- for (size_t j = 0; j < num_clusters; ++j) {
211
- const double cur_bits = HistogramBitCostDistance(in[i], out[clusters[j]]);
212
- if (cur_bits < best_bits) {
213
- best_bits = cur_bits;
214
- best_out = clusters[j];
215
- }
216
- }
217
- symbols[i] = best_out;
218
- }
219
-
220
- // Recompute each out based on raw and symbols.
221
- for (size_t j = 0; j < num_clusters; ++j) {
222
- out[clusters[j]].Clear();
223
- }
224
- for (size_t i = 0; i < in_size; ++i) {
225
- out[symbols[i]].AddHistogram(in[i]);
226
- }
227
- }
228
-
229
- // Reorders elements of the out[0..length) array and changes values in
230
- // symbols[0..length) array in the following way:
231
- // * when called, symbols[] contains indexes into out[], and has N unique
232
- // values (possibly N < length)
233
- // * on return, symbols'[i] = f(symbols[i]) and
234
- // out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
235
- // where f is a bijection between the range of symbols[] and [0..N), and
236
- // the first occurrences of values in symbols'[i] come in consecutive
237
- // increasing order.
238
- // Returns N, the number of unique values in symbols[].
239
- template<typename HistogramType>
240
- size_t HistogramReindex(HistogramType* out, uint32_t* symbols, size_t length) {
241
- static const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
242
- std::vector<uint32_t> new_index(length, kInvalidIndex);
243
- uint32_t next_index = 0;
244
- for (size_t i = 0; i < length; ++i) {
245
- if (new_index[symbols[i]] == kInvalidIndex) {
246
- new_index[symbols[i]] = next_index;
247
- ++next_index;
248
- }
249
- }
250
- std::vector<HistogramType> tmp(next_index);
251
- next_index = 0;
252
- for (size_t i = 0; i < length; ++i) {
253
- if (new_index[symbols[i]] == next_index) {
254
- tmp[next_index] = out[symbols[i]];
255
- ++next_index;
256
- }
257
- symbols[i] = new_index[symbols[i]];
258
- }
259
- for (size_t i = 0; i < next_index; ++i) {
260
- out[i] = tmp[i];
261
- }
262
- return next_index;
263
- }
264
-
265
- // Clusters similar histograms in 'in' together, the selected histograms are
266
- // placed in 'out', and for each index in 'in', *histogram_symbols will
267
- // indicate which of the 'out' histograms is the best approximation.
268
- template<typename HistogramType>
269
- void ClusterHistograms(const std::vector<HistogramType>& in,
270
- size_t num_contexts, size_t num_blocks,
271
- size_t max_histograms,
272
- std::vector<HistogramType>* out,
273
- std::vector<uint32_t>* histogram_symbols) {
274
- const size_t in_size = num_contexts * num_blocks;
275
- assert(in_size == in.size());
276
- std::vector<uint32_t> cluster_size(in_size, 1);
277
- std::vector<uint32_t> clusters(in_size);
278
- size_t num_clusters = 0;
279
- out->resize(in_size);
280
- histogram_symbols->resize(in_size);
281
- for (size_t i = 0; i < in_size; ++i) {
282
- (*out)[i] = in[i];
283
- (*out)[i].bit_cost_ = PopulationCost(in[i]);
284
- (*histogram_symbols)[i] = static_cast<uint32_t>(i);
285
- }
286
-
287
- const size_t max_input_histograms = 64;
288
- // For the first pass of clustering, we allow all pairs.
289
- size_t max_num_pairs = max_input_histograms * max_input_histograms / 2;
290
- std::vector<HistogramPair> pairs(max_num_pairs + 1);
26
+ } HistogramPair;
291
27
 
292
- for (size_t i = 0; i < in_size; i += max_input_histograms) {
293
- size_t num_to_combine = std::min(in_size - i, max_input_histograms);
294
- for (size_t j = 0; j < num_to_combine; ++j) {
295
- clusters[num_clusters + j] = static_cast<uint32_t>(i + j);
296
- }
297
- size_t num_new_clusters =
298
- HistogramCombine(&(*out)[0], &cluster_size[0],
299
- &(*histogram_symbols)[i],
300
- &clusters[num_clusters], &pairs[0],
301
- num_to_combine, num_to_combine,
302
- max_histograms, max_num_pairs);
303
- num_clusters += num_new_clusters;
304
- }
28
+ #define CODE(X) /* Declaration */;
305
29
 
306
- // For the second pass, we limit the total number of histogram pairs.
307
- // After this limit is reached, we only keep searching for the best pair.
308
- max_num_pairs =
309
- std::min(64 * num_clusters, (num_clusters / 2) * num_clusters);
310
- pairs.resize(max_num_pairs + 1);
30
+ #define FN(X) X ## Literal
31
+ #include "./cluster_inc.h" /* NOLINT(build/include) */
32
+ #undef FN
311
33
 
312
- // Collapse similar histograms.
313
- num_clusters = HistogramCombine(&(*out)[0], &cluster_size[0],
314
- &(*histogram_symbols)[0], &clusters[0],
315
- &pairs[0], num_clusters, in_size,
316
- max_histograms, max_num_pairs);
34
+ #define FN(X) X ## Command
35
+ #include "./cluster_inc.h" /* NOLINT(build/include) */
36
+ #undef FN
317
37
 
318
- // Find the optimal map from original histograms to the final ones.
319
- HistogramRemap(&in[0], in_size, &clusters[0], num_clusters,
320
- &(*out)[0], &(*histogram_symbols)[0]);
38
+ #define FN(X) X ## Distance
39
+ #include "./cluster_inc.h" /* NOLINT(build/include) */
40
+ #undef FN
321
41
 
322
- // Convert the context map to a canonical form.
323
- size_t num_histograms =
324
- HistogramReindex(&(*out)[0], &(*histogram_symbols)[0], in_size);
325
- out->resize(num_histograms);
326
- }
42
+ #undef CODE
327
43
 
328
- } // namespace brotli
44
+ #if defined(__cplusplus) || defined(c_plusplus)
45
+ } /* extern "C" */
46
+ #endif
329
47
 
330
- #endif // BROTLI_ENC_CLUSTER_H_
48
+ #endif /* BROTLI_ENC_CLUSTER_H_ */
@@ -0,0 +1,315 @@
1
+ /* NOLINT(build/header_guard) */
2
+ /* Copyright 2013 Google Inc. All Rights Reserved.
3
+
4
+ Distributed under MIT license.
5
+ See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
6
+ */
7
+
8
+ /* template parameters: FN, CODE */
9
+
10
+ #define HistogramType FN(Histogram)
11
+
12
+ /* Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
13
+ it is below a threshold, stores the pair (idx1, idx2) in the *pairs queue. */
14
+ BROTLI_INTERNAL void FN(BrotliCompareAndPushToQueue)(
15
+ const HistogramType* out, const uint32_t* cluster_size, uint32_t idx1,
16
+ uint32_t idx2, size_t max_num_pairs, HistogramPair* pairs,
17
+ size_t* num_pairs) CODE({
18
+ BROTLI_BOOL is_good_pair = BROTLI_FALSE;
19
+ HistogramPair p;
20
+ if (idx1 == idx2) {
21
+ return;
22
+ }
23
+ if (idx2 < idx1) {
24
+ uint32_t t = idx2;
25
+ idx2 = idx1;
26
+ idx1 = t;
27
+ }
28
+ p.idx1 = idx1;
29
+ p.idx2 = idx2;
30
+ p.cost_diff = 0.5 * ClusterCostDiff(cluster_size[idx1], cluster_size[idx2]);
31
+ p.cost_diff -= out[idx1].bit_cost_;
32
+ p.cost_diff -= out[idx2].bit_cost_;
33
+
34
+ if (out[idx1].total_count_ == 0) {
35
+ p.cost_combo = out[idx2].bit_cost_;
36
+ is_good_pair = BROTLI_TRUE;
37
+ } else if (out[idx2].total_count_ == 0) {
38
+ p.cost_combo = out[idx1].bit_cost_;
39
+ is_good_pair = BROTLI_TRUE;
40
+ } else {
41
+ double threshold = *num_pairs == 0 ? 1e99 :
42
+ BROTLI_MAX(double, 0.0, pairs[0].cost_diff);
43
+ HistogramType combo = out[idx1];
44
+ double cost_combo;
45
+ FN(HistogramAddHistogram)(&combo, &out[idx2]);
46
+ cost_combo = FN(BrotliPopulationCost)(&combo);
47
+ if (cost_combo < threshold - p.cost_diff) {
48
+ p.cost_combo = cost_combo;
49
+ is_good_pair = BROTLI_TRUE;
50
+ }
51
+ }
52
+ if (is_good_pair) {
53
+ p.cost_diff += p.cost_combo;
54
+ if (*num_pairs > 0 && HistogramPairIsLess(&pairs[0], &p)) {
55
+ /* Replace the top of the queue if needed. */
56
+ if (*num_pairs < max_num_pairs) {
57
+ pairs[*num_pairs] = pairs[0];
58
+ ++(*num_pairs);
59
+ }
60
+ pairs[0] = p;
61
+ } else if (*num_pairs < max_num_pairs) {
62
+ pairs[*num_pairs] = p;
63
+ ++(*num_pairs);
64
+ }
65
+ }
66
+ })
67
+
68
+ BROTLI_INTERNAL size_t FN(BrotliHistogramCombine)(HistogramType* out,
69
+ uint32_t* cluster_size,
70
+ uint32_t* symbols,
71
+ uint32_t* clusters,
72
+ HistogramPair* pairs,
73
+ size_t num_clusters,
74
+ size_t symbols_size,
75
+ size_t max_clusters,
76
+ size_t max_num_pairs) CODE({
77
+ double cost_diff_threshold = 0.0;
78
+ size_t min_cluster_size = 1;
79
+ size_t num_pairs = 0;
80
+
81
+ {
82
+ /* We maintain a vector of histogram pairs, with the property that the pair
83
+ with the maximum bit cost reduction is the first. */
84
+ size_t idx1;
85
+ for (idx1 = 0; idx1 < num_clusters; ++idx1) {
86
+ size_t idx2;
87
+ for (idx2 = idx1 + 1; idx2 < num_clusters; ++idx2) {
88
+ FN(BrotliCompareAndPushToQueue)(out, cluster_size, clusters[idx1],
89
+ clusters[idx2], max_num_pairs, &pairs[0], &num_pairs);
90
+ }
91
+ }
92
+ }
93
+
94
+ while (num_clusters > min_cluster_size) {
95
+ uint32_t best_idx1;
96
+ uint32_t best_idx2;
97
+ size_t i;
98
+ if (pairs[0].cost_diff >= cost_diff_threshold) {
99
+ cost_diff_threshold = 1e99;
100
+ min_cluster_size = max_clusters;
101
+ continue;
102
+ }
103
+ /* Take the best pair from the top of heap. */
104
+ best_idx1 = pairs[0].idx1;
105
+ best_idx2 = pairs[0].idx2;
106
+ FN(HistogramAddHistogram)(&out[best_idx1], &out[best_idx2]);
107
+ out[best_idx1].bit_cost_ = pairs[0].cost_combo;
108
+ cluster_size[best_idx1] += cluster_size[best_idx2];
109
+ for (i = 0; i < symbols_size; ++i) {
110
+ if (symbols[i] == best_idx2) {
111
+ symbols[i] = best_idx1;
112
+ }
113
+ }
114
+ for (i = 0; i < num_clusters; ++i) {
115
+ if (clusters[i] == best_idx2) {
116
+ memmove(&clusters[i], &clusters[i + 1],
117
+ (num_clusters - i - 1) * sizeof(clusters[0]));
118
+ break;
119
+ }
120
+ }
121
+ --num_clusters;
122
+ {
123
+ /* Remove pairs intersecting the just combined best pair. */
124
+ size_t copy_to_idx = 0;
125
+ for (i = 0; i < num_pairs; ++i) {
126
+ HistogramPair* p = &pairs[i];
127
+ if (p->idx1 == best_idx1 || p->idx2 == best_idx1 ||
128
+ p->idx1 == best_idx2 || p->idx2 == best_idx2) {
129
+ /* Remove invalid pair from the queue. */
130
+ continue;
131
+ }
132
+ if (HistogramPairIsLess(&pairs[0], p)) {
133
+ /* Replace the top of the queue if needed. */
134
+ HistogramPair front = pairs[0];
135
+ pairs[0] = *p;
136
+ pairs[copy_to_idx] = front;
137
+ } else {
138
+ pairs[copy_to_idx] = *p;
139
+ }
140
+ ++copy_to_idx;
141
+ }
142
+ num_pairs = copy_to_idx;
143
+ }
144
+
145
+ /* Push new pairs formed with the combined histogram to the heap. */
146
+ for (i = 0; i < num_clusters; ++i) {
147
+ FN(BrotliCompareAndPushToQueue)(out, cluster_size, best_idx1, clusters[i],
148
+ max_num_pairs, &pairs[0], &num_pairs);
149
+ }
150
+ }
151
+ return num_clusters;
152
+ })
153
+
154
+ /* What is the bit cost of moving histogram from cur_symbol to candidate. */
155
+ BROTLI_INTERNAL double FN(BrotliHistogramBitCostDistance)(
156
+ const HistogramType* histogram, const HistogramType* candidate) CODE({
157
+ if (histogram->total_count_ == 0) {
158
+ return 0.0;
159
+ } else {
160
+ HistogramType tmp = *histogram;
161
+ FN(HistogramAddHistogram)(&tmp, candidate);
162
+ return FN(BrotliPopulationCost)(&tmp) - candidate->bit_cost_;
163
+ }
164
+ })
165
+
166
+ /* Find the best 'out' histogram for each of the 'in' histograms.
167
+ When called, clusters[0..num_clusters) contains the unique values from
168
+ symbols[0..in_size), but this property is not preserved in this function.
169
+ Note: we assume that out[]->bit_cost_ is already up-to-date. */
170
+ BROTLI_INTERNAL void FN(BrotliHistogramRemap)(const HistogramType* in,
171
+ size_t in_size, const uint32_t* clusters, size_t num_clusters,
172
+ HistogramType* out, uint32_t* symbols) CODE({
173
+ size_t i;
174
+ for (i = 0; i < in_size; ++i) {
175
+ uint32_t best_out = i == 0 ? symbols[0] : symbols[i - 1];
176
+ double best_bits =
177
+ FN(BrotliHistogramBitCostDistance)(&in[i], &out[best_out]);
178
+ size_t j;
179
+ for (j = 0; j < num_clusters; ++j) {
180
+ const double cur_bits =
181
+ FN(BrotliHistogramBitCostDistance)(&in[i], &out[clusters[j]]);
182
+ if (cur_bits < best_bits) {
183
+ best_bits = cur_bits;
184
+ best_out = clusters[j];
185
+ }
186
+ }
187
+ symbols[i] = best_out;
188
+ }
189
+
190
+ /* Recompute each out based on raw and symbols. */
191
+ for (i = 0; i < num_clusters; ++i) {
192
+ FN(HistogramClear)(&out[clusters[i]]);
193
+ }
194
+ for (i = 0; i < in_size; ++i) {
195
+ FN(HistogramAddHistogram)(&out[symbols[i]], &in[i]);
196
+ }
197
+ })
198
+
199
+ /* Reorders elements of the out[0..length) array and changes values in
200
+ symbols[0..length) array in the following way:
201
+ * when called, symbols[] contains indexes into out[], and has N unique
202
+ values (possibly N < length)
203
+ * on return, symbols'[i] = f(symbols[i]) and
204
+ out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
205
+ where f is a bijection between the range of symbols[] and [0..N), and
206
+ the first occurrences of values in symbols'[i] come in consecutive
207
+ increasing order.
208
+ Returns N, the number of unique values in symbols[]. */
209
+ BROTLI_INTERNAL size_t FN(BrotliHistogramReindex)(MemoryManager* m,
210
+ HistogramType* out, uint32_t* symbols, size_t length) CODE({
211
+ static const uint32_t kInvalidIndex = BROTLI_UINT32_MAX;
212
+ uint32_t* new_index = BROTLI_ALLOC(m, uint32_t, length);
213
+ uint32_t next_index;
214
+ HistogramType* tmp;
215
+ size_t i;
216
+ if (BROTLI_IS_OOM(m)) return 0;
217
+ for (i = 0; i < length; ++i) {
218
+ new_index[i] = kInvalidIndex;
219
+ }
220
+ next_index = 0;
221
+ for (i = 0; i < length; ++i) {
222
+ if (new_index[symbols[i]] == kInvalidIndex) {
223
+ new_index[symbols[i]] = next_index;
224
+ ++next_index;
225
+ }
226
+ }
227
+ /* TODO: by using idea of "cycle-sort" we can avoid allocation of
228
+ tmp and reduce the number of copying by the factor of 2. */
229
+ tmp = BROTLI_ALLOC(m, HistogramType, next_index);
230
+ if (BROTLI_IS_OOM(m)) return 0;
231
+ next_index = 0;
232
+ for (i = 0; i < length; ++i) {
233
+ if (new_index[symbols[i]] == next_index) {
234
+ tmp[next_index] = out[symbols[i]];
235
+ ++next_index;
236
+ }
237
+ symbols[i] = new_index[symbols[i]];
238
+ }
239
+ BROTLI_FREE(m, new_index);
240
+ for (i = 0; i < next_index; ++i) {
241
+ out[i] = tmp[i];
242
+ }
243
+ BROTLI_FREE(m, tmp);
244
+ return next_index;
245
+ })
246
+
247
+ BROTLI_INTERNAL void FN(BrotliClusterHistograms)(
248
+ MemoryManager* m, const HistogramType* in, const size_t in_size,
249
+ size_t max_histograms, HistogramType* out, size_t* out_size,
250
+ uint32_t* histogram_symbols) CODE({
251
+ uint32_t* cluster_size = BROTLI_ALLOC(m, uint32_t, in_size);
252
+ uint32_t* clusters = BROTLI_ALLOC(m, uint32_t, in_size);
253
+ size_t num_clusters = 0;
254
+ const size_t max_input_histograms = 64;
255
+ size_t pairs_capacity = max_input_histograms * max_input_histograms / 2;
256
+ /* For the first pass of clustering, we allow all pairs. */
257
+ HistogramPair* pairs = BROTLI_ALLOC(m, HistogramPair, pairs_capacity + 1);
258
+ size_t i;
259
+
260
+ if (BROTLI_IS_OOM(m)) return;
261
+
262
+ for (i = 0; i < in_size; ++i) {
263
+ cluster_size[i] = 1;
264
+ }
265
+
266
+ for (i = 0; i < in_size; ++i) {
267
+ out[i] = in[i];
268
+ out[i].bit_cost_ = FN(BrotliPopulationCost)(&in[i]);
269
+ histogram_symbols[i] = (uint32_t)i;
270
+ }
271
+
272
+ for (i = 0; i < in_size; i += max_input_histograms) {
273
+ size_t num_to_combine =
274
+ BROTLI_MIN(size_t, in_size - i, max_input_histograms);
275
+ size_t num_new_clusters;
276
+ size_t j;
277
+ for (j = 0; j < num_to_combine; ++j) {
278
+ clusters[num_clusters + j] = (uint32_t)(i + j);
279
+ }
280
+ num_new_clusters =
281
+ FN(BrotliHistogramCombine)(out, cluster_size,
282
+ &histogram_symbols[i],
283
+ &clusters[num_clusters], pairs,
284
+ num_to_combine, num_to_combine,
285
+ max_histograms, pairs_capacity);
286
+ num_clusters += num_new_clusters;
287
+ }
288
+
289
+ {
290
+ /* For the second pass, we limit the total number of histogram pairs.
291
+ After this limit is reached, we only keep searching for the best pair. */
292
+ size_t max_num_pairs = BROTLI_MIN(size_t,
293
+ 64 * num_clusters, (num_clusters / 2) * num_clusters);
294
+ BROTLI_ENSURE_CAPACITY(
295
+ m, HistogramPair, pairs, pairs_capacity, max_num_pairs + 1);
296
+ if (BROTLI_IS_OOM(m)) return;
297
+
298
+ /* Collapse similar histograms. */
299
+ num_clusters = FN(BrotliHistogramCombine)(out, cluster_size,
300
+ histogram_symbols, clusters,
301
+ pairs, num_clusters, in_size,
302
+ max_histograms, max_num_pairs);
303
+ }
304
+ BROTLI_FREE(m, pairs);
305
+ BROTLI_FREE(m, cluster_size);
306
+ /* Find the optimal map from original histograms to the final ones. */
307
+ FN(BrotliHistogramRemap)(in, in_size, clusters, num_clusters,
308
+ out, histogram_symbols);
309
+ BROTLI_FREE(m, clusters);
310
+ /* Convert the context map to a canonical form. */
311
+ *out_size = FN(BrotliHistogramReindex)(m, out, histogram_symbols, in_size);
312
+ if (BROTLI_IS_OOM(m)) return;
313
+ })
314
+
315
+ #undef HistogramType