brotli 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.travis.yml +11 -3
- data/Gemfile +2 -0
- data/ext/brotli/brotli.c +279 -0
- data/ext/brotli/brotli.h +2 -0
- data/ext/brotli/buffer.c +95 -0
- data/ext/brotli/buffer.h +19 -0
- data/ext/brotli/extconf.rb +21 -81
- data/lib/brotli/version.rb +1 -1
- data/vendor/brotli/dec/bit_reader.c +5 -5
- data/vendor/brotli/dec/bit_reader.h +15 -15
- data/vendor/brotli/dec/context.h +1 -1
- data/vendor/brotli/dec/decode.c +433 -348
- data/vendor/brotli/dec/decode.h +74 -48
- data/vendor/brotli/dec/huffman.c +5 -4
- data/vendor/brotli/dec/huffman.h +4 -4
- data/vendor/brotli/dec/port.h +2 -95
- data/vendor/brotli/dec/prefix.h +5 -3
- data/vendor/brotli/dec/state.c +15 -27
- data/vendor/brotli/dec/state.h +21 -17
- data/vendor/brotli/dec/transform.h +1 -1
- data/vendor/brotli/enc/backward_references.c +892 -0
- data/vendor/brotli/enc/backward_references.h +85 -102
- data/vendor/brotli/enc/backward_references_inc.h +147 -0
- data/vendor/brotli/enc/bit_cost.c +35 -0
- data/vendor/brotli/enc/bit_cost.h +23 -121
- data/vendor/brotli/enc/bit_cost_inc.h +127 -0
- data/vendor/brotli/enc/block_encoder_inc.h +33 -0
- data/vendor/brotli/enc/block_splitter.c +197 -0
- data/vendor/brotli/enc/block_splitter.h +40 -50
- data/vendor/brotli/enc/block_splitter_inc.h +432 -0
- data/vendor/brotli/enc/brotli_bit_stream.c +1334 -0
- data/vendor/brotli/enc/brotli_bit_stream.h +95 -167
- data/vendor/brotli/enc/cluster.c +56 -0
- data/vendor/brotli/enc/cluster.h +23 -305
- data/vendor/brotli/enc/cluster_inc.h +315 -0
- data/vendor/brotli/enc/command.h +83 -76
- data/vendor/brotli/enc/compress_fragment.c +747 -0
- data/vendor/brotli/enc/compress_fragment.h +48 -37
- data/vendor/brotli/enc/compress_fragment_two_pass.c +557 -0
- data/vendor/brotli/enc/compress_fragment_two_pass.h +37 -26
- data/vendor/brotli/enc/compressor.cc +139 -0
- data/vendor/brotli/enc/compressor.h +146 -0
- data/vendor/brotli/enc/context.h +102 -96
- data/vendor/brotli/enc/dictionary_hash.h +9 -5
- data/vendor/brotli/enc/encode.c +1562 -0
- data/vendor/brotli/enc/encode.h +211 -199
- data/vendor/brotli/enc/encode_parallel.cc +161 -151
- data/vendor/brotli/enc/encode_parallel.h +7 -8
- data/vendor/brotli/enc/entropy_encode.c +501 -0
- data/vendor/brotli/enc/entropy_encode.h +107 -89
- data/vendor/brotli/enc/entropy_encode_static.h +29 -62
- data/vendor/brotli/enc/fast_log.h +26 -20
- data/vendor/brotli/enc/find_match_length.h +23 -20
- data/vendor/brotli/enc/hash.h +614 -871
- data/vendor/brotli/enc/hash_forgetful_chain_inc.h +249 -0
- data/vendor/brotli/enc/hash_longest_match_inc.h +241 -0
- data/vendor/brotli/enc/hash_longest_match_quickly_inc.h +230 -0
- data/vendor/brotli/enc/histogram.c +95 -0
- data/vendor/brotli/enc/histogram.h +49 -83
- data/vendor/brotli/enc/histogram_inc.h +51 -0
- data/vendor/brotli/enc/literal_cost.c +178 -0
- data/vendor/brotli/enc/literal_cost.h +16 -10
- data/vendor/brotli/enc/memory.c +181 -0
- data/vendor/brotli/enc/memory.h +62 -0
- data/vendor/brotli/enc/metablock.c +515 -0
- data/vendor/brotli/enc/metablock.h +87 -57
- data/vendor/brotli/enc/metablock_inc.h +183 -0
- data/vendor/brotli/enc/port.h +73 -47
- data/vendor/brotli/enc/prefix.h +34 -61
- data/vendor/brotli/enc/quality.h +130 -0
- data/vendor/brotli/enc/ringbuffer.h +137 -122
- data/vendor/brotli/enc/{static_dict.cc → static_dict.c} +162 -139
- data/vendor/brotli/enc/static_dict.h +23 -18
- data/vendor/brotli/enc/static_dict_lut.h +11223 -12037
- data/vendor/brotli/enc/streams.cc +7 -7
- data/vendor/brotli/enc/streams.h +32 -32
- data/vendor/brotli/enc/{utf8_util.cc → utf8_util.c} +22 -20
- data/vendor/brotli/enc/utf8_util.h +16 -9
- data/vendor/brotli/enc/write_bits.h +49 -43
- metadata +34 -25
- data/ext/brotli/brotli.cc +0 -181
- data/vendor/brotli/dec/Makefile +0 -12
- data/vendor/brotli/dec/dictionary.c +0 -9466
- data/vendor/brotli/dec/dictionary.h +0 -38
- data/vendor/brotli/dec/types.h +0 -38
- data/vendor/brotli/enc/Makefile +0 -14
- data/vendor/brotli/enc/backward_references.cc +0 -858
- data/vendor/brotli/enc/block_splitter.cc +0 -505
- data/vendor/brotli/enc/brotli_bit_stream.cc +0 -1181
- data/vendor/brotli/enc/compress_fragment.cc +0 -701
- data/vendor/brotli/enc/compress_fragment_two_pass.cc +0 -524
- data/vendor/brotli/enc/dictionary.cc +0 -9466
- data/vendor/brotli/enc/dictionary.h +0 -41
- data/vendor/brotli/enc/encode.cc +0 -1180
- data/vendor/brotli/enc/entropy_encode.cc +0 -480
- data/vendor/brotli/enc/histogram.cc +0 -67
- data/vendor/brotli/enc/literal_cost.cc +0 -165
- data/vendor/brotli/enc/metablock.cc +0 -539
- data/vendor/brotli/enc/transform.h +0 -248
- data/vendor/brotli/enc/types.h +0 -29
data/vendor/brotli/enc/cluster.h
CHANGED
@@ -4,327 +4,45 @@
|
|
4
4
|
See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
|
5
5
|
*/
|
6
6
|
|
7
|
-
|
7
|
+
/* Functions for clustering similar histograms together. */
|
8
8
|
|
9
9
|
#ifndef BROTLI_ENC_CLUSTER_H_
|
10
10
|
#define BROTLI_ENC_CLUSTER_H_
|
11
11
|
|
12
|
-
#include
|
13
|
-
#include <algorithm>
|
14
|
-
#include <utility>
|
15
|
-
#include <vector>
|
16
|
-
|
17
|
-
#include "./bit_cost.h"
|
18
|
-
#include "./entropy_encode.h"
|
19
|
-
#include "./fast_log.h"
|
12
|
+
#include "../common/types.h"
|
20
13
|
#include "./histogram.h"
|
14
|
+
#include "./memory.h"
|
21
15
|
#include "./port.h"
|
22
|
-
#include "./types.h"
|
23
16
|
|
24
|
-
|
17
|
+
#if defined(__cplusplus) || defined(c_plusplus)
|
18
|
+
extern "C" {
|
19
|
+
#endif
|
25
20
|
|
26
|
-
struct HistogramPair {
|
21
|
+
typedef struct HistogramPair {
|
27
22
|
uint32_t idx1;
|
28
23
|
uint32_t idx2;
|
29
24
|
double cost_combo;
|
30
25
|
double cost_diff;
|
31
|
-
};
|
32
|
-
|
33
|
-
inline bool operator<(const HistogramPair& p1, const HistogramPair& p2) {
|
34
|
-
if (p1.cost_diff != p2.cost_diff) {
|
35
|
-
return p1.cost_diff > p2.cost_diff;
|
36
|
-
}
|
37
|
-
return (p1.idx2 - p1.idx1) > (p2.idx2 - p2.idx1);
|
38
|
-
}
|
39
|
-
|
40
|
-
// Returns entropy reduction of the context map when we combine two clusters.
|
41
|
-
inline double ClusterCostDiff(size_t size_a, size_t size_b) {
|
42
|
-
size_t size_c = size_a + size_b;
|
43
|
-
return static_cast<double>(size_a) * FastLog2(size_a) +
|
44
|
-
static_cast<double>(size_b) * FastLog2(size_b) -
|
45
|
-
static_cast<double>(size_c) * FastLog2(size_c);
|
46
|
-
}
|
47
|
-
|
48
|
-
// Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
|
49
|
-
// it is below a threshold, stores the pair (idx1, idx2) in the *pairs queue.
|
50
|
-
template<typename HistogramType>
|
51
|
-
void CompareAndPushToQueue(const HistogramType* out,
|
52
|
-
const uint32_t* cluster_size,
|
53
|
-
uint32_t idx1, uint32_t idx2,
|
54
|
-
size_t max_num_pairs,
|
55
|
-
HistogramPair* pairs,
|
56
|
-
size_t* num_pairs) {
|
57
|
-
if (idx1 == idx2) {
|
58
|
-
return;
|
59
|
-
}
|
60
|
-
if (idx2 < idx1) {
|
61
|
-
uint32_t t = idx2;
|
62
|
-
idx2 = idx1;
|
63
|
-
idx1 = t;
|
64
|
-
}
|
65
|
-
bool store_pair = false;
|
66
|
-
HistogramPair p;
|
67
|
-
p.idx1 = idx1;
|
68
|
-
p.idx2 = idx2;
|
69
|
-
p.cost_diff = 0.5 * ClusterCostDiff(cluster_size[idx1], cluster_size[idx2]);
|
70
|
-
p.cost_diff -= out[idx1].bit_cost_;
|
71
|
-
p.cost_diff -= out[idx2].bit_cost_;
|
72
|
-
|
73
|
-
if (out[idx1].total_count_ == 0) {
|
74
|
-
p.cost_combo = out[idx2].bit_cost_;
|
75
|
-
store_pair = true;
|
76
|
-
} else if (out[idx2].total_count_ == 0) {
|
77
|
-
p.cost_combo = out[idx1].bit_cost_;
|
78
|
-
store_pair = true;
|
79
|
-
} else {
|
80
|
-
double threshold = *num_pairs == 0 ? 1e99 :
|
81
|
-
std::max(0.0, pairs[0].cost_diff);
|
82
|
-
HistogramType combo = out[idx1];
|
83
|
-
combo.AddHistogram(out[idx2]);
|
84
|
-
double cost_combo = PopulationCost(combo);
|
85
|
-
if (cost_combo < threshold - p.cost_diff) {
|
86
|
-
p.cost_combo = cost_combo;
|
87
|
-
store_pair = true;
|
88
|
-
}
|
89
|
-
}
|
90
|
-
if (store_pair) {
|
91
|
-
p.cost_diff += p.cost_combo;
|
92
|
-
if (*num_pairs > 0 && pairs[0] < p) {
|
93
|
-
// Replace the top of the queue if needed.
|
94
|
-
if (*num_pairs < max_num_pairs) {
|
95
|
-
pairs[*num_pairs] = pairs[0];
|
96
|
-
++(*num_pairs);
|
97
|
-
}
|
98
|
-
pairs[0] = p;
|
99
|
-
} else if (*num_pairs < max_num_pairs) {
|
100
|
-
pairs[*num_pairs] = p;
|
101
|
-
++(*num_pairs);
|
102
|
-
}
|
103
|
-
}
|
104
|
-
}
|
105
|
-
|
106
|
-
template<typename HistogramType>
|
107
|
-
size_t HistogramCombine(HistogramType* out,
|
108
|
-
uint32_t* cluster_size,
|
109
|
-
uint32_t* symbols,
|
110
|
-
uint32_t* clusters,
|
111
|
-
HistogramPair* pairs,
|
112
|
-
size_t num_clusters,
|
113
|
-
size_t symbols_size,
|
114
|
-
size_t max_clusters,
|
115
|
-
size_t max_num_pairs) {
|
116
|
-
double cost_diff_threshold = 0.0;
|
117
|
-
size_t min_cluster_size = 1;
|
118
|
-
|
119
|
-
// We maintain a vector of histogram pairs, with the property that the pair
|
120
|
-
// with the maximum bit cost reduction is the first.
|
121
|
-
size_t num_pairs = 0;
|
122
|
-
for (size_t idx1 = 0; idx1 < num_clusters; ++idx1) {
|
123
|
-
for (size_t idx2 = idx1 + 1; idx2 < num_clusters; ++idx2) {
|
124
|
-
CompareAndPushToQueue(out, cluster_size, clusters[idx1], clusters[idx2],
|
125
|
-
max_num_pairs, &pairs[0], &num_pairs);
|
126
|
-
}
|
127
|
-
}
|
128
|
-
|
129
|
-
while (num_clusters > min_cluster_size) {
|
130
|
-
if (pairs[0].cost_diff >= cost_diff_threshold) {
|
131
|
-
cost_diff_threshold = 1e99;
|
132
|
-
min_cluster_size = max_clusters;
|
133
|
-
continue;
|
134
|
-
}
|
135
|
-
// Take the best pair from the top of heap.
|
136
|
-
uint32_t best_idx1 = pairs[0].idx1;
|
137
|
-
uint32_t best_idx2 = pairs[0].idx2;
|
138
|
-
out[best_idx1].AddHistogram(out[best_idx2]);
|
139
|
-
out[best_idx1].bit_cost_ = pairs[0].cost_combo;
|
140
|
-
cluster_size[best_idx1] += cluster_size[best_idx2];
|
141
|
-
for (size_t i = 0; i < symbols_size; ++i) {
|
142
|
-
if (symbols[i] == best_idx2) {
|
143
|
-
symbols[i] = best_idx1;
|
144
|
-
}
|
145
|
-
}
|
146
|
-
for (size_t i = 0; i < num_clusters; ++i) {
|
147
|
-
if (clusters[i] == best_idx2) {
|
148
|
-
memmove(&clusters[i], &clusters[i + 1],
|
149
|
-
(num_clusters - i - 1) * sizeof(clusters[0]));
|
150
|
-
break;
|
151
|
-
}
|
152
|
-
}
|
153
|
-
--num_clusters;
|
154
|
-
// Remove pairs intersecting the just combined best pair.
|
155
|
-
size_t copy_to_idx = 0;
|
156
|
-
for (size_t i = 0; i < num_pairs; ++i) {
|
157
|
-
HistogramPair& p = pairs[i];
|
158
|
-
if (p.idx1 == best_idx1 || p.idx2 == best_idx1 ||
|
159
|
-
p.idx1 == best_idx2 || p.idx2 == best_idx2) {
|
160
|
-
// Remove invalid pair from the queue.
|
161
|
-
continue;
|
162
|
-
}
|
163
|
-
if (pairs[0] < p) {
|
164
|
-
// Replace the top of the queue if needed.
|
165
|
-
HistogramPair front = pairs[0];
|
166
|
-
pairs[0] = p;
|
167
|
-
pairs[copy_to_idx] = front;
|
168
|
-
} else {
|
169
|
-
pairs[copy_to_idx] = p;
|
170
|
-
}
|
171
|
-
++copy_to_idx;
|
172
|
-
}
|
173
|
-
num_pairs = copy_to_idx;
|
174
|
-
|
175
|
-
// Push new pairs formed with the combined histogram to the heap.
|
176
|
-
for (size_t i = 0; i < num_clusters; ++i) {
|
177
|
-
CompareAndPushToQueue(out, cluster_size, best_idx1, clusters[i],
|
178
|
-
max_num_pairs, &pairs[0], &num_pairs);
|
179
|
-
}
|
180
|
-
}
|
181
|
-
return num_clusters;
|
182
|
-
}
|
183
|
-
|
184
|
-
// -----------------------------------------------------------------------------
|
185
|
-
// Histogram refinement
|
186
|
-
|
187
|
-
// What is the bit cost of moving histogram from cur_symbol to candidate.
|
188
|
-
template<typename HistogramType>
|
189
|
-
double HistogramBitCostDistance(const HistogramType& histogram,
|
190
|
-
const HistogramType& candidate) {
|
191
|
-
if (histogram.total_count_ == 0) {
|
192
|
-
return 0.0;
|
193
|
-
}
|
194
|
-
HistogramType tmp = histogram;
|
195
|
-
tmp.AddHistogram(candidate);
|
196
|
-
return PopulationCost(tmp) - candidate.bit_cost_;
|
197
|
-
}
|
198
|
-
|
199
|
-
// Find the best 'out' histogram for each of the 'in' histograms.
|
200
|
-
// When called, clusters[0..num_clusters) contains the unique values from
|
201
|
-
// symbols[0..in_size), but this property is not preserved in this function.
|
202
|
-
// Note: we assume that out[]->bit_cost_ is already up-to-date.
|
203
|
-
template<typename HistogramType>
|
204
|
-
void HistogramRemap(const HistogramType* in, size_t in_size,
|
205
|
-
const uint32_t* clusters, size_t num_clusters,
|
206
|
-
HistogramType* out, uint32_t* symbols) {
|
207
|
-
for (size_t i = 0; i < in_size; ++i) {
|
208
|
-
uint32_t best_out = i == 0 ? symbols[0] : symbols[i - 1];
|
209
|
-
double best_bits = HistogramBitCostDistance(in[i], out[best_out]);
|
210
|
-
for (size_t j = 0; j < num_clusters; ++j) {
|
211
|
-
const double cur_bits = HistogramBitCostDistance(in[i], out[clusters[j]]);
|
212
|
-
if (cur_bits < best_bits) {
|
213
|
-
best_bits = cur_bits;
|
214
|
-
best_out = clusters[j];
|
215
|
-
}
|
216
|
-
}
|
217
|
-
symbols[i] = best_out;
|
218
|
-
}
|
219
|
-
|
220
|
-
// Recompute each out based on raw and symbols.
|
221
|
-
for (size_t j = 0; j < num_clusters; ++j) {
|
222
|
-
out[clusters[j]].Clear();
|
223
|
-
}
|
224
|
-
for (size_t i = 0; i < in_size; ++i) {
|
225
|
-
out[symbols[i]].AddHistogram(in[i]);
|
226
|
-
}
|
227
|
-
}
|
228
|
-
|
229
|
-
// Reorders elements of the out[0..length) array and changes values in
|
230
|
-
// symbols[0..length) array in the following way:
|
231
|
-
// * when called, symbols[] contains indexes into out[], and has N unique
|
232
|
-
// values (possibly N < length)
|
233
|
-
// * on return, symbols'[i] = f(symbols[i]) and
|
234
|
-
// out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
|
235
|
-
// where f is a bijection between the range of symbols[] and [0..N), and
|
236
|
-
// the first occurrences of values in symbols'[i] come in consecutive
|
237
|
-
// increasing order.
|
238
|
-
// Returns N, the number of unique values in symbols[].
|
239
|
-
template<typename HistogramType>
|
240
|
-
size_t HistogramReindex(HistogramType* out, uint32_t* symbols, size_t length) {
|
241
|
-
static const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
|
242
|
-
std::vector<uint32_t> new_index(length, kInvalidIndex);
|
243
|
-
uint32_t next_index = 0;
|
244
|
-
for (size_t i = 0; i < length; ++i) {
|
245
|
-
if (new_index[symbols[i]] == kInvalidIndex) {
|
246
|
-
new_index[symbols[i]] = next_index;
|
247
|
-
++next_index;
|
248
|
-
}
|
249
|
-
}
|
250
|
-
std::vector<HistogramType> tmp(next_index);
|
251
|
-
next_index = 0;
|
252
|
-
for (size_t i = 0; i < length; ++i) {
|
253
|
-
if (new_index[symbols[i]] == next_index) {
|
254
|
-
tmp[next_index] = out[symbols[i]];
|
255
|
-
++next_index;
|
256
|
-
}
|
257
|
-
symbols[i] = new_index[symbols[i]];
|
258
|
-
}
|
259
|
-
for (size_t i = 0; i < next_index; ++i) {
|
260
|
-
out[i] = tmp[i];
|
261
|
-
}
|
262
|
-
return next_index;
|
263
|
-
}
|
264
|
-
|
265
|
-
// Clusters similar histograms in 'in' together, the selected histograms are
|
266
|
-
// placed in 'out', and for each index in 'in', *histogram_symbols will
|
267
|
-
// indicate which of the 'out' histograms is the best approximation.
|
268
|
-
template<typename HistogramType>
|
269
|
-
void ClusterHistograms(const std::vector<HistogramType>& in,
|
270
|
-
size_t num_contexts, size_t num_blocks,
|
271
|
-
size_t max_histograms,
|
272
|
-
std::vector<HistogramType>* out,
|
273
|
-
std::vector<uint32_t>* histogram_symbols) {
|
274
|
-
const size_t in_size = num_contexts * num_blocks;
|
275
|
-
assert(in_size == in.size());
|
276
|
-
std::vector<uint32_t> cluster_size(in_size, 1);
|
277
|
-
std::vector<uint32_t> clusters(in_size);
|
278
|
-
size_t num_clusters = 0;
|
279
|
-
out->resize(in_size);
|
280
|
-
histogram_symbols->resize(in_size);
|
281
|
-
for (size_t i = 0; i < in_size; ++i) {
|
282
|
-
(*out)[i] = in[i];
|
283
|
-
(*out)[i].bit_cost_ = PopulationCost(in[i]);
|
284
|
-
(*histogram_symbols)[i] = static_cast<uint32_t>(i);
|
285
|
-
}
|
286
|
-
|
287
|
-
const size_t max_input_histograms = 64;
|
288
|
-
// For the first pass of clustering, we allow all pairs.
|
289
|
-
size_t max_num_pairs = max_input_histograms * max_input_histograms / 2;
|
290
|
-
std::vector<HistogramPair> pairs(max_num_pairs + 1);
|
26
|
+
} HistogramPair;
|
291
27
|
|
292
|
-
|
293
|
-
size_t num_to_combine = std::min(in_size - i, max_input_histograms);
|
294
|
-
for (size_t j = 0; j < num_to_combine; ++j) {
|
295
|
-
clusters[num_clusters + j] = static_cast<uint32_t>(i + j);
|
296
|
-
}
|
297
|
-
size_t num_new_clusters =
|
298
|
-
HistogramCombine(&(*out)[0], &cluster_size[0],
|
299
|
-
&(*histogram_symbols)[i],
|
300
|
-
&clusters[num_clusters], &pairs[0],
|
301
|
-
num_to_combine, num_to_combine,
|
302
|
-
max_histograms, max_num_pairs);
|
303
|
-
num_clusters += num_new_clusters;
|
304
|
-
}
|
28
|
+
#define CODE(X) /* Declaration */;
|
305
29
|
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
std::min(64 * num_clusters, (num_clusters / 2) * num_clusters);
|
310
|
-
pairs.resize(max_num_pairs + 1);
|
30
|
+
#define FN(X) X ## Literal
|
31
|
+
#include "./cluster_inc.h" /* NOLINT(build/include) */
|
32
|
+
#undef FN
|
311
33
|
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
&pairs[0], num_clusters, in_size,
|
316
|
-
max_histograms, max_num_pairs);
|
34
|
+
#define FN(X) X ## Command
|
35
|
+
#include "./cluster_inc.h" /* NOLINT(build/include) */
|
36
|
+
#undef FN
|
317
37
|
|
318
|
-
|
319
|
-
|
320
|
-
|
38
|
+
#define FN(X) X ## Distance
|
39
|
+
#include "./cluster_inc.h" /* NOLINT(build/include) */
|
40
|
+
#undef FN
|
321
41
|
|
322
|
-
|
323
|
-
size_t num_histograms =
|
324
|
-
HistogramReindex(&(*out)[0], &(*histogram_symbols)[0], in_size);
|
325
|
-
out->resize(num_histograms);
|
326
|
-
}
|
42
|
+
#undef CODE
|
327
43
|
|
328
|
-
|
44
|
+
#if defined(__cplusplus) || defined(c_plusplus)
|
45
|
+
} /* extern "C" */
|
46
|
+
#endif
|
329
47
|
|
330
|
-
#endif
|
48
|
+
#endif /* BROTLI_ENC_CLUSTER_H_ */
|
@@ -0,0 +1,315 @@
|
|
1
|
+
/* NOLINT(build/header_guard) */
|
2
|
+
/* Copyright 2013 Google Inc. All Rights Reserved.
|
3
|
+
|
4
|
+
Distributed under MIT license.
|
5
|
+
See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
|
6
|
+
*/
|
7
|
+
|
8
|
+
/* template parameters: FN, CODE */
|
9
|
+
|
10
|
+
#define HistogramType FN(Histogram)
|
11
|
+
|
12
|
+
/* Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
|
13
|
+
it is below a threshold, stores the pair (idx1, idx2) in the *pairs queue. */
|
14
|
+
BROTLI_INTERNAL void FN(BrotliCompareAndPushToQueue)(
|
15
|
+
const HistogramType* out, const uint32_t* cluster_size, uint32_t idx1,
|
16
|
+
uint32_t idx2, size_t max_num_pairs, HistogramPair* pairs,
|
17
|
+
size_t* num_pairs) CODE({
|
18
|
+
BROTLI_BOOL is_good_pair = BROTLI_FALSE;
|
19
|
+
HistogramPair p;
|
20
|
+
if (idx1 == idx2) {
|
21
|
+
return;
|
22
|
+
}
|
23
|
+
if (idx2 < idx1) {
|
24
|
+
uint32_t t = idx2;
|
25
|
+
idx2 = idx1;
|
26
|
+
idx1 = t;
|
27
|
+
}
|
28
|
+
p.idx1 = idx1;
|
29
|
+
p.idx2 = idx2;
|
30
|
+
p.cost_diff = 0.5 * ClusterCostDiff(cluster_size[idx1], cluster_size[idx2]);
|
31
|
+
p.cost_diff -= out[idx1].bit_cost_;
|
32
|
+
p.cost_diff -= out[idx2].bit_cost_;
|
33
|
+
|
34
|
+
if (out[idx1].total_count_ == 0) {
|
35
|
+
p.cost_combo = out[idx2].bit_cost_;
|
36
|
+
is_good_pair = BROTLI_TRUE;
|
37
|
+
} else if (out[idx2].total_count_ == 0) {
|
38
|
+
p.cost_combo = out[idx1].bit_cost_;
|
39
|
+
is_good_pair = BROTLI_TRUE;
|
40
|
+
} else {
|
41
|
+
double threshold = *num_pairs == 0 ? 1e99 :
|
42
|
+
BROTLI_MAX(double, 0.0, pairs[0].cost_diff);
|
43
|
+
HistogramType combo = out[idx1];
|
44
|
+
double cost_combo;
|
45
|
+
FN(HistogramAddHistogram)(&combo, &out[idx2]);
|
46
|
+
cost_combo = FN(BrotliPopulationCost)(&combo);
|
47
|
+
if (cost_combo < threshold - p.cost_diff) {
|
48
|
+
p.cost_combo = cost_combo;
|
49
|
+
is_good_pair = BROTLI_TRUE;
|
50
|
+
}
|
51
|
+
}
|
52
|
+
if (is_good_pair) {
|
53
|
+
p.cost_diff += p.cost_combo;
|
54
|
+
if (*num_pairs > 0 && HistogramPairIsLess(&pairs[0], &p)) {
|
55
|
+
/* Replace the top of the queue if needed. */
|
56
|
+
if (*num_pairs < max_num_pairs) {
|
57
|
+
pairs[*num_pairs] = pairs[0];
|
58
|
+
++(*num_pairs);
|
59
|
+
}
|
60
|
+
pairs[0] = p;
|
61
|
+
} else if (*num_pairs < max_num_pairs) {
|
62
|
+
pairs[*num_pairs] = p;
|
63
|
+
++(*num_pairs);
|
64
|
+
}
|
65
|
+
}
|
66
|
+
})
|
67
|
+
|
68
|
+
BROTLI_INTERNAL size_t FN(BrotliHistogramCombine)(HistogramType* out,
|
69
|
+
uint32_t* cluster_size,
|
70
|
+
uint32_t* symbols,
|
71
|
+
uint32_t* clusters,
|
72
|
+
HistogramPair* pairs,
|
73
|
+
size_t num_clusters,
|
74
|
+
size_t symbols_size,
|
75
|
+
size_t max_clusters,
|
76
|
+
size_t max_num_pairs) CODE({
|
77
|
+
double cost_diff_threshold = 0.0;
|
78
|
+
size_t min_cluster_size = 1;
|
79
|
+
size_t num_pairs = 0;
|
80
|
+
|
81
|
+
{
|
82
|
+
/* We maintain a vector of histogram pairs, with the property that the pair
|
83
|
+
with the maximum bit cost reduction is the first. */
|
84
|
+
size_t idx1;
|
85
|
+
for (idx1 = 0; idx1 < num_clusters; ++idx1) {
|
86
|
+
size_t idx2;
|
87
|
+
for (idx2 = idx1 + 1; idx2 < num_clusters; ++idx2) {
|
88
|
+
FN(BrotliCompareAndPushToQueue)(out, cluster_size, clusters[idx1],
|
89
|
+
clusters[idx2], max_num_pairs, &pairs[0], &num_pairs);
|
90
|
+
}
|
91
|
+
}
|
92
|
+
}
|
93
|
+
|
94
|
+
while (num_clusters > min_cluster_size) {
|
95
|
+
uint32_t best_idx1;
|
96
|
+
uint32_t best_idx2;
|
97
|
+
size_t i;
|
98
|
+
if (pairs[0].cost_diff >= cost_diff_threshold) {
|
99
|
+
cost_diff_threshold = 1e99;
|
100
|
+
min_cluster_size = max_clusters;
|
101
|
+
continue;
|
102
|
+
}
|
103
|
+
/* Take the best pair from the top of heap. */
|
104
|
+
best_idx1 = pairs[0].idx1;
|
105
|
+
best_idx2 = pairs[0].idx2;
|
106
|
+
FN(HistogramAddHistogram)(&out[best_idx1], &out[best_idx2]);
|
107
|
+
out[best_idx1].bit_cost_ = pairs[0].cost_combo;
|
108
|
+
cluster_size[best_idx1] += cluster_size[best_idx2];
|
109
|
+
for (i = 0; i < symbols_size; ++i) {
|
110
|
+
if (symbols[i] == best_idx2) {
|
111
|
+
symbols[i] = best_idx1;
|
112
|
+
}
|
113
|
+
}
|
114
|
+
for (i = 0; i < num_clusters; ++i) {
|
115
|
+
if (clusters[i] == best_idx2) {
|
116
|
+
memmove(&clusters[i], &clusters[i + 1],
|
117
|
+
(num_clusters - i - 1) * sizeof(clusters[0]));
|
118
|
+
break;
|
119
|
+
}
|
120
|
+
}
|
121
|
+
--num_clusters;
|
122
|
+
{
|
123
|
+
/* Remove pairs intersecting the just combined best pair. */
|
124
|
+
size_t copy_to_idx = 0;
|
125
|
+
for (i = 0; i < num_pairs; ++i) {
|
126
|
+
HistogramPair* p = &pairs[i];
|
127
|
+
if (p->idx1 == best_idx1 || p->idx2 == best_idx1 ||
|
128
|
+
p->idx1 == best_idx2 || p->idx2 == best_idx2) {
|
129
|
+
/* Remove invalid pair from the queue. */
|
130
|
+
continue;
|
131
|
+
}
|
132
|
+
if (HistogramPairIsLess(&pairs[0], p)) {
|
133
|
+
/* Replace the top of the queue if needed. */
|
134
|
+
HistogramPair front = pairs[0];
|
135
|
+
pairs[0] = *p;
|
136
|
+
pairs[copy_to_idx] = front;
|
137
|
+
} else {
|
138
|
+
pairs[copy_to_idx] = *p;
|
139
|
+
}
|
140
|
+
++copy_to_idx;
|
141
|
+
}
|
142
|
+
num_pairs = copy_to_idx;
|
143
|
+
}
|
144
|
+
|
145
|
+
/* Push new pairs formed with the combined histogram to the heap. */
|
146
|
+
for (i = 0; i < num_clusters; ++i) {
|
147
|
+
FN(BrotliCompareAndPushToQueue)(out, cluster_size, best_idx1, clusters[i],
|
148
|
+
max_num_pairs, &pairs[0], &num_pairs);
|
149
|
+
}
|
150
|
+
}
|
151
|
+
return num_clusters;
|
152
|
+
})
|
153
|
+
|
154
|
+
/* What is the bit cost of moving histogram from cur_symbol to candidate. */
|
155
|
+
BROTLI_INTERNAL double FN(BrotliHistogramBitCostDistance)(
|
156
|
+
const HistogramType* histogram, const HistogramType* candidate) CODE({
|
157
|
+
if (histogram->total_count_ == 0) {
|
158
|
+
return 0.0;
|
159
|
+
} else {
|
160
|
+
HistogramType tmp = *histogram;
|
161
|
+
FN(HistogramAddHistogram)(&tmp, candidate);
|
162
|
+
return FN(BrotliPopulationCost)(&tmp) - candidate->bit_cost_;
|
163
|
+
}
|
164
|
+
})
|
165
|
+
|
166
|
+
/* Find the best 'out' histogram for each of the 'in' histograms.
|
167
|
+
When called, clusters[0..num_clusters) contains the unique values from
|
168
|
+
symbols[0..in_size), but this property is not preserved in this function.
|
169
|
+
Note: we assume that out[]->bit_cost_ is already up-to-date. */
|
170
|
+
BROTLI_INTERNAL void FN(BrotliHistogramRemap)(const HistogramType* in,
|
171
|
+
size_t in_size, const uint32_t* clusters, size_t num_clusters,
|
172
|
+
HistogramType* out, uint32_t* symbols) CODE({
|
173
|
+
size_t i;
|
174
|
+
for (i = 0; i < in_size; ++i) {
|
175
|
+
uint32_t best_out = i == 0 ? symbols[0] : symbols[i - 1];
|
176
|
+
double best_bits =
|
177
|
+
FN(BrotliHistogramBitCostDistance)(&in[i], &out[best_out]);
|
178
|
+
size_t j;
|
179
|
+
for (j = 0; j < num_clusters; ++j) {
|
180
|
+
const double cur_bits =
|
181
|
+
FN(BrotliHistogramBitCostDistance)(&in[i], &out[clusters[j]]);
|
182
|
+
if (cur_bits < best_bits) {
|
183
|
+
best_bits = cur_bits;
|
184
|
+
best_out = clusters[j];
|
185
|
+
}
|
186
|
+
}
|
187
|
+
symbols[i] = best_out;
|
188
|
+
}
|
189
|
+
|
190
|
+
/* Recompute each out based on raw and symbols. */
|
191
|
+
for (i = 0; i < num_clusters; ++i) {
|
192
|
+
FN(HistogramClear)(&out[clusters[i]]);
|
193
|
+
}
|
194
|
+
for (i = 0; i < in_size; ++i) {
|
195
|
+
FN(HistogramAddHistogram)(&out[symbols[i]], &in[i]);
|
196
|
+
}
|
197
|
+
})
|
198
|
+
|
199
|
+
/* Reorders elements of the out[0..length) array and changes values in
|
200
|
+
symbols[0..length) array in the following way:
|
201
|
+
* when called, symbols[] contains indexes into out[], and has N unique
|
202
|
+
values (possibly N < length)
|
203
|
+
* on return, symbols'[i] = f(symbols[i]) and
|
204
|
+
out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
|
205
|
+
where f is a bijection between the range of symbols[] and [0..N), and
|
206
|
+
the first occurrences of values in symbols'[i] come in consecutive
|
207
|
+
increasing order.
|
208
|
+
Returns N, the number of unique values in symbols[]. */
|
209
|
+
BROTLI_INTERNAL size_t FN(BrotliHistogramReindex)(MemoryManager* m,
|
210
|
+
HistogramType* out, uint32_t* symbols, size_t length) CODE({
|
211
|
+
static const uint32_t kInvalidIndex = BROTLI_UINT32_MAX;
|
212
|
+
uint32_t* new_index = BROTLI_ALLOC(m, uint32_t, length);
|
213
|
+
uint32_t next_index;
|
214
|
+
HistogramType* tmp;
|
215
|
+
size_t i;
|
216
|
+
if (BROTLI_IS_OOM(m)) return 0;
|
217
|
+
for (i = 0; i < length; ++i) {
|
218
|
+
new_index[i] = kInvalidIndex;
|
219
|
+
}
|
220
|
+
next_index = 0;
|
221
|
+
for (i = 0; i < length; ++i) {
|
222
|
+
if (new_index[symbols[i]] == kInvalidIndex) {
|
223
|
+
new_index[symbols[i]] = next_index;
|
224
|
+
++next_index;
|
225
|
+
}
|
226
|
+
}
|
227
|
+
/* TODO: by using idea of "cycle-sort" we can avoid allocation of
|
228
|
+
tmp and reduce the number of copying by the factor of 2. */
|
229
|
+
tmp = BROTLI_ALLOC(m, HistogramType, next_index);
|
230
|
+
if (BROTLI_IS_OOM(m)) return 0;
|
231
|
+
next_index = 0;
|
232
|
+
for (i = 0; i < length; ++i) {
|
233
|
+
if (new_index[symbols[i]] == next_index) {
|
234
|
+
tmp[next_index] = out[symbols[i]];
|
235
|
+
++next_index;
|
236
|
+
}
|
237
|
+
symbols[i] = new_index[symbols[i]];
|
238
|
+
}
|
239
|
+
BROTLI_FREE(m, new_index);
|
240
|
+
for (i = 0; i < next_index; ++i) {
|
241
|
+
out[i] = tmp[i];
|
242
|
+
}
|
243
|
+
BROTLI_FREE(m, tmp);
|
244
|
+
return next_index;
|
245
|
+
})
|
246
|
+
|
247
|
+
BROTLI_INTERNAL void FN(BrotliClusterHistograms)(
|
248
|
+
MemoryManager* m, const HistogramType* in, const size_t in_size,
|
249
|
+
size_t max_histograms, HistogramType* out, size_t* out_size,
|
250
|
+
uint32_t* histogram_symbols) CODE({
|
251
|
+
uint32_t* cluster_size = BROTLI_ALLOC(m, uint32_t, in_size);
|
252
|
+
uint32_t* clusters = BROTLI_ALLOC(m, uint32_t, in_size);
|
253
|
+
size_t num_clusters = 0;
|
254
|
+
const size_t max_input_histograms = 64;
|
255
|
+
size_t pairs_capacity = max_input_histograms * max_input_histograms / 2;
|
256
|
+
/* For the first pass of clustering, we allow all pairs. */
|
257
|
+
HistogramPair* pairs = BROTLI_ALLOC(m, HistogramPair, pairs_capacity + 1);
|
258
|
+
size_t i;
|
259
|
+
|
260
|
+
if (BROTLI_IS_OOM(m)) return;
|
261
|
+
|
262
|
+
for (i = 0; i < in_size; ++i) {
|
263
|
+
cluster_size[i] = 1;
|
264
|
+
}
|
265
|
+
|
266
|
+
for (i = 0; i < in_size; ++i) {
|
267
|
+
out[i] = in[i];
|
268
|
+
out[i].bit_cost_ = FN(BrotliPopulationCost)(&in[i]);
|
269
|
+
histogram_symbols[i] = (uint32_t)i;
|
270
|
+
}
|
271
|
+
|
272
|
+
for (i = 0; i < in_size; i += max_input_histograms) {
|
273
|
+
size_t num_to_combine =
|
274
|
+
BROTLI_MIN(size_t, in_size - i, max_input_histograms);
|
275
|
+
size_t num_new_clusters;
|
276
|
+
size_t j;
|
277
|
+
for (j = 0; j < num_to_combine; ++j) {
|
278
|
+
clusters[num_clusters + j] = (uint32_t)(i + j);
|
279
|
+
}
|
280
|
+
num_new_clusters =
|
281
|
+
FN(BrotliHistogramCombine)(out, cluster_size,
|
282
|
+
&histogram_symbols[i],
|
283
|
+
&clusters[num_clusters], pairs,
|
284
|
+
num_to_combine, num_to_combine,
|
285
|
+
max_histograms, pairs_capacity);
|
286
|
+
num_clusters += num_new_clusters;
|
287
|
+
}
|
288
|
+
|
289
|
+
{
|
290
|
+
/* For the second pass, we limit the total number of histogram pairs.
|
291
|
+
After this limit is reached, we only keep searching for the best pair. */
|
292
|
+
size_t max_num_pairs = BROTLI_MIN(size_t,
|
293
|
+
64 * num_clusters, (num_clusters / 2) * num_clusters);
|
294
|
+
BROTLI_ENSURE_CAPACITY(
|
295
|
+
m, HistogramPair, pairs, pairs_capacity, max_num_pairs + 1);
|
296
|
+
if (BROTLI_IS_OOM(m)) return;
|
297
|
+
|
298
|
+
/* Collapse similar histograms. */
|
299
|
+
num_clusters = FN(BrotliHistogramCombine)(out, cluster_size,
|
300
|
+
histogram_symbols, clusters,
|
301
|
+
pairs, num_clusters, in_size,
|
302
|
+
max_histograms, max_num_pairs);
|
303
|
+
}
|
304
|
+
BROTLI_FREE(m, pairs);
|
305
|
+
BROTLI_FREE(m, cluster_size);
|
306
|
+
/* Find the optimal map from original histograms to the final ones. */
|
307
|
+
FN(BrotliHistogramRemap)(in, in_size, clusters, num_clusters,
|
308
|
+
out, histogram_symbols);
|
309
|
+
BROTLI_FREE(m, clusters);
|
310
|
+
/* Convert the context map to a canonical form. */
|
311
|
+
*out_size = FN(BrotliHistogramReindex)(m, out, histogram_symbols, in_size);
|
312
|
+
if (BROTLI_IS_OOM(m)) return;
|
313
|
+
})
|
314
|
+
|
315
|
+
#undef HistogramType
|