brotli 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/brotli/brotli.cc +114 -24
- data/ext/brotli/brotli.h +0 -1
- data/ext/brotli/extconf.rb +30 -23
- data/lib/brotli/version.rb +1 -1
- data/vendor/brotli/LICENSE +1 -1
- data/vendor/brotli/dec/Makefile +1 -1
- data/vendor/brotli/dec/bit_reader.c +3 -3
- data/vendor/brotli/dec/bit_reader.h +25 -27
- data/vendor/brotli/dec/context.h +4 -4
- data/vendor/brotli/dec/decode.c +410 -486
- data/vendor/brotli/dec/decode.h +101 -105
- data/vendor/brotli/dec/dictionary.c +1 -1
- data/vendor/brotli/dec/dictionary.h +7 -8
- data/vendor/brotli/dec/huffman.c +103 -105
- data/vendor/brotli/dec/huffman.h +18 -18
- data/vendor/brotli/dec/port.h +52 -40
- data/vendor/brotli/dec/prefix.h +2 -0
- data/vendor/brotli/dec/state.c +13 -19
- data/vendor/brotli/dec/state.h +25 -39
- data/vendor/brotli/dec/transform.h +38 -44
- data/vendor/brotli/dec/types.h +2 -2
- data/vendor/brotli/enc/Makefile +1 -1
- data/vendor/brotli/enc/backward_references.cc +455 -359
- data/vendor/brotli/enc/backward_references.h +79 -3
- data/vendor/brotli/enc/bit_cost.h +54 -32
- data/vendor/brotli/enc/block_splitter.cc +285 -193
- data/vendor/brotli/enc/block_splitter.h +4 -12
- data/vendor/brotli/enc/brotli_bit_stream.cc +623 -324
- data/vendor/brotli/enc/brotli_bit_stream.h +76 -37
- data/vendor/brotli/enc/cluster.h +161 -120
- data/vendor/brotli/enc/command.h +60 -37
- data/vendor/brotli/enc/compress_fragment.cc +701 -0
- data/vendor/brotli/enc/compress_fragment.h +47 -0
- data/vendor/brotli/enc/compress_fragment_two_pass.cc +524 -0
- data/vendor/brotli/enc/compress_fragment_two_pass.h +40 -0
- data/vendor/brotli/enc/compressor.h +15 -0
- data/vendor/brotli/enc/context.h +1 -1
- data/vendor/brotli/enc/dictionary.h +2 -2
- data/vendor/brotli/enc/encode.cc +819 -286
- data/vendor/brotli/enc/encode.h +38 -15
- data/vendor/brotli/enc/encode_parallel.cc +40 -42
- data/vendor/brotli/enc/entropy_encode.cc +144 -147
- data/vendor/brotli/enc/entropy_encode.h +32 -8
- data/vendor/brotli/enc/entropy_encode_static.h +572 -0
- data/vendor/brotli/enc/fast_log.h +7 -40
- data/vendor/brotli/enc/find_match_length.h +9 -9
- data/vendor/brotli/enc/hash.h +462 -154
- data/vendor/brotli/enc/histogram.cc +6 -6
- data/vendor/brotli/enc/histogram.h +13 -13
- data/vendor/brotli/enc/literal_cost.cc +45 -45
- data/vendor/brotli/enc/metablock.cc +92 -89
- data/vendor/brotli/enc/metablock.h +12 -12
- data/vendor/brotli/enc/port.h +7 -16
- data/vendor/brotli/enc/prefix.h +23 -22
- data/vendor/brotli/enc/ringbuffer.h +75 -29
- data/vendor/brotli/enc/static_dict.cc +56 -48
- data/vendor/brotli/enc/static_dict.h +5 -5
- data/vendor/brotli/enc/streams.cc +1 -1
- data/vendor/brotli/enc/streams.h +5 -5
- data/vendor/brotli/enc/transform.h +40 -35
- data/vendor/brotli/enc/types.h +2 -0
- data/vendor/brotli/enc/utf8_util.cc +3 -2
- data/vendor/brotli/enc/write_bits.h +6 -6
- metadata +9 -5
- data/vendor/brotli/dec/streams.c +0 -102
- data/vendor/brotli/dec/streams.h +0 -95
@@ -18,6 +18,7 @@
|
|
18
18
|
|
19
19
|
#include <vector>
|
20
20
|
|
21
|
+
#include "./entropy_encode.h"
|
21
22
|
#include "./metablock.h"
|
22
23
|
#include "./types.h"
|
23
24
|
|
@@ -27,113 +28,151 @@ namespace brotli {
|
|
27
28
|
// position for the current storage.
|
28
29
|
|
29
30
|
// Stores a number between 0 and 255.
|
30
|
-
void StoreVarLenUint8(
|
31
|
+
void StoreVarLenUint8(size_t n, size_t* storage_ix, uint8_t* storage);
|
31
32
|
|
32
33
|
// Stores the compressed meta-block header.
|
33
|
-
|
34
|
+
// REQUIRES: length > 0
|
35
|
+
// REQUIRES: length <= (1 << 24)
|
36
|
+
void StoreCompressedMetaBlockHeader(bool final_block,
|
34
37
|
size_t length,
|
35
|
-
|
38
|
+
size_t* storage_ix,
|
36
39
|
uint8_t* storage);
|
37
40
|
|
38
41
|
// Stores the uncompressed meta-block header.
|
39
|
-
|
40
|
-
|
42
|
+
// REQUIRES: length > 0
|
43
|
+
// REQUIRES: length <= (1 << 24)
|
44
|
+
void StoreUncompressedMetaBlockHeader(size_t length,
|
45
|
+
size_t* storage_ix,
|
41
46
|
uint8_t* storage);
|
42
47
|
|
43
48
|
// Stores a context map where the histogram type is always the block type.
|
44
|
-
void StoreTrivialContextMap(
|
45
|
-
|
46
|
-
|
49
|
+
void StoreTrivialContextMap(size_t num_types,
|
50
|
+
size_t context_bits,
|
51
|
+
HuffmanTree* tree,
|
52
|
+
size_t* storage_ix,
|
47
53
|
uint8_t* storage);
|
48
54
|
|
49
55
|
void StoreHuffmanTreeOfHuffmanTreeToBitMask(
|
50
56
|
const int num_codes,
|
51
57
|
const uint8_t *code_length_bitdepth,
|
52
|
-
|
58
|
+
size_t *storage_ix,
|
53
59
|
uint8_t *storage);
|
54
60
|
|
61
|
+
void StoreHuffmanTree(const uint8_t* depths, size_t num, HuffmanTree* tree,
|
62
|
+
size_t *storage_ix, uint8_t *storage);
|
63
|
+
|
55
64
|
// Builds a Huffman tree from histogram[0:length] into depth[0:length] and
|
56
65
|
// bits[0:length] and stores the encoded tree to the bit stream.
|
57
|
-
void BuildAndStoreHuffmanTree(const
|
58
|
-
const
|
66
|
+
void BuildAndStoreHuffmanTree(const uint32_t *histogram,
|
67
|
+
const size_t length,
|
68
|
+
HuffmanTree* tree,
|
59
69
|
uint8_t* depth,
|
60
70
|
uint16_t* bits,
|
61
|
-
|
71
|
+
size_t* storage_ix,
|
62
72
|
uint8_t* storage);
|
63
73
|
|
74
|
+
void BuildAndStoreHuffmanTreeFast(const uint32_t *histogram,
|
75
|
+
const size_t histogram_total,
|
76
|
+
const size_t max_bits,
|
77
|
+
uint8_t* depth,
|
78
|
+
uint16_t* bits,
|
79
|
+
size_t* storage_ix,
|
80
|
+
uint8_t* storage);
|
81
|
+
|
64
82
|
// Encodes the given context map to the bit stream. The number of different
|
65
83
|
// histogram ids is given by num_clusters.
|
66
|
-
void EncodeContextMap(const std::vector<
|
67
|
-
|
68
|
-
|
84
|
+
void EncodeContextMap(const std::vector<uint32_t>& context_map,
|
85
|
+
size_t num_clusters,
|
86
|
+
HuffmanTree* tree,
|
87
|
+
size_t* storage_ix, uint8_t* storage);
|
69
88
|
|
70
89
|
// Data structure that stores everything that is needed to encode each block
|
71
90
|
// switch command.
|
72
91
|
struct BlockSplitCode {
|
73
|
-
std::vector<
|
74
|
-
std::vector<
|
75
|
-
std::vector<
|
76
|
-
std::vector<
|
92
|
+
std::vector<uint32_t> type_code;
|
93
|
+
std::vector<uint32_t> length_prefix;
|
94
|
+
std::vector<uint32_t> length_nextra;
|
95
|
+
std::vector<uint32_t> length_extra;
|
77
96
|
std::vector<uint8_t> type_depths;
|
78
97
|
std::vector<uint16_t> type_bits;
|
79
|
-
|
80
|
-
|
98
|
+
uint8_t length_depths[kNumBlockLenPrefixes];
|
99
|
+
uint16_t length_bits[kNumBlockLenPrefixes];
|
81
100
|
};
|
82
101
|
|
83
102
|
// Builds a BlockSplitCode data structure from the block split given by the
|
84
103
|
// vector of block types and block lengths and stores it to the bit stream.
|
85
|
-
void BuildAndStoreBlockSplitCode(const std::vector<
|
86
|
-
const std::vector<
|
87
|
-
const
|
104
|
+
void BuildAndStoreBlockSplitCode(const std::vector<uint8_t>& types,
|
105
|
+
const std::vector<uint32_t>& lengths,
|
106
|
+
const size_t num_types,
|
88
107
|
BlockSplitCode* code,
|
89
|
-
|
108
|
+
size_t* storage_ix,
|
90
109
|
uint8_t* storage);
|
91
110
|
|
92
111
|
// Stores the block switch command with index block_ix to the bit stream.
|
93
112
|
void StoreBlockSwitch(const BlockSplitCode& code,
|
94
|
-
const
|
95
|
-
|
113
|
+
const size_t block_ix,
|
114
|
+
size_t* storage_ix,
|
96
115
|
uint8_t* storage);
|
97
116
|
|
98
|
-
|
117
|
+
// REQUIRES: length > 0
|
118
|
+
// REQUIRES: length <= (1 << 24)
|
119
|
+
void StoreMetaBlock(const uint8_t* input,
|
99
120
|
size_t start_pos,
|
100
121
|
size_t length,
|
101
122
|
size_t mask,
|
102
123
|
uint8_t prev_byte,
|
103
124
|
uint8_t prev_byte2,
|
104
125
|
bool final_block,
|
105
|
-
|
106
|
-
|
107
|
-
|
126
|
+
uint32_t num_direct_distance_codes,
|
127
|
+
uint32_t distance_postfix_bits,
|
128
|
+
ContextType literal_context_mode,
|
108
129
|
const brotli::Command *commands,
|
109
130
|
size_t n_commands,
|
110
131
|
const MetaBlockSplit& mb,
|
111
|
-
|
132
|
+
size_t *storage_ix,
|
112
133
|
uint8_t *storage);
|
113
134
|
|
114
135
|
// Stores the meta-block without doing any block splitting, just collects
|
115
136
|
// one histogram per block category and uses that for entropy coding.
|
116
|
-
|
137
|
+
// REQUIRES: length > 0
|
138
|
+
// REQUIRES: length <= (1 << 24)
|
139
|
+
void StoreMetaBlockTrivial(const uint8_t* input,
|
117
140
|
size_t start_pos,
|
118
141
|
size_t length,
|
119
142
|
size_t mask,
|
120
143
|
bool is_last,
|
121
144
|
const brotli::Command *commands,
|
122
145
|
size_t n_commands,
|
123
|
-
|
146
|
+
size_t *storage_ix,
|
124
147
|
uint8_t *storage);
|
125
148
|
|
149
|
+
// Same as above, but uses static prefix codes for histograms with a only a few
|
150
|
+
// symbols, and uses static code length prefix codes for all other histograms.
|
151
|
+
// REQUIRES: length > 0
|
152
|
+
// REQUIRES: length <= (1 << 24)
|
153
|
+
void StoreMetaBlockFast(const uint8_t* input,
|
154
|
+
size_t start_pos,
|
155
|
+
size_t length,
|
156
|
+
size_t mask,
|
157
|
+
bool is_last,
|
158
|
+
const brotli::Command *commands,
|
159
|
+
size_t n_commands,
|
160
|
+
size_t *storage_ix,
|
161
|
+
uint8_t *storage);
|
162
|
+
|
126
163
|
// This is for storing uncompressed blocks (simple raw storage of
|
127
164
|
// bytes-as-bytes).
|
128
|
-
|
165
|
+
// REQUIRES: length > 0
|
166
|
+
// REQUIRES: length <= (1 << 24)
|
167
|
+
void StoreUncompressedMetaBlock(bool final_block,
|
129
168
|
const uint8_t* input,
|
130
169
|
size_t position, size_t mask,
|
131
170
|
size_t len,
|
132
|
-
|
171
|
+
size_t* storage_ix,
|
133
172
|
uint8_t* storage);
|
134
173
|
|
135
174
|
// Stores an empty metadata meta-block and syncs to a byte boundary.
|
136
|
-
void StoreSyncMetaBlock(
|
175
|
+
void StoreSyncMetaBlock(size_t* storage_ix, uint8_t* storage);
|
137
176
|
|
138
177
|
} // namespace brotli
|
139
178
|
|
data/vendor/brotli/enc/cluster.h
CHANGED
@@ -10,11 +10,7 @@
|
|
10
10
|
#define BROTLI_ENC_CLUSTER_H_
|
11
11
|
|
12
12
|
#include <math.h>
|
13
|
-
#include <stdio.h>
|
14
13
|
#include <algorithm>
|
15
|
-
#include <complex>
|
16
|
-
#include <map>
|
17
|
-
#include <set>
|
18
14
|
#include <utility>
|
19
15
|
#include <vector>
|
20
16
|
|
@@ -28,41 +24,41 @@
|
|
28
24
|
namespace brotli {
|
29
25
|
|
30
26
|
struct HistogramPair {
|
31
|
-
|
32
|
-
|
33
|
-
bool valid;
|
27
|
+
uint32_t idx1;
|
28
|
+
uint32_t idx2;
|
34
29
|
double cost_combo;
|
35
30
|
double cost_diff;
|
36
31
|
};
|
37
32
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
return p1.cost_diff > p2.cost_diff;
|
42
|
-
}
|
43
|
-
return abs(p1.idx1 - p1.idx2) > abs(p2.idx1 - p2.idx2);
|
33
|
+
inline bool operator<(const HistogramPair& p1, const HistogramPair& p2) {
|
34
|
+
if (p1.cost_diff != p2.cost_diff) {
|
35
|
+
return p1.cost_diff > p2.cost_diff;
|
44
36
|
}
|
45
|
-
|
37
|
+
return (p1.idx2 - p1.idx1) > (p2.idx2 - p2.idx1);
|
38
|
+
}
|
46
39
|
|
47
40
|
// Returns entropy reduction of the context map when we combine two clusters.
|
48
|
-
inline double ClusterCostDiff(
|
49
|
-
|
50
|
-
return size_a * FastLog2(size_a) +
|
51
|
-
|
41
|
+
inline double ClusterCostDiff(size_t size_a, size_t size_b) {
|
42
|
+
size_t size_c = size_a + size_b;
|
43
|
+
return static_cast<double>(size_a) * FastLog2(size_a) +
|
44
|
+
static_cast<double>(size_b) * FastLog2(size_b) -
|
45
|
+
static_cast<double>(size_c) * FastLog2(size_c);
|
52
46
|
}
|
53
47
|
|
54
48
|
// Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
|
55
|
-
// it is below a threshold, stores the pair (idx1, idx2) in the *pairs
|
49
|
+
// it is below a threshold, stores the pair (idx1, idx2) in the *pairs queue.
|
56
50
|
template<typename HistogramType>
|
57
|
-
void
|
58
|
-
|
59
|
-
|
60
|
-
|
51
|
+
void CompareAndPushToQueue(const HistogramType* out,
|
52
|
+
const uint32_t* cluster_size,
|
53
|
+
uint32_t idx1, uint32_t idx2,
|
54
|
+
size_t max_num_pairs,
|
55
|
+
HistogramPair* pairs,
|
56
|
+
size_t* num_pairs) {
|
61
57
|
if (idx1 == idx2) {
|
62
58
|
return;
|
63
59
|
}
|
64
60
|
if (idx2 < idx1) {
|
65
|
-
|
61
|
+
uint32_t t = idx2;
|
66
62
|
idx2 = idx1;
|
67
63
|
idx1 = t;
|
68
64
|
}
|
@@ -70,7 +66,6 @@ void CompareAndPushToHeap(const HistogramType* out,
|
|
70
66
|
HistogramPair p;
|
71
67
|
p.idx1 = idx1;
|
72
68
|
p.idx2 = idx2;
|
73
|
-
p.valid = true;
|
74
69
|
p.cost_diff = 0.5 * ClusterCostDiff(cluster_size[idx1], cluster_size[idx2]);
|
75
70
|
p.cost_diff -= out[idx1].bit_cost_;
|
76
71
|
p.cost_diff -= out[idx2].bit_cost_;
|
@@ -82,8 +77,8 @@ void CompareAndPushToHeap(const HistogramType* out,
|
|
82
77
|
p.cost_combo = out[idx1].bit_cost_;
|
83
78
|
store_pair = true;
|
84
79
|
} else {
|
85
|
-
double threshold =
|
86
|
-
std::max(0.0,
|
80
|
+
double threshold = *num_pairs == 0 ? 1e99 :
|
81
|
+
std::max(0.0, pairs[0].cost_diff);
|
87
82
|
HistogramType combo = out[idx1];
|
88
83
|
combo.AddHistogram(out[idx2]);
|
89
84
|
double cost_combo = PopulationCost(combo);
|
@@ -94,81 +89,96 @@ void CompareAndPushToHeap(const HistogramType* out,
|
|
94
89
|
}
|
95
90
|
if (store_pair) {
|
96
91
|
p.cost_diff += p.cost_combo;
|
97
|
-
pairs
|
98
|
-
|
92
|
+
if (*num_pairs > 0 && pairs[0] < p) {
|
93
|
+
// Replace the top of the queue if needed.
|
94
|
+
if (*num_pairs < max_num_pairs) {
|
95
|
+
pairs[*num_pairs] = pairs[0];
|
96
|
+
++(*num_pairs);
|
97
|
+
}
|
98
|
+
pairs[0] = p;
|
99
|
+
} else if (*num_pairs < max_num_pairs) {
|
100
|
+
pairs[*num_pairs] = p;
|
101
|
+
++(*num_pairs);
|
102
|
+
}
|
99
103
|
}
|
100
104
|
}
|
101
105
|
|
102
106
|
template<typename HistogramType>
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
107
|
+
size_t HistogramCombine(HistogramType* out,
|
108
|
+
uint32_t* cluster_size,
|
109
|
+
uint32_t* symbols,
|
110
|
+
uint32_t* clusters,
|
111
|
+
HistogramPair* pairs,
|
112
|
+
size_t num_clusters,
|
113
|
+
size_t symbols_size,
|
114
|
+
size_t max_clusters,
|
115
|
+
size_t max_num_pairs) {
|
108
116
|
double cost_diff_threshold = 0.0;
|
109
117
|
size_t min_cluster_size = 1;
|
110
|
-
std::set<int> all_symbols;
|
111
|
-
std::vector<int> clusters;
|
112
|
-
for (int i = 0; i < symbols_size; ++i) {
|
113
|
-
if (all_symbols.find(symbols[i]) == all_symbols.end()) {
|
114
|
-
all_symbols.insert(symbols[i]);
|
115
|
-
if (!clusters.empty()) {
|
116
|
-
BROTLI_DCHECK(clusters.back() < symbols[i]);
|
117
|
-
}
|
118
|
-
clusters.push_back(symbols[i]);
|
119
|
-
}
|
120
|
-
}
|
121
118
|
|
122
|
-
// We maintain a
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
119
|
+
// We maintain a vector of histogram pairs, with the property that the pair
|
120
|
+
// with the maximum bit cost reduction is the first.
|
121
|
+
size_t num_pairs = 0;
|
122
|
+
for (size_t idx1 = 0; idx1 < num_clusters; ++idx1) {
|
123
|
+
for (size_t idx2 = idx1 + 1; idx2 < num_clusters; ++idx2) {
|
124
|
+
CompareAndPushToQueue(out, cluster_size, clusters[idx1], clusters[idx2],
|
125
|
+
max_num_pairs, &pairs[0], &num_pairs);
|
128
126
|
}
|
129
127
|
}
|
130
128
|
|
131
|
-
while (
|
129
|
+
while (num_clusters > min_cluster_size) {
|
132
130
|
if (pairs[0].cost_diff >= cost_diff_threshold) {
|
133
131
|
cost_diff_threshold = 1e99;
|
134
132
|
min_cluster_size = max_clusters;
|
135
133
|
continue;
|
136
134
|
}
|
137
135
|
// Take the best pair from the top of heap.
|
138
|
-
|
139
|
-
|
136
|
+
uint32_t best_idx1 = pairs[0].idx1;
|
137
|
+
uint32_t best_idx2 = pairs[0].idx2;
|
140
138
|
out[best_idx1].AddHistogram(out[best_idx2]);
|
141
139
|
out[best_idx1].bit_cost_ = pairs[0].cost_combo;
|
142
140
|
cluster_size[best_idx1] += cluster_size[best_idx2];
|
143
|
-
for (
|
141
|
+
for (size_t i = 0; i < symbols_size; ++i) {
|
144
142
|
if (symbols[i] == best_idx2) {
|
145
143
|
symbols[i] = best_idx1;
|
146
144
|
}
|
147
145
|
}
|
148
|
-
for (size_t i = 0; i
|
149
|
-
if (clusters[i]
|
150
|
-
clusters[i]
|
146
|
+
for (size_t i = 0; i < num_clusters; ++i) {
|
147
|
+
if (clusters[i] == best_idx2) {
|
148
|
+
memmove(&clusters[i], &clusters[i + 1],
|
149
|
+
(num_clusters - i - 1) * sizeof(clusters[0]));
|
150
|
+
break;
|
151
151
|
}
|
152
152
|
}
|
153
|
-
|
154
|
-
//
|
155
|
-
|
153
|
+
--num_clusters;
|
154
|
+
// Remove pairs intersecting the just combined best pair.
|
155
|
+
size_t copy_to_idx = 0;
|
156
|
+
for (size_t i = 0; i < num_pairs; ++i) {
|
156
157
|
HistogramPair& p = pairs[i];
|
157
158
|
if (p.idx1 == best_idx1 || p.idx2 == best_idx1 ||
|
158
159
|
p.idx1 == best_idx2 || p.idx2 == best_idx2) {
|
159
|
-
|
160
|
+
// Remove invalid pair from the queue.
|
161
|
+
continue;
|
160
162
|
}
|
163
|
+
if (pairs[0] < p) {
|
164
|
+
// Replace the top of the queue if needed.
|
165
|
+
HistogramPair front = pairs[0];
|
166
|
+
pairs[0] = p;
|
167
|
+
pairs[copy_to_idx] = front;
|
168
|
+
} else {
|
169
|
+
pairs[copy_to_idx] = p;
|
170
|
+
}
|
171
|
+
++copy_to_idx;
|
161
172
|
}
|
162
|
-
|
163
|
-
|
164
|
-
std::pop_heap(pairs.begin(), pairs.end(), HistogramPairComparator());
|
165
|
-
pairs.pop_back();
|
166
|
-
}
|
173
|
+
num_pairs = copy_to_idx;
|
174
|
+
|
167
175
|
// Push new pairs formed with the combined histogram to the heap.
|
168
|
-
for (size_t i = 0; i <
|
169
|
-
|
176
|
+
for (size_t i = 0; i < num_clusters; ++i) {
|
177
|
+
CompareAndPushToQueue(out, cluster_size, best_idx1, clusters[i],
|
178
|
+
max_num_pairs, &pairs[0], &num_pairs);
|
170
179
|
}
|
171
180
|
}
|
181
|
+
return num_clusters;
|
172
182
|
}
|
173
183
|
|
174
184
|
// -----------------------------------------------------------------------------
|
@@ -187,58 +197,69 @@ double HistogramBitCostDistance(const HistogramType& histogram,
|
|
187
197
|
}
|
188
198
|
|
189
199
|
// Find the best 'out' histogram for each of the 'in' histograms.
|
200
|
+
// When called, clusters[0..num_clusters) contains the unique values from
|
201
|
+
// symbols[0..in_size), but this property is not preserved in this function.
|
190
202
|
// Note: we assume that out[]->bit_cost_ is already up-to-date.
|
191
203
|
template<typename HistogramType>
|
192
|
-
void HistogramRemap(const HistogramType* in,
|
193
|
-
|
194
|
-
|
195
|
-
for (
|
196
|
-
|
197
|
-
}
|
198
|
-
for (int i = 0; i < in_size; ++i) {
|
199
|
-
int best_out = i == 0 ? symbols[0] : symbols[i - 1];
|
204
|
+
void HistogramRemap(const HistogramType* in, size_t in_size,
|
205
|
+
const uint32_t* clusters, size_t num_clusters,
|
206
|
+
HistogramType* out, uint32_t* symbols) {
|
207
|
+
for (size_t i = 0; i < in_size; ++i) {
|
208
|
+
uint32_t best_out = i == 0 ? symbols[0] : symbols[i - 1];
|
200
209
|
double best_bits = HistogramBitCostDistance(in[i], out[best_out]);
|
201
|
-
for (
|
202
|
-
|
203
|
-
const double cur_bits = HistogramBitCostDistance(in[i], out[*k]);
|
210
|
+
for (size_t j = 0; j < num_clusters; ++j) {
|
211
|
+
const double cur_bits = HistogramBitCostDistance(in[i], out[clusters[j]]);
|
204
212
|
if (cur_bits < best_bits) {
|
205
213
|
best_bits = cur_bits;
|
206
|
-
best_out =
|
214
|
+
best_out = clusters[j];
|
207
215
|
}
|
208
216
|
}
|
209
217
|
symbols[i] = best_out;
|
210
218
|
}
|
211
219
|
|
212
|
-
|
213
220
|
// Recompute each out based on raw and symbols.
|
214
|
-
for (
|
215
|
-
|
216
|
-
out[*k].Clear();
|
221
|
+
for (size_t j = 0; j < num_clusters; ++j) {
|
222
|
+
out[clusters[j]].Clear();
|
217
223
|
}
|
218
|
-
for (
|
224
|
+
for (size_t i = 0; i < in_size; ++i) {
|
219
225
|
out[symbols[i]].AddHistogram(in[i]);
|
220
226
|
}
|
221
227
|
}
|
222
228
|
|
223
|
-
//
|
224
|
-
//
|
229
|
+
// Reorders elements of the out[0..length) array and changes values in
|
230
|
+
// symbols[0..length) array in the following way:
|
231
|
+
// * when called, symbols[] contains indexes into out[], and has N unique
|
232
|
+
// values (possibly N < length)
|
233
|
+
// * on return, symbols'[i] = f(symbols[i]) and
|
234
|
+
// out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
|
235
|
+
// where f is a bijection between the range of symbols[] and [0..N), and
|
236
|
+
// the first occurrences of values in symbols'[i] come in consecutive
|
237
|
+
// increasing order.
|
238
|
+
// Returns N, the number of unique values in symbols[].
|
225
239
|
template<typename HistogramType>
|
226
|
-
|
227
|
-
|
228
|
-
std::vector<
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
240
|
+
size_t HistogramReindex(HistogramType* out, uint32_t* symbols, size_t length) {
|
241
|
+
static const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
|
242
|
+
std::vector<uint32_t> new_index(length, kInvalidIndex);
|
243
|
+
uint32_t next_index = 0;
|
244
|
+
for (size_t i = 0; i < length; ++i) {
|
245
|
+
if (new_index[symbols[i]] == kInvalidIndex) {
|
246
|
+
new_index[symbols[i]] = next_index;
|
247
|
+
++next_index;
|
248
|
+
}
|
249
|
+
}
|
250
|
+
std::vector<HistogramType> tmp(next_index);
|
251
|
+
next_index = 0;
|
252
|
+
for (size_t i = 0; i < length; ++i) {
|
253
|
+
if (new_index[symbols[i]] == next_index) {
|
254
|
+
tmp[next_index] = out[symbols[i]];
|
235
255
|
++next_index;
|
236
256
|
}
|
257
|
+
symbols[i] = new_index[symbols[i]];
|
237
258
|
}
|
238
|
-
|
239
|
-
|
240
|
-
(*symbols)[i] = new_index[(*symbols)[i]];
|
259
|
+
for (size_t i = 0; i < next_index; ++i) {
|
260
|
+
out[i] = tmp[i];
|
241
261
|
}
|
262
|
+
return next_index;
|
242
263
|
}
|
243
264
|
|
244
265
|
// Clusters similar histograms in 'in' together, the selected histograms are
|
@@ -246,44 +267,64 @@ void HistogramReindex(std::vector<HistogramType>* out,
|
|
246
267
|
// indicate which of the 'out' histograms is the best approximation.
|
247
268
|
template<typename HistogramType>
|
248
269
|
void ClusterHistograms(const std::vector<HistogramType>& in,
|
249
|
-
|
270
|
+
size_t num_contexts, size_t num_blocks,
|
250
271
|
size_t max_histograms,
|
251
272
|
std::vector<HistogramType>* out,
|
252
|
-
std::vector<
|
253
|
-
const
|
254
|
-
|
255
|
-
std::vector<
|
273
|
+
std::vector<uint32_t>* histogram_symbols) {
|
274
|
+
const size_t in_size = num_contexts * num_blocks;
|
275
|
+
assert(in_size == in.size());
|
276
|
+
std::vector<uint32_t> cluster_size(in_size, 1);
|
277
|
+
std::vector<uint32_t> clusters(in_size);
|
278
|
+
size_t num_clusters = 0;
|
256
279
|
out->resize(in_size);
|
257
280
|
histogram_symbols->resize(in_size);
|
258
|
-
for (
|
281
|
+
for (size_t i = 0; i < in_size; ++i) {
|
259
282
|
(*out)[i] = in[i];
|
260
283
|
(*out)[i].bit_cost_ = PopulationCost(in[i]);
|
261
|
-
(*histogram_symbols)[i] = i;
|
284
|
+
(*histogram_symbols)[i] = static_cast<uint32_t>(i);
|
262
285
|
}
|
263
286
|
|
287
|
+
const size_t max_input_histograms = 64;
|
288
|
+
// For the first pass of clustering, we allow all pairs.
|
289
|
+
size_t max_num_pairs = max_input_histograms * max_input_histograms / 2;
|
290
|
+
std::vector<HistogramPair> pairs(max_num_pairs + 1);
|
264
291
|
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
292
|
+
for (size_t i = 0; i < in_size; i += max_input_histograms) {
|
293
|
+
size_t num_to_combine = std::min(in_size - i, max_input_histograms);
|
294
|
+
for (size_t j = 0; j < num_to_combine; ++j) {
|
295
|
+
clusters[num_clusters + j] = static_cast<uint32_t>(i + j);
|
296
|
+
}
|
297
|
+
size_t num_new_clusters =
|
298
|
+
HistogramCombine(&(*out)[0], &cluster_size[0],
|
299
|
+
&(*histogram_symbols)[i],
|
300
|
+
&clusters[num_clusters], &pairs[0],
|
301
|
+
num_to_combine, num_to_combine,
|
302
|
+
max_histograms, max_num_pairs);
|
303
|
+
num_clusters += num_new_clusters;
|
271
304
|
}
|
272
305
|
|
306
|
+
// For the second pass, we limit the total number of histogram pairs.
|
307
|
+
// After this limit is reached, we only keep searching for the best pair.
|
308
|
+
max_num_pairs =
|
309
|
+
std::min(64 * num_clusters, (num_clusters / 2) * num_clusters);
|
310
|
+
pairs.resize(max_num_pairs + 1);
|
311
|
+
|
273
312
|
// Collapse similar histograms.
|
274
|
-
HistogramCombine(&(*out)[0], &cluster_size[0],
|
275
|
-
|
276
|
-
|
313
|
+
num_clusters = HistogramCombine(&(*out)[0], &cluster_size[0],
|
314
|
+
&(*histogram_symbols)[0], &clusters[0],
|
315
|
+
&pairs[0], num_clusters, in_size,
|
316
|
+
max_histograms, max_num_pairs);
|
277
317
|
|
278
318
|
// Find the optimal map from original histograms to the final ones.
|
279
|
-
HistogramRemap(&in[0], in_size, &
|
319
|
+
HistogramRemap(&in[0], in_size, &clusters[0], num_clusters,
|
320
|
+
&(*out)[0], &(*histogram_symbols)[0]);
|
280
321
|
|
281
322
|
// Convert the context map to a canonical form.
|
282
|
-
|
283
|
-
|
323
|
+
size_t num_histograms =
|
324
|
+
HistogramReindex(&(*out)[0], &(*histogram_symbols)[0], in_size);
|
325
|
+
out->resize(num_histograms);
|
284
326
|
}
|
285
327
|
|
286
|
-
|
287
328
|
} // namespace brotli
|
288
329
|
|
289
330
|
#endif // BROTLI_ENC_CLUSTER_H_
|