brotli 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/ext/brotli/brotli.cc +114 -24
  3. data/ext/brotli/brotli.h +0 -1
  4. data/ext/brotli/extconf.rb +30 -23
  5. data/lib/brotli/version.rb +1 -1
  6. data/vendor/brotli/LICENSE +1 -1
  7. data/vendor/brotli/dec/Makefile +1 -1
  8. data/vendor/brotli/dec/bit_reader.c +3 -3
  9. data/vendor/brotli/dec/bit_reader.h +25 -27
  10. data/vendor/brotli/dec/context.h +4 -4
  11. data/vendor/brotli/dec/decode.c +410 -486
  12. data/vendor/brotli/dec/decode.h +101 -105
  13. data/vendor/brotli/dec/dictionary.c +1 -1
  14. data/vendor/brotli/dec/dictionary.h +7 -8
  15. data/vendor/brotli/dec/huffman.c +103 -105
  16. data/vendor/brotli/dec/huffman.h +18 -18
  17. data/vendor/brotli/dec/port.h +52 -40
  18. data/vendor/brotli/dec/prefix.h +2 -0
  19. data/vendor/brotli/dec/state.c +13 -19
  20. data/vendor/brotli/dec/state.h +25 -39
  21. data/vendor/brotli/dec/transform.h +38 -44
  22. data/vendor/brotli/dec/types.h +2 -2
  23. data/vendor/brotli/enc/Makefile +1 -1
  24. data/vendor/brotli/enc/backward_references.cc +455 -359
  25. data/vendor/brotli/enc/backward_references.h +79 -3
  26. data/vendor/brotli/enc/bit_cost.h +54 -32
  27. data/vendor/brotli/enc/block_splitter.cc +285 -193
  28. data/vendor/brotli/enc/block_splitter.h +4 -12
  29. data/vendor/brotli/enc/brotli_bit_stream.cc +623 -324
  30. data/vendor/brotli/enc/brotli_bit_stream.h +76 -37
  31. data/vendor/brotli/enc/cluster.h +161 -120
  32. data/vendor/brotli/enc/command.h +60 -37
  33. data/vendor/brotli/enc/compress_fragment.cc +701 -0
  34. data/vendor/brotli/enc/compress_fragment.h +47 -0
  35. data/vendor/brotli/enc/compress_fragment_two_pass.cc +524 -0
  36. data/vendor/brotli/enc/compress_fragment_two_pass.h +40 -0
  37. data/vendor/brotli/enc/compressor.h +15 -0
  38. data/vendor/brotli/enc/context.h +1 -1
  39. data/vendor/brotli/enc/dictionary.h +2 -2
  40. data/vendor/brotli/enc/encode.cc +819 -286
  41. data/vendor/brotli/enc/encode.h +38 -15
  42. data/vendor/brotli/enc/encode_parallel.cc +40 -42
  43. data/vendor/brotli/enc/entropy_encode.cc +144 -147
  44. data/vendor/brotli/enc/entropy_encode.h +32 -8
  45. data/vendor/brotli/enc/entropy_encode_static.h +572 -0
  46. data/vendor/brotli/enc/fast_log.h +7 -40
  47. data/vendor/brotli/enc/find_match_length.h +9 -9
  48. data/vendor/brotli/enc/hash.h +462 -154
  49. data/vendor/brotli/enc/histogram.cc +6 -6
  50. data/vendor/brotli/enc/histogram.h +13 -13
  51. data/vendor/brotli/enc/literal_cost.cc +45 -45
  52. data/vendor/brotli/enc/metablock.cc +92 -89
  53. data/vendor/brotli/enc/metablock.h +12 -12
  54. data/vendor/brotli/enc/port.h +7 -16
  55. data/vendor/brotli/enc/prefix.h +23 -22
  56. data/vendor/brotli/enc/ringbuffer.h +75 -29
  57. data/vendor/brotli/enc/static_dict.cc +56 -48
  58. data/vendor/brotli/enc/static_dict.h +5 -5
  59. data/vendor/brotli/enc/streams.cc +1 -1
  60. data/vendor/brotli/enc/streams.h +5 -5
  61. data/vendor/brotli/enc/transform.h +40 -35
  62. data/vendor/brotli/enc/types.h +2 -0
  63. data/vendor/brotli/enc/utf8_util.cc +3 -2
  64. data/vendor/brotli/enc/write_bits.h +6 -6
  65. metadata +9 -5
  66. data/vendor/brotli/dec/streams.c +0 -102
  67. data/vendor/brotli/dec/streams.h +0 -95
@@ -18,6 +18,7 @@
18
18
 
19
19
  #include <vector>
20
20
 
21
+ #include "./entropy_encode.h"
21
22
  #include "./metablock.h"
22
23
  #include "./types.h"
23
24
 
@@ -27,113 +28,151 @@ namespace brotli {
27
28
  // position for the current storage.
28
29
 
29
30
  // Stores a number between 0 and 255.
30
- void StoreVarLenUint8(int n, int* storage_ix, uint8_t* storage);
31
+ void StoreVarLenUint8(size_t n, size_t* storage_ix, uint8_t* storage);
31
32
 
32
33
  // Stores the compressed meta-block header.
33
- bool StoreCompressedMetaBlockHeader(bool final_block,
34
+ // REQUIRES: length > 0
35
+ // REQUIRES: length <= (1 << 24)
36
+ void StoreCompressedMetaBlockHeader(bool final_block,
34
37
  size_t length,
35
- int* storage_ix,
38
+ size_t* storage_ix,
36
39
  uint8_t* storage);
37
40
 
38
41
  // Stores the uncompressed meta-block header.
39
- bool StoreUncompressedMetaBlockHeader(size_t length,
40
- int* storage_ix,
42
+ // REQUIRES: length > 0
43
+ // REQUIRES: length <= (1 << 24)
44
+ void StoreUncompressedMetaBlockHeader(size_t length,
45
+ size_t* storage_ix,
41
46
  uint8_t* storage);
42
47
 
43
48
  // Stores a context map where the histogram type is always the block type.
44
- void StoreTrivialContextMap(int num_types,
45
- int context_bits,
46
- int* storage_ix,
49
+ void StoreTrivialContextMap(size_t num_types,
50
+ size_t context_bits,
51
+ HuffmanTree* tree,
52
+ size_t* storage_ix,
47
53
  uint8_t* storage);
48
54
 
49
55
  void StoreHuffmanTreeOfHuffmanTreeToBitMask(
50
56
  const int num_codes,
51
57
  const uint8_t *code_length_bitdepth,
52
- int *storage_ix,
58
+ size_t *storage_ix,
53
59
  uint8_t *storage);
54
60
 
61
+ void StoreHuffmanTree(const uint8_t* depths, size_t num, HuffmanTree* tree,
62
+ size_t *storage_ix, uint8_t *storage);
63
+
55
64
  // Builds a Huffman tree from histogram[0:length] into depth[0:length] and
56
65
  // bits[0:length] and stores the encoded tree to the bit stream.
57
- void BuildAndStoreHuffmanTree(const int *histogram,
58
- const int length,
66
+ void BuildAndStoreHuffmanTree(const uint32_t *histogram,
67
+ const size_t length,
68
+ HuffmanTree* tree,
59
69
  uint8_t* depth,
60
70
  uint16_t* bits,
61
- int* storage_ix,
71
+ size_t* storage_ix,
62
72
  uint8_t* storage);
63
73
 
74
+ void BuildAndStoreHuffmanTreeFast(const uint32_t *histogram,
75
+ const size_t histogram_total,
76
+ const size_t max_bits,
77
+ uint8_t* depth,
78
+ uint16_t* bits,
79
+ size_t* storage_ix,
80
+ uint8_t* storage);
81
+
64
82
  // Encodes the given context map to the bit stream. The number of different
65
83
  // histogram ids is given by num_clusters.
66
- void EncodeContextMap(const std::vector<int>& context_map,
67
- int num_clusters,
68
- int* storage_ix, uint8_t* storage);
84
+ void EncodeContextMap(const std::vector<uint32_t>& context_map,
85
+ size_t num_clusters,
86
+ HuffmanTree* tree,
87
+ size_t* storage_ix, uint8_t* storage);
69
88
 
70
89
  // Data structure that stores everything that is needed to encode each block
71
90
  // switch command.
72
91
  struct BlockSplitCode {
73
- std::vector<int> type_code;
74
- std::vector<int> length_prefix;
75
- std::vector<int> length_nextra;
76
- std::vector<int> length_extra;
92
+ std::vector<uint32_t> type_code;
93
+ std::vector<uint32_t> length_prefix;
94
+ std::vector<uint32_t> length_nextra;
95
+ std::vector<uint32_t> length_extra;
77
96
  std::vector<uint8_t> type_depths;
78
97
  std::vector<uint16_t> type_bits;
79
- std::vector<uint8_t> length_depths;
80
- std::vector<uint16_t> length_bits;
98
+ uint8_t length_depths[kNumBlockLenPrefixes];
99
+ uint16_t length_bits[kNumBlockLenPrefixes];
81
100
  };
82
101
 
83
102
  // Builds a BlockSplitCode data structure from the block split given by the
84
103
  // vector of block types and block lengths and stores it to the bit stream.
85
- void BuildAndStoreBlockSplitCode(const std::vector<int>& types,
86
- const std::vector<int>& lengths,
87
- const int num_types,
104
+ void BuildAndStoreBlockSplitCode(const std::vector<uint8_t>& types,
105
+ const std::vector<uint32_t>& lengths,
106
+ const size_t num_types,
88
107
  BlockSplitCode* code,
89
- int* storage_ix,
108
+ size_t* storage_ix,
90
109
  uint8_t* storage);
91
110
 
92
111
  // Stores the block switch command with index block_ix to the bit stream.
93
112
  void StoreBlockSwitch(const BlockSplitCode& code,
94
- const int block_ix,
95
- int* storage_ix,
113
+ const size_t block_ix,
114
+ size_t* storage_ix,
96
115
  uint8_t* storage);
97
116
 
98
- bool StoreMetaBlock(const uint8_t* input,
117
+ // REQUIRES: length > 0
118
+ // REQUIRES: length <= (1 << 24)
119
+ void StoreMetaBlock(const uint8_t* input,
99
120
  size_t start_pos,
100
121
  size_t length,
101
122
  size_t mask,
102
123
  uint8_t prev_byte,
103
124
  uint8_t prev_byte2,
104
125
  bool final_block,
105
- int num_direct_distance_codes,
106
- int distance_postfix_bits,
107
- int literal_context_mode,
126
+ uint32_t num_direct_distance_codes,
127
+ uint32_t distance_postfix_bits,
128
+ ContextType literal_context_mode,
108
129
  const brotli::Command *commands,
109
130
  size_t n_commands,
110
131
  const MetaBlockSplit& mb,
111
- int *storage_ix,
132
+ size_t *storage_ix,
112
133
  uint8_t *storage);
113
134
 
114
135
  // Stores the meta-block without doing any block splitting, just collects
115
136
  // one histogram per block category and uses that for entropy coding.
116
- bool StoreMetaBlockTrivial(const uint8_t* input,
137
+ // REQUIRES: length > 0
138
+ // REQUIRES: length <= (1 << 24)
139
+ void StoreMetaBlockTrivial(const uint8_t* input,
117
140
  size_t start_pos,
118
141
  size_t length,
119
142
  size_t mask,
120
143
  bool is_last,
121
144
  const brotli::Command *commands,
122
145
  size_t n_commands,
123
- int *storage_ix,
146
+ size_t *storage_ix,
124
147
  uint8_t *storage);
125
148
 
149
+ // Same as above, but uses static prefix codes for histograms with a only a few
150
+ // symbols, and uses static code length prefix codes for all other histograms.
151
+ // REQUIRES: length > 0
152
+ // REQUIRES: length <= (1 << 24)
153
+ void StoreMetaBlockFast(const uint8_t* input,
154
+ size_t start_pos,
155
+ size_t length,
156
+ size_t mask,
157
+ bool is_last,
158
+ const brotli::Command *commands,
159
+ size_t n_commands,
160
+ size_t *storage_ix,
161
+ uint8_t *storage);
162
+
126
163
  // This is for storing uncompressed blocks (simple raw storage of
127
164
  // bytes-as-bytes).
128
- bool StoreUncompressedMetaBlock(bool final_block,
165
+ // REQUIRES: length > 0
166
+ // REQUIRES: length <= (1 << 24)
167
+ void StoreUncompressedMetaBlock(bool final_block,
129
168
  const uint8_t* input,
130
169
  size_t position, size_t mask,
131
170
  size_t len,
132
- int* storage_ix,
171
+ size_t* storage_ix,
133
172
  uint8_t* storage);
134
173
 
135
174
  // Stores an empty metadata meta-block and syncs to a byte boundary.
136
- void StoreSyncMetaBlock(int* storage_ix, uint8_t* storage);
175
+ void StoreSyncMetaBlock(size_t* storage_ix, uint8_t* storage);
137
176
 
138
177
  } // namespace brotli
139
178
 
@@ -10,11 +10,7 @@
10
10
  #define BROTLI_ENC_CLUSTER_H_
11
11
 
12
12
  #include <math.h>
13
- #include <stdio.h>
14
13
  #include <algorithm>
15
- #include <complex>
16
- #include <map>
17
- #include <set>
18
14
  #include <utility>
19
15
  #include <vector>
20
16
 
@@ -28,41 +24,41 @@
28
24
  namespace brotli {
29
25
 
30
26
  struct HistogramPair {
31
- int idx1;
32
- int idx2;
33
- bool valid;
27
+ uint32_t idx1;
28
+ uint32_t idx2;
34
29
  double cost_combo;
35
30
  double cost_diff;
36
31
  };
37
32
 
38
- struct HistogramPairComparator {
39
- bool operator()(const HistogramPair& p1, const HistogramPair& p2) const {
40
- if (p1.cost_diff != p2.cost_diff) {
41
- return p1.cost_diff > p2.cost_diff;
42
- }
43
- return abs(p1.idx1 - p1.idx2) > abs(p2.idx1 - p2.idx2);
33
+ inline bool operator<(const HistogramPair& p1, const HistogramPair& p2) {
34
+ if (p1.cost_diff != p2.cost_diff) {
35
+ return p1.cost_diff > p2.cost_diff;
44
36
  }
45
- };
37
+ return (p1.idx2 - p1.idx1) > (p2.idx2 - p2.idx1);
38
+ }
46
39
 
47
40
  // Returns entropy reduction of the context map when we combine two clusters.
48
- inline double ClusterCostDiff(int size_a, int size_b) {
49
- int size_c = size_a + size_b;
50
- return size_a * FastLog2(size_a) + size_b * FastLog2(size_b) -
51
- size_c * FastLog2(size_c);
41
+ inline double ClusterCostDiff(size_t size_a, size_t size_b) {
42
+ size_t size_c = size_a + size_b;
43
+ return static_cast<double>(size_a) * FastLog2(size_a) +
44
+ static_cast<double>(size_b) * FastLog2(size_b) -
45
+ static_cast<double>(size_c) * FastLog2(size_c);
52
46
  }
53
47
 
54
48
  // Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
55
- // it is below a threshold, stores the pair (idx1, idx2) in the *pairs heap.
49
+ // it is below a threshold, stores the pair (idx1, idx2) in the *pairs queue.
56
50
  template<typename HistogramType>
57
- void CompareAndPushToHeap(const HistogramType* out,
58
- const int* cluster_size,
59
- int idx1, int idx2,
60
- std::vector<HistogramPair>* pairs) {
51
+ void CompareAndPushToQueue(const HistogramType* out,
52
+ const uint32_t* cluster_size,
53
+ uint32_t idx1, uint32_t idx2,
54
+ size_t max_num_pairs,
55
+ HistogramPair* pairs,
56
+ size_t* num_pairs) {
61
57
  if (idx1 == idx2) {
62
58
  return;
63
59
  }
64
60
  if (idx2 < idx1) {
65
- int t = idx2;
61
+ uint32_t t = idx2;
66
62
  idx2 = idx1;
67
63
  idx1 = t;
68
64
  }
@@ -70,7 +66,6 @@ void CompareAndPushToHeap(const HistogramType* out,
70
66
  HistogramPair p;
71
67
  p.idx1 = idx1;
72
68
  p.idx2 = idx2;
73
- p.valid = true;
74
69
  p.cost_diff = 0.5 * ClusterCostDiff(cluster_size[idx1], cluster_size[idx2]);
75
70
  p.cost_diff -= out[idx1].bit_cost_;
76
71
  p.cost_diff -= out[idx2].bit_cost_;
@@ -82,8 +77,8 @@ void CompareAndPushToHeap(const HistogramType* out,
82
77
  p.cost_combo = out[idx1].bit_cost_;
83
78
  store_pair = true;
84
79
  } else {
85
- double threshold = pairs->empty() ? 1e99 :
86
- std::max(0.0, (*pairs)[0].cost_diff);
80
+ double threshold = *num_pairs == 0 ? 1e99 :
81
+ std::max(0.0, pairs[0].cost_diff);
87
82
  HistogramType combo = out[idx1];
88
83
  combo.AddHistogram(out[idx2]);
89
84
  double cost_combo = PopulationCost(combo);
@@ -94,81 +89,96 @@ void CompareAndPushToHeap(const HistogramType* out,
94
89
  }
95
90
  if (store_pair) {
96
91
  p.cost_diff += p.cost_combo;
97
- pairs->push_back(p);
98
- std::push_heap(pairs->begin(), pairs->end(), HistogramPairComparator());
92
+ if (*num_pairs > 0 && pairs[0] < p) {
93
+ // Replace the top of the queue if needed.
94
+ if (*num_pairs < max_num_pairs) {
95
+ pairs[*num_pairs] = pairs[0];
96
+ ++(*num_pairs);
97
+ }
98
+ pairs[0] = p;
99
+ } else if (*num_pairs < max_num_pairs) {
100
+ pairs[*num_pairs] = p;
101
+ ++(*num_pairs);
102
+ }
99
103
  }
100
104
  }
101
105
 
102
106
  template<typename HistogramType>
103
- void HistogramCombine(HistogramType* out,
104
- int* cluster_size,
105
- int* symbols,
106
- int symbols_size,
107
- size_t max_clusters) {
107
+ size_t HistogramCombine(HistogramType* out,
108
+ uint32_t* cluster_size,
109
+ uint32_t* symbols,
110
+ uint32_t* clusters,
111
+ HistogramPair* pairs,
112
+ size_t num_clusters,
113
+ size_t symbols_size,
114
+ size_t max_clusters,
115
+ size_t max_num_pairs) {
108
116
  double cost_diff_threshold = 0.0;
109
117
  size_t min_cluster_size = 1;
110
- std::set<int> all_symbols;
111
- std::vector<int> clusters;
112
- for (int i = 0; i < symbols_size; ++i) {
113
- if (all_symbols.find(symbols[i]) == all_symbols.end()) {
114
- all_symbols.insert(symbols[i]);
115
- if (!clusters.empty()) {
116
- BROTLI_DCHECK(clusters.back() < symbols[i]);
117
- }
118
- clusters.push_back(symbols[i]);
119
- }
120
- }
121
118
 
122
- // We maintain a heap of histogram pairs, ordered by the bit cost reduction.
123
- std::vector<HistogramPair> pairs;
124
- for (size_t idx1 = 0; idx1 < clusters.size(); ++idx1) {
125
- for (size_t idx2 = idx1 + 1; idx2 < clusters.size(); ++idx2) {
126
- CompareAndPushToHeap(out, cluster_size, clusters[idx1], clusters[idx2],
127
- &pairs);
119
+ // We maintain a vector of histogram pairs, with the property that the pair
120
+ // with the maximum bit cost reduction is the first.
121
+ size_t num_pairs = 0;
122
+ for (size_t idx1 = 0; idx1 < num_clusters; ++idx1) {
123
+ for (size_t idx2 = idx1 + 1; idx2 < num_clusters; ++idx2) {
124
+ CompareAndPushToQueue(out, cluster_size, clusters[idx1], clusters[idx2],
125
+ max_num_pairs, &pairs[0], &num_pairs);
128
126
  }
129
127
  }
130
128
 
131
- while (clusters.size() > min_cluster_size) {
129
+ while (num_clusters > min_cluster_size) {
132
130
  if (pairs[0].cost_diff >= cost_diff_threshold) {
133
131
  cost_diff_threshold = 1e99;
134
132
  min_cluster_size = max_clusters;
135
133
  continue;
136
134
  }
137
135
  // Take the best pair from the top of heap.
138
- int best_idx1 = pairs[0].idx1;
139
- int best_idx2 = pairs[0].idx2;
136
+ uint32_t best_idx1 = pairs[0].idx1;
137
+ uint32_t best_idx2 = pairs[0].idx2;
140
138
  out[best_idx1].AddHistogram(out[best_idx2]);
141
139
  out[best_idx1].bit_cost_ = pairs[0].cost_combo;
142
140
  cluster_size[best_idx1] += cluster_size[best_idx2];
143
- for (int i = 0; i < symbols_size; ++i) {
141
+ for (size_t i = 0; i < symbols_size; ++i) {
144
142
  if (symbols[i] == best_idx2) {
145
143
  symbols[i] = best_idx1;
146
144
  }
147
145
  }
148
- for (size_t i = 0; i + 1 < clusters.size(); ++i) {
149
- if (clusters[i] >= best_idx2) {
150
- clusters[i] = clusters[i + 1];
146
+ for (size_t i = 0; i < num_clusters; ++i) {
147
+ if (clusters[i] == best_idx2) {
148
+ memmove(&clusters[i], &clusters[i + 1],
149
+ (num_clusters - i - 1) * sizeof(clusters[0]));
150
+ break;
151
151
  }
152
152
  }
153
- clusters.pop_back();
154
- // Invalidate pairs intersecting the just combined best pair.
155
- for (size_t i = 0; i < pairs.size(); ++i) {
153
+ --num_clusters;
154
+ // Remove pairs intersecting the just combined best pair.
155
+ size_t copy_to_idx = 0;
156
+ for (size_t i = 0; i < num_pairs; ++i) {
156
157
  HistogramPair& p = pairs[i];
157
158
  if (p.idx1 == best_idx1 || p.idx2 == best_idx1 ||
158
159
  p.idx1 == best_idx2 || p.idx2 == best_idx2) {
159
- p.valid = false;
160
+ // Remove invalid pair from the queue.
161
+ continue;
160
162
  }
163
+ if (pairs[0] < p) {
164
+ // Replace the top of the queue if needed.
165
+ HistogramPair front = pairs[0];
166
+ pairs[0] = p;
167
+ pairs[copy_to_idx] = front;
168
+ } else {
169
+ pairs[copy_to_idx] = p;
170
+ }
171
+ ++copy_to_idx;
161
172
  }
162
- // Pop invalid pairs from the top of the heap.
163
- while (!pairs.empty() && !pairs[0].valid) {
164
- std::pop_heap(pairs.begin(), pairs.end(), HistogramPairComparator());
165
- pairs.pop_back();
166
- }
173
+ num_pairs = copy_to_idx;
174
+
167
175
  // Push new pairs formed with the combined histogram to the heap.
168
- for (size_t i = 0; i < clusters.size(); ++i) {
169
- CompareAndPushToHeap(out, cluster_size, best_idx1, clusters[i], &pairs);
176
+ for (size_t i = 0; i < num_clusters; ++i) {
177
+ CompareAndPushToQueue(out, cluster_size, best_idx1, clusters[i],
178
+ max_num_pairs, &pairs[0], &num_pairs);
170
179
  }
171
180
  }
181
+ return num_clusters;
172
182
  }
173
183
 
174
184
  // -----------------------------------------------------------------------------
@@ -187,58 +197,69 @@ double HistogramBitCostDistance(const HistogramType& histogram,
187
197
  }
188
198
 
189
199
  // Find the best 'out' histogram for each of the 'in' histograms.
200
+ // When called, clusters[0..num_clusters) contains the unique values from
201
+ // symbols[0..in_size), but this property is not preserved in this function.
190
202
  // Note: we assume that out[]->bit_cost_ is already up-to-date.
191
203
  template<typename HistogramType>
192
- void HistogramRemap(const HistogramType* in, int in_size,
193
- HistogramType* out, int* symbols) {
194
- std::set<int> all_symbols;
195
- for (int i = 0; i < in_size; ++i) {
196
- all_symbols.insert(symbols[i]);
197
- }
198
- for (int i = 0; i < in_size; ++i) {
199
- int best_out = i == 0 ? symbols[0] : symbols[i - 1];
204
+ void HistogramRemap(const HistogramType* in, size_t in_size,
205
+ const uint32_t* clusters, size_t num_clusters,
206
+ HistogramType* out, uint32_t* symbols) {
207
+ for (size_t i = 0; i < in_size; ++i) {
208
+ uint32_t best_out = i == 0 ? symbols[0] : symbols[i - 1];
200
209
  double best_bits = HistogramBitCostDistance(in[i], out[best_out]);
201
- for (std::set<int>::const_iterator k = all_symbols.begin();
202
- k != all_symbols.end(); ++k) {
203
- const double cur_bits = HistogramBitCostDistance(in[i], out[*k]);
210
+ for (size_t j = 0; j < num_clusters; ++j) {
211
+ const double cur_bits = HistogramBitCostDistance(in[i], out[clusters[j]]);
204
212
  if (cur_bits < best_bits) {
205
213
  best_bits = cur_bits;
206
- best_out = *k;
214
+ best_out = clusters[j];
207
215
  }
208
216
  }
209
217
  symbols[i] = best_out;
210
218
  }
211
219
 
212
-
213
220
  // Recompute each out based on raw and symbols.
214
- for (std::set<int>::const_iterator k = all_symbols.begin();
215
- k != all_symbols.end(); ++k) {
216
- out[*k].Clear();
221
+ for (size_t j = 0; j < num_clusters; ++j) {
222
+ out[clusters[j]].Clear();
217
223
  }
218
- for (int i = 0; i < in_size; ++i) {
224
+ for (size_t i = 0; i < in_size; ++i) {
219
225
  out[symbols[i]].AddHistogram(in[i]);
220
226
  }
221
227
  }
222
228
 
223
- // Reorder histograms in *out so that the new symbols in *symbols come in
224
- // increasing order.
229
+ // Reorders elements of the out[0..length) array and changes values in
230
+ // symbols[0..length) array in the following way:
231
+ // * when called, symbols[] contains indexes into out[], and has N unique
232
+ // values (possibly N < length)
233
+ // * on return, symbols'[i] = f(symbols[i]) and
234
+ // out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
235
+ // where f is a bijection between the range of symbols[] and [0..N), and
236
+ // the first occurrences of values in symbols'[i] come in consecutive
237
+ // increasing order.
238
+ // Returns N, the number of unique values in symbols[].
225
239
  template<typename HistogramType>
226
- void HistogramReindex(std::vector<HistogramType>* out,
227
- std::vector<int>* symbols) {
228
- std::vector<HistogramType> tmp(*out);
229
- std::map<int, int> new_index;
230
- int next_index = 0;
231
- for (size_t i = 0; i < symbols->size(); ++i) {
232
- if (new_index.find((*symbols)[i]) == new_index.end()) {
233
- new_index[(*symbols)[i]] = next_index;
234
- (*out)[next_index] = tmp[(*symbols)[i]];
240
+ size_t HistogramReindex(HistogramType* out, uint32_t* symbols, size_t length) {
241
+ static const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
242
+ std::vector<uint32_t> new_index(length, kInvalidIndex);
243
+ uint32_t next_index = 0;
244
+ for (size_t i = 0; i < length; ++i) {
245
+ if (new_index[symbols[i]] == kInvalidIndex) {
246
+ new_index[symbols[i]] = next_index;
247
+ ++next_index;
248
+ }
249
+ }
250
+ std::vector<HistogramType> tmp(next_index);
251
+ next_index = 0;
252
+ for (size_t i = 0; i < length; ++i) {
253
+ if (new_index[symbols[i]] == next_index) {
254
+ tmp[next_index] = out[symbols[i]];
235
255
  ++next_index;
236
256
  }
257
+ symbols[i] = new_index[symbols[i]];
237
258
  }
238
- out->resize(next_index);
239
- for (size_t i = 0; i < symbols->size(); ++i) {
240
- (*symbols)[i] = new_index[(*symbols)[i]];
259
+ for (size_t i = 0; i < next_index; ++i) {
260
+ out[i] = tmp[i];
241
261
  }
262
+ return next_index;
242
263
  }
243
264
 
244
265
  // Clusters similar histograms in 'in' together, the selected histograms are
@@ -246,44 +267,64 @@ void HistogramReindex(std::vector<HistogramType>* out,
246
267
  // indicate which of the 'out' histograms is the best approximation.
247
268
  template<typename HistogramType>
248
269
  void ClusterHistograms(const std::vector<HistogramType>& in,
249
- int num_contexts, int num_blocks,
270
+ size_t num_contexts, size_t num_blocks,
250
271
  size_t max_histograms,
251
272
  std::vector<HistogramType>* out,
252
- std::vector<int>* histogram_symbols) {
253
- const int in_size = num_contexts * num_blocks;
254
- BROTLI_DCHECK(in_size == in.size());
255
- std::vector<int> cluster_size(in_size, 1);
273
+ std::vector<uint32_t>* histogram_symbols) {
274
+ const size_t in_size = num_contexts * num_blocks;
275
+ assert(in_size == in.size());
276
+ std::vector<uint32_t> cluster_size(in_size, 1);
277
+ std::vector<uint32_t> clusters(in_size);
278
+ size_t num_clusters = 0;
256
279
  out->resize(in_size);
257
280
  histogram_symbols->resize(in_size);
258
- for (int i = 0; i < in_size; ++i) {
281
+ for (size_t i = 0; i < in_size; ++i) {
259
282
  (*out)[i] = in[i];
260
283
  (*out)[i].bit_cost_ = PopulationCost(in[i]);
261
- (*histogram_symbols)[i] = i;
284
+ (*histogram_symbols)[i] = static_cast<uint32_t>(i);
262
285
  }
263
286
 
287
+ const size_t max_input_histograms = 64;
288
+ // For the first pass of clustering, we allow all pairs.
289
+ size_t max_num_pairs = max_input_histograms * max_input_histograms / 2;
290
+ std::vector<HistogramPair> pairs(max_num_pairs + 1);
264
291
 
265
- const int max_input_histograms = 64;
266
- for (int i = 0; i < in_size; i += max_input_histograms) {
267
- int num_to_combine = std::min(in_size - i, max_input_histograms);
268
- HistogramCombine(&(*out)[0], &cluster_size[0],
269
- &(*histogram_symbols)[i], num_to_combine,
270
- max_histograms);
292
+ for (size_t i = 0; i < in_size; i += max_input_histograms) {
293
+ size_t num_to_combine = std::min(in_size - i, max_input_histograms);
294
+ for (size_t j = 0; j < num_to_combine; ++j) {
295
+ clusters[num_clusters + j] = static_cast<uint32_t>(i + j);
296
+ }
297
+ size_t num_new_clusters =
298
+ HistogramCombine(&(*out)[0], &cluster_size[0],
299
+ &(*histogram_symbols)[i],
300
+ &clusters[num_clusters], &pairs[0],
301
+ num_to_combine, num_to_combine,
302
+ max_histograms, max_num_pairs);
303
+ num_clusters += num_new_clusters;
271
304
  }
272
305
 
306
+ // For the second pass, we limit the total number of histogram pairs.
307
+ // After this limit is reached, we only keep searching for the best pair.
308
+ max_num_pairs =
309
+ std::min(64 * num_clusters, (num_clusters / 2) * num_clusters);
310
+ pairs.resize(max_num_pairs + 1);
311
+
273
312
  // Collapse similar histograms.
274
- HistogramCombine(&(*out)[0], &cluster_size[0],
275
- &(*histogram_symbols)[0], in_size,
276
- max_histograms);
313
+ num_clusters = HistogramCombine(&(*out)[0], &cluster_size[0],
314
+ &(*histogram_symbols)[0], &clusters[0],
315
+ &pairs[0], num_clusters, in_size,
316
+ max_histograms, max_num_pairs);
277
317
 
278
318
  // Find the optimal map from original histograms to the final ones.
279
- HistogramRemap(&in[0], in_size, &(*out)[0], &(*histogram_symbols)[0]);
319
+ HistogramRemap(&in[0], in_size, &clusters[0], num_clusters,
320
+ &(*out)[0], &(*histogram_symbols)[0]);
280
321
 
281
322
  // Convert the context map to a canonical form.
282
- HistogramReindex(out, histogram_symbols);
283
-
323
+ size_t num_histograms =
324
+ HistogramReindex(&(*out)[0], &(*histogram_symbols)[0], in_size);
325
+ out->resize(num_histograms);
284
326
  }
285
327
 
286
-
287
328
  } // namespace brotli
288
329
 
289
330
  #endif // BROTLI_ENC_CLUSTER_H_