brotli 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/ext/brotli/brotli.cc +114 -24
  3. data/ext/brotli/brotli.h +0 -1
  4. data/ext/brotli/extconf.rb +30 -23
  5. data/lib/brotli/version.rb +1 -1
  6. data/vendor/brotli/LICENSE +1 -1
  7. data/vendor/brotli/dec/Makefile +1 -1
  8. data/vendor/brotli/dec/bit_reader.c +3 -3
  9. data/vendor/brotli/dec/bit_reader.h +25 -27
  10. data/vendor/brotli/dec/context.h +4 -4
  11. data/vendor/brotli/dec/decode.c +410 -486
  12. data/vendor/brotli/dec/decode.h +101 -105
  13. data/vendor/brotli/dec/dictionary.c +1 -1
  14. data/vendor/brotli/dec/dictionary.h +7 -8
  15. data/vendor/brotli/dec/huffman.c +103 -105
  16. data/vendor/brotli/dec/huffman.h +18 -18
  17. data/vendor/brotli/dec/port.h +52 -40
  18. data/vendor/brotli/dec/prefix.h +2 -0
  19. data/vendor/brotli/dec/state.c +13 -19
  20. data/vendor/brotli/dec/state.h +25 -39
  21. data/vendor/brotli/dec/transform.h +38 -44
  22. data/vendor/brotli/dec/types.h +2 -2
  23. data/vendor/brotli/enc/Makefile +1 -1
  24. data/vendor/brotli/enc/backward_references.cc +455 -359
  25. data/vendor/brotli/enc/backward_references.h +79 -3
  26. data/vendor/brotli/enc/bit_cost.h +54 -32
  27. data/vendor/brotli/enc/block_splitter.cc +285 -193
  28. data/vendor/brotli/enc/block_splitter.h +4 -12
  29. data/vendor/brotli/enc/brotli_bit_stream.cc +623 -324
  30. data/vendor/brotli/enc/brotli_bit_stream.h +76 -37
  31. data/vendor/brotli/enc/cluster.h +161 -120
  32. data/vendor/brotli/enc/command.h +60 -37
  33. data/vendor/brotli/enc/compress_fragment.cc +701 -0
  34. data/vendor/brotli/enc/compress_fragment.h +47 -0
  35. data/vendor/brotli/enc/compress_fragment_two_pass.cc +524 -0
  36. data/vendor/brotli/enc/compress_fragment_two_pass.h +40 -0
  37. data/vendor/brotli/enc/compressor.h +15 -0
  38. data/vendor/brotli/enc/context.h +1 -1
  39. data/vendor/brotli/enc/dictionary.h +2 -2
  40. data/vendor/brotli/enc/encode.cc +819 -286
  41. data/vendor/brotli/enc/encode.h +38 -15
  42. data/vendor/brotli/enc/encode_parallel.cc +40 -42
  43. data/vendor/brotli/enc/entropy_encode.cc +144 -147
  44. data/vendor/brotli/enc/entropy_encode.h +32 -8
  45. data/vendor/brotli/enc/entropy_encode_static.h +572 -0
  46. data/vendor/brotli/enc/fast_log.h +7 -40
  47. data/vendor/brotli/enc/find_match_length.h +9 -9
  48. data/vendor/brotli/enc/hash.h +462 -154
  49. data/vendor/brotli/enc/histogram.cc +6 -6
  50. data/vendor/brotli/enc/histogram.h +13 -13
  51. data/vendor/brotli/enc/literal_cost.cc +45 -45
  52. data/vendor/brotli/enc/metablock.cc +92 -89
  53. data/vendor/brotli/enc/metablock.h +12 -12
  54. data/vendor/brotli/enc/port.h +7 -16
  55. data/vendor/brotli/enc/prefix.h +23 -22
  56. data/vendor/brotli/enc/ringbuffer.h +75 -29
  57. data/vendor/brotli/enc/static_dict.cc +56 -48
  58. data/vendor/brotli/enc/static_dict.h +5 -5
  59. data/vendor/brotli/enc/streams.cc +1 -1
  60. data/vendor/brotli/enc/streams.h +5 -5
  61. data/vendor/brotli/enc/transform.h +40 -35
  62. data/vendor/brotli/enc/types.h +2 -0
  63. data/vendor/brotli/enc/utf8_util.cc +3 -2
  64. data/vendor/brotli/enc/write_bits.h +6 -6
  65. metadata +9 -5
  66. data/vendor/brotli/dec/streams.c +0 -102
  67. data/vendor/brotli/dec/streams.h +0 -95
@@ -10,12 +10,10 @@
10
10
 
11
11
  #include <assert.h>
12
12
  #include <math.h>
13
- #include <stdio.h>
14
- #include <stdlib.h>
15
- #include <string.h>
16
13
 
17
14
  #include <algorithm>
18
- #include <map>
15
+ #include <cstring>
16
+ #include <vector>
19
17
 
20
18
  #include "./cluster.h"
21
19
  #include "./command.h"
@@ -24,19 +22,19 @@
24
22
 
25
23
  namespace brotli {
26
24
 
27
- static const int kMaxLiteralHistograms = 100;
28
- static const int kMaxCommandHistograms = 50;
25
+ static const size_t kMaxLiteralHistograms = 100;
26
+ static const size_t kMaxCommandHistograms = 50;
29
27
  static const double kLiteralBlockSwitchCost = 28.1;
30
28
  static const double kCommandBlockSwitchCost = 13.5;
31
29
  static const double kDistanceBlockSwitchCost = 14.6;
32
- static const int kLiteralStrideLength = 70;
33
- static const int kCommandStrideLength = 40;
34
- static const int kSymbolsPerLiteralHistogram = 544;
35
- static const int kSymbolsPerCommandHistogram = 530;
36
- static const int kSymbolsPerDistanceHistogram = 544;
37
- static const int kMinLengthForBlockSplitting = 128;
38
- static const int kIterMulForRefining = 2;
39
- static const int kMinItersForRefining = 100;
30
+ static const size_t kLiteralStrideLength = 70;
31
+ static const size_t kCommandStrideLength = 40;
32
+ static const size_t kSymbolsPerLiteralHistogram = 544;
33
+ static const size_t kSymbolsPerCommandHistogram = 530;
34
+ static const size_t kSymbolsPerDistanceHistogram = 544;
35
+ static const size_t kMinLengthForBlockSplitting = 128;
36
+ static const size_t kIterMulForRefining = 2;
37
+ static const size_t kMinItersForRefining = 100;
40
38
 
41
39
  void CopyLiteralsToByteArray(const Command* cmds,
42
40
  const size_t num_commands,
@@ -72,20 +70,7 @@ void CopyLiteralsToByteArray(const Command* cmds,
72
70
  memcpy(&(*literals)[pos], data + from_pos, insert_len);
73
71
  pos += insert_len;
74
72
  }
75
- from_pos = (from_pos + insert_len + cmds[i].copy_len_) & mask;
76
- }
77
- }
78
-
79
- void CopyCommandsToByteArray(const Command* cmds,
80
- const size_t num_commands,
81
- std::vector<uint16_t>* insert_and_copy_codes,
82
- std::vector<uint16_t>* distance_prefixes) {
83
- for (size_t i = 0; i < num_commands; ++i) {
84
- const Command& cmd = cmds[i];
85
- insert_and_copy_codes->push_back(cmd.cmd_prefix_);
86
- if (cmd.copy_len_ > 0 && cmd.cmd_prefix_ >= 128) {
87
- distance_prefixes->push_back(cmd.dist_prefix_);
88
- }
73
+ from_pos = (from_pos + insert_len + cmds[i].copy_len()) & mask;
89
74
  }
90
75
  }
91
76
 
@@ -99,27 +84,23 @@ inline static unsigned int MyRand(unsigned int* seed) {
99
84
 
100
85
  template<typename HistogramType, typename DataType>
101
86
  void InitialEntropyCodes(const DataType* data, size_t length,
102
- int literals_per_histogram,
103
- int max_histograms,
104
87
  size_t stride,
105
- std::vector<HistogramType>* vec) {
106
- int total_histograms = static_cast<int>(length) / literals_per_histogram + 1;
107
- if (total_histograms > max_histograms) {
108
- total_histograms = max_histograms;
88
+ size_t num_histograms,
89
+ HistogramType* histograms) {
90
+ for (size_t i = 0; i < num_histograms; ++i) {
91
+ histograms[i].Clear();
109
92
  }
110
93
  unsigned int seed = 7;
111
- size_t block_length = length / total_histograms;
112
- for (int i = 0; i < total_histograms; ++i) {
113
- size_t pos = length * i / total_histograms;
94
+ size_t block_length = length / num_histograms;
95
+ for (size_t i = 0; i < num_histograms; ++i) {
96
+ size_t pos = length * i / num_histograms;
114
97
  if (i != 0) {
115
98
  pos += MyRand(&seed) % block_length;
116
99
  }
117
100
  if (pos + stride >= length) {
118
101
  pos = length - stride - 1;
119
102
  }
120
- HistogramType histo;
121
- histo.Add(data + pos, stride);
122
- vec->push_back(histo);
103
+ histograms[i].Add(data + pos, stride);
123
104
  }
124
105
  }
125
106
 
@@ -142,50 +123,58 @@ void RandomSample(unsigned int* seed,
142
123
  template<typename HistogramType, typename DataType>
143
124
  void RefineEntropyCodes(const DataType* data, size_t length,
144
125
  size_t stride,
145
- std::vector<HistogramType>* vec) {
126
+ size_t num_histograms,
127
+ HistogramType* histograms) {
146
128
  size_t iters =
147
129
  kIterMulForRefining * length / stride + kMinItersForRefining;
148
130
  unsigned int seed = 7;
149
- iters = ((iters + vec->size() - 1) / vec->size()) * vec->size();
131
+ iters = ((iters + num_histograms - 1) / num_histograms) * num_histograms;
150
132
  for (size_t iter = 0; iter < iters; ++iter) {
151
133
  HistogramType sample;
152
134
  RandomSample(&seed, data, length, stride, &sample);
153
- size_t ix = iter % vec->size();
154
- (*vec)[ix].AddHistogram(sample);
135
+ size_t ix = iter % num_histograms;
136
+ histograms[ix].AddHistogram(sample);
155
137
  }
156
138
  }
157
139
 
158
- inline static double BitCost(int count) {
159
- return count == 0 ? -2 : FastLog2(count);
140
+ inline static double BitCost(size_t count) {
141
+ return count == 0 ? -2.0 : FastLog2(count);
160
142
  }
161
143
 
144
+ // Assigns a block id from the range [0, vec.size()) to each data element
145
+ // in data[0..length) and fills in block_id[0..length) with the assigned values.
146
+ // Returns the number of blocks, i.e. one plus the number of block switches.
162
147
  template<typename DataType, int kSize>
163
- void FindBlocks(const DataType* data, const size_t length,
164
- const double block_switch_bitcost,
165
- const std::vector<Histogram<kSize> > &vec,
166
- uint8_t *block_id) {
167
- if (vec.size() <= 1) {
148
+ size_t FindBlocks(const DataType* data, const size_t length,
149
+ const double block_switch_bitcost,
150
+ const size_t num_histograms,
151
+ const Histogram<kSize>* histograms,
152
+ double* insert_cost,
153
+ double* cost,
154
+ uint8_t* switch_signal,
155
+ uint8_t *block_id) {
156
+ if (num_histograms <= 1) {
168
157
  for (size_t i = 0; i < length; ++i) {
169
158
  block_id[i] = 0;
170
159
  }
171
- return;
160
+ return 1;
172
161
  }
173
- int vecsize = static_cast<int>(vec.size());
174
- assert(vecsize <= 256);
175
- double* insert_cost = new double[kSize * vecsize];
176
- memset(insert_cost, 0, sizeof(insert_cost[0]) * kSize * vecsize);
177
- for (int j = 0; j < vecsize; ++j) {
178
- insert_cost[j] = FastLog2(vec[j].total_count_);
162
+ const size_t bitmaplen = (num_histograms + 7) >> 3;
163
+ assert(num_histograms <= 256);
164
+ memset(insert_cost, 0, sizeof(insert_cost[0]) * kSize * num_histograms);
165
+ for (size_t j = 0; j < num_histograms; ++j) {
166
+ insert_cost[j] = FastLog2(static_cast<uint32_t>(
167
+ histograms[j].total_count_));
179
168
  }
180
- for (int i = kSize - 1; i >= 0; --i) {
181
- for (int j = 0; j < vecsize; ++j) {
182
- insert_cost[i * vecsize + j] = insert_cost[j] - BitCost(vec[j].data_[i]);
169
+ for (size_t i = kSize; i != 0;) {
170
+ --i;
171
+ for (size_t j = 0; j < num_histograms; ++j) {
172
+ insert_cost[i * num_histograms + j] =
173
+ insert_cost[j] - BitCost(histograms[j].data_[i]);
183
174
  }
184
175
  }
185
- double *cost = new double[vecsize];
186
- memset(cost, 0, sizeof(cost[0]) * vecsize);
187
- bool* switch_signal = new bool[length * vecsize];
188
- memset(switch_signal, 0, sizeof(switch_signal[0]) * length * vecsize);
176
+ memset(cost, 0, sizeof(cost[0]) * num_histograms);
177
+ memset(switch_signal, 0, sizeof(switch_signal[0]) * length * bitmaplen);
189
178
  // After each iteration of this loop, cost[k] will contain the difference
190
179
  // between the minimum cost of arriving at the current byte position using
191
180
  // entropy code k, and the minimum cost of arriving at the current byte
@@ -193,10 +182,10 @@ void FindBlocks(const DataType* data, const size_t length,
193
182
  // reaches block switch cost, it means that when we trace back from the last
194
183
  // position, we need to switch here.
195
184
  for (size_t byte_ix = 0; byte_ix < length; ++byte_ix) {
196
- size_t ix = byte_ix * vecsize;
197
- int insert_cost_ix = data[byte_ix] * vecsize;
185
+ size_t ix = byte_ix * bitmaplen;
186
+ size_t insert_cost_ix = data[byte_ix] * num_histograms;
198
187
  double min_cost = 1e99;
199
- for (int k = 0; k < vecsize; ++k) {
188
+ for (size_t k = 0; k < num_histograms; ++k) {
200
189
  // We are coding the symbol in data[byte_ix] with entropy code k.
201
190
  cost[k] += insert_cost[insert_cost_ix + k];
202
191
  if (cost[k] < min_cost) {
@@ -207,116 +196,206 @@ void FindBlocks(const DataType* data, const size_t length,
207
196
  double block_switch_cost = block_switch_bitcost;
208
197
  // More blocks for the beginning.
209
198
  if (byte_ix < 2000) {
210
- block_switch_cost *= 0.77 + 0.07 * byte_ix / 2000;
199
+ block_switch_cost *= 0.77 + 0.07 * static_cast<double>(byte_ix) / 2000;
211
200
  }
212
- for (int k = 0; k < vecsize; ++k) {
201
+ for (size_t k = 0; k < num_histograms; ++k) {
213
202
  cost[k] -= min_cost;
214
203
  if (cost[k] >= block_switch_cost) {
215
204
  cost[k] = block_switch_cost;
216
- switch_signal[ix + k] = true;
205
+ const uint8_t mask = static_cast<uint8_t>(1u << (k & 7));
206
+ assert((k >> 3) < bitmaplen);
207
+ switch_signal[ix + (k >> 3)] |= mask;
217
208
  }
218
209
  }
219
210
  }
220
211
  // Now trace back from the last position and switch at the marked places.
221
212
  size_t byte_ix = length - 1;
222
- size_t ix = byte_ix * vecsize;
213
+ size_t ix = byte_ix * bitmaplen;
223
214
  uint8_t cur_id = block_id[byte_ix];
215
+ size_t num_blocks = 1;
224
216
  while (byte_ix > 0) {
225
217
  --byte_ix;
226
- ix -= vecsize;
227
- if (switch_signal[ix + cur_id]) {
228
- cur_id = block_id[byte_ix];
218
+ ix -= bitmaplen;
219
+ const uint8_t mask = static_cast<uint8_t>(1u << (cur_id & 7));
220
+ assert((static_cast<size_t>(cur_id) >> 3) < bitmaplen);
221
+ if (switch_signal[ix + (cur_id >> 3)] & mask) {
222
+ if (cur_id != block_id[byte_ix]) {
223
+ cur_id = block_id[byte_ix];
224
+ ++num_blocks;
225
+ }
229
226
  }
230
227
  block_id[byte_ix] = cur_id;
231
228
  }
232
- delete[] insert_cost;
233
- delete[] cost;
234
- delete[] switch_signal;
229
+ return num_blocks;
235
230
  }
236
231
 
237
- int RemapBlockIds(uint8_t* block_ids, const size_t length) {
238
- std::map<uint8_t, uint8_t> new_id;
239
- int next_id = 0;
232
+ static size_t RemapBlockIds(uint8_t* block_ids, const size_t length,
233
+ uint16_t* new_id, const size_t num_histograms) {
234
+ static const uint16_t kInvalidId = 256;
235
+ for (size_t i = 0; i < num_histograms; ++i) {
236
+ new_id[i] = kInvalidId;
237
+ }
238
+ uint16_t next_id = 0;
240
239
  for (size_t i = 0; i < length; ++i) {
241
- if (new_id.find(block_ids[i]) == new_id.end()) {
242
- new_id[block_ids[i]] = static_cast<uint8_t>(next_id);
243
- ++next_id;
240
+ assert(block_ids[i] < num_histograms);
241
+ if (new_id[block_ids[i]] == kInvalidId) {
242
+ new_id[block_ids[i]] = next_id++;
244
243
  }
245
244
  }
246
245
  for (size_t i = 0; i < length; ++i) {
247
- block_ids[i] = new_id[block_ids[i]];
246
+ block_ids[i] = static_cast<uint8_t>(new_id[block_ids[i]]);
247
+ assert(block_ids[i] < num_histograms);
248
248
  }
249
+ assert(next_id <= num_histograms);
249
250
  return next_id;
250
251
  }
251
252
 
252
253
  template<typename HistogramType, typename DataType>
253
254
  void BuildBlockHistograms(const DataType* data, const size_t length,
254
- uint8_t* block_ids,
255
- std::vector<HistogramType>* histograms) {
256
- int num_types = RemapBlockIds(block_ids, length);
257
- assert(num_types <= 256);
258
- histograms->clear();
259
- histograms->resize(num_types);
255
+ const uint8_t* block_ids,
256
+ const size_t num_histograms,
257
+ HistogramType* histograms) {
258
+ for (size_t i = 0; i < num_histograms; ++i) {
259
+ histograms[i].Clear();
260
+ }
260
261
  for (size_t i = 0; i < length; ++i) {
261
- (*histograms)[block_ids[i]].Add(data[i]);
262
+ histograms[block_ids[i]].Add(data[i]);
262
263
  }
263
264
  }
264
265
 
265
266
  template<typename HistogramType, typename DataType>
266
267
  void ClusterBlocks(const DataType* data, const size_t length,
267
- uint8_t* block_ids) {
268
- std::vector<HistogramType> histograms;
269
- std::vector<int> block_index(length);
270
- int cur_idx = 0;
271
- HistogramType cur_histogram;
268
+ const size_t num_blocks,
269
+ uint8_t* block_ids,
270
+ BlockSplit* split) {
271
+ static const size_t kMaxNumberOfBlockTypes = 256;
272
+ static const size_t kHistogramsPerBatch = 64;
273
+ static const size_t kClustersPerBatch = 16;
274
+ std::vector<uint32_t> histogram_symbols(num_blocks);
275
+ std::vector<uint32_t> block_lengths(num_blocks);
276
+
277
+ size_t block_idx = 0;
272
278
  for (size_t i = 0; i < length; ++i) {
273
- bool block_boundary = (i + 1 == length || block_ids[i] != block_ids[i + 1]);
274
- block_index[i] = cur_idx;
275
- cur_histogram.Add(data[i]);
276
- if (block_boundary) {
277
- histograms.push_back(cur_histogram);
278
- cur_histogram.Clear();
279
- ++cur_idx;
279
+ assert(block_idx < num_blocks);
280
+ ++block_lengths[block_idx];
281
+ if (i + 1 == length || block_ids[i] != block_ids[i + 1]) {
282
+ ++block_idx;
280
283
  }
281
284
  }
282
- std::vector<HistogramType> clustered_histograms;
283
- std::vector<int> histogram_symbols;
284
- // Block ids need to fit in one byte.
285
- static const size_t kMaxNumberOfBlockTypes = 256;
286
- ClusterHistograms(histograms, 1, static_cast<int>(histograms.size()),
287
- kMaxNumberOfBlockTypes,
288
- &clustered_histograms,
289
- &histogram_symbols);
290
- for (size_t i = 0; i < length; ++i) {
291
- block_ids[i] = static_cast<uint8_t>(histogram_symbols[block_index[i]]);
285
+ assert(block_idx == num_blocks);
286
+
287
+ const size_t expected_num_clusters =
288
+ kClustersPerBatch *
289
+ (num_blocks + kHistogramsPerBatch - 1) / kHistogramsPerBatch;
290
+ std::vector<HistogramType> all_histograms;
291
+ std::vector<uint32_t> cluster_size;
292
+ all_histograms.reserve(expected_num_clusters);
293
+ cluster_size.reserve(expected_num_clusters);
294
+ size_t num_clusters = 0;
295
+ std::vector<HistogramType> histograms(
296
+ std::min(num_blocks, kHistogramsPerBatch));
297
+ size_t max_num_pairs = kHistogramsPerBatch * kHistogramsPerBatch / 2;
298
+ std::vector<HistogramPair> pairs(max_num_pairs + 1);
299
+ size_t pos = 0;
300
+ for (size_t i = 0; i < num_blocks; i += kHistogramsPerBatch) {
301
+ const size_t num_to_combine = std::min(num_blocks - i, kHistogramsPerBatch);
302
+ uint32_t sizes[kHistogramsPerBatch];
303
+ uint32_t clusters[kHistogramsPerBatch];
304
+ uint32_t symbols[kHistogramsPerBatch];
305
+ uint32_t remap[kHistogramsPerBatch];
306
+ for (size_t j = 0; j < num_to_combine; ++j) {
307
+ histograms[j].Clear();
308
+ for (size_t k = 0; k < block_lengths[i + j]; ++k) {
309
+ histograms[j].Add(data[pos++]);
310
+ }
311
+ histograms[j].bit_cost_ = PopulationCost(histograms[j]);
312
+ symbols[j] = clusters[j] = static_cast<uint32_t>(j);
313
+ sizes[j] = 1;
314
+ }
315
+ size_t num_new_clusters = HistogramCombine(
316
+ &histograms[0], sizes, symbols, clusters, &pairs[0], num_to_combine,
317
+ num_to_combine, kHistogramsPerBatch, max_num_pairs);
318
+ for (size_t j = 0; j < num_new_clusters; ++j) {
319
+ all_histograms.push_back(histograms[clusters[j]]);
320
+ cluster_size.push_back(sizes[clusters[j]]);
321
+ remap[clusters[j]] = static_cast<uint32_t>(j);
322
+ }
323
+ for (size_t j = 0; j < num_to_combine; ++j) {
324
+ histogram_symbols[i + j] =
325
+ static_cast<uint32_t>(num_clusters) + remap[symbols[j]];
326
+ }
327
+ num_clusters += num_new_clusters;
328
+ assert(num_clusters == cluster_size.size());
329
+ assert(num_clusters == all_histograms.size());
292
330
  }
293
- }
294
331
 
295
- void BuildBlockSplit(const std::vector<uint8_t>& block_ids, BlockSplit* split) {
296
- int cur_id = block_ids[0];
297
- int cur_length = 1;
298
- split->num_types = -1;
299
- for (size_t i = 1; i < block_ids.size(); ++i) {
300
- if (block_ids[i] != cur_id) {
301
- split->types.push_back(cur_id);
302
- split->lengths.push_back(cur_length);
303
- split->num_types = std::max(split->num_types, cur_id);
304
- cur_id = block_ids[i];
332
+ max_num_pairs =
333
+ std::min(64 * num_clusters, (num_clusters / 2) * num_clusters);
334
+ pairs.resize(max_num_pairs + 1);
335
+
336
+ std::vector<uint32_t> clusters(num_clusters);
337
+ for (size_t i = 0; i < num_clusters; ++i) {
338
+ clusters[i] = static_cast<uint32_t>(i);
339
+ }
340
+ size_t num_final_clusters =
341
+ HistogramCombine(&all_histograms[0], &cluster_size[0],
342
+ &histogram_symbols[0],
343
+ &clusters[0], &pairs[0], num_clusters,
344
+ num_blocks, kMaxNumberOfBlockTypes, max_num_pairs);
345
+
346
+ static const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
347
+ std::vector<uint32_t> new_index(num_clusters, kInvalidIndex);
348
+ uint32_t next_index = 0;
349
+ pos = 0;
350
+ for (size_t i = 0; i < num_blocks; ++i) {
351
+ HistogramType histo;
352
+ for (size_t j = 0; j < block_lengths[i]; ++j) {
353
+ histo.Add(data[pos++]);
354
+ }
355
+ uint32_t best_out =
356
+ i == 0 ? histogram_symbols[0] : histogram_symbols[i - 1];
357
+ double best_bits = HistogramBitCostDistance(
358
+ histo, all_histograms[best_out]);
359
+ for (size_t j = 0; j < num_final_clusters; ++j) {
360
+ const double cur_bits = HistogramBitCostDistance(
361
+ histo, all_histograms[clusters[j]]);
362
+ if (cur_bits < best_bits) {
363
+ best_bits = cur_bits;
364
+ best_out = clusters[j];
365
+ }
366
+ }
367
+ histogram_symbols[i] = best_out;
368
+ if (new_index[best_out] == kInvalidIndex) {
369
+ new_index[best_out] = next_index++;
370
+ }
371
+ }
372
+ uint8_t max_type = 0;
373
+ uint32_t cur_length = 0;
374
+ block_idx = 0;
375
+ split->types.resize(num_blocks);
376
+ split->lengths.resize(num_blocks);
377
+ for (size_t i = 0; i < num_blocks; ++i) {
378
+ cur_length += block_lengths[i];
379
+ if (i + 1 == num_blocks ||
380
+ histogram_symbols[i] != histogram_symbols[i + 1]) {
381
+ const uint8_t id = static_cast<uint8_t>(new_index[histogram_symbols[i]]);
382
+ split->types[block_idx] = id;
383
+ split->lengths[block_idx] = cur_length;
384
+ max_type = std::max(max_type, id);
305
385
  cur_length = 0;
386
+ ++block_idx;
306
387
  }
307
- ++cur_length;
308
388
  }
309
- split->types.push_back(cur_id);
310
- split->lengths.push_back(cur_length);
311
- split->num_types = std::max(split->num_types, cur_id);
312
- ++split->num_types;
389
+ split->types.resize(block_idx);
390
+ split->lengths.resize(block_idx);
391
+ split->num_types = static_cast<size_t>(max_type) + 1;
313
392
  }
314
393
 
315
- template<typename HistogramType, typename DataType>
394
+ template<int kSize, typename DataType>
316
395
  void SplitByteVector(const std::vector<DataType>& data,
317
- const int literals_per_histogram,
318
- const int max_histograms,
319
- const int sampling_stride_length,
396
+ const size_t literals_per_histogram,
397
+ const size_t max_histograms,
398
+ const size_t sampling_stride_length,
320
399
  const double block_switch_cost,
321
400
  BlockSplit* split) {
322
401
  if (data.empty()) {
@@ -325,30 +404,47 @@ void SplitByteVector(const std::vector<DataType>& data,
325
404
  } else if (data.size() < kMinLengthForBlockSplitting) {
326
405
  split->num_types = 1;
327
406
  split->types.push_back(0);
328
- split->lengths.push_back(static_cast<int>(data.size()));
407
+ split->lengths.push_back(static_cast<uint32_t>(data.size()));
329
408
  return;
330
409
  }
331
- std::vector<HistogramType> histograms;
410
+ size_t num_histograms = data.size() / literals_per_histogram + 1;
411
+ if (num_histograms > max_histograms) {
412
+ num_histograms = max_histograms;
413
+ }
414
+ Histogram<kSize>* histograms = new Histogram<kSize>[num_histograms];
332
415
  // Find good entropy codes.
333
416
  InitialEntropyCodes(&data[0], data.size(),
334
- literals_per_histogram,
335
- max_histograms,
336
417
  sampling_stride_length,
337
- &histograms);
418
+ num_histograms, histograms);
338
419
  RefineEntropyCodes(&data[0], data.size(),
339
420
  sampling_stride_length,
340
- &histograms);
421
+ num_histograms, histograms);
341
422
  // Find a good path through literals with the good entropy codes.
342
423
  std::vector<uint8_t> block_ids(data.size());
343
- for (int i = 0; i < 10; ++i) {
344
- FindBlocks(&data[0], data.size(),
345
- block_switch_cost,
346
- histograms,
347
- &block_ids[0]);
348
- BuildBlockHistograms(&data[0], data.size(), &block_ids[0], &histograms);
424
+ size_t num_blocks;
425
+ const size_t bitmaplen = (num_histograms + 7) >> 3;
426
+ double* insert_cost = new double[kSize * num_histograms];
427
+ double *cost = new double[num_histograms];
428
+ uint8_t* switch_signal = new uint8_t[data.size() * bitmaplen];
429
+ uint16_t* new_id = new uint16_t[num_histograms];
430
+ for (size_t i = 0; i < 10; ++i) {
431
+ num_blocks = FindBlocks(&data[0], data.size(),
432
+ block_switch_cost,
433
+ num_histograms, histograms,
434
+ insert_cost, cost, switch_signal,
435
+ &block_ids[0]);
436
+ num_histograms = RemapBlockIds(&block_ids[0], data.size(),
437
+ new_id, num_histograms);
438
+ BuildBlockHistograms(&data[0], data.size(), &block_ids[0],
439
+ num_histograms, histograms);
349
440
  }
350
- ClusterBlocks<HistogramType>(&data[0], data.size(), &block_ids[0]);
351
- BuildBlockSplit(block_ids, split);
441
+ delete[] insert_cost;
442
+ delete[] cost;
443
+ delete[] switch_signal;
444
+ delete[] new_id;
445
+ delete[] histograms;
446
+ ClusterBlocks<Histogram<kSize> >(&data[0], data.size(), num_blocks,
447
+ &block_ids[0], split);
352
448
  }
353
449
 
354
450
  void SplitBlock(const Command* cmds,
@@ -359,55 +455,51 @@ void SplitBlock(const Command* cmds,
359
455
  BlockSplit* literal_split,
360
456
  BlockSplit* insert_and_copy_split,
361
457
  BlockSplit* dist_split) {
362
- // Create a continuous array of literals.
363
- std::vector<uint8_t> literals;
364
- CopyLiteralsToByteArray(cmds, num_commands, data, pos, mask, &literals);
365
-
366
- // Compute prefix codes for commands.
367
- std::vector<uint16_t> insert_and_copy_codes;
368
- std::vector<uint16_t> distance_prefixes;
369
- CopyCommandsToByteArray(cmds, num_commands,
370
- &insert_and_copy_codes,
371
- &distance_prefixes);
458
+ {
459
+ // Create a continuous array of literals.
460
+ std::vector<uint8_t> literals;
461
+ CopyLiteralsToByteArray(cmds, num_commands, data, pos, mask, &literals);
462
+ // Create the block split on the array of literals.
463
+ // Literal histograms have alphabet size 256.
464
+ SplitByteVector<256>(
465
+ literals,
466
+ kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,
467
+ kLiteralStrideLength, kLiteralBlockSwitchCost,
468
+ literal_split);
469
+ }
372
470
 
373
- SplitByteVector<HistogramLiteral>(
374
- literals,
375
- kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,
376
- kLiteralStrideLength, kLiteralBlockSwitchCost,
377
- literal_split);
378
- SplitByteVector<HistogramCommand>(
379
- insert_and_copy_codes,
380
- kSymbolsPerCommandHistogram, kMaxCommandHistograms,
381
- kCommandStrideLength, kCommandBlockSwitchCost,
382
- insert_and_copy_split);
383
- SplitByteVector<HistogramDistance>(
384
- distance_prefixes,
385
- kSymbolsPerDistanceHistogram, kMaxCommandHistograms,
386
- kCommandStrideLength, kDistanceBlockSwitchCost,
387
- dist_split);
388
- }
471
+ {
472
+ // Compute prefix codes for commands.
473
+ std::vector<uint16_t> insert_and_copy_codes(num_commands);
474
+ for (size_t i = 0; i < num_commands; ++i) {
475
+ insert_and_copy_codes[i] = cmds[i].cmd_prefix_;
476
+ }
477
+ // Create the block split on the array of command prefixes.
478
+ SplitByteVector<kNumCommandPrefixes>(
479
+ insert_and_copy_codes,
480
+ kSymbolsPerCommandHistogram, kMaxCommandHistograms,
481
+ kCommandStrideLength, kCommandBlockSwitchCost,
482
+ insert_and_copy_split);
483
+ }
389
484
 
390
- void SplitBlockByTotalLength(const Command* all_commands,
391
- const size_t num_commands,
392
- int input_size,
393
- int target_length,
394
- std::vector<std::vector<Command> >* blocks) {
395
- int num_blocks = input_size / target_length + 1;
396
- int length_limit = input_size / num_blocks + 1;
397
- int total_length = 0;
398
- std::vector<Command> cur_block;
399
- for (size_t i = 0; i < num_commands; ++i) {
400
- const Command& cmd = all_commands[i];
401
- int cmd_length = cmd.insert_len_ + cmd.copy_len_;
402
- if (total_length > length_limit) {
403
- blocks->push_back(cur_block);
404
- cur_block.clear();
405
- total_length = 0;
485
+ {
486
+ // Create a continuous array of distance prefixes.
487
+ std::vector<uint16_t> distance_prefixes(num_commands);
488
+ size_t pos = 0;
489
+ for (size_t i = 0; i < num_commands; ++i) {
490
+ const Command& cmd = cmds[i];
491
+ if (cmd.copy_len() && cmd.cmd_prefix_ >= 128) {
492
+ distance_prefixes[pos++] = cmd.dist_prefix_;
493
+ }
406
494
  }
407
- cur_block.push_back(cmd);
408
- total_length += cmd_length;
495
+ distance_prefixes.resize(pos);
496
+ // Create the block split on the array of distance prefixes.
497
+ SplitByteVector<kNumDistancePrefixes>(
498
+ distance_prefixes,
499
+ kSymbolsPerDistanceHistogram, kMaxCommandHistograms,
500
+ kCommandStrideLength, kDistanceBlockSwitchCost,
501
+ dist_split);
409
502
  }
410
- blocks->push_back(cur_block);
411
503
  }
412
504
 
413
505
  } // namespace brotli