extbrotli 0.0.1.PROTOTYPE

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +28 -0
  3. data/README.md +67 -0
  4. data/Rakefile +158 -0
  5. data/contrib/brotli/LICENSE +202 -0
  6. data/contrib/brotli/README.md +18 -0
  7. data/contrib/brotli/dec/bit_reader.c +55 -0
  8. data/contrib/brotli/dec/bit_reader.h +256 -0
  9. data/contrib/brotli/dec/context.h +260 -0
  10. data/contrib/brotli/dec/decode.c +1573 -0
  11. data/contrib/brotli/dec/decode.h +160 -0
  12. data/contrib/brotli/dec/dictionary.h +9494 -0
  13. data/contrib/brotli/dec/huffman.c +325 -0
  14. data/contrib/brotli/dec/huffman.h +77 -0
  15. data/contrib/brotli/dec/port.h +148 -0
  16. data/contrib/brotli/dec/prefix.h +756 -0
  17. data/contrib/brotli/dec/state.c +149 -0
  18. data/contrib/brotli/dec/state.h +185 -0
  19. data/contrib/brotli/dec/streams.c +99 -0
  20. data/contrib/brotli/dec/streams.h +100 -0
  21. data/contrib/brotli/dec/transform.h +315 -0
  22. data/contrib/brotli/dec/types.h +36 -0
  23. data/contrib/brotli/enc/backward_references.cc +769 -0
  24. data/contrib/brotli/enc/backward_references.h +50 -0
  25. data/contrib/brotli/enc/bit_cost.h +147 -0
  26. data/contrib/brotli/enc/block_splitter.cc +418 -0
  27. data/contrib/brotli/enc/block_splitter.h +78 -0
  28. data/contrib/brotli/enc/brotli_bit_stream.cc +884 -0
  29. data/contrib/brotli/enc/brotli_bit_stream.h +149 -0
  30. data/contrib/brotli/enc/cluster.h +290 -0
  31. data/contrib/brotli/enc/command.h +140 -0
  32. data/contrib/brotli/enc/context.h +185 -0
  33. data/contrib/brotli/enc/dictionary.h +9485 -0
  34. data/contrib/brotli/enc/dictionary_hash.h +4125 -0
  35. data/contrib/brotli/enc/encode.cc +715 -0
  36. data/contrib/brotli/enc/encode.h +196 -0
  37. data/contrib/brotli/enc/encode_parallel.cc +354 -0
  38. data/contrib/brotli/enc/encode_parallel.h +37 -0
  39. data/contrib/brotli/enc/entropy_encode.cc +492 -0
  40. data/contrib/brotli/enc/entropy_encode.h +88 -0
  41. data/contrib/brotli/enc/fast_log.h +179 -0
  42. data/contrib/brotli/enc/find_match_length.h +87 -0
  43. data/contrib/brotli/enc/hash.h +686 -0
  44. data/contrib/brotli/enc/histogram.cc +76 -0
  45. data/contrib/brotli/enc/histogram.h +100 -0
  46. data/contrib/brotli/enc/literal_cost.cc +172 -0
  47. data/contrib/brotli/enc/literal_cost.h +38 -0
  48. data/contrib/brotli/enc/metablock.cc +544 -0
  49. data/contrib/brotli/enc/metablock.h +88 -0
  50. data/contrib/brotli/enc/port.h +151 -0
  51. data/contrib/brotli/enc/prefix.h +85 -0
  52. data/contrib/brotli/enc/ringbuffer.h +108 -0
  53. data/contrib/brotli/enc/static_dict.cc +441 -0
  54. data/contrib/brotli/enc/static_dict.h +40 -0
  55. data/contrib/brotli/enc/static_dict_lut.h +12063 -0
  56. data/contrib/brotli/enc/streams.cc +127 -0
  57. data/contrib/brotli/enc/streams.h +129 -0
  58. data/contrib/brotli/enc/transform.h +250 -0
  59. data/contrib/brotli/enc/write_bits.h +91 -0
  60. data/ext/extbrotli.cc +24 -0
  61. data/ext/extbrotli.h +73 -0
  62. data/ext/extconf.rb +35 -0
  63. data/ext/lldecoder.c +220 -0
  64. data/ext/llencoder.cc +433 -0
  65. data/gemstub.rb +21 -0
  66. data/lib/extbrotli.rb +243 -0
  67. data/lib/extbrotli/version.rb +3 -0
  68. metadata +140 -0
@@ -0,0 +1,50 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Function to find backward reference copies.
16
+
17
+ #ifndef BROTLI_ENC_BACKWARD_REFERENCES_H_
18
+ #define BROTLI_ENC_BACKWARD_REFERENCES_H_
19
+
20
+ #include <stdint.h>
21
+ #include <vector>
22
+
23
+ #include "./hash.h"
24
+ #include "./command.h"
25
+
26
+ namespace brotli {
27
+
28
+ // "commands" points to the next output command to write to, "*num_commands" is
29
+ // initially the total amount of commands output by previous
30
+ // CreateBackwardReferences calls, and must be incremented by the amount written
31
+ // by this call.
32
+ void CreateBackwardReferences(size_t num_bytes,
33
+ size_t position,
34
+ const uint8_t* ringbuffer,
35
+ size_t ringbuffer_mask,
36
+ const float* literal_cost,
37
+ size_t literal_cost_mask,
38
+ const size_t max_backward_limit,
39
+ const int quality,
40
+ Hashers* hashers,
41
+ int hash_type,
42
+ int* dist_cache,
43
+ int* last_insert_len,
44
+ Command* commands,
45
+ int* num_commands,
46
+ int* num_literals);
47
+
48
+ } // namespace brotli
49
+
50
+ #endif // BROTLI_ENC_BACKWARD_REFERENCES_H_
@@ -0,0 +1,147 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Functions to estimate the bit cost of Huffman trees.
16
+
17
+ #ifndef BROTLI_ENC_BIT_COST_H_
18
+ #define BROTLI_ENC_BIT_COST_H_
19
+
20
+
21
+ #include <stdint.h>
22
+
23
+ #include "./entropy_encode.h"
24
+ #include "./fast_log.h"
25
+
26
+ namespace brotli {
27
+
28
+ static inline double ShannonEntropy(const int *population, int size,
29
+ int *total) {
30
+ int sum = 0;
31
+ double retval = 0;
32
+ const int *population_end = population + size;
33
+ int p;
34
+ if (size & 1) {
35
+ goto odd_number_of_elements_left;
36
+ }
37
+ while (population < population_end) {
38
+ p = *population++;
39
+ sum += p;
40
+ retval -= p * FastLog2(p);
41
+ odd_number_of_elements_left:
42
+ p = *population++;
43
+ sum += p;
44
+ retval -= p * FastLog2(p);
45
+ }
46
+ if (sum) retval += sum * FastLog2(sum);
47
+ *total = sum;
48
+ return retval;
49
+ }
50
+
51
+ static inline double BitsEntropy(const int *population, int size) {
52
+ int sum;
53
+ double retval = ShannonEntropy(population, size, &sum);
54
+ if (retval < sum) {
55
+ // At least one bit per literal is needed.
56
+ retval = sum;
57
+ }
58
+ return retval;
59
+ }
60
+
61
+
62
+ template<int kSize>
63
+ double PopulationCost(const Histogram<kSize>& histogram) {
64
+ if (histogram.total_count_ == 0) {
65
+ return 12;
66
+ }
67
+ int count = 0;
68
+ for (int i = 0; i < kSize; ++i) {
69
+ if (histogram.data_[i] > 0) {
70
+ ++count;
71
+ }
72
+ }
73
+ if (count == 1) {
74
+ return 12;
75
+ }
76
+ if (count == 2) {
77
+ return 20 + histogram.total_count_;
78
+ }
79
+ double bits = 0;
80
+ uint8_t depth[kSize] = { 0 };
81
+ if (count <= 4) {
82
+ // For very low symbol count we build the Huffman tree.
83
+ CreateHuffmanTree(&histogram.data_[0], kSize, 15, depth);
84
+ for (int i = 0; i < kSize; ++i) {
85
+ bits += histogram.data_[i] * depth[i];
86
+ }
87
+ return count == 3 ? bits + 28 : bits + 37;
88
+ }
89
+
90
+ // In this loop we compute the entropy of the histogram and simultaneously
91
+ // build a simplified histogram of the code length codes where we use the
92
+ // zero repeat code 17, but we don't use the non-zero repeat code 16.
93
+ int max_depth = 1;
94
+ int depth_histo[kCodeLengthCodes] = { 0 };
95
+ const double log2total = FastLog2(histogram.total_count_);
96
+ for (int i = 0; i < kSize;) {
97
+ if (histogram.data_[i] > 0) {
98
+ // Compute -log2(P(symbol)) = -log2(count(symbol)/total_count) =
99
+ // = log2(total_count) - log2(count(symbol))
100
+ double log2p = log2total - FastLog2(histogram.data_[i]);
101
+ // Approximate the bit depth by round(-log2(P(symbol)))
102
+ int depth = static_cast<int>(log2p + 0.5);
103
+ bits += histogram.data_[i] * log2p;
104
+ if (depth > 15) {
105
+ depth = 15;
106
+ }
107
+ if (depth > max_depth) {
108
+ max_depth = depth;
109
+ }
110
+ ++depth_histo[depth];
111
+ ++i;
112
+ } else {
113
+ // Compute the run length of zeros and add the appropiate number of 0 and
114
+ // 17 code length codes to the code length code histogram.
115
+ int reps = 1;
116
+ for (int k = i + 1; k < kSize && histogram.data_[k] == 0; ++k) {
117
+ ++reps;
118
+ }
119
+ i += reps;
120
+ if (i == kSize) {
121
+ // Don't add any cost for the last zero run, since these are encoded
122
+ // only implicitly.
123
+ break;
124
+ }
125
+ if (reps < 3) {
126
+ depth_histo[0] += reps;
127
+ } else {
128
+ reps -= 2;
129
+ while (reps > 0) {
130
+ ++depth_histo[17];
131
+ // Add the 3 extra bits for the 17 code length code.
132
+ bits += 3;
133
+ reps >>= 3;
134
+ }
135
+ }
136
+ }
137
+ }
138
+ // Add the estimated encoding cost of the code length code histogram.
139
+ bits += 18 + 2 * max_depth;
140
+ // Add the entropy of the code length code histogram.
141
+ bits += BitsEntropy(depth_histo, kCodeLengthCodes);
142
+ return bits;
143
+ }
144
+
145
+ } // namespace brotli
146
+
147
+ #endif // BROTLI_ENC_BIT_COST_H_
@@ -0,0 +1,418 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Block split point selection utilities.
16
+
17
+ #include "./block_splitter.h"
18
+
19
+ #include <math.h>
20
+ #include <stdio.h>
21
+ #include <stdlib.h>
22
+ #include <string.h>
23
+
24
+ #include <algorithm>
25
+ #include <map>
26
+
27
+ #include "./cluster.h"
28
+ #include "./command.h"
29
+ #include "./fast_log.h"
30
+ #include "./histogram.h"
31
+
32
+ namespace brotli {
33
+
34
+ static const int kMaxLiteralHistograms = 100;
35
+ static const int kMaxCommandHistograms = 50;
36
+ static const double kLiteralBlockSwitchCost = 28.1;
37
+ static const double kCommandBlockSwitchCost = 13.5;
38
+ static const double kDistanceBlockSwitchCost = 14.6;
39
+ static const int kLiteralStrideLength = 70;
40
+ static const int kCommandStrideLength = 40;
41
+ static const int kSymbolsPerLiteralHistogram = 544;
42
+ static const int kSymbolsPerCommandHistogram = 530;
43
+ static const int kSymbolsPerDistanceHistogram = 544;
44
+ static const int kMinLengthForBlockSplitting = 128;
45
+ static const int kIterMulForRefining = 2;
46
+ static const int kMinItersForRefining = 100;
47
+
48
+ void CopyLiteralsToByteArray(const Command* cmds,
49
+ const size_t num_commands,
50
+ const uint8_t* data,
51
+ const size_t offset,
52
+ const size_t mask,
53
+ std::vector<uint8_t>* literals) {
54
+ // Count how many we have.
55
+ size_t total_length = 0;
56
+ for (int i = 0; i < num_commands; ++i) {
57
+ total_length += cmds[i].insert_len_;
58
+ }
59
+ if (total_length == 0) {
60
+ return;
61
+ }
62
+
63
+ // Allocate.
64
+ literals->resize(total_length);
65
+
66
+ // Loop again, and copy this time.
67
+ size_t pos = 0;
68
+ size_t from_pos = offset & mask;
69
+ for (int i = 0; i < num_commands && pos < total_length; ++i) {
70
+ size_t insert_len = cmds[i].insert_len_;
71
+ if (from_pos + insert_len > mask) {
72
+ size_t head_size = mask + 1 - from_pos;
73
+ memcpy(&(*literals)[pos], data + from_pos, head_size);
74
+ from_pos = 0;
75
+ pos += head_size;
76
+ insert_len -= head_size;
77
+ }
78
+ if (insert_len > 0) {
79
+ memcpy(&(*literals)[pos], data + from_pos, insert_len);
80
+ pos += insert_len;
81
+ }
82
+ from_pos = (from_pos + insert_len + cmds[i].copy_len_) & mask;
83
+ }
84
+ }
85
+
86
+ void CopyCommandsToByteArray(const Command* cmds,
87
+ const size_t num_commands,
88
+ std::vector<uint16_t>* insert_and_copy_codes,
89
+ std::vector<uint16_t>* distance_prefixes) {
90
+ for (int i = 0; i < num_commands; ++i) {
91
+ const Command& cmd = cmds[i];
92
+ insert_and_copy_codes->push_back(cmd.cmd_prefix_);
93
+ if (cmd.copy_len_ > 0 && cmd.cmd_prefix_ >= 128) {
94
+ distance_prefixes->push_back(cmd.dist_prefix_);
95
+ }
96
+ }
97
+ }
98
+
99
+ inline static unsigned int MyRand(unsigned int* seed) {
100
+ *seed *= 16807U;
101
+ if (*seed == 0) {
102
+ *seed = 1;
103
+ }
104
+ return *seed;
105
+ }
106
+
107
+ template<typename HistogramType, typename DataType>
108
+ void InitialEntropyCodes(const DataType* data, size_t length,
109
+ int literals_per_histogram,
110
+ int max_histograms,
111
+ size_t stride,
112
+ std::vector<HistogramType>* vec) {
113
+ int total_histograms = length / literals_per_histogram + 1;
114
+ if (total_histograms > max_histograms) {
115
+ total_histograms = max_histograms;
116
+ }
117
+ unsigned int seed = 7;
118
+ int block_length = length / total_histograms;
119
+ for (int i = 0; i < total_histograms; ++i) {
120
+ int pos = length * i / total_histograms;
121
+ if (i != 0) {
122
+ pos += MyRand(&seed) % block_length;
123
+ }
124
+ if (pos + stride >= length) {
125
+ pos = length - stride - 1;
126
+ }
127
+ HistogramType histo;
128
+ histo.Add(data + pos, stride);
129
+ vec->push_back(histo);
130
+ }
131
+ }
132
+
133
+ template<typename HistogramType, typename DataType>
134
+ void RandomSample(unsigned int* seed,
135
+ const DataType* data,
136
+ size_t length,
137
+ size_t stride,
138
+ HistogramType* sample) {
139
+ size_t pos = 0;
140
+ if (stride >= length) {
141
+ pos = 0;
142
+ stride = length;
143
+ } else {
144
+ pos = MyRand(seed) % (length - stride + 1);
145
+ }
146
+ sample->Add(data + pos, stride);
147
+ }
148
+
149
+ template<typename HistogramType, typename DataType>
150
+ void RefineEntropyCodes(const DataType* data, size_t length,
151
+ size_t stride,
152
+ std::vector<HistogramType>* vec) {
153
+ int iters =
154
+ kIterMulForRefining * length / stride + kMinItersForRefining;
155
+ unsigned int seed = 7;
156
+ iters = ((iters + vec->size() - 1) / vec->size()) * vec->size();
157
+ for (int iter = 0; iter < iters; ++iter) {
158
+ HistogramType sample;
159
+ RandomSample(&seed, data, length, stride, &sample);
160
+ int ix = iter % vec->size();
161
+ (*vec)[ix].AddHistogram(sample);
162
+ }
163
+ }
164
+
165
+ inline static float BitCost(int count) {
166
+ return count == 0 ? -2 : FastLog2(count);
167
+ }
168
+
169
+ template<typename DataType, int kSize>
170
+ void FindBlocks(const DataType* data, const size_t length,
171
+ const double block_switch_bitcost,
172
+ const std::vector<Histogram<kSize> > &vec,
173
+ uint8_t *block_id) {
174
+ if (vec.size() <= 1) {
175
+ for (int i = 0; i < length; ++i) {
176
+ block_id[i] = 0;
177
+ }
178
+ return;
179
+ }
180
+ int vecsize = vec.size();
181
+ double* insert_cost = new double[kSize * vecsize];
182
+ memset(insert_cost, 0, sizeof(insert_cost[0]) * kSize * vecsize);
183
+ for (int j = 0; j < vecsize; ++j) {
184
+ insert_cost[j] = FastLog2(vec[j].total_count_);
185
+ }
186
+ for (int i = kSize - 1; i >= 0; --i) {
187
+ for (int j = 0; j < vecsize; ++j) {
188
+ insert_cost[i * vecsize + j] = insert_cost[j] - BitCost(vec[j].data_[i]);
189
+ }
190
+ }
191
+ double *cost = new double[vecsize];
192
+ memset(cost, 0, sizeof(cost[0]) * vecsize);
193
+ bool* switch_signal = new bool[length * vecsize];
194
+ memset(switch_signal, 0, sizeof(switch_signal[0]) * length * vecsize);
195
+ // After each iteration of this loop, cost[k] will contain the difference
196
+ // between the minimum cost of arriving at the current byte position using
197
+ // entropy code k, and the minimum cost of arriving at the current byte
198
+ // position. This difference is capped at the block switch cost, and if it
199
+ // reaches block switch cost, it means that when we trace back from the last
200
+ // position, we need to switch here.
201
+ for (size_t byte_ix = 0; byte_ix < length; ++byte_ix) {
202
+ int ix = byte_ix * vecsize;
203
+ int insert_cost_ix = data[byte_ix] * vecsize;
204
+ double min_cost = 1e99;
205
+ for (int k = 0; k < vecsize; ++k) {
206
+ // We are coding the symbol in data[byte_ix] with entropy code k.
207
+ cost[k] += insert_cost[insert_cost_ix + k];
208
+ if (cost[k] < min_cost) {
209
+ min_cost = cost[k];
210
+ block_id[byte_ix] = k;
211
+ }
212
+ }
213
+ double block_switch_cost = block_switch_bitcost;
214
+ // More blocks for the beginning.
215
+ if (byte_ix < 2000) {
216
+ block_switch_cost *= 0.77 + 0.07 * byte_ix / 2000;
217
+ }
218
+ for (int k = 0; k < vecsize; ++k) {
219
+ cost[k] -= min_cost;
220
+ if (cost[k] >= block_switch_cost) {
221
+ cost[k] = block_switch_cost;
222
+ switch_signal[ix + k] = true;
223
+ }
224
+ }
225
+ }
226
+ // Now trace back from the last position and switch at the marked places.
227
+ int byte_ix = length - 1;
228
+ int ix = byte_ix * vecsize;
229
+ int cur_id = block_id[byte_ix];
230
+ while (byte_ix > 0) {
231
+ --byte_ix;
232
+ ix -= vecsize;
233
+ if (switch_signal[ix + cur_id]) {
234
+ cur_id = block_id[byte_ix];
235
+ }
236
+ block_id[byte_ix] = cur_id;
237
+ }
238
+ delete[] insert_cost;
239
+ delete[] cost;
240
+ delete[] switch_signal;
241
+ }
242
+
243
+ int RemapBlockIds(uint8_t* block_ids, const size_t length) {
244
+ std::map<uint8_t, uint8_t> new_id;
245
+ int next_id = 0;
246
+ for (int i = 0; i < length; ++i) {
247
+ if (new_id.find(block_ids[i]) == new_id.end()) {
248
+ new_id[block_ids[i]] = next_id;
249
+ ++next_id;
250
+ }
251
+ }
252
+ for (int i = 0; i < length; ++i) {
253
+ block_ids[i] = new_id[block_ids[i]];
254
+ }
255
+ return next_id;
256
+ }
257
+
258
+ template<typename HistogramType, typename DataType>
259
+ void BuildBlockHistograms(const DataType* data, const size_t length,
260
+ uint8_t* block_ids,
261
+ std::vector<HistogramType>* histograms) {
262
+ int num_types = RemapBlockIds(block_ids, length);
263
+ histograms->clear();
264
+ histograms->resize(num_types);
265
+ for (int i = 0; i < length; ++i) {
266
+ (*histograms)[block_ids[i]].Add(data[i]);
267
+ }
268
+ }
269
+
270
+ template<typename HistogramType, typename DataType>
271
+ void ClusterBlocks(const DataType* data, const size_t length,
272
+ uint8_t* block_ids) {
273
+ std::vector<HistogramType> histograms;
274
+ std::vector<int> block_index(length);
275
+ int cur_idx = 0;
276
+ HistogramType cur_histogram;
277
+ for (int i = 0; i < length; ++i) {
278
+ bool block_boundary = (i + 1 == length || block_ids[i] != block_ids[i + 1]);
279
+ block_index[i] = cur_idx;
280
+ cur_histogram.Add(data[i]);
281
+ if (block_boundary) {
282
+ histograms.push_back(cur_histogram);
283
+ cur_histogram.Clear();
284
+ ++cur_idx;
285
+ }
286
+ }
287
+ std::vector<HistogramType> clustered_histograms;
288
+ std::vector<int> histogram_symbols;
289
+ // Block ids need to fit in one byte.
290
+ static const int kMaxNumberOfBlockTypes = 256;
291
+ ClusterHistograms(histograms, 1, histograms.size(),
292
+ kMaxNumberOfBlockTypes,
293
+ &clustered_histograms,
294
+ &histogram_symbols);
295
+ for (int i = 0; i < length; ++i) {
296
+ block_ids[i] = histogram_symbols[block_index[i]];
297
+ }
298
+ }
299
+
300
+ void BuildBlockSplit(const std::vector<uint8_t>& block_ids, BlockSplit* split) {
301
+ int cur_id = block_ids[0];
302
+ int cur_length = 1;
303
+ split->num_types = -1;
304
+ for (int i = 1; i < block_ids.size(); ++i) {
305
+ if (block_ids[i] != cur_id) {
306
+ split->types.push_back(cur_id);
307
+ split->lengths.push_back(cur_length);
308
+ split->num_types = std::max(split->num_types, cur_id);
309
+ cur_id = block_ids[i];
310
+ cur_length = 0;
311
+ }
312
+ ++cur_length;
313
+ }
314
+ split->types.push_back(cur_id);
315
+ split->lengths.push_back(cur_length);
316
+ split->num_types = std::max(split->num_types, cur_id);
317
+ ++split->num_types;
318
+ }
319
+
320
+ template<typename HistogramType, typename DataType>
321
+ void SplitByteVector(const std::vector<DataType>& data,
322
+ const int literals_per_histogram,
323
+ const int max_histograms,
324
+ const int sampling_stride_length,
325
+ const double block_switch_cost,
326
+ BlockSplit* split) {
327
+ if (data.empty()) {
328
+ split->num_types = 1;
329
+ return;
330
+ } else if (data.size() < kMinLengthForBlockSplitting) {
331
+ split->num_types = 1;
332
+ split->types.push_back(0);
333
+ split->lengths.push_back(data.size());
334
+ return;
335
+ }
336
+ std::vector<HistogramType> histograms;
337
+ // Find good entropy codes.
338
+ InitialEntropyCodes(data.data(), data.size(),
339
+ literals_per_histogram,
340
+ max_histograms,
341
+ sampling_stride_length,
342
+ &histograms);
343
+ RefineEntropyCodes(data.data(), data.size(),
344
+ sampling_stride_length,
345
+ &histograms);
346
+ // Find a good path through literals with the good entropy codes.
347
+ std::vector<uint8_t> block_ids(data.size());
348
+ for (int i = 0; i < 10; ++i) {
349
+ FindBlocks(data.data(), data.size(),
350
+ block_switch_cost,
351
+ histograms,
352
+ &block_ids[0]);
353
+ BuildBlockHistograms(data.data(), data.size(), &block_ids[0], &histograms);
354
+ }
355
+ ClusterBlocks<HistogramType>(data.data(), data.size(), &block_ids[0]);
356
+ BuildBlockSplit(block_ids, split);
357
+ }
358
+
359
+ void SplitBlock(const Command* cmds,
360
+ const size_t num_commands,
361
+ const uint8_t* data,
362
+ const size_t pos,
363
+ const size_t mask,
364
+ BlockSplit* literal_split,
365
+ BlockSplit* insert_and_copy_split,
366
+ BlockSplit* dist_split) {
367
+ // Create a continuous array of literals.
368
+ std::vector<uint8_t> literals;
369
+ CopyLiteralsToByteArray(cmds, num_commands, data, pos, mask, &literals);
370
+
371
+ // Compute prefix codes for commands.
372
+ std::vector<uint16_t> insert_and_copy_codes;
373
+ std::vector<uint16_t> distance_prefixes;
374
+ CopyCommandsToByteArray(cmds, num_commands,
375
+ &insert_and_copy_codes,
376
+ &distance_prefixes);
377
+
378
+ SplitByteVector<HistogramLiteral>(
379
+ literals,
380
+ kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,
381
+ kLiteralStrideLength, kLiteralBlockSwitchCost,
382
+ literal_split);
383
+ SplitByteVector<HistogramCommand>(
384
+ insert_and_copy_codes,
385
+ kSymbolsPerCommandHistogram, kMaxCommandHistograms,
386
+ kCommandStrideLength, kCommandBlockSwitchCost,
387
+ insert_and_copy_split);
388
+ SplitByteVector<HistogramDistance>(
389
+ distance_prefixes,
390
+ kSymbolsPerDistanceHistogram, kMaxCommandHistograms,
391
+ kCommandStrideLength, kDistanceBlockSwitchCost,
392
+ dist_split);
393
+ }
394
+
395
+ void SplitBlockByTotalLength(const Command* all_commands,
396
+ const size_t num_commands,
397
+ int input_size,
398
+ int target_length,
399
+ std::vector<std::vector<Command> >* blocks) {
400
+ int num_blocks = input_size / target_length + 1;
401
+ int length_limit = input_size / num_blocks + 1;
402
+ int total_length = 0;
403
+ std::vector<Command> cur_block;
404
+ for (int i = 0; i < num_commands; ++i) {
405
+ const Command& cmd = all_commands[i];
406
+ int cmd_length = cmd.insert_len_ + cmd.copy_len_;
407
+ if (total_length > length_limit) {
408
+ blocks->push_back(cur_block);
409
+ cur_block.clear();
410
+ total_length = 0;
411
+ }
412
+ cur_block.push_back(cmd);
413
+ total_length += cmd_length;
414
+ }
415
+ blocks->push_back(cur_block);
416
+ }
417
+
418
+ } // namespace brotli