extbrotli 0.0.1.PROTOTYPE

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +28 -0
  3. data/README.md +67 -0
  4. data/Rakefile +158 -0
  5. data/contrib/brotli/LICENSE +202 -0
  6. data/contrib/brotli/README.md +18 -0
  7. data/contrib/brotli/dec/bit_reader.c +55 -0
  8. data/contrib/brotli/dec/bit_reader.h +256 -0
  9. data/contrib/brotli/dec/context.h +260 -0
  10. data/contrib/brotli/dec/decode.c +1573 -0
  11. data/contrib/brotli/dec/decode.h +160 -0
  12. data/contrib/brotli/dec/dictionary.h +9494 -0
  13. data/contrib/brotli/dec/huffman.c +325 -0
  14. data/contrib/brotli/dec/huffman.h +77 -0
  15. data/contrib/brotli/dec/port.h +148 -0
  16. data/contrib/brotli/dec/prefix.h +756 -0
  17. data/contrib/brotli/dec/state.c +149 -0
  18. data/contrib/brotli/dec/state.h +185 -0
  19. data/contrib/brotli/dec/streams.c +99 -0
  20. data/contrib/brotli/dec/streams.h +100 -0
  21. data/contrib/brotli/dec/transform.h +315 -0
  22. data/contrib/brotli/dec/types.h +36 -0
  23. data/contrib/brotli/enc/backward_references.cc +769 -0
  24. data/contrib/brotli/enc/backward_references.h +50 -0
  25. data/contrib/brotli/enc/bit_cost.h +147 -0
  26. data/contrib/brotli/enc/block_splitter.cc +418 -0
  27. data/contrib/brotli/enc/block_splitter.h +78 -0
  28. data/contrib/brotli/enc/brotli_bit_stream.cc +884 -0
  29. data/contrib/brotli/enc/brotli_bit_stream.h +149 -0
  30. data/contrib/brotli/enc/cluster.h +290 -0
  31. data/contrib/brotli/enc/command.h +140 -0
  32. data/contrib/brotli/enc/context.h +185 -0
  33. data/contrib/brotli/enc/dictionary.h +9485 -0
  34. data/contrib/brotli/enc/dictionary_hash.h +4125 -0
  35. data/contrib/brotli/enc/encode.cc +715 -0
  36. data/contrib/brotli/enc/encode.h +196 -0
  37. data/contrib/brotli/enc/encode_parallel.cc +354 -0
  38. data/contrib/brotli/enc/encode_parallel.h +37 -0
  39. data/contrib/brotli/enc/entropy_encode.cc +492 -0
  40. data/contrib/brotli/enc/entropy_encode.h +88 -0
  41. data/contrib/brotli/enc/fast_log.h +179 -0
  42. data/contrib/brotli/enc/find_match_length.h +87 -0
  43. data/contrib/brotli/enc/hash.h +686 -0
  44. data/contrib/brotli/enc/histogram.cc +76 -0
  45. data/contrib/brotli/enc/histogram.h +100 -0
  46. data/contrib/brotli/enc/literal_cost.cc +172 -0
  47. data/contrib/brotli/enc/literal_cost.h +38 -0
  48. data/contrib/brotli/enc/metablock.cc +544 -0
  49. data/contrib/brotli/enc/metablock.h +88 -0
  50. data/contrib/brotli/enc/port.h +151 -0
  51. data/contrib/brotli/enc/prefix.h +85 -0
  52. data/contrib/brotli/enc/ringbuffer.h +108 -0
  53. data/contrib/brotli/enc/static_dict.cc +441 -0
  54. data/contrib/brotli/enc/static_dict.h +40 -0
  55. data/contrib/brotli/enc/static_dict_lut.h +12063 -0
  56. data/contrib/brotli/enc/streams.cc +127 -0
  57. data/contrib/brotli/enc/streams.h +129 -0
  58. data/contrib/brotli/enc/transform.h +250 -0
  59. data/contrib/brotli/enc/write_bits.h +91 -0
  60. data/ext/extbrotli.cc +24 -0
  61. data/ext/extbrotli.h +73 -0
  62. data/ext/extconf.rb +35 -0
  63. data/ext/lldecoder.c +220 -0
  64. data/ext/llencoder.cc +433 -0
  65. data/gemstub.rb +21 -0
  66. data/lib/extbrotli.rb +243 -0
  67. data/lib/extbrotli/version.rb +3 -0
  68. metadata +140 -0
@@ -0,0 +1,50 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Function to find backward reference copies.
16
+
17
+ #ifndef BROTLI_ENC_BACKWARD_REFERENCES_H_
18
+ #define BROTLI_ENC_BACKWARD_REFERENCES_H_
19
+
20
+ #include <stdint.h>
21
+ #include <vector>
22
+
23
+ #include "./hash.h"
24
+ #include "./command.h"
25
+
26
+ namespace brotli {
27
+
28
+ // "commands" points to the next output command to write to, "*num_commands" is
29
+ // initially the total amount of commands output by previous
30
+ // CreateBackwardReferences calls, and must be incremented by the amount written
31
+ // by this call.
32
+ void CreateBackwardReferences(size_t num_bytes,
33
+ size_t position,
34
+ const uint8_t* ringbuffer,
35
+ size_t ringbuffer_mask,
36
+ const float* literal_cost,
37
+ size_t literal_cost_mask,
38
+ const size_t max_backward_limit,
39
+ const int quality,
40
+ Hashers* hashers,
41
+ int hash_type,
42
+ int* dist_cache,
43
+ int* last_insert_len,
44
+ Command* commands,
45
+ int* num_commands,
46
+ int* num_literals);
47
+
48
+ } // namespace brotli
49
+
50
+ #endif // BROTLI_ENC_BACKWARD_REFERENCES_H_
@@ -0,0 +1,147 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Functions to estimate the bit cost of Huffman trees.
16
+
17
+ #ifndef BROTLI_ENC_BIT_COST_H_
18
+ #define BROTLI_ENC_BIT_COST_H_
19
+
20
+
21
+ #include <stdint.h>
22
+
23
+ #include "./entropy_encode.h"
24
+ #include "./fast_log.h"
25
+
26
+ namespace brotli {
27
+
28
+ static inline double ShannonEntropy(const int *population, int size,
29
+ int *total) {
30
+ int sum = 0;
31
+ double retval = 0;
32
+ const int *population_end = population + size;
33
+ int p;
34
+ if (size & 1) {
35
+ goto odd_number_of_elements_left;
36
+ }
37
+ while (population < population_end) {
38
+ p = *population++;
39
+ sum += p;
40
+ retval -= p * FastLog2(p);
41
+ odd_number_of_elements_left:
42
+ p = *population++;
43
+ sum += p;
44
+ retval -= p * FastLog2(p);
45
+ }
46
+ if (sum) retval += sum * FastLog2(sum);
47
+ *total = sum;
48
+ return retval;
49
+ }
50
+
51
+ static inline double BitsEntropy(const int *population, int size) {
52
+ int sum;
53
+ double retval = ShannonEntropy(population, size, &sum);
54
+ if (retval < sum) {
55
+ // At least one bit per literal is needed.
56
+ retval = sum;
57
+ }
58
+ return retval;
59
+ }
60
+
61
+
62
+ template<int kSize>
63
+ double PopulationCost(const Histogram<kSize>& histogram) {
64
+ if (histogram.total_count_ == 0) {
65
+ return 12;
66
+ }
67
+ int count = 0;
68
+ for (int i = 0; i < kSize; ++i) {
69
+ if (histogram.data_[i] > 0) {
70
+ ++count;
71
+ }
72
+ }
73
+ if (count == 1) {
74
+ return 12;
75
+ }
76
+ if (count == 2) {
77
+ return 20 + histogram.total_count_;
78
+ }
79
+ double bits = 0;
80
+ uint8_t depth[kSize] = { 0 };
81
+ if (count <= 4) {
82
+ // For very low symbol count we build the Huffman tree.
83
+ CreateHuffmanTree(&histogram.data_[0], kSize, 15, depth);
84
+ for (int i = 0; i < kSize; ++i) {
85
+ bits += histogram.data_[i] * depth[i];
86
+ }
87
+ return count == 3 ? bits + 28 : bits + 37;
88
+ }
89
+
90
+ // In this loop we compute the entropy of the histogram and simultaneously
91
+ // build a simplified histogram of the code length codes where we use the
92
+ // zero repeat code 17, but we don't use the non-zero repeat code 16.
93
+ int max_depth = 1;
94
+ int depth_histo[kCodeLengthCodes] = { 0 };
95
+ const double log2total = FastLog2(histogram.total_count_);
96
+ for (int i = 0; i < kSize;) {
97
+ if (histogram.data_[i] > 0) {
98
+ // Compute -log2(P(symbol)) = -log2(count(symbol)/total_count) =
99
+ // = log2(total_count) - log2(count(symbol))
100
+ double log2p = log2total - FastLog2(histogram.data_[i]);
101
+ // Approximate the bit depth by round(-log2(P(symbol)))
102
+ int depth = static_cast<int>(log2p + 0.5);
103
+ bits += histogram.data_[i] * log2p;
104
+ if (depth > 15) {
105
+ depth = 15;
106
+ }
107
+ if (depth > max_depth) {
108
+ max_depth = depth;
109
+ }
110
+ ++depth_histo[depth];
111
+ ++i;
112
+ } else {
113
+ // Compute the run length of zeros and add the appropiate number of 0 and
114
+ // 17 code length codes to the code length code histogram.
115
+ int reps = 1;
116
+ for (int k = i + 1; k < kSize && histogram.data_[k] == 0; ++k) {
117
+ ++reps;
118
+ }
119
+ i += reps;
120
+ if (i == kSize) {
121
+ // Don't add any cost for the last zero run, since these are encoded
122
+ // only implicitly.
123
+ break;
124
+ }
125
+ if (reps < 3) {
126
+ depth_histo[0] += reps;
127
+ } else {
128
+ reps -= 2;
129
+ while (reps > 0) {
130
+ ++depth_histo[17];
131
+ // Add the 3 extra bits for the 17 code length code.
132
+ bits += 3;
133
+ reps >>= 3;
134
+ }
135
+ }
136
+ }
137
+ }
138
+ // Add the estimated encoding cost of the code length code histogram.
139
+ bits += 18 + 2 * max_depth;
140
+ // Add the entropy of the code length code histogram.
141
+ bits += BitsEntropy(depth_histo, kCodeLengthCodes);
142
+ return bits;
143
+ }
144
+
145
+ } // namespace brotli
146
+
147
+ #endif // BROTLI_ENC_BIT_COST_H_
@@ -0,0 +1,418 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Block split point selection utilities.
16
+
17
+ #include "./block_splitter.h"
18
+
19
+ #include <math.h>
20
+ #include <stdio.h>
21
+ #include <stdlib.h>
22
+ #include <string.h>
23
+
24
+ #include <algorithm>
25
+ #include <map>
26
+
27
+ #include "./cluster.h"
28
+ #include "./command.h"
29
+ #include "./fast_log.h"
30
+ #include "./histogram.h"
31
+
32
+ namespace brotli {
33
+
34
+ static const int kMaxLiteralHistograms = 100;
35
+ static const int kMaxCommandHistograms = 50;
36
+ static const double kLiteralBlockSwitchCost = 28.1;
37
+ static const double kCommandBlockSwitchCost = 13.5;
38
+ static const double kDistanceBlockSwitchCost = 14.6;
39
+ static const int kLiteralStrideLength = 70;
40
+ static const int kCommandStrideLength = 40;
41
+ static const int kSymbolsPerLiteralHistogram = 544;
42
+ static const int kSymbolsPerCommandHistogram = 530;
43
+ static const int kSymbolsPerDistanceHistogram = 544;
44
+ static const int kMinLengthForBlockSplitting = 128;
45
+ static const int kIterMulForRefining = 2;
46
+ static const int kMinItersForRefining = 100;
47
+
48
+ void CopyLiteralsToByteArray(const Command* cmds,
49
+ const size_t num_commands,
50
+ const uint8_t* data,
51
+ const size_t offset,
52
+ const size_t mask,
53
+ std::vector<uint8_t>* literals) {
54
+ // Count how many we have.
55
+ size_t total_length = 0;
56
+ for (int i = 0; i < num_commands; ++i) {
57
+ total_length += cmds[i].insert_len_;
58
+ }
59
+ if (total_length == 0) {
60
+ return;
61
+ }
62
+
63
+ // Allocate.
64
+ literals->resize(total_length);
65
+
66
+ // Loop again, and copy this time.
67
+ size_t pos = 0;
68
+ size_t from_pos = offset & mask;
69
+ for (int i = 0; i < num_commands && pos < total_length; ++i) {
70
+ size_t insert_len = cmds[i].insert_len_;
71
+ if (from_pos + insert_len > mask) {
72
+ size_t head_size = mask + 1 - from_pos;
73
+ memcpy(&(*literals)[pos], data + from_pos, head_size);
74
+ from_pos = 0;
75
+ pos += head_size;
76
+ insert_len -= head_size;
77
+ }
78
+ if (insert_len > 0) {
79
+ memcpy(&(*literals)[pos], data + from_pos, insert_len);
80
+ pos += insert_len;
81
+ }
82
+ from_pos = (from_pos + insert_len + cmds[i].copy_len_) & mask;
83
+ }
84
+ }
85
+
86
+ void CopyCommandsToByteArray(const Command* cmds,
87
+ const size_t num_commands,
88
+ std::vector<uint16_t>* insert_and_copy_codes,
89
+ std::vector<uint16_t>* distance_prefixes) {
90
+ for (int i = 0; i < num_commands; ++i) {
91
+ const Command& cmd = cmds[i];
92
+ insert_and_copy_codes->push_back(cmd.cmd_prefix_);
93
+ if (cmd.copy_len_ > 0 && cmd.cmd_prefix_ >= 128) {
94
+ distance_prefixes->push_back(cmd.dist_prefix_);
95
+ }
96
+ }
97
+ }
98
+
99
+ inline static unsigned int MyRand(unsigned int* seed) {
100
+ *seed *= 16807U;
101
+ if (*seed == 0) {
102
+ *seed = 1;
103
+ }
104
+ return *seed;
105
+ }
106
+
107
+ template<typename HistogramType, typename DataType>
108
+ void InitialEntropyCodes(const DataType* data, size_t length,
109
+ int literals_per_histogram,
110
+ int max_histograms,
111
+ size_t stride,
112
+ std::vector<HistogramType>* vec) {
113
+ int total_histograms = length / literals_per_histogram + 1;
114
+ if (total_histograms > max_histograms) {
115
+ total_histograms = max_histograms;
116
+ }
117
+ unsigned int seed = 7;
118
+ int block_length = length / total_histograms;
119
+ for (int i = 0; i < total_histograms; ++i) {
120
+ int pos = length * i / total_histograms;
121
+ if (i != 0) {
122
+ pos += MyRand(&seed) % block_length;
123
+ }
124
+ if (pos + stride >= length) {
125
+ pos = length - stride - 1;
126
+ }
127
+ HistogramType histo;
128
+ histo.Add(data + pos, stride);
129
+ vec->push_back(histo);
130
+ }
131
+ }
132
+
133
+ template<typename HistogramType, typename DataType>
134
+ void RandomSample(unsigned int* seed,
135
+ const DataType* data,
136
+ size_t length,
137
+ size_t stride,
138
+ HistogramType* sample) {
139
+ size_t pos = 0;
140
+ if (stride >= length) {
141
+ pos = 0;
142
+ stride = length;
143
+ } else {
144
+ pos = MyRand(seed) % (length - stride + 1);
145
+ }
146
+ sample->Add(data + pos, stride);
147
+ }
148
+
149
+ template<typename HistogramType, typename DataType>
150
+ void RefineEntropyCodes(const DataType* data, size_t length,
151
+ size_t stride,
152
+ std::vector<HistogramType>* vec) {
153
+ int iters =
154
+ kIterMulForRefining * length / stride + kMinItersForRefining;
155
+ unsigned int seed = 7;
156
+ iters = ((iters + vec->size() - 1) / vec->size()) * vec->size();
157
+ for (int iter = 0; iter < iters; ++iter) {
158
+ HistogramType sample;
159
+ RandomSample(&seed, data, length, stride, &sample);
160
+ int ix = iter % vec->size();
161
+ (*vec)[ix].AddHistogram(sample);
162
+ }
163
+ }
164
+
165
+ inline static float BitCost(int count) {
166
+ return count == 0 ? -2 : FastLog2(count);
167
+ }
168
+
169
+ template<typename DataType, int kSize>
170
+ void FindBlocks(const DataType* data, const size_t length,
171
+ const double block_switch_bitcost,
172
+ const std::vector<Histogram<kSize> > &vec,
173
+ uint8_t *block_id) {
174
+ if (vec.size() <= 1) {
175
+ for (int i = 0; i < length; ++i) {
176
+ block_id[i] = 0;
177
+ }
178
+ return;
179
+ }
180
+ int vecsize = vec.size();
181
+ double* insert_cost = new double[kSize * vecsize];
182
+ memset(insert_cost, 0, sizeof(insert_cost[0]) * kSize * vecsize);
183
+ for (int j = 0; j < vecsize; ++j) {
184
+ insert_cost[j] = FastLog2(vec[j].total_count_);
185
+ }
186
+ for (int i = kSize - 1; i >= 0; --i) {
187
+ for (int j = 0; j < vecsize; ++j) {
188
+ insert_cost[i * vecsize + j] = insert_cost[j] - BitCost(vec[j].data_[i]);
189
+ }
190
+ }
191
+ double *cost = new double[vecsize];
192
+ memset(cost, 0, sizeof(cost[0]) * vecsize);
193
+ bool* switch_signal = new bool[length * vecsize];
194
+ memset(switch_signal, 0, sizeof(switch_signal[0]) * length * vecsize);
195
+ // After each iteration of this loop, cost[k] will contain the difference
196
+ // between the minimum cost of arriving at the current byte position using
197
+ // entropy code k, and the minimum cost of arriving at the current byte
198
+ // position. This difference is capped at the block switch cost, and if it
199
+ // reaches block switch cost, it means that when we trace back from the last
200
+ // position, we need to switch here.
201
+ for (size_t byte_ix = 0; byte_ix < length; ++byte_ix) {
202
+ int ix = byte_ix * vecsize;
203
+ int insert_cost_ix = data[byte_ix] * vecsize;
204
+ double min_cost = 1e99;
205
+ for (int k = 0; k < vecsize; ++k) {
206
+ // We are coding the symbol in data[byte_ix] with entropy code k.
207
+ cost[k] += insert_cost[insert_cost_ix + k];
208
+ if (cost[k] < min_cost) {
209
+ min_cost = cost[k];
210
+ block_id[byte_ix] = k;
211
+ }
212
+ }
213
+ double block_switch_cost = block_switch_bitcost;
214
+ // More blocks for the beginning.
215
+ if (byte_ix < 2000) {
216
+ block_switch_cost *= 0.77 + 0.07 * byte_ix / 2000;
217
+ }
218
+ for (int k = 0; k < vecsize; ++k) {
219
+ cost[k] -= min_cost;
220
+ if (cost[k] >= block_switch_cost) {
221
+ cost[k] = block_switch_cost;
222
+ switch_signal[ix + k] = true;
223
+ }
224
+ }
225
+ }
226
+ // Now trace back from the last position and switch at the marked places.
227
+ int byte_ix = length - 1;
228
+ int ix = byte_ix * vecsize;
229
+ int cur_id = block_id[byte_ix];
230
+ while (byte_ix > 0) {
231
+ --byte_ix;
232
+ ix -= vecsize;
233
+ if (switch_signal[ix + cur_id]) {
234
+ cur_id = block_id[byte_ix];
235
+ }
236
+ block_id[byte_ix] = cur_id;
237
+ }
238
+ delete[] insert_cost;
239
+ delete[] cost;
240
+ delete[] switch_signal;
241
+ }
242
+
243
+ int RemapBlockIds(uint8_t* block_ids, const size_t length) {
244
+ std::map<uint8_t, uint8_t> new_id;
245
+ int next_id = 0;
246
+ for (int i = 0; i < length; ++i) {
247
+ if (new_id.find(block_ids[i]) == new_id.end()) {
248
+ new_id[block_ids[i]] = next_id;
249
+ ++next_id;
250
+ }
251
+ }
252
+ for (int i = 0; i < length; ++i) {
253
+ block_ids[i] = new_id[block_ids[i]];
254
+ }
255
+ return next_id;
256
+ }
257
+
258
+ template<typename HistogramType, typename DataType>
259
+ void BuildBlockHistograms(const DataType* data, const size_t length,
260
+ uint8_t* block_ids,
261
+ std::vector<HistogramType>* histograms) {
262
+ int num_types = RemapBlockIds(block_ids, length);
263
+ histograms->clear();
264
+ histograms->resize(num_types);
265
+ for (int i = 0; i < length; ++i) {
266
+ (*histograms)[block_ids[i]].Add(data[i]);
267
+ }
268
+ }
269
+
270
+ template<typename HistogramType, typename DataType>
271
+ void ClusterBlocks(const DataType* data, const size_t length,
272
+ uint8_t* block_ids) {
273
+ std::vector<HistogramType> histograms;
274
+ std::vector<int> block_index(length);
275
+ int cur_idx = 0;
276
+ HistogramType cur_histogram;
277
+ for (int i = 0; i < length; ++i) {
278
+ bool block_boundary = (i + 1 == length || block_ids[i] != block_ids[i + 1]);
279
+ block_index[i] = cur_idx;
280
+ cur_histogram.Add(data[i]);
281
+ if (block_boundary) {
282
+ histograms.push_back(cur_histogram);
283
+ cur_histogram.Clear();
284
+ ++cur_idx;
285
+ }
286
+ }
287
+ std::vector<HistogramType> clustered_histograms;
288
+ std::vector<int> histogram_symbols;
289
+ // Block ids need to fit in one byte.
290
+ static const int kMaxNumberOfBlockTypes = 256;
291
+ ClusterHistograms(histograms, 1, histograms.size(),
292
+ kMaxNumberOfBlockTypes,
293
+ &clustered_histograms,
294
+ &histogram_symbols);
295
+ for (int i = 0; i < length; ++i) {
296
+ block_ids[i] = histogram_symbols[block_index[i]];
297
+ }
298
+ }
299
+
300
+ void BuildBlockSplit(const std::vector<uint8_t>& block_ids, BlockSplit* split) {
301
+ int cur_id = block_ids[0];
302
+ int cur_length = 1;
303
+ split->num_types = -1;
304
+ for (int i = 1; i < block_ids.size(); ++i) {
305
+ if (block_ids[i] != cur_id) {
306
+ split->types.push_back(cur_id);
307
+ split->lengths.push_back(cur_length);
308
+ split->num_types = std::max(split->num_types, cur_id);
309
+ cur_id = block_ids[i];
310
+ cur_length = 0;
311
+ }
312
+ ++cur_length;
313
+ }
314
+ split->types.push_back(cur_id);
315
+ split->lengths.push_back(cur_length);
316
+ split->num_types = std::max(split->num_types, cur_id);
317
+ ++split->num_types;
318
+ }
319
+
320
+ template<typename HistogramType, typename DataType>
321
+ void SplitByteVector(const std::vector<DataType>& data,
322
+ const int literals_per_histogram,
323
+ const int max_histograms,
324
+ const int sampling_stride_length,
325
+ const double block_switch_cost,
326
+ BlockSplit* split) {
327
+ if (data.empty()) {
328
+ split->num_types = 1;
329
+ return;
330
+ } else if (data.size() < kMinLengthForBlockSplitting) {
331
+ split->num_types = 1;
332
+ split->types.push_back(0);
333
+ split->lengths.push_back(data.size());
334
+ return;
335
+ }
336
+ std::vector<HistogramType> histograms;
337
+ // Find good entropy codes.
338
+ InitialEntropyCodes(data.data(), data.size(),
339
+ literals_per_histogram,
340
+ max_histograms,
341
+ sampling_stride_length,
342
+ &histograms);
343
+ RefineEntropyCodes(data.data(), data.size(),
344
+ sampling_stride_length,
345
+ &histograms);
346
+ // Find a good path through literals with the good entropy codes.
347
+ std::vector<uint8_t> block_ids(data.size());
348
+ for (int i = 0; i < 10; ++i) {
349
+ FindBlocks(data.data(), data.size(),
350
+ block_switch_cost,
351
+ histograms,
352
+ &block_ids[0]);
353
+ BuildBlockHistograms(data.data(), data.size(), &block_ids[0], &histograms);
354
+ }
355
+ ClusterBlocks<HistogramType>(data.data(), data.size(), &block_ids[0]);
356
+ BuildBlockSplit(block_ids, split);
357
+ }
358
+
359
+ void SplitBlock(const Command* cmds,
360
+ const size_t num_commands,
361
+ const uint8_t* data,
362
+ const size_t pos,
363
+ const size_t mask,
364
+ BlockSplit* literal_split,
365
+ BlockSplit* insert_and_copy_split,
366
+ BlockSplit* dist_split) {
367
+ // Create a continuous array of literals.
368
+ std::vector<uint8_t> literals;
369
+ CopyLiteralsToByteArray(cmds, num_commands, data, pos, mask, &literals);
370
+
371
+ // Compute prefix codes for commands.
372
+ std::vector<uint16_t> insert_and_copy_codes;
373
+ std::vector<uint16_t> distance_prefixes;
374
+ CopyCommandsToByteArray(cmds, num_commands,
375
+ &insert_and_copy_codes,
376
+ &distance_prefixes);
377
+
378
+ SplitByteVector<HistogramLiteral>(
379
+ literals,
380
+ kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,
381
+ kLiteralStrideLength, kLiteralBlockSwitchCost,
382
+ literal_split);
383
+ SplitByteVector<HistogramCommand>(
384
+ insert_and_copy_codes,
385
+ kSymbolsPerCommandHistogram, kMaxCommandHistograms,
386
+ kCommandStrideLength, kCommandBlockSwitchCost,
387
+ insert_and_copy_split);
388
+ SplitByteVector<HistogramDistance>(
389
+ distance_prefixes,
390
+ kSymbolsPerDistanceHistogram, kMaxCommandHistograms,
391
+ kCommandStrideLength, kDistanceBlockSwitchCost,
392
+ dist_split);
393
+ }
394
+
395
+ void SplitBlockByTotalLength(const Command* all_commands,
396
+ const size_t num_commands,
397
+ int input_size,
398
+ int target_length,
399
+ std::vector<std::vector<Command> >* blocks) {
400
+ int num_blocks = input_size / target_length + 1;
401
+ int length_limit = input_size / num_blocks + 1;
402
+ int total_length = 0;
403
+ std::vector<Command> cur_block;
404
+ for (int i = 0; i < num_commands; ++i) {
405
+ const Command& cmd = all_commands[i];
406
+ int cmd_length = cmd.insert_len_ + cmd.copy_len_;
407
+ if (total_length > length_limit) {
408
+ blocks->push_back(cur_block);
409
+ cur_block.clear();
410
+ total_length = 0;
411
+ }
412
+ cur_block.push_back(cmd);
413
+ total_length += cmd_length;
414
+ }
415
+ blocks->push_back(cur_block);
416
+ }
417
+
418
+ } // namespace brotli