brotli 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/brotli/brotli.cc +114 -24
- data/ext/brotli/brotli.h +0 -1
- data/ext/brotli/extconf.rb +30 -23
- data/lib/brotli/version.rb +1 -1
- data/vendor/brotli/LICENSE +1 -1
- data/vendor/brotli/dec/Makefile +1 -1
- data/vendor/brotli/dec/bit_reader.c +3 -3
- data/vendor/brotli/dec/bit_reader.h +25 -27
- data/vendor/brotli/dec/context.h +4 -4
- data/vendor/brotli/dec/decode.c +410 -486
- data/vendor/brotli/dec/decode.h +101 -105
- data/vendor/brotli/dec/dictionary.c +1 -1
- data/vendor/brotli/dec/dictionary.h +7 -8
- data/vendor/brotli/dec/huffman.c +103 -105
- data/vendor/brotli/dec/huffman.h +18 -18
- data/vendor/brotli/dec/port.h +52 -40
- data/vendor/brotli/dec/prefix.h +2 -0
- data/vendor/brotli/dec/state.c +13 -19
- data/vendor/brotli/dec/state.h +25 -39
- data/vendor/brotli/dec/transform.h +38 -44
- data/vendor/brotli/dec/types.h +2 -2
- data/vendor/brotli/enc/Makefile +1 -1
- data/vendor/brotli/enc/backward_references.cc +455 -359
- data/vendor/brotli/enc/backward_references.h +79 -3
- data/vendor/brotli/enc/bit_cost.h +54 -32
- data/vendor/brotli/enc/block_splitter.cc +285 -193
- data/vendor/brotli/enc/block_splitter.h +4 -12
- data/vendor/brotli/enc/brotli_bit_stream.cc +623 -324
- data/vendor/brotli/enc/brotli_bit_stream.h +76 -37
- data/vendor/brotli/enc/cluster.h +161 -120
- data/vendor/brotli/enc/command.h +60 -37
- data/vendor/brotli/enc/compress_fragment.cc +701 -0
- data/vendor/brotli/enc/compress_fragment.h +47 -0
- data/vendor/brotli/enc/compress_fragment_two_pass.cc +524 -0
- data/vendor/brotli/enc/compress_fragment_two_pass.h +40 -0
- data/vendor/brotli/enc/compressor.h +15 -0
- data/vendor/brotli/enc/context.h +1 -1
- data/vendor/brotli/enc/dictionary.h +2 -2
- data/vendor/brotli/enc/encode.cc +819 -286
- data/vendor/brotli/enc/encode.h +38 -15
- data/vendor/brotli/enc/encode_parallel.cc +40 -42
- data/vendor/brotli/enc/entropy_encode.cc +144 -147
- data/vendor/brotli/enc/entropy_encode.h +32 -8
- data/vendor/brotli/enc/entropy_encode_static.h +572 -0
- data/vendor/brotli/enc/fast_log.h +7 -40
- data/vendor/brotli/enc/find_match_length.h +9 -9
- data/vendor/brotli/enc/hash.h +462 -154
- data/vendor/brotli/enc/histogram.cc +6 -6
- data/vendor/brotli/enc/histogram.h +13 -13
- data/vendor/brotli/enc/literal_cost.cc +45 -45
- data/vendor/brotli/enc/metablock.cc +92 -89
- data/vendor/brotli/enc/metablock.h +12 -12
- data/vendor/brotli/enc/port.h +7 -16
- data/vendor/brotli/enc/prefix.h +23 -22
- data/vendor/brotli/enc/ringbuffer.h +75 -29
- data/vendor/brotli/enc/static_dict.cc +56 -48
- data/vendor/brotli/enc/static_dict.h +5 -5
- data/vendor/brotli/enc/streams.cc +1 -1
- data/vendor/brotli/enc/streams.h +5 -5
- data/vendor/brotli/enc/transform.h +40 -35
- data/vendor/brotli/enc/types.h +2 -0
- data/vendor/brotli/enc/utf8_util.cc +3 -2
- data/vendor/brotli/enc/write_bits.h +6 -6
- metadata +9 -5
- data/vendor/brotli/dec/streams.c +0 -102
- data/vendor/brotli/dec/streams.h +0 -95
@@ -10,12 +10,10 @@
|
|
10
10
|
|
11
11
|
#include <assert.h>
|
12
12
|
#include <math.h>
|
13
|
-
#include <stdio.h>
|
14
|
-
#include <stdlib.h>
|
15
|
-
#include <string.h>
|
16
13
|
|
17
14
|
#include <algorithm>
|
18
|
-
#include <
|
15
|
+
#include <cstring>
|
16
|
+
#include <vector>
|
19
17
|
|
20
18
|
#include "./cluster.h"
|
21
19
|
#include "./command.h"
|
@@ -24,19 +22,19 @@
|
|
24
22
|
|
25
23
|
namespace brotli {
|
26
24
|
|
27
|
-
static const
|
28
|
-
static const
|
25
|
+
static const size_t kMaxLiteralHistograms = 100;
|
26
|
+
static const size_t kMaxCommandHistograms = 50;
|
29
27
|
static const double kLiteralBlockSwitchCost = 28.1;
|
30
28
|
static const double kCommandBlockSwitchCost = 13.5;
|
31
29
|
static const double kDistanceBlockSwitchCost = 14.6;
|
32
|
-
static const
|
33
|
-
static const
|
34
|
-
static const
|
35
|
-
static const
|
36
|
-
static const
|
37
|
-
static const
|
38
|
-
static const
|
39
|
-
static const
|
30
|
+
static const size_t kLiteralStrideLength = 70;
|
31
|
+
static const size_t kCommandStrideLength = 40;
|
32
|
+
static const size_t kSymbolsPerLiteralHistogram = 544;
|
33
|
+
static const size_t kSymbolsPerCommandHistogram = 530;
|
34
|
+
static const size_t kSymbolsPerDistanceHistogram = 544;
|
35
|
+
static const size_t kMinLengthForBlockSplitting = 128;
|
36
|
+
static const size_t kIterMulForRefining = 2;
|
37
|
+
static const size_t kMinItersForRefining = 100;
|
40
38
|
|
41
39
|
void CopyLiteralsToByteArray(const Command* cmds,
|
42
40
|
const size_t num_commands,
|
@@ -72,20 +70,7 @@ void CopyLiteralsToByteArray(const Command* cmds,
|
|
72
70
|
memcpy(&(*literals)[pos], data + from_pos, insert_len);
|
73
71
|
pos += insert_len;
|
74
72
|
}
|
75
|
-
from_pos = (from_pos + insert_len + cmds[i].
|
76
|
-
}
|
77
|
-
}
|
78
|
-
|
79
|
-
void CopyCommandsToByteArray(const Command* cmds,
|
80
|
-
const size_t num_commands,
|
81
|
-
std::vector<uint16_t>* insert_and_copy_codes,
|
82
|
-
std::vector<uint16_t>* distance_prefixes) {
|
83
|
-
for (size_t i = 0; i < num_commands; ++i) {
|
84
|
-
const Command& cmd = cmds[i];
|
85
|
-
insert_and_copy_codes->push_back(cmd.cmd_prefix_);
|
86
|
-
if (cmd.copy_len_ > 0 && cmd.cmd_prefix_ >= 128) {
|
87
|
-
distance_prefixes->push_back(cmd.dist_prefix_);
|
88
|
-
}
|
73
|
+
from_pos = (from_pos + insert_len + cmds[i].copy_len()) & mask;
|
89
74
|
}
|
90
75
|
}
|
91
76
|
|
@@ -99,27 +84,23 @@ inline static unsigned int MyRand(unsigned int* seed) {
|
|
99
84
|
|
100
85
|
template<typename HistogramType, typename DataType>
|
101
86
|
void InitialEntropyCodes(const DataType* data, size_t length,
|
102
|
-
int literals_per_histogram,
|
103
|
-
int max_histograms,
|
104
87
|
size_t stride,
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
88
|
+
size_t num_histograms,
|
89
|
+
HistogramType* histograms) {
|
90
|
+
for (size_t i = 0; i < num_histograms; ++i) {
|
91
|
+
histograms[i].Clear();
|
109
92
|
}
|
110
93
|
unsigned int seed = 7;
|
111
|
-
size_t block_length = length /
|
112
|
-
for (
|
113
|
-
size_t pos = length * i /
|
94
|
+
size_t block_length = length / num_histograms;
|
95
|
+
for (size_t i = 0; i < num_histograms; ++i) {
|
96
|
+
size_t pos = length * i / num_histograms;
|
114
97
|
if (i != 0) {
|
115
98
|
pos += MyRand(&seed) % block_length;
|
116
99
|
}
|
117
100
|
if (pos + stride >= length) {
|
118
101
|
pos = length - stride - 1;
|
119
102
|
}
|
120
|
-
|
121
|
-
histo.Add(data + pos, stride);
|
122
|
-
vec->push_back(histo);
|
103
|
+
histograms[i].Add(data + pos, stride);
|
123
104
|
}
|
124
105
|
}
|
125
106
|
|
@@ -142,50 +123,58 @@ void RandomSample(unsigned int* seed,
|
|
142
123
|
template<typename HistogramType, typename DataType>
|
143
124
|
void RefineEntropyCodes(const DataType* data, size_t length,
|
144
125
|
size_t stride,
|
145
|
-
|
126
|
+
size_t num_histograms,
|
127
|
+
HistogramType* histograms) {
|
146
128
|
size_t iters =
|
147
129
|
kIterMulForRefining * length / stride + kMinItersForRefining;
|
148
130
|
unsigned int seed = 7;
|
149
|
-
iters = ((iters +
|
131
|
+
iters = ((iters + num_histograms - 1) / num_histograms) * num_histograms;
|
150
132
|
for (size_t iter = 0; iter < iters; ++iter) {
|
151
133
|
HistogramType sample;
|
152
134
|
RandomSample(&seed, data, length, stride, &sample);
|
153
|
-
size_t ix = iter %
|
154
|
-
|
135
|
+
size_t ix = iter % num_histograms;
|
136
|
+
histograms[ix].AddHistogram(sample);
|
155
137
|
}
|
156
138
|
}
|
157
139
|
|
158
|
-
inline static double BitCost(
|
159
|
-
return count == 0 ? -2 : FastLog2(count);
|
140
|
+
inline static double BitCost(size_t count) {
|
141
|
+
return count == 0 ? -2.0 : FastLog2(count);
|
160
142
|
}
|
161
143
|
|
144
|
+
// Assigns a block id from the range [0, vec.size()) to each data element
|
145
|
+
// in data[0..length) and fills in block_id[0..length) with the assigned values.
|
146
|
+
// Returns the number of blocks, i.e. one plus the number of block switches.
|
162
147
|
template<typename DataType, int kSize>
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
148
|
+
size_t FindBlocks(const DataType* data, const size_t length,
|
149
|
+
const double block_switch_bitcost,
|
150
|
+
const size_t num_histograms,
|
151
|
+
const Histogram<kSize>* histograms,
|
152
|
+
double* insert_cost,
|
153
|
+
double* cost,
|
154
|
+
uint8_t* switch_signal,
|
155
|
+
uint8_t *block_id) {
|
156
|
+
if (num_histograms <= 1) {
|
168
157
|
for (size_t i = 0; i < length; ++i) {
|
169
158
|
block_id[i] = 0;
|
170
159
|
}
|
171
|
-
return;
|
160
|
+
return 1;
|
172
161
|
}
|
173
|
-
|
174
|
-
assert(
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
162
|
+
const size_t bitmaplen = (num_histograms + 7) >> 3;
|
163
|
+
assert(num_histograms <= 256);
|
164
|
+
memset(insert_cost, 0, sizeof(insert_cost[0]) * kSize * num_histograms);
|
165
|
+
for (size_t j = 0; j < num_histograms; ++j) {
|
166
|
+
insert_cost[j] = FastLog2(static_cast<uint32_t>(
|
167
|
+
histograms[j].total_count_));
|
179
168
|
}
|
180
|
-
for (
|
181
|
-
|
182
|
-
|
169
|
+
for (size_t i = kSize; i != 0;) {
|
170
|
+
--i;
|
171
|
+
for (size_t j = 0; j < num_histograms; ++j) {
|
172
|
+
insert_cost[i * num_histograms + j] =
|
173
|
+
insert_cost[j] - BitCost(histograms[j].data_[i]);
|
183
174
|
}
|
184
175
|
}
|
185
|
-
|
186
|
-
memset(
|
187
|
-
bool* switch_signal = new bool[length * vecsize];
|
188
|
-
memset(switch_signal, 0, sizeof(switch_signal[0]) * length * vecsize);
|
176
|
+
memset(cost, 0, sizeof(cost[0]) * num_histograms);
|
177
|
+
memset(switch_signal, 0, sizeof(switch_signal[0]) * length * bitmaplen);
|
189
178
|
// After each iteration of this loop, cost[k] will contain the difference
|
190
179
|
// between the minimum cost of arriving at the current byte position using
|
191
180
|
// entropy code k, and the minimum cost of arriving at the current byte
|
@@ -193,10 +182,10 @@ void FindBlocks(const DataType* data, const size_t length,
|
|
193
182
|
// reaches block switch cost, it means that when we trace back from the last
|
194
183
|
// position, we need to switch here.
|
195
184
|
for (size_t byte_ix = 0; byte_ix < length; ++byte_ix) {
|
196
|
-
size_t ix = byte_ix *
|
197
|
-
|
185
|
+
size_t ix = byte_ix * bitmaplen;
|
186
|
+
size_t insert_cost_ix = data[byte_ix] * num_histograms;
|
198
187
|
double min_cost = 1e99;
|
199
|
-
for (
|
188
|
+
for (size_t k = 0; k < num_histograms; ++k) {
|
200
189
|
// We are coding the symbol in data[byte_ix] with entropy code k.
|
201
190
|
cost[k] += insert_cost[insert_cost_ix + k];
|
202
191
|
if (cost[k] < min_cost) {
|
@@ -207,116 +196,206 @@ void FindBlocks(const DataType* data, const size_t length,
|
|
207
196
|
double block_switch_cost = block_switch_bitcost;
|
208
197
|
// More blocks for the beginning.
|
209
198
|
if (byte_ix < 2000) {
|
210
|
-
block_switch_cost *= 0.77 + 0.07 * byte_ix / 2000;
|
199
|
+
block_switch_cost *= 0.77 + 0.07 * static_cast<double>(byte_ix) / 2000;
|
211
200
|
}
|
212
|
-
for (
|
201
|
+
for (size_t k = 0; k < num_histograms; ++k) {
|
213
202
|
cost[k] -= min_cost;
|
214
203
|
if (cost[k] >= block_switch_cost) {
|
215
204
|
cost[k] = block_switch_cost;
|
216
|
-
|
205
|
+
const uint8_t mask = static_cast<uint8_t>(1u << (k & 7));
|
206
|
+
assert((k >> 3) < bitmaplen);
|
207
|
+
switch_signal[ix + (k >> 3)] |= mask;
|
217
208
|
}
|
218
209
|
}
|
219
210
|
}
|
220
211
|
// Now trace back from the last position and switch at the marked places.
|
221
212
|
size_t byte_ix = length - 1;
|
222
|
-
size_t ix = byte_ix *
|
213
|
+
size_t ix = byte_ix * bitmaplen;
|
223
214
|
uint8_t cur_id = block_id[byte_ix];
|
215
|
+
size_t num_blocks = 1;
|
224
216
|
while (byte_ix > 0) {
|
225
217
|
--byte_ix;
|
226
|
-
ix -=
|
227
|
-
|
228
|
-
|
218
|
+
ix -= bitmaplen;
|
219
|
+
const uint8_t mask = static_cast<uint8_t>(1u << (cur_id & 7));
|
220
|
+
assert((static_cast<size_t>(cur_id) >> 3) < bitmaplen);
|
221
|
+
if (switch_signal[ix + (cur_id >> 3)] & mask) {
|
222
|
+
if (cur_id != block_id[byte_ix]) {
|
223
|
+
cur_id = block_id[byte_ix];
|
224
|
+
++num_blocks;
|
225
|
+
}
|
229
226
|
}
|
230
227
|
block_id[byte_ix] = cur_id;
|
231
228
|
}
|
232
|
-
|
233
|
-
delete[] cost;
|
234
|
-
delete[] switch_signal;
|
229
|
+
return num_blocks;
|
235
230
|
}
|
236
231
|
|
237
|
-
|
238
|
-
|
239
|
-
|
232
|
+
static size_t RemapBlockIds(uint8_t* block_ids, const size_t length,
|
233
|
+
uint16_t* new_id, const size_t num_histograms) {
|
234
|
+
static const uint16_t kInvalidId = 256;
|
235
|
+
for (size_t i = 0; i < num_histograms; ++i) {
|
236
|
+
new_id[i] = kInvalidId;
|
237
|
+
}
|
238
|
+
uint16_t next_id = 0;
|
240
239
|
for (size_t i = 0; i < length; ++i) {
|
241
|
-
|
242
|
-
|
243
|
-
|
240
|
+
assert(block_ids[i] < num_histograms);
|
241
|
+
if (new_id[block_ids[i]] == kInvalidId) {
|
242
|
+
new_id[block_ids[i]] = next_id++;
|
244
243
|
}
|
245
244
|
}
|
246
245
|
for (size_t i = 0; i < length; ++i) {
|
247
|
-
block_ids[i] = new_id[block_ids[i]];
|
246
|
+
block_ids[i] = static_cast<uint8_t>(new_id[block_ids[i]]);
|
247
|
+
assert(block_ids[i] < num_histograms);
|
248
248
|
}
|
249
|
+
assert(next_id <= num_histograms);
|
249
250
|
return next_id;
|
250
251
|
}
|
251
252
|
|
252
253
|
template<typename HistogramType, typename DataType>
|
253
254
|
void BuildBlockHistograms(const DataType* data, const size_t length,
|
254
|
-
uint8_t* block_ids,
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
255
|
+
const uint8_t* block_ids,
|
256
|
+
const size_t num_histograms,
|
257
|
+
HistogramType* histograms) {
|
258
|
+
for (size_t i = 0; i < num_histograms; ++i) {
|
259
|
+
histograms[i].Clear();
|
260
|
+
}
|
260
261
|
for (size_t i = 0; i < length; ++i) {
|
261
|
-
|
262
|
+
histograms[block_ids[i]].Add(data[i]);
|
262
263
|
}
|
263
264
|
}
|
264
265
|
|
265
266
|
template<typename HistogramType, typename DataType>
|
266
267
|
void ClusterBlocks(const DataType* data, const size_t length,
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
268
|
+
const size_t num_blocks,
|
269
|
+
uint8_t* block_ids,
|
270
|
+
BlockSplit* split) {
|
271
|
+
static const size_t kMaxNumberOfBlockTypes = 256;
|
272
|
+
static const size_t kHistogramsPerBatch = 64;
|
273
|
+
static const size_t kClustersPerBatch = 16;
|
274
|
+
std::vector<uint32_t> histogram_symbols(num_blocks);
|
275
|
+
std::vector<uint32_t> block_lengths(num_blocks);
|
276
|
+
|
277
|
+
size_t block_idx = 0;
|
272
278
|
for (size_t i = 0; i < length; ++i) {
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
histograms.push_back(cur_histogram);
|
278
|
-
cur_histogram.Clear();
|
279
|
-
++cur_idx;
|
279
|
+
assert(block_idx < num_blocks);
|
280
|
+
++block_lengths[block_idx];
|
281
|
+
if (i + 1 == length || block_ids[i] != block_ids[i + 1]) {
|
282
|
+
++block_idx;
|
280
283
|
}
|
281
284
|
}
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
285
|
+
assert(block_idx == num_blocks);
|
286
|
+
|
287
|
+
const size_t expected_num_clusters =
|
288
|
+
kClustersPerBatch *
|
289
|
+
(num_blocks + kHistogramsPerBatch - 1) / kHistogramsPerBatch;
|
290
|
+
std::vector<HistogramType> all_histograms;
|
291
|
+
std::vector<uint32_t> cluster_size;
|
292
|
+
all_histograms.reserve(expected_num_clusters);
|
293
|
+
cluster_size.reserve(expected_num_clusters);
|
294
|
+
size_t num_clusters = 0;
|
295
|
+
std::vector<HistogramType> histograms(
|
296
|
+
std::min(num_blocks, kHistogramsPerBatch));
|
297
|
+
size_t max_num_pairs = kHistogramsPerBatch * kHistogramsPerBatch / 2;
|
298
|
+
std::vector<HistogramPair> pairs(max_num_pairs + 1);
|
299
|
+
size_t pos = 0;
|
300
|
+
for (size_t i = 0; i < num_blocks; i += kHistogramsPerBatch) {
|
301
|
+
const size_t num_to_combine = std::min(num_blocks - i, kHistogramsPerBatch);
|
302
|
+
uint32_t sizes[kHistogramsPerBatch];
|
303
|
+
uint32_t clusters[kHistogramsPerBatch];
|
304
|
+
uint32_t symbols[kHistogramsPerBatch];
|
305
|
+
uint32_t remap[kHistogramsPerBatch];
|
306
|
+
for (size_t j = 0; j < num_to_combine; ++j) {
|
307
|
+
histograms[j].Clear();
|
308
|
+
for (size_t k = 0; k < block_lengths[i + j]; ++k) {
|
309
|
+
histograms[j].Add(data[pos++]);
|
310
|
+
}
|
311
|
+
histograms[j].bit_cost_ = PopulationCost(histograms[j]);
|
312
|
+
symbols[j] = clusters[j] = static_cast<uint32_t>(j);
|
313
|
+
sizes[j] = 1;
|
314
|
+
}
|
315
|
+
size_t num_new_clusters = HistogramCombine(
|
316
|
+
&histograms[0], sizes, symbols, clusters, &pairs[0], num_to_combine,
|
317
|
+
num_to_combine, kHistogramsPerBatch, max_num_pairs);
|
318
|
+
for (size_t j = 0; j < num_new_clusters; ++j) {
|
319
|
+
all_histograms.push_back(histograms[clusters[j]]);
|
320
|
+
cluster_size.push_back(sizes[clusters[j]]);
|
321
|
+
remap[clusters[j]] = static_cast<uint32_t>(j);
|
322
|
+
}
|
323
|
+
for (size_t j = 0; j < num_to_combine; ++j) {
|
324
|
+
histogram_symbols[i + j] =
|
325
|
+
static_cast<uint32_t>(num_clusters) + remap[symbols[j]];
|
326
|
+
}
|
327
|
+
num_clusters += num_new_clusters;
|
328
|
+
assert(num_clusters == cluster_size.size());
|
329
|
+
assert(num_clusters == all_histograms.size());
|
292
330
|
}
|
293
|
-
}
|
294
331
|
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
332
|
+
max_num_pairs =
|
333
|
+
std::min(64 * num_clusters, (num_clusters / 2) * num_clusters);
|
334
|
+
pairs.resize(max_num_pairs + 1);
|
335
|
+
|
336
|
+
std::vector<uint32_t> clusters(num_clusters);
|
337
|
+
for (size_t i = 0; i < num_clusters; ++i) {
|
338
|
+
clusters[i] = static_cast<uint32_t>(i);
|
339
|
+
}
|
340
|
+
size_t num_final_clusters =
|
341
|
+
HistogramCombine(&all_histograms[0], &cluster_size[0],
|
342
|
+
&histogram_symbols[0],
|
343
|
+
&clusters[0], &pairs[0], num_clusters,
|
344
|
+
num_blocks, kMaxNumberOfBlockTypes, max_num_pairs);
|
345
|
+
|
346
|
+
static const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
|
347
|
+
std::vector<uint32_t> new_index(num_clusters, kInvalidIndex);
|
348
|
+
uint32_t next_index = 0;
|
349
|
+
pos = 0;
|
350
|
+
for (size_t i = 0; i < num_blocks; ++i) {
|
351
|
+
HistogramType histo;
|
352
|
+
for (size_t j = 0; j < block_lengths[i]; ++j) {
|
353
|
+
histo.Add(data[pos++]);
|
354
|
+
}
|
355
|
+
uint32_t best_out =
|
356
|
+
i == 0 ? histogram_symbols[0] : histogram_symbols[i - 1];
|
357
|
+
double best_bits = HistogramBitCostDistance(
|
358
|
+
histo, all_histograms[best_out]);
|
359
|
+
for (size_t j = 0; j < num_final_clusters; ++j) {
|
360
|
+
const double cur_bits = HistogramBitCostDistance(
|
361
|
+
histo, all_histograms[clusters[j]]);
|
362
|
+
if (cur_bits < best_bits) {
|
363
|
+
best_bits = cur_bits;
|
364
|
+
best_out = clusters[j];
|
365
|
+
}
|
366
|
+
}
|
367
|
+
histogram_symbols[i] = best_out;
|
368
|
+
if (new_index[best_out] == kInvalidIndex) {
|
369
|
+
new_index[best_out] = next_index++;
|
370
|
+
}
|
371
|
+
}
|
372
|
+
uint8_t max_type = 0;
|
373
|
+
uint32_t cur_length = 0;
|
374
|
+
block_idx = 0;
|
375
|
+
split->types.resize(num_blocks);
|
376
|
+
split->lengths.resize(num_blocks);
|
377
|
+
for (size_t i = 0; i < num_blocks; ++i) {
|
378
|
+
cur_length += block_lengths[i];
|
379
|
+
if (i + 1 == num_blocks ||
|
380
|
+
histogram_symbols[i] != histogram_symbols[i + 1]) {
|
381
|
+
const uint8_t id = static_cast<uint8_t>(new_index[histogram_symbols[i]]);
|
382
|
+
split->types[block_idx] = id;
|
383
|
+
split->lengths[block_idx] = cur_length;
|
384
|
+
max_type = std::max(max_type, id);
|
305
385
|
cur_length = 0;
|
386
|
+
++block_idx;
|
306
387
|
}
|
307
|
-
++cur_length;
|
308
388
|
}
|
309
|
-
split->types.
|
310
|
-
split->lengths.
|
311
|
-
split->num_types =
|
312
|
-
++split->num_types;
|
389
|
+
split->types.resize(block_idx);
|
390
|
+
split->lengths.resize(block_idx);
|
391
|
+
split->num_types = static_cast<size_t>(max_type) + 1;
|
313
392
|
}
|
314
393
|
|
315
|
-
template<
|
394
|
+
template<int kSize, typename DataType>
|
316
395
|
void SplitByteVector(const std::vector<DataType>& data,
|
317
|
-
const
|
318
|
-
const
|
319
|
-
const
|
396
|
+
const size_t literals_per_histogram,
|
397
|
+
const size_t max_histograms,
|
398
|
+
const size_t sampling_stride_length,
|
320
399
|
const double block_switch_cost,
|
321
400
|
BlockSplit* split) {
|
322
401
|
if (data.empty()) {
|
@@ -325,30 +404,47 @@ void SplitByteVector(const std::vector<DataType>& data,
|
|
325
404
|
} else if (data.size() < kMinLengthForBlockSplitting) {
|
326
405
|
split->num_types = 1;
|
327
406
|
split->types.push_back(0);
|
328
|
-
split->lengths.push_back(static_cast<
|
407
|
+
split->lengths.push_back(static_cast<uint32_t>(data.size()));
|
329
408
|
return;
|
330
409
|
}
|
331
|
-
|
410
|
+
size_t num_histograms = data.size() / literals_per_histogram + 1;
|
411
|
+
if (num_histograms > max_histograms) {
|
412
|
+
num_histograms = max_histograms;
|
413
|
+
}
|
414
|
+
Histogram<kSize>* histograms = new Histogram<kSize>[num_histograms];
|
332
415
|
// Find good entropy codes.
|
333
416
|
InitialEntropyCodes(&data[0], data.size(),
|
334
|
-
literals_per_histogram,
|
335
|
-
max_histograms,
|
336
417
|
sampling_stride_length,
|
337
|
-
|
418
|
+
num_histograms, histograms);
|
338
419
|
RefineEntropyCodes(&data[0], data.size(),
|
339
420
|
sampling_stride_length,
|
340
|
-
|
421
|
+
num_histograms, histograms);
|
341
422
|
// Find a good path through literals with the good entropy codes.
|
342
423
|
std::vector<uint8_t> block_ids(data.size());
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
424
|
+
size_t num_blocks;
|
425
|
+
const size_t bitmaplen = (num_histograms + 7) >> 3;
|
426
|
+
double* insert_cost = new double[kSize * num_histograms];
|
427
|
+
double *cost = new double[num_histograms];
|
428
|
+
uint8_t* switch_signal = new uint8_t[data.size() * bitmaplen];
|
429
|
+
uint16_t* new_id = new uint16_t[num_histograms];
|
430
|
+
for (size_t i = 0; i < 10; ++i) {
|
431
|
+
num_blocks = FindBlocks(&data[0], data.size(),
|
432
|
+
block_switch_cost,
|
433
|
+
num_histograms, histograms,
|
434
|
+
insert_cost, cost, switch_signal,
|
435
|
+
&block_ids[0]);
|
436
|
+
num_histograms = RemapBlockIds(&block_ids[0], data.size(),
|
437
|
+
new_id, num_histograms);
|
438
|
+
BuildBlockHistograms(&data[0], data.size(), &block_ids[0],
|
439
|
+
num_histograms, histograms);
|
349
440
|
}
|
350
|
-
|
351
|
-
|
441
|
+
delete[] insert_cost;
|
442
|
+
delete[] cost;
|
443
|
+
delete[] switch_signal;
|
444
|
+
delete[] new_id;
|
445
|
+
delete[] histograms;
|
446
|
+
ClusterBlocks<Histogram<kSize> >(&data[0], data.size(), num_blocks,
|
447
|
+
&block_ids[0], split);
|
352
448
|
}
|
353
449
|
|
354
450
|
void SplitBlock(const Command* cmds,
|
@@ -359,55 +455,51 @@ void SplitBlock(const Command* cmds,
|
|
359
455
|
BlockSplit* literal_split,
|
360
456
|
BlockSplit* insert_and_copy_split,
|
361
457
|
BlockSplit* dist_split) {
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
458
|
+
{
|
459
|
+
// Create a continuous array of literals.
|
460
|
+
std::vector<uint8_t> literals;
|
461
|
+
CopyLiteralsToByteArray(cmds, num_commands, data, pos, mask, &literals);
|
462
|
+
// Create the block split on the array of literals.
|
463
|
+
// Literal histograms have alphabet size 256.
|
464
|
+
SplitByteVector<256>(
|
465
|
+
literals,
|
466
|
+
kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,
|
467
|
+
kLiteralStrideLength, kLiteralBlockSwitchCost,
|
468
|
+
literal_split);
|
469
|
+
}
|
372
470
|
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
kCommandStrideLength, kDistanceBlockSwitchCost,
|
387
|
-
dist_split);
|
388
|
-
}
|
471
|
+
{
|
472
|
+
// Compute prefix codes for commands.
|
473
|
+
std::vector<uint16_t> insert_and_copy_codes(num_commands);
|
474
|
+
for (size_t i = 0; i < num_commands; ++i) {
|
475
|
+
insert_and_copy_codes[i] = cmds[i].cmd_prefix_;
|
476
|
+
}
|
477
|
+
// Create the block split on the array of command prefixes.
|
478
|
+
SplitByteVector<kNumCommandPrefixes>(
|
479
|
+
insert_and_copy_codes,
|
480
|
+
kSymbolsPerCommandHistogram, kMaxCommandHistograms,
|
481
|
+
kCommandStrideLength, kCommandBlockSwitchCost,
|
482
|
+
insert_and_copy_split);
|
483
|
+
}
|
389
484
|
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
for (size_t i = 0; i < num_commands; ++i) {
|
400
|
-
const Command& cmd = all_commands[i];
|
401
|
-
int cmd_length = cmd.insert_len_ + cmd.copy_len_;
|
402
|
-
if (total_length > length_limit) {
|
403
|
-
blocks->push_back(cur_block);
|
404
|
-
cur_block.clear();
|
405
|
-
total_length = 0;
|
485
|
+
{
|
486
|
+
// Create a continuous array of distance prefixes.
|
487
|
+
std::vector<uint16_t> distance_prefixes(num_commands);
|
488
|
+
size_t pos = 0;
|
489
|
+
for (size_t i = 0; i < num_commands; ++i) {
|
490
|
+
const Command& cmd = cmds[i];
|
491
|
+
if (cmd.copy_len() && cmd.cmd_prefix_ >= 128) {
|
492
|
+
distance_prefixes[pos++] = cmd.dist_prefix_;
|
493
|
+
}
|
406
494
|
}
|
407
|
-
|
408
|
-
|
495
|
+
distance_prefixes.resize(pos);
|
496
|
+
// Create the block split on the array of distance prefixes.
|
497
|
+
SplitByteVector<kNumDistancePrefixes>(
|
498
|
+
distance_prefixes,
|
499
|
+
kSymbolsPerDistanceHistogram, kMaxCommandHistograms,
|
500
|
+
kCommandStrideLength, kDistanceBlockSwitchCost,
|
501
|
+
dist_split);
|
409
502
|
}
|
410
|
-
blocks->push_back(cur_block);
|
411
503
|
}
|
412
504
|
|
413
505
|
} // namespace brotli
|