brotli 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.gitmodules +3 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/README.md +36 -0
- data/Rakefile +13 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/brotli.gemspec +28 -0
- data/ext/brotli/brotli.cc +67 -0
- data/ext/brotli/brotli.h +9 -0
- data/ext/brotli/extconf.rb +34 -0
- data/lib/brotli.rb +2 -0
- data/lib/brotli/version.rb +3 -0
- data/vendor/brotli/LICENSE +202 -0
- data/vendor/brotli/dec/Makefile +12 -0
- data/vendor/brotli/dec/bit_reader.c +55 -0
- data/vendor/brotli/dec/bit_reader.h +256 -0
- data/vendor/brotli/dec/context.h +260 -0
- data/vendor/brotli/dec/decode.c +1573 -0
- data/vendor/brotli/dec/decode.h +160 -0
- data/vendor/brotli/dec/dictionary.h +9494 -0
- data/vendor/brotli/dec/huffman.c +325 -0
- data/vendor/brotli/dec/huffman.h +77 -0
- data/vendor/brotli/dec/port.h +148 -0
- data/vendor/brotli/dec/prefix.h +756 -0
- data/vendor/brotli/dec/state.c +149 -0
- data/vendor/brotli/dec/state.h +185 -0
- data/vendor/brotli/dec/streams.c +99 -0
- data/vendor/brotli/dec/streams.h +100 -0
- data/vendor/brotli/dec/transform.h +315 -0
- data/vendor/brotli/dec/types.h +36 -0
- data/vendor/brotli/enc/Makefile +11 -0
- data/vendor/brotli/enc/backward_references.cc +769 -0
- data/vendor/brotli/enc/backward_references.h +50 -0
- data/vendor/brotli/enc/bit_cost.h +147 -0
- data/vendor/brotli/enc/block_splitter.cc +418 -0
- data/vendor/brotli/enc/block_splitter.h +78 -0
- data/vendor/brotli/enc/brotli_bit_stream.cc +884 -0
- data/vendor/brotli/enc/brotli_bit_stream.h +149 -0
- data/vendor/brotli/enc/cluster.h +290 -0
- data/vendor/brotli/enc/command.h +140 -0
- data/vendor/brotli/enc/context.h +185 -0
- data/vendor/brotli/enc/dictionary.h +9485 -0
- data/vendor/brotli/enc/dictionary_hash.h +4125 -0
- data/vendor/brotli/enc/encode.cc +715 -0
- data/vendor/brotli/enc/encode.h +196 -0
- data/vendor/brotli/enc/encode_parallel.cc +354 -0
- data/vendor/brotli/enc/encode_parallel.h +37 -0
- data/vendor/brotli/enc/entropy_encode.cc +492 -0
- data/vendor/brotli/enc/entropy_encode.h +88 -0
- data/vendor/brotli/enc/fast_log.h +179 -0
- data/vendor/brotli/enc/find_match_length.h +87 -0
- data/vendor/brotli/enc/hash.h +686 -0
- data/vendor/brotli/enc/histogram.cc +76 -0
- data/vendor/brotli/enc/histogram.h +100 -0
- data/vendor/brotli/enc/literal_cost.cc +172 -0
- data/vendor/brotli/enc/literal_cost.h +38 -0
- data/vendor/brotli/enc/metablock.cc +544 -0
- data/vendor/brotli/enc/metablock.h +88 -0
- data/vendor/brotli/enc/port.h +151 -0
- data/vendor/brotli/enc/prefix.h +85 -0
- data/vendor/brotli/enc/ringbuffer.h +108 -0
- data/vendor/brotli/enc/static_dict.cc +441 -0
- data/vendor/brotli/enc/static_dict.h +40 -0
- data/vendor/brotli/enc/static_dict_lut.h +12063 -0
- data/vendor/brotli/enc/streams.cc +127 -0
- data/vendor/brotli/enc/streams.h +129 -0
- data/vendor/brotli/enc/transform.h +250 -0
- data/vendor/brotli/enc/write_bits.h +91 -0
- metadata +171 -0
@@ -0,0 +1,715 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Implementation of Brotli compressor.
|
16
|
+
|
17
|
+
#include "./encode.h"
|
18
|
+
|
19
|
+
#include <algorithm>
|
20
|
+
#include <limits>
|
21
|
+
|
22
|
+
#include "./backward_references.h"
|
23
|
+
#include "./bit_cost.h"
|
24
|
+
#include "./block_splitter.h"
|
25
|
+
#include "./brotli_bit_stream.h"
|
26
|
+
#include "./cluster.h"
|
27
|
+
#include "./context.h"
|
28
|
+
#include "./metablock.h"
|
29
|
+
#include "./transform.h"
|
30
|
+
#include "./entropy_encode.h"
|
31
|
+
#include "./fast_log.h"
|
32
|
+
#include "./hash.h"
|
33
|
+
#include "./histogram.h"
|
34
|
+
#include "./literal_cost.h"
|
35
|
+
#include "./prefix.h"
|
36
|
+
#include "./write_bits.h"
|
37
|
+
|
38
|
+
namespace brotli {
|
39
|
+
|
40
|
+
static const double kMinUTF8Ratio = 0.75;
|
41
|
+
static const int kMinQualityForBlockSplit = 4;
|
42
|
+
static const int kMinQualityForContextModeling = 5;
|
43
|
+
static const int kMinQualityForOptimizeHistograms = 4;
|
44
|
+
|
45
|
+
int ParseAsUTF8(int* symbol, const uint8_t* input, int size) {
|
46
|
+
// ASCII
|
47
|
+
if ((input[0] & 0x80) == 0) {
|
48
|
+
*symbol = input[0];
|
49
|
+
if (*symbol > 0) {
|
50
|
+
return 1;
|
51
|
+
}
|
52
|
+
}
|
53
|
+
// 2-byte UTF8
|
54
|
+
if (size > 1 &&
|
55
|
+
(input[0] & 0xe0) == 0xc0 &&
|
56
|
+
(input[1] & 0xc0) == 0x80) {
|
57
|
+
*symbol = (((input[0] & 0x1f) << 6) |
|
58
|
+
(input[1] & 0x3f));
|
59
|
+
if (*symbol > 0x7f) {
|
60
|
+
return 2;
|
61
|
+
}
|
62
|
+
}
|
63
|
+
// 3-byte UFT8
|
64
|
+
if (size > 2 &&
|
65
|
+
(input[0] & 0xf0) == 0xe0 &&
|
66
|
+
(input[1] & 0xc0) == 0x80 &&
|
67
|
+
(input[2] & 0xc0) == 0x80) {
|
68
|
+
*symbol = (((input[0] & 0x0f) << 12) |
|
69
|
+
((input[1] & 0x3f) << 6) |
|
70
|
+
(input[2] & 0x3f));
|
71
|
+
if (*symbol > 0x7ff) {
|
72
|
+
return 3;
|
73
|
+
}
|
74
|
+
}
|
75
|
+
// 4-byte UFT8
|
76
|
+
if (size > 3 &&
|
77
|
+
(input[0] & 0xf8) == 0xf0 &&
|
78
|
+
(input[1] & 0xc0) == 0x80 &&
|
79
|
+
(input[2] & 0xc0) == 0x80 &&
|
80
|
+
(input[3] & 0xc0) == 0x80) {
|
81
|
+
*symbol = (((input[0] & 0x07) << 18) |
|
82
|
+
((input[1] & 0x3f) << 12) |
|
83
|
+
((input[2] & 0x3f) << 6) |
|
84
|
+
(input[3] & 0x3f));
|
85
|
+
if (*symbol > 0xffff && *symbol <= 0x10ffff) {
|
86
|
+
return 4;
|
87
|
+
}
|
88
|
+
}
|
89
|
+
// Not UTF8, emit a special symbol above the UTF8-code space
|
90
|
+
*symbol = 0x110000 | input[0];
|
91
|
+
return 1;
|
92
|
+
}
|
93
|
+
|
94
|
+
// Returns true if at least min_fraction of the data is UTF8-encoded.
|
95
|
+
bool IsMostlyUTF8(const uint8_t* data, size_t length, double min_fraction) {
|
96
|
+
size_t size_utf8 = 0;
|
97
|
+
size_t pos = 0;
|
98
|
+
while (pos < length) {
|
99
|
+
int symbol;
|
100
|
+
int bytes_read = ParseAsUTF8(&symbol, data + pos, length - pos);
|
101
|
+
pos += bytes_read;
|
102
|
+
if (symbol < 0x110000) size_utf8 += bytes_read;
|
103
|
+
}
|
104
|
+
return size_utf8 > min_fraction * length;
|
105
|
+
}
|
106
|
+
|
107
|
+
void RecomputeDistancePrefixes(Command* cmds,
|
108
|
+
size_t num_commands,
|
109
|
+
int num_direct_distance_codes,
|
110
|
+
int distance_postfix_bits) {
|
111
|
+
if (num_direct_distance_codes == 0 && distance_postfix_bits == 0) {
|
112
|
+
return;
|
113
|
+
}
|
114
|
+
for (int i = 0; i < num_commands; ++i) {
|
115
|
+
Command* cmd = &cmds[i];
|
116
|
+
if (cmd->copy_len_ > 0 && cmd->cmd_prefix_ >= 128) {
|
117
|
+
PrefixEncodeCopyDistance(cmd->DistanceCode(),
|
118
|
+
num_direct_distance_codes,
|
119
|
+
distance_postfix_bits,
|
120
|
+
&cmd->dist_prefix_,
|
121
|
+
&cmd->dist_extra_);
|
122
|
+
}
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
uint8_t* BrotliCompressor::GetBrotliStorage(size_t size) {
|
127
|
+
if (storage_size_ < size) {
|
128
|
+
storage_.reset(new uint8_t[size]);
|
129
|
+
storage_size_ = size;
|
130
|
+
}
|
131
|
+
return &storage_[0];
|
132
|
+
}
|
133
|
+
|
134
|
+
BrotliCompressor::BrotliCompressor(BrotliParams params)
|
135
|
+
: params_(params),
|
136
|
+
hashers_(new Hashers()),
|
137
|
+
input_pos_(0),
|
138
|
+
num_commands_(0),
|
139
|
+
num_literals_(0),
|
140
|
+
last_insert_len_(0),
|
141
|
+
last_flush_pos_(0),
|
142
|
+
last_processed_pos_(0),
|
143
|
+
prev_byte_(0),
|
144
|
+
prev_byte2_(0),
|
145
|
+
storage_size_(0) {
|
146
|
+
// Sanitize params.
|
147
|
+
params_.quality = std::max(1, params_.quality);
|
148
|
+
if (params_.lgwin < kMinWindowBits) {
|
149
|
+
params_.lgwin = kMinWindowBits;
|
150
|
+
} else if (params_.lgwin > kMaxWindowBits) {
|
151
|
+
params_.lgwin = kMaxWindowBits;
|
152
|
+
}
|
153
|
+
if (params_.lgblock == 0) {
|
154
|
+
params_.lgblock = params_.quality < kMinQualityForBlockSplit ? 14 : 16;
|
155
|
+
if (params_.quality >= 9 && params_.lgwin > params_.lgblock) {
|
156
|
+
params_.lgblock = std::min(21, params_.lgwin);
|
157
|
+
}
|
158
|
+
} else {
|
159
|
+
params_.lgblock = std::min(kMaxInputBlockBits,
|
160
|
+
std::max(kMinInputBlockBits, params_.lgblock));
|
161
|
+
}
|
162
|
+
|
163
|
+
// Set maximum distance, see section 9.1. of the spec.
|
164
|
+
max_backward_distance_ = (1 << params_.lgwin) - 16;
|
165
|
+
|
166
|
+
// Initialize input and literal cost ring buffers.
|
167
|
+
// We allocate at least lgwin + 1 bits for the ring buffer so that the newly
|
168
|
+
// added block fits there completely and we still get lgwin bits and at least
|
169
|
+
// read_block_size_bits + 1 bits because the copy tail length needs to be
|
170
|
+
// smaller than ringbuffer size.
|
171
|
+
int ringbuffer_bits = std::max(params_.lgwin + 1, params_.lgblock + 1);
|
172
|
+
ringbuffer_.reset(new RingBuffer(ringbuffer_bits, params_.lgblock));
|
173
|
+
if (params_.quality > 9) {
|
174
|
+
literal_cost_mask_ = (1 << params_.lgblock) - 1;
|
175
|
+
literal_cost_.reset(new float[literal_cost_mask_ + 1]);
|
176
|
+
}
|
177
|
+
|
178
|
+
// Allocate command buffer.
|
179
|
+
cmd_buffer_size_ = std::max(1 << 18, 1 << params_.lgblock);
|
180
|
+
commands_.reset(new brotli::Command[cmd_buffer_size_]);
|
181
|
+
|
182
|
+
// Initialize last byte with stream header.
|
183
|
+
if (params_.lgwin == 16) {
|
184
|
+
last_byte_ = 0;
|
185
|
+
last_byte_bits_ = 1;
|
186
|
+
} else if (params_.lgwin == 17) {
|
187
|
+
last_byte_ = 1;
|
188
|
+
last_byte_bits_ = 7;
|
189
|
+
} else if (params_.lgwin > 17) {
|
190
|
+
last_byte_ = ((params_.lgwin - 17) << 1) | 1;
|
191
|
+
last_byte_bits_ = 4;
|
192
|
+
} else {
|
193
|
+
last_byte_ = ((params_.lgwin - 8) << 4) | 1;
|
194
|
+
last_byte_bits_ = 7;
|
195
|
+
}
|
196
|
+
|
197
|
+
// Initialize distance cache.
|
198
|
+
dist_cache_[0] = 4;
|
199
|
+
dist_cache_[1] = 11;
|
200
|
+
dist_cache_[2] = 15;
|
201
|
+
dist_cache_[3] = 16;
|
202
|
+
// Save the state of the distance cache in case we need to restore it for
|
203
|
+
// emitting an uncompressed block.
|
204
|
+
memcpy(saved_dist_cache_, dist_cache_, sizeof(dist_cache_));
|
205
|
+
|
206
|
+
// Initialize hashers.
|
207
|
+
hash_type_ = std::min(9, params_.quality);
|
208
|
+
hashers_->Init(hash_type_);
|
209
|
+
}
|
210
|
+
|
211
|
+
BrotliCompressor::~BrotliCompressor() {
|
212
|
+
}
|
213
|
+
|
214
|
+
void BrotliCompressor::CopyInputToRingBuffer(const size_t input_size,
|
215
|
+
const uint8_t* input_buffer) {
|
216
|
+
ringbuffer_->Write(input_buffer, input_size);
|
217
|
+
input_pos_ += input_size;
|
218
|
+
|
219
|
+
// Erase a few more bytes in the ring buffer to make hashing not
|
220
|
+
// depend on uninitialized data. This makes compression deterministic
|
221
|
+
// and it prevents uninitialized memory warnings in Valgrind. Even
|
222
|
+
// without erasing, the output would be valid (but nondeterministic).
|
223
|
+
//
|
224
|
+
// Background information: The compressor stores short (at most 8 bytes)
|
225
|
+
// substrings of the input already read in a hash table, and detects
|
226
|
+
// repetitions by looking up such substrings in the hash table. If it
|
227
|
+
// can find a substring, it checks whether the substring is really there
|
228
|
+
// in the ring buffer (or it's just a hash collision). Should the hash
|
229
|
+
// table become corrupt, this check makes sure that the output is
|
230
|
+
// still valid, albeit the compression ratio would be bad.
|
231
|
+
//
|
232
|
+
// The compressor populates the hash table from the ring buffer as it's
|
233
|
+
// reading new bytes from the input. However, at the last few indexes of
|
234
|
+
// the ring buffer, there are not enough bytes to build full-length
|
235
|
+
// substrings from. Since the hash table always contains full-length
|
236
|
+
// substrings, we erase with dummy 0s here to make sure that those
|
237
|
+
// substrings will contain 0s at the end instead of uninitialized
|
238
|
+
// data.
|
239
|
+
//
|
240
|
+
// Please note that erasing is not necessary (because the
|
241
|
+
// memory region is already initialized since he ring buffer
|
242
|
+
// has a `tail' that holds a copy of the beginning,) so we
|
243
|
+
// skip erasing if we have already gone around at least once in
|
244
|
+
// the ring buffer.
|
245
|
+
size_t pos = ringbuffer_->position();
|
246
|
+
// Only clear during the first round of ringbuffer writes. On
|
247
|
+
// subsequent rounds data in the ringbuffer would be affected.
|
248
|
+
if (pos <= ringbuffer_->mask()) {
|
249
|
+
// This is the first time when the ring buffer is being written.
|
250
|
+
// We clear 3 bytes just after the bytes that have been copied from
|
251
|
+
// the input buffer.
|
252
|
+
//
|
253
|
+
// The ringbuffer has a "tail" that holds a copy of the beginning,
|
254
|
+
// but only once the ring buffer has been fully written once, i.e.,
|
255
|
+
// pos <= mask. For the first time, we need to write values
|
256
|
+
// in this tail (where index may be larger than mask), so that
|
257
|
+
// we have exactly defined behavior and don't read un-initialized
|
258
|
+
// memory. Due to performance reasons, hashing reads data using a
|
259
|
+
// LOAD32, which can go 3 bytes beyond the bytes written in the
|
260
|
+
// ringbuffer.
|
261
|
+
memset(ringbuffer_->start() + pos, 0, 3);
|
262
|
+
}
|
263
|
+
}
|
264
|
+
|
265
|
+
void BrotliCompressor::BrotliSetCustomDictionary(
|
266
|
+
const size_t size, const uint8_t* dict) {
|
267
|
+
CopyInputToRingBuffer(size, dict);
|
268
|
+
last_flush_pos_ = size;
|
269
|
+
last_processed_pos_ = size;
|
270
|
+
if (size > 0) {
|
271
|
+
prev_byte_ = dict[size - 1];
|
272
|
+
}
|
273
|
+
if (size > 1) {
|
274
|
+
prev_byte2_ = dict[size - 2];
|
275
|
+
}
|
276
|
+
hashers_->PrependCustomDictionary(hash_type_, size, dict);
|
277
|
+
}
|
278
|
+
|
279
|
+
bool BrotliCompressor::WriteBrotliData(const bool is_last,
|
280
|
+
const bool force_flush,
|
281
|
+
size_t* out_size,
|
282
|
+
uint8_t** output) {
|
283
|
+
const size_t bytes = input_pos_ - last_processed_pos_;
|
284
|
+
const uint8_t* data = ringbuffer_->start();
|
285
|
+
const size_t mask = ringbuffer_->mask();
|
286
|
+
|
287
|
+
if (bytes > input_block_size()) {
|
288
|
+
return false;
|
289
|
+
}
|
290
|
+
|
291
|
+
bool utf8_mode =
|
292
|
+
params_.quality >= 9 &&
|
293
|
+
IsMostlyUTF8(&data[last_processed_pos_ & mask], bytes, kMinUTF8Ratio);
|
294
|
+
|
295
|
+
if (literal_cost_.get()) {
|
296
|
+
if (utf8_mode) {
|
297
|
+
EstimateBitCostsForLiteralsUTF8(last_processed_pos_, bytes, mask,
|
298
|
+
literal_cost_mask_, data,
|
299
|
+
literal_cost_.get());
|
300
|
+
} else {
|
301
|
+
EstimateBitCostsForLiterals(last_processed_pos_, bytes, mask,
|
302
|
+
literal_cost_mask_,
|
303
|
+
data, literal_cost_.get());
|
304
|
+
}
|
305
|
+
}
|
306
|
+
CreateBackwardReferences(bytes, last_processed_pos_, data, mask,
|
307
|
+
literal_cost_.get(),
|
308
|
+
literal_cost_mask_,
|
309
|
+
max_backward_distance_,
|
310
|
+
params_.quality,
|
311
|
+
hashers_.get(),
|
312
|
+
hash_type_,
|
313
|
+
dist_cache_,
|
314
|
+
&last_insert_len_,
|
315
|
+
&commands_[num_commands_],
|
316
|
+
&num_commands_,
|
317
|
+
&num_literals_);
|
318
|
+
|
319
|
+
// For quality 1 there is no block splitting, so we buffer at most this much
|
320
|
+
// literals and commands.
|
321
|
+
static const int kMaxNumDelayedSymbols = 0x2fff;
|
322
|
+
int max_length = std::min<int>(mask + 1, 1 << kMaxInputBlockBits);
|
323
|
+
if (!is_last && !force_flush &&
|
324
|
+
(params_.quality >= kMinQualityForBlockSplit ||
|
325
|
+
(num_literals_ + num_commands_ < kMaxNumDelayedSymbols)) &&
|
326
|
+
num_commands_ + (input_block_size() >> 1) < cmd_buffer_size_ &&
|
327
|
+
input_pos_ + input_block_size() <= last_flush_pos_ + max_length) {
|
328
|
+
// Everything will happen later.
|
329
|
+
last_processed_pos_ = input_pos_;
|
330
|
+
*out_size = 0;
|
331
|
+
return true;
|
332
|
+
}
|
333
|
+
|
334
|
+
// Create the last insert-only command.
|
335
|
+
if (last_insert_len_ > 0) {
|
336
|
+
brotli::Command cmd(last_insert_len_);
|
337
|
+
commands_[num_commands_++] = cmd;
|
338
|
+
num_literals_ += last_insert_len_;
|
339
|
+
last_insert_len_ = 0;
|
340
|
+
}
|
341
|
+
|
342
|
+
return WriteMetaBlockInternal(is_last, utf8_mode, out_size, output);
|
343
|
+
}
|
344
|
+
|
345
|
+
// Decide about the context map based on the ability of the prediction
|
346
|
+
// ability of the previous byte UTF8-prefix on the next byte. The
|
347
|
+
// prediction ability is calculated as shannon entropy. Here we need
|
348
|
+
// shannon entropy instead of 'BitsEntropy' since the prefix will be
|
349
|
+
// encoded with the remaining 6 bits of the following byte, and
|
350
|
+
// BitsEntropy will assume that symbol to be stored alone using Huffman
|
351
|
+
// coding.
|
352
|
+
void ChooseContextMap(int quality,
|
353
|
+
int* bigram_histo,
|
354
|
+
int* num_literal_contexts,
|
355
|
+
const int** literal_context_map) {
|
356
|
+
int monogram_histo[3] = { 0 };
|
357
|
+
int two_prefix_histo[6] = { 0 };
|
358
|
+
int total = 0;
|
359
|
+
for (int i = 0; i < 9; ++i) {
|
360
|
+
total += bigram_histo[i];
|
361
|
+
monogram_histo[i % 3] += bigram_histo[i];
|
362
|
+
int j = i;
|
363
|
+
if (j >= 6) {
|
364
|
+
j -= 6;
|
365
|
+
}
|
366
|
+
two_prefix_histo[j] += bigram_histo[i];
|
367
|
+
}
|
368
|
+
int dummy;
|
369
|
+
double entropy1 = ShannonEntropy(monogram_histo, 3, &dummy);
|
370
|
+
double entropy2 = (ShannonEntropy(two_prefix_histo, 3, &dummy) +
|
371
|
+
ShannonEntropy(two_prefix_histo + 3, 3, &dummy));
|
372
|
+
double entropy3 = 0;
|
373
|
+
for (int k = 0; k < 3; ++k) {
|
374
|
+
entropy3 += ShannonEntropy(bigram_histo + 3 * k, 3, &dummy);
|
375
|
+
}
|
376
|
+
entropy1 *= (1.0 / total);
|
377
|
+
entropy2 *= (1.0 / total);
|
378
|
+
entropy3 *= (1.0 / total);
|
379
|
+
|
380
|
+
static const int kStaticContextMapContinuation[64] = {
|
381
|
+
1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
382
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
383
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
384
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
385
|
+
};
|
386
|
+
static const int kStaticContextMapSimpleUTF8[64] = {
|
387
|
+
0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
388
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
389
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
390
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
391
|
+
};
|
392
|
+
if (quality < 7) {
|
393
|
+
// 3 context models is a bit slower, don't use it at lower qualities.
|
394
|
+
entropy3 = entropy1 * 10;
|
395
|
+
}
|
396
|
+
// If expected savings by symbol are less than 0.2 bits, skip the
|
397
|
+
// context modeling -- in exchange for faster decoding speed.
|
398
|
+
if (entropy1 - entropy2 < 0.2 &&
|
399
|
+
entropy1 - entropy3 < 0.2) {
|
400
|
+
*num_literal_contexts = 1;
|
401
|
+
} else if (entropy2 - entropy3 < 0.02) {
|
402
|
+
*num_literal_contexts = 2;
|
403
|
+
*literal_context_map = kStaticContextMapSimpleUTF8;
|
404
|
+
} else {
|
405
|
+
*num_literal_contexts = 3;
|
406
|
+
*literal_context_map = kStaticContextMapContinuation;
|
407
|
+
}
|
408
|
+
}
|
409
|
+
|
410
|
+
void DecideOverLiteralContextModeling(const uint8_t* input,
|
411
|
+
size_t start_pos,
|
412
|
+
size_t length,
|
413
|
+
size_t mask,
|
414
|
+
int quality,
|
415
|
+
int* literal_context_mode,
|
416
|
+
int* num_literal_contexts,
|
417
|
+
const int** literal_context_map) {
|
418
|
+
if (quality < kMinQualityForContextModeling || length < 64) {
|
419
|
+
return;
|
420
|
+
}
|
421
|
+
// Gather bigram data of the UTF8 byte prefixes. To make the analysis of
|
422
|
+
// UTF8 data faster we only examine 64 byte long strides at every 4kB
|
423
|
+
// intervals.
|
424
|
+
const size_t end_pos = start_pos + length;
|
425
|
+
int bigram_prefix_histo[9] = { 0 };
|
426
|
+
for (; start_pos + 64 < end_pos; start_pos += 4096) {
|
427
|
+
static const int lut[4] = { 0, 0, 1, 2 };
|
428
|
+
const size_t stride_end_pos = start_pos + 64;
|
429
|
+
int prev = lut[input[start_pos & mask] >> 6] * 3;
|
430
|
+
for (size_t pos = start_pos + 1; pos < stride_end_pos; ++pos) {
|
431
|
+
const uint8_t literal = input[pos & mask];
|
432
|
+
++bigram_prefix_histo[prev + lut[literal >> 6]];
|
433
|
+
prev = lut[literal >> 6] * 3;
|
434
|
+
}
|
435
|
+
}
|
436
|
+
*literal_context_mode = CONTEXT_UTF8;
|
437
|
+
ChooseContextMap(quality, &bigram_prefix_histo[0], num_literal_contexts,
|
438
|
+
literal_context_map);
|
439
|
+
}
|
440
|
+
|
441
|
+
bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
|
442
|
+
const bool utf8_mode,
|
443
|
+
size_t* out_size,
|
444
|
+
uint8_t** output) {
|
445
|
+
const size_t bytes = input_pos_ - last_flush_pos_;
|
446
|
+
const uint8_t* data = ringbuffer_->start();
|
447
|
+
const size_t mask = ringbuffer_->mask();
|
448
|
+
const size_t max_out_size = 2 * bytes + 500;
|
449
|
+
uint8_t* storage = GetBrotliStorage(max_out_size);
|
450
|
+
storage[0] = last_byte_;
|
451
|
+
int storage_ix = last_byte_bits_;
|
452
|
+
|
453
|
+
bool uncompressed = false;
|
454
|
+
if (num_commands_ < (bytes >> 8) + 2) {
|
455
|
+
if (num_literals_ > 0.99 * bytes) {
|
456
|
+
int literal_histo[256] = { 0 };
|
457
|
+
static const int kSampleRate = 13;
|
458
|
+
static const double kMinEntropy = 7.92;
|
459
|
+
static const double kBitCostThreshold = bytes * kMinEntropy / kSampleRate;
|
460
|
+
for (int i = last_flush_pos_; i < input_pos_; i += kSampleRate) {
|
461
|
+
++literal_histo[data[i & mask]];
|
462
|
+
}
|
463
|
+
if (BitsEntropy(literal_histo, 256) > kBitCostThreshold) {
|
464
|
+
uncompressed = true;
|
465
|
+
}
|
466
|
+
}
|
467
|
+
}
|
468
|
+
|
469
|
+
if (bytes == 0) {
|
470
|
+
if (!StoreCompressedMetaBlockHeader(is_last, 0, &storage_ix, &storage[0])) {
|
471
|
+
return false;
|
472
|
+
}
|
473
|
+
storage_ix = (storage_ix + 7) & ~7;
|
474
|
+
} else if (uncompressed) {
|
475
|
+
// Restore the distance cache, as its last update by
|
476
|
+
// CreateBackwardReferences is now unused.
|
477
|
+
memcpy(dist_cache_, saved_dist_cache_, sizeof(dist_cache_));
|
478
|
+
if (!StoreUncompressedMetaBlock(is_last,
|
479
|
+
data, last_flush_pos_, mask, bytes,
|
480
|
+
&storage_ix,
|
481
|
+
&storage[0])) {
|
482
|
+
return false;
|
483
|
+
}
|
484
|
+
} else {
|
485
|
+
int num_direct_distance_codes = 0;
|
486
|
+
int distance_postfix_bits = 0;
|
487
|
+
if (params_.quality > 9 && params_.mode == BrotliParams::MODE_FONT) {
|
488
|
+
num_direct_distance_codes = 12;
|
489
|
+
distance_postfix_bits = 1;
|
490
|
+
RecomputeDistancePrefixes(commands_.get(),
|
491
|
+
num_commands_,
|
492
|
+
num_direct_distance_codes,
|
493
|
+
distance_postfix_bits);
|
494
|
+
}
|
495
|
+
if (params_.quality < kMinQualityForBlockSplit) {
|
496
|
+
if (!StoreMetaBlockTrivial(data, last_flush_pos_, bytes, mask, is_last,
|
497
|
+
commands_.get(), num_commands_,
|
498
|
+
&storage_ix,
|
499
|
+
&storage[0])) {
|
500
|
+
return false;
|
501
|
+
}
|
502
|
+
} else {
|
503
|
+
MetaBlockSplit mb;
|
504
|
+
int literal_context_mode = utf8_mode ? CONTEXT_UTF8 : CONTEXT_SIGNED;
|
505
|
+
if (params_.quality <= 9) {
|
506
|
+
int num_literal_contexts = 1;
|
507
|
+
const int* literal_context_map = NULL;
|
508
|
+
DecideOverLiteralContextModeling(data, last_flush_pos_, bytes, mask,
|
509
|
+
params_.quality,
|
510
|
+
&literal_context_mode,
|
511
|
+
&num_literal_contexts,
|
512
|
+
&literal_context_map);
|
513
|
+
if (literal_context_map == NULL) {
|
514
|
+
BuildMetaBlockGreedy(data, last_flush_pos_, mask,
|
515
|
+
commands_.get(), num_commands_,
|
516
|
+
&mb);
|
517
|
+
} else {
|
518
|
+
BuildMetaBlockGreedyWithContexts(data, last_flush_pos_, mask,
|
519
|
+
prev_byte_, prev_byte2_,
|
520
|
+
literal_context_mode,
|
521
|
+
num_literal_contexts,
|
522
|
+
literal_context_map,
|
523
|
+
commands_.get(), num_commands_,
|
524
|
+
&mb);
|
525
|
+
}
|
526
|
+
} else {
|
527
|
+
BuildMetaBlock(data, last_flush_pos_, mask,
|
528
|
+
prev_byte_, prev_byte2_,
|
529
|
+
commands_.get(), num_commands_,
|
530
|
+
literal_context_mode,
|
531
|
+
&mb);
|
532
|
+
}
|
533
|
+
if (params_.quality >= kMinQualityForOptimizeHistograms) {
|
534
|
+
OptimizeHistograms(num_direct_distance_codes,
|
535
|
+
distance_postfix_bits,
|
536
|
+
&mb);
|
537
|
+
}
|
538
|
+
if (!StoreMetaBlock(data, last_flush_pos_, bytes, mask,
|
539
|
+
prev_byte_, prev_byte2_,
|
540
|
+
is_last,
|
541
|
+
num_direct_distance_codes,
|
542
|
+
distance_postfix_bits,
|
543
|
+
literal_context_mode,
|
544
|
+
commands_.get(), num_commands_,
|
545
|
+
mb,
|
546
|
+
&storage_ix,
|
547
|
+
&storage[0])) {
|
548
|
+
return false;
|
549
|
+
}
|
550
|
+
}
|
551
|
+
if (bytes + 4 < (storage_ix >> 3)) {
|
552
|
+
// Restore the distance cache and last byte.
|
553
|
+
memcpy(dist_cache_, saved_dist_cache_, sizeof(dist_cache_));
|
554
|
+
storage[0] = last_byte_;
|
555
|
+
storage_ix = last_byte_bits_;
|
556
|
+
if (!StoreUncompressedMetaBlock(is_last, data, last_flush_pos_, mask,
|
557
|
+
bytes, &storage_ix, &storage[0])) {
|
558
|
+
return false;
|
559
|
+
}
|
560
|
+
}
|
561
|
+
}
|
562
|
+
last_byte_ = storage[storage_ix >> 3];
|
563
|
+
last_byte_bits_ = storage_ix & 7;
|
564
|
+
last_flush_pos_ = input_pos_;
|
565
|
+
last_processed_pos_ = input_pos_;
|
566
|
+
prev_byte_ = data[(last_flush_pos_ - 1) & mask];
|
567
|
+
prev_byte2_ = data[(last_flush_pos_ - 2) & mask];
|
568
|
+
num_commands_ = 0;
|
569
|
+
num_literals_ = 0;
|
570
|
+
// Save the state of the distance cache in case we need to restore it for
|
571
|
+
// emitting an uncompressed block.
|
572
|
+
memcpy(saved_dist_cache_, dist_cache_, sizeof(dist_cache_));
|
573
|
+
*output = &storage[0];
|
574
|
+
*out_size = storage_ix >> 3;
|
575
|
+
return true;
|
576
|
+
}
|
577
|
+
|
578
|
+
bool BrotliCompressor::WriteMetaBlock(const size_t input_size,
|
579
|
+
const uint8_t* input_buffer,
|
580
|
+
const bool is_last,
|
581
|
+
size_t* encoded_size,
|
582
|
+
uint8_t* encoded_buffer) {
|
583
|
+
CopyInputToRingBuffer(input_size, input_buffer);
|
584
|
+
size_t out_size = 0;
|
585
|
+
uint8_t* output;
|
586
|
+
if (!WriteBrotliData(is_last, /* force_flush = */ true, &out_size, &output) ||
|
587
|
+
out_size > *encoded_size) {
|
588
|
+
return false;
|
589
|
+
}
|
590
|
+
if (out_size > 0) {
|
591
|
+
memcpy(encoded_buffer, output, out_size);
|
592
|
+
}
|
593
|
+
*encoded_size = out_size;
|
594
|
+
return true;
|
595
|
+
}
|
596
|
+
|
597
|
+
bool BrotliCompressor::WriteMetadata(const size_t input_size,
|
598
|
+
const uint8_t* input_buffer,
|
599
|
+
const bool is_last,
|
600
|
+
size_t* encoded_size,
|
601
|
+
uint8_t* encoded_buffer) {
|
602
|
+
if (input_size > (1 << 24) || input_size + 6 > *encoded_size) {
|
603
|
+
return false;
|
604
|
+
}
|
605
|
+
int storage_ix = last_byte_bits_;
|
606
|
+
encoded_buffer[0] = last_byte_;
|
607
|
+
WriteBits(1, 0, &storage_ix, encoded_buffer);
|
608
|
+
WriteBits(2, 3, &storage_ix, encoded_buffer);
|
609
|
+
WriteBits(1, 0, &storage_ix, encoded_buffer);
|
610
|
+
if (input_size == 0) {
|
611
|
+
WriteBits(2, 0, &storage_ix, encoded_buffer);
|
612
|
+
*encoded_size = (storage_ix + 7) >> 3;
|
613
|
+
} else {
|
614
|
+
size_t nbits = Log2Floor(input_size - 1) + 1;
|
615
|
+
size_t nbytes = (nbits + 7) / 8;
|
616
|
+
WriteBits(2, nbytes, &storage_ix, encoded_buffer);
|
617
|
+
WriteBits(8 * nbytes, input_size - 1, &storage_ix, encoded_buffer);
|
618
|
+
size_t hdr_size = (storage_ix + 7) >> 3;
|
619
|
+
memcpy(&encoded_buffer[hdr_size], input_buffer, input_size);
|
620
|
+
*encoded_size = hdr_size + input_size;
|
621
|
+
}
|
622
|
+
if (is_last) {
|
623
|
+
encoded_buffer[(*encoded_size)++] = 3;
|
624
|
+
}
|
625
|
+
last_byte_ = 0;
|
626
|
+
last_byte_bits_ = 0;
|
627
|
+
return true;
|
628
|
+
}
|
629
|
+
|
630
|
+
bool BrotliCompressor::FinishStream(
|
631
|
+
size_t* encoded_size, uint8_t* encoded_buffer) {
|
632
|
+
return WriteMetaBlock(0, NULL, true, encoded_size, encoded_buffer);
|
633
|
+
}
|
634
|
+
|
635
|
+
int BrotliCompressBuffer(BrotliParams params,
|
636
|
+
size_t input_size,
|
637
|
+
const uint8_t* input_buffer,
|
638
|
+
size_t* encoded_size,
|
639
|
+
uint8_t* encoded_buffer) {
|
640
|
+
if (*encoded_size == 0) {
|
641
|
+
// Output buffer needs at least one byte.
|
642
|
+
return 0;
|
643
|
+
}
|
644
|
+
BrotliCompressor compressor(params);
|
645
|
+
BrotliMemIn in(input_buffer, input_size);
|
646
|
+
BrotliMemOut out(encoded_buffer, *encoded_size);
|
647
|
+
if (!BrotliCompress(params, &in, &out)) {
|
648
|
+
return 0;
|
649
|
+
}
|
650
|
+
*encoded_size = out.position();
|
651
|
+
return 1;
|
652
|
+
}
|
653
|
+
|
654
|
+
size_t CopyOneBlockToRingBuffer(BrotliIn* r, BrotliCompressor* compressor) {
|
655
|
+
const size_t block_size = compressor->input_block_size();
|
656
|
+
size_t bytes_read = 0;
|
657
|
+
const uint8_t* data = reinterpret_cast<const uint8_t*>(
|
658
|
+
r->Read(block_size, &bytes_read));
|
659
|
+
if (data == NULL) {
|
660
|
+
return 0;
|
661
|
+
}
|
662
|
+
compressor->CopyInputToRingBuffer(bytes_read, data);
|
663
|
+
|
664
|
+
// Read more bytes until block_size is filled or an EOF (data == NULL) is
|
665
|
+
// received. This is useful to get deterministic compressed output for the
|
666
|
+
// same input no matter how r->Read splits the input to chunks.
|
667
|
+
for (size_t remaining = block_size - bytes_read; remaining > 0; ) {
|
668
|
+
size_t more_bytes_read = 0;
|
669
|
+
data = reinterpret_cast<const uint8_t*>(
|
670
|
+
r->Read(remaining, &more_bytes_read));
|
671
|
+
if (data == NULL) {
|
672
|
+
break;
|
673
|
+
}
|
674
|
+
compressor->CopyInputToRingBuffer(more_bytes_read, data);
|
675
|
+
bytes_read += more_bytes_read;
|
676
|
+
remaining -= more_bytes_read;
|
677
|
+
}
|
678
|
+
return bytes_read;
|
679
|
+
}
|
680
|
+
|
681
|
+
bool BrotliInIsFinished(BrotliIn* r) {
|
682
|
+
size_t read_bytes;
|
683
|
+
return r->Read(0, &read_bytes) == NULL;
|
684
|
+
}
|
685
|
+
|
686
|
+
int BrotliCompress(BrotliParams params, BrotliIn* in, BrotliOut* out) {
|
687
|
+
return BrotliCompressWithCustomDictionary(0, nullptr, params, in, out);
|
688
|
+
}
|
689
|
+
|
690
|
+
int BrotliCompressWithCustomDictionary(size_t dictsize, const uint8_t* dict,
|
691
|
+
BrotliParams params,
|
692
|
+
BrotliIn* in, BrotliOut* out) {
|
693
|
+
size_t in_bytes = 0;
|
694
|
+
size_t out_bytes = 0;
|
695
|
+
uint8_t* output;
|
696
|
+
bool final_block = false;
|
697
|
+
BrotliCompressor compressor(params);
|
698
|
+
if (dictsize != 0) compressor.BrotliSetCustomDictionary(dictsize, dict);
|
699
|
+
while (!final_block) {
|
700
|
+
in_bytes = CopyOneBlockToRingBuffer(in, &compressor);
|
701
|
+
final_block = in_bytes == 0 || BrotliInIsFinished(in);
|
702
|
+
out_bytes = 0;
|
703
|
+
if (!compressor.WriteBrotliData(final_block,
|
704
|
+
/* force_flush = */ false,
|
705
|
+
&out_bytes, &output)) {
|
706
|
+
return false;
|
707
|
+
}
|
708
|
+
if (out_bytes > 0 && !out->Write(output, out_bytes)) {
|
709
|
+
return false;
|
710
|
+
}
|
711
|
+
}
|
712
|
+
return true;
|
713
|
+
}
|
714
|
+
|
715
|
+
} // namespace brotli
|