extbrotli 0.0.1.PROTOTYPE
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +28 -0
- data/README.md +67 -0
- data/Rakefile +158 -0
- data/contrib/brotli/LICENSE +202 -0
- data/contrib/brotli/README.md +18 -0
- data/contrib/brotli/dec/bit_reader.c +55 -0
- data/contrib/brotli/dec/bit_reader.h +256 -0
- data/contrib/brotli/dec/context.h +260 -0
- data/contrib/brotli/dec/decode.c +1573 -0
- data/contrib/brotli/dec/decode.h +160 -0
- data/contrib/brotli/dec/dictionary.h +9494 -0
- data/contrib/brotli/dec/huffman.c +325 -0
- data/contrib/brotli/dec/huffman.h +77 -0
- data/contrib/brotli/dec/port.h +148 -0
- data/contrib/brotli/dec/prefix.h +756 -0
- data/contrib/brotli/dec/state.c +149 -0
- data/contrib/brotli/dec/state.h +185 -0
- data/contrib/brotli/dec/streams.c +99 -0
- data/contrib/brotli/dec/streams.h +100 -0
- data/contrib/brotli/dec/transform.h +315 -0
- data/contrib/brotli/dec/types.h +36 -0
- data/contrib/brotli/enc/backward_references.cc +769 -0
- data/contrib/brotli/enc/backward_references.h +50 -0
- data/contrib/brotli/enc/bit_cost.h +147 -0
- data/contrib/brotli/enc/block_splitter.cc +418 -0
- data/contrib/brotli/enc/block_splitter.h +78 -0
- data/contrib/brotli/enc/brotli_bit_stream.cc +884 -0
- data/contrib/brotli/enc/brotli_bit_stream.h +149 -0
- data/contrib/brotli/enc/cluster.h +290 -0
- data/contrib/brotli/enc/command.h +140 -0
- data/contrib/brotli/enc/context.h +185 -0
- data/contrib/brotli/enc/dictionary.h +9485 -0
- data/contrib/brotli/enc/dictionary_hash.h +4125 -0
- data/contrib/brotli/enc/encode.cc +715 -0
- data/contrib/brotli/enc/encode.h +196 -0
- data/contrib/brotli/enc/encode_parallel.cc +354 -0
- data/contrib/brotli/enc/encode_parallel.h +37 -0
- data/contrib/brotli/enc/entropy_encode.cc +492 -0
- data/contrib/brotli/enc/entropy_encode.h +88 -0
- data/contrib/brotli/enc/fast_log.h +179 -0
- data/contrib/brotli/enc/find_match_length.h +87 -0
- data/contrib/brotli/enc/hash.h +686 -0
- data/contrib/brotli/enc/histogram.cc +76 -0
- data/contrib/brotli/enc/histogram.h +100 -0
- data/contrib/brotli/enc/literal_cost.cc +172 -0
- data/contrib/brotli/enc/literal_cost.h +38 -0
- data/contrib/brotli/enc/metablock.cc +544 -0
- data/contrib/brotli/enc/metablock.h +88 -0
- data/contrib/brotli/enc/port.h +151 -0
- data/contrib/brotli/enc/prefix.h +85 -0
- data/contrib/brotli/enc/ringbuffer.h +108 -0
- data/contrib/brotli/enc/static_dict.cc +441 -0
- data/contrib/brotli/enc/static_dict.h +40 -0
- data/contrib/brotli/enc/static_dict_lut.h +12063 -0
- data/contrib/brotli/enc/streams.cc +127 -0
- data/contrib/brotli/enc/streams.h +129 -0
- data/contrib/brotli/enc/transform.h +250 -0
- data/contrib/brotli/enc/write_bits.h +91 -0
- data/ext/extbrotli.cc +24 -0
- data/ext/extbrotli.h +73 -0
- data/ext/extconf.rb +35 -0
- data/ext/lldecoder.c +220 -0
- data/ext/llencoder.cc +433 -0
- data/gemstub.rb +21 -0
- data/lib/extbrotli.rb +243 -0
- data/lib/extbrotli/version.rb +3 -0
- metadata +140 -0
@@ -0,0 +1,50 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Function to find backward reference copies.
|
16
|
+
|
17
|
+
#ifndef BROTLI_ENC_BACKWARD_REFERENCES_H_
|
18
|
+
#define BROTLI_ENC_BACKWARD_REFERENCES_H_
|
19
|
+
|
20
|
+
#include <stdint.h>
|
21
|
+
#include <vector>
|
22
|
+
|
23
|
+
#include "./hash.h"
|
24
|
+
#include "./command.h"
|
25
|
+
|
26
|
+
namespace brotli {
|
27
|
+
|
28
|
+
// "commands" points to the next output command to write to, "*num_commands" is
|
29
|
+
// initially the total amount of commands output by previous
|
30
|
+
// CreateBackwardReferences calls, and must be incremented by the amount written
|
31
|
+
// by this call.
|
32
|
+
void CreateBackwardReferences(size_t num_bytes,
|
33
|
+
size_t position,
|
34
|
+
const uint8_t* ringbuffer,
|
35
|
+
size_t ringbuffer_mask,
|
36
|
+
const float* literal_cost,
|
37
|
+
size_t literal_cost_mask,
|
38
|
+
const size_t max_backward_limit,
|
39
|
+
const int quality,
|
40
|
+
Hashers* hashers,
|
41
|
+
int hash_type,
|
42
|
+
int* dist_cache,
|
43
|
+
int* last_insert_len,
|
44
|
+
Command* commands,
|
45
|
+
int* num_commands,
|
46
|
+
int* num_literals);
|
47
|
+
|
48
|
+
} // namespace brotli
|
49
|
+
|
50
|
+
#endif // BROTLI_ENC_BACKWARD_REFERENCES_H_
|
@@ -0,0 +1,147 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Functions to estimate the bit cost of Huffman trees.
|
16
|
+
|
17
|
+
#ifndef BROTLI_ENC_BIT_COST_H_
|
18
|
+
#define BROTLI_ENC_BIT_COST_H_
|
19
|
+
|
20
|
+
|
21
|
+
#include <stdint.h>
|
22
|
+
|
23
|
+
#include "./entropy_encode.h"
|
24
|
+
#include "./fast_log.h"
|
25
|
+
|
26
|
+
namespace brotli {
|
27
|
+
|
28
|
+
static inline double ShannonEntropy(const int *population, int size,
|
29
|
+
int *total) {
|
30
|
+
int sum = 0;
|
31
|
+
double retval = 0;
|
32
|
+
const int *population_end = population + size;
|
33
|
+
int p;
|
34
|
+
if (size & 1) {
|
35
|
+
goto odd_number_of_elements_left;
|
36
|
+
}
|
37
|
+
while (population < population_end) {
|
38
|
+
p = *population++;
|
39
|
+
sum += p;
|
40
|
+
retval -= p * FastLog2(p);
|
41
|
+
odd_number_of_elements_left:
|
42
|
+
p = *population++;
|
43
|
+
sum += p;
|
44
|
+
retval -= p * FastLog2(p);
|
45
|
+
}
|
46
|
+
if (sum) retval += sum * FastLog2(sum);
|
47
|
+
*total = sum;
|
48
|
+
return retval;
|
49
|
+
}
|
50
|
+
|
51
|
+
static inline double BitsEntropy(const int *population, int size) {
|
52
|
+
int sum;
|
53
|
+
double retval = ShannonEntropy(population, size, &sum);
|
54
|
+
if (retval < sum) {
|
55
|
+
// At least one bit per literal is needed.
|
56
|
+
retval = sum;
|
57
|
+
}
|
58
|
+
return retval;
|
59
|
+
}
|
60
|
+
|
61
|
+
|
62
|
+
template<int kSize>
|
63
|
+
double PopulationCost(const Histogram<kSize>& histogram) {
|
64
|
+
if (histogram.total_count_ == 0) {
|
65
|
+
return 12;
|
66
|
+
}
|
67
|
+
int count = 0;
|
68
|
+
for (int i = 0; i < kSize; ++i) {
|
69
|
+
if (histogram.data_[i] > 0) {
|
70
|
+
++count;
|
71
|
+
}
|
72
|
+
}
|
73
|
+
if (count == 1) {
|
74
|
+
return 12;
|
75
|
+
}
|
76
|
+
if (count == 2) {
|
77
|
+
return 20 + histogram.total_count_;
|
78
|
+
}
|
79
|
+
double bits = 0;
|
80
|
+
uint8_t depth[kSize] = { 0 };
|
81
|
+
if (count <= 4) {
|
82
|
+
// For very low symbol count we build the Huffman tree.
|
83
|
+
CreateHuffmanTree(&histogram.data_[0], kSize, 15, depth);
|
84
|
+
for (int i = 0; i < kSize; ++i) {
|
85
|
+
bits += histogram.data_[i] * depth[i];
|
86
|
+
}
|
87
|
+
return count == 3 ? bits + 28 : bits + 37;
|
88
|
+
}
|
89
|
+
|
90
|
+
// In this loop we compute the entropy of the histogram and simultaneously
|
91
|
+
// build a simplified histogram of the code length codes where we use the
|
92
|
+
// zero repeat code 17, but we don't use the non-zero repeat code 16.
|
93
|
+
int max_depth = 1;
|
94
|
+
int depth_histo[kCodeLengthCodes] = { 0 };
|
95
|
+
const double log2total = FastLog2(histogram.total_count_);
|
96
|
+
for (int i = 0; i < kSize;) {
|
97
|
+
if (histogram.data_[i] > 0) {
|
98
|
+
// Compute -log2(P(symbol)) = -log2(count(symbol)/total_count) =
|
99
|
+
// = log2(total_count) - log2(count(symbol))
|
100
|
+
double log2p = log2total - FastLog2(histogram.data_[i]);
|
101
|
+
// Approximate the bit depth by round(-log2(P(symbol)))
|
102
|
+
int depth = static_cast<int>(log2p + 0.5);
|
103
|
+
bits += histogram.data_[i] * log2p;
|
104
|
+
if (depth > 15) {
|
105
|
+
depth = 15;
|
106
|
+
}
|
107
|
+
if (depth > max_depth) {
|
108
|
+
max_depth = depth;
|
109
|
+
}
|
110
|
+
++depth_histo[depth];
|
111
|
+
++i;
|
112
|
+
} else {
|
113
|
+
// Compute the run length of zeros and add the appropiate number of 0 and
|
114
|
+
// 17 code length codes to the code length code histogram.
|
115
|
+
int reps = 1;
|
116
|
+
for (int k = i + 1; k < kSize && histogram.data_[k] == 0; ++k) {
|
117
|
+
++reps;
|
118
|
+
}
|
119
|
+
i += reps;
|
120
|
+
if (i == kSize) {
|
121
|
+
// Don't add any cost for the last zero run, since these are encoded
|
122
|
+
// only implicitly.
|
123
|
+
break;
|
124
|
+
}
|
125
|
+
if (reps < 3) {
|
126
|
+
depth_histo[0] += reps;
|
127
|
+
} else {
|
128
|
+
reps -= 2;
|
129
|
+
while (reps > 0) {
|
130
|
+
++depth_histo[17];
|
131
|
+
// Add the 3 extra bits for the 17 code length code.
|
132
|
+
bits += 3;
|
133
|
+
reps >>= 3;
|
134
|
+
}
|
135
|
+
}
|
136
|
+
}
|
137
|
+
}
|
138
|
+
// Add the estimated encoding cost of the code length code histogram.
|
139
|
+
bits += 18 + 2 * max_depth;
|
140
|
+
// Add the entropy of the code length code histogram.
|
141
|
+
bits += BitsEntropy(depth_histo, kCodeLengthCodes);
|
142
|
+
return bits;
|
143
|
+
}
|
144
|
+
|
145
|
+
} // namespace brotli
|
146
|
+
|
147
|
+
#endif // BROTLI_ENC_BIT_COST_H_
|
@@ -0,0 +1,418 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Block split point selection utilities.
|
16
|
+
|
17
|
+
#include "./block_splitter.h"
|
18
|
+
|
19
|
+
#include <math.h>
|
20
|
+
#include <stdio.h>
|
21
|
+
#include <stdlib.h>
|
22
|
+
#include <string.h>
|
23
|
+
|
24
|
+
#include <algorithm>
|
25
|
+
#include <map>
|
26
|
+
|
27
|
+
#include "./cluster.h"
|
28
|
+
#include "./command.h"
|
29
|
+
#include "./fast_log.h"
|
30
|
+
#include "./histogram.h"
|
31
|
+
|
32
|
+
namespace brotli {
|
33
|
+
|
34
|
+
static const int kMaxLiteralHistograms = 100;
|
35
|
+
static const int kMaxCommandHistograms = 50;
|
36
|
+
static const double kLiteralBlockSwitchCost = 28.1;
|
37
|
+
static const double kCommandBlockSwitchCost = 13.5;
|
38
|
+
static const double kDistanceBlockSwitchCost = 14.6;
|
39
|
+
static const int kLiteralStrideLength = 70;
|
40
|
+
static const int kCommandStrideLength = 40;
|
41
|
+
static const int kSymbolsPerLiteralHistogram = 544;
|
42
|
+
static const int kSymbolsPerCommandHistogram = 530;
|
43
|
+
static const int kSymbolsPerDistanceHistogram = 544;
|
44
|
+
static const int kMinLengthForBlockSplitting = 128;
|
45
|
+
static const int kIterMulForRefining = 2;
|
46
|
+
static const int kMinItersForRefining = 100;
|
47
|
+
|
48
|
+
void CopyLiteralsToByteArray(const Command* cmds,
|
49
|
+
const size_t num_commands,
|
50
|
+
const uint8_t* data,
|
51
|
+
const size_t offset,
|
52
|
+
const size_t mask,
|
53
|
+
std::vector<uint8_t>* literals) {
|
54
|
+
// Count how many we have.
|
55
|
+
size_t total_length = 0;
|
56
|
+
for (int i = 0; i < num_commands; ++i) {
|
57
|
+
total_length += cmds[i].insert_len_;
|
58
|
+
}
|
59
|
+
if (total_length == 0) {
|
60
|
+
return;
|
61
|
+
}
|
62
|
+
|
63
|
+
// Allocate.
|
64
|
+
literals->resize(total_length);
|
65
|
+
|
66
|
+
// Loop again, and copy this time.
|
67
|
+
size_t pos = 0;
|
68
|
+
size_t from_pos = offset & mask;
|
69
|
+
for (int i = 0; i < num_commands && pos < total_length; ++i) {
|
70
|
+
size_t insert_len = cmds[i].insert_len_;
|
71
|
+
if (from_pos + insert_len > mask) {
|
72
|
+
size_t head_size = mask + 1 - from_pos;
|
73
|
+
memcpy(&(*literals)[pos], data + from_pos, head_size);
|
74
|
+
from_pos = 0;
|
75
|
+
pos += head_size;
|
76
|
+
insert_len -= head_size;
|
77
|
+
}
|
78
|
+
if (insert_len > 0) {
|
79
|
+
memcpy(&(*literals)[pos], data + from_pos, insert_len);
|
80
|
+
pos += insert_len;
|
81
|
+
}
|
82
|
+
from_pos = (from_pos + insert_len + cmds[i].copy_len_) & mask;
|
83
|
+
}
|
84
|
+
}
|
85
|
+
|
86
|
+
void CopyCommandsToByteArray(const Command* cmds,
|
87
|
+
const size_t num_commands,
|
88
|
+
std::vector<uint16_t>* insert_and_copy_codes,
|
89
|
+
std::vector<uint16_t>* distance_prefixes) {
|
90
|
+
for (int i = 0; i < num_commands; ++i) {
|
91
|
+
const Command& cmd = cmds[i];
|
92
|
+
insert_and_copy_codes->push_back(cmd.cmd_prefix_);
|
93
|
+
if (cmd.copy_len_ > 0 && cmd.cmd_prefix_ >= 128) {
|
94
|
+
distance_prefixes->push_back(cmd.dist_prefix_);
|
95
|
+
}
|
96
|
+
}
|
97
|
+
}
|
98
|
+
|
99
|
+
inline static unsigned int MyRand(unsigned int* seed) {
|
100
|
+
*seed *= 16807U;
|
101
|
+
if (*seed == 0) {
|
102
|
+
*seed = 1;
|
103
|
+
}
|
104
|
+
return *seed;
|
105
|
+
}
|
106
|
+
|
107
|
+
template<typename HistogramType, typename DataType>
|
108
|
+
void InitialEntropyCodes(const DataType* data, size_t length,
|
109
|
+
int literals_per_histogram,
|
110
|
+
int max_histograms,
|
111
|
+
size_t stride,
|
112
|
+
std::vector<HistogramType>* vec) {
|
113
|
+
int total_histograms = length / literals_per_histogram + 1;
|
114
|
+
if (total_histograms > max_histograms) {
|
115
|
+
total_histograms = max_histograms;
|
116
|
+
}
|
117
|
+
unsigned int seed = 7;
|
118
|
+
int block_length = length / total_histograms;
|
119
|
+
for (int i = 0; i < total_histograms; ++i) {
|
120
|
+
int pos = length * i / total_histograms;
|
121
|
+
if (i != 0) {
|
122
|
+
pos += MyRand(&seed) % block_length;
|
123
|
+
}
|
124
|
+
if (pos + stride >= length) {
|
125
|
+
pos = length - stride - 1;
|
126
|
+
}
|
127
|
+
HistogramType histo;
|
128
|
+
histo.Add(data + pos, stride);
|
129
|
+
vec->push_back(histo);
|
130
|
+
}
|
131
|
+
}
|
132
|
+
|
133
|
+
template<typename HistogramType, typename DataType>
|
134
|
+
void RandomSample(unsigned int* seed,
|
135
|
+
const DataType* data,
|
136
|
+
size_t length,
|
137
|
+
size_t stride,
|
138
|
+
HistogramType* sample) {
|
139
|
+
size_t pos = 0;
|
140
|
+
if (stride >= length) {
|
141
|
+
pos = 0;
|
142
|
+
stride = length;
|
143
|
+
} else {
|
144
|
+
pos = MyRand(seed) % (length - stride + 1);
|
145
|
+
}
|
146
|
+
sample->Add(data + pos, stride);
|
147
|
+
}
|
148
|
+
|
149
|
+
template<typename HistogramType, typename DataType>
|
150
|
+
void RefineEntropyCodes(const DataType* data, size_t length,
|
151
|
+
size_t stride,
|
152
|
+
std::vector<HistogramType>* vec) {
|
153
|
+
int iters =
|
154
|
+
kIterMulForRefining * length / stride + kMinItersForRefining;
|
155
|
+
unsigned int seed = 7;
|
156
|
+
iters = ((iters + vec->size() - 1) / vec->size()) * vec->size();
|
157
|
+
for (int iter = 0; iter < iters; ++iter) {
|
158
|
+
HistogramType sample;
|
159
|
+
RandomSample(&seed, data, length, stride, &sample);
|
160
|
+
int ix = iter % vec->size();
|
161
|
+
(*vec)[ix].AddHistogram(sample);
|
162
|
+
}
|
163
|
+
}
|
164
|
+
|
165
|
+
inline static float BitCost(int count) {
|
166
|
+
return count == 0 ? -2 : FastLog2(count);
|
167
|
+
}
|
168
|
+
|
169
|
+
template<typename DataType, int kSize>
|
170
|
+
void FindBlocks(const DataType* data, const size_t length,
|
171
|
+
const double block_switch_bitcost,
|
172
|
+
const std::vector<Histogram<kSize> > &vec,
|
173
|
+
uint8_t *block_id) {
|
174
|
+
if (vec.size() <= 1) {
|
175
|
+
for (int i = 0; i < length; ++i) {
|
176
|
+
block_id[i] = 0;
|
177
|
+
}
|
178
|
+
return;
|
179
|
+
}
|
180
|
+
int vecsize = vec.size();
|
181
|
+
double* insert_cost = new double[kSize * vecsize];
|
182
|
+
memset(insert_cost, 0, sizeof(insert_cost[0]) * kSize * vecsize);
|
183
|
+
for (int j = 0; j < vecsize; ++j) {
|
184
|
+
insert_cost[j] = FastLog2(vec[j].total_count_);
|
185
|
+
}
|
186
|
+
for (int i = kSize - 1; i >= 0; --i) {
|
187
|
+
for (int j = 0; j < vecsize; ++j) {
|
188
|
+
insert_cost[i * vecsize + j] = insert_cost[j] - BitCost(vec[j].data_[i]);
|
189
|
+
}
|
190
|
+
}
|
191
|
+
double *cost = new double[vecsize];
|
192
|
+
memset(cost, 0, sizeof(cost[0]) * vecsize);
|
193
|
+
bool* switch_signal = new bool[length * vecsize];
|
194
|
+
memset(switch_signal, 0, sizeof(switch_signal[0]) * length * vecsize);
|
195
|
+
// After each iteration of this loop, cost[k] will contain the difference
|
196
|
+
// between the minimum cost of arriving at the current byte position using
|
197
|
+
// entropy code k, and the minimum cost of arriving at the current byte
|
198
|
+
// position. This difference is capped at the block switch cost, and if it
|
199
|
+
// reaches block switch cost, it means that when we trace back from the last
|
200
|
+
// position, we need to switch here.
|
201
|
+
for (size_t byte_ix = 0; byte_ix < length; ++byte_ix) {
|
202
|
+
int ix = byte_ix * vecsize;
|
203
|
+
int insert_cost_ix = data[byte_ix] * vecsize;
|
204
|
+
double min_cost = 1e99;
|
205
|
+
for (int k = 0; k < vecsize; ++k) {
|
206
|
+
// We are coding the symbol in data[byte_ix] with entropy code k.
|
207
|
+
cost[k] += insert_cost[insert_cost_ix + k];
|
208
|
+
if (cost[k] < min_cost) {
|
209
|
+
min_cost = cost[k];
|
210
|
+
block_id[byte_ix] = k;
|
211
|
+
}
|
212
|
+
}
|
213
|
+
double block_switch_cost = block_switch_bitcost;
|
214
|
+
// More blocks for the beginning.
|
215
|
+
if (byte_ix < 2000) {
|
216
|
+
block_switch_cost *= 0.77 + 0.07 * byte_ix / 2000;
|
217
|
+
}
|
218
|
+
for (int k = 0; k < vecsize; ++k) {
|
219
|
+
cost[k] -= min_cost;
|
220
|
+
if (cost[k] >= block_switch_cost) {
|
221
|
+
cost[k] = block_switch_cost;
|
222
|
+
switch_signal[ix + k] = true;
|
223
|
+
}
|
224
|
+
}
|
225
|
+
}
|
226
|
+
// Now trace back from the last position and switch at the marked places.
|
227
|
+
int byte_ix = length - 1;
|
228
|
+
int ix = byte_ix * vecsize;
|
229
|
+
int cur_id = block_id[byte_ix];
|
230
|
+
while (byte_ix > 0) {
|
231
|
+
--byte_ix;
|
232
|
+
ix -= vecsize;
|
233
|
+
if (switch_signal[ix + cur_id]) {
|
234
|
+
cur_id = block_id[byte_ix];
|
235
|
+
}
|
236
|
+
block_id[byte_ix] = cur_id;
|
237
|
+
}
|
238
|
+
delete[] insert_cost;
|
239
|
+
delete[] cost;
|
240
|
+
delete[] switch_signal;
|
241
|
+
}
|
242
|
+
|
243
|
+
int RemapBlockIds(uint8_t* block_ids, const size_t length) {
|
244
|
+
std::map<uint8_t, uint8_t> new_id;
|
245
|
+
int next_id = 0;
|
246
|
+
for (int i = 0; i < length; ++i) {
|
247
|
+
if (new_id.find(block_ids[i]) == new_id.end()) {
|
248
|
+
new_id[block_ids[i]] = next_id;
|
249
|
+
++next_id;
|
250
|
+
}
|
251
|
+
}
|
252
|
+
for (int i = 0; i < length; ++i) {
|
253
|
+
block_ids[i] = new_id[block_ids[i]];
|
254
|
+
}
|
255
|
+
return next_id;
|
256
|
+
}
|
257
|
+
|
258
|
+
template<typename HistogramType, typename DataType>
|
259
|
+
void BuildBlockHistograms(const DataType* data, const size_t length,
|
260
|
+
uint8_t* block_ids,
|
261
|
+
std::vector<HistogramType>* histograms) {
|
262
|
+
int num_types = RemapBlockIds(block_ids, length);
|
263
|
+
histograms->clear();
|
264
|
+
histograms->resize(num_types);
|
265
|
+
for (int i = 0; i < length; ++i) {
|
266
|
+
(*histograms)[block_ids[i]].Add(data[i]);
|
267
|
+
}
|
268
|
+
}
|
269
|
+
|
270
|
+
template<typename HistogramType, typename DataType>
|
271
|
+
void ClusterBlocks(const DataType* data, const size_t length,
|
272
|
+
uint8_t* block_ids) {
|
273
|
+
std::vector<HistogramType> histograms;
|
274
|
+
std::vector<int> block_index(length);
|
275
|
+
int cur_idx = 0;
|
276
|
+
HistogramType cur_histogram;
|
277
|
+
for (int i = 0; i < length; ++i) {
|
278
|
+
bool block_boundary = (i + 1 == length || block_ids[i] != block_ids[i + 1]);
|
279
|
+
block_index[i] = cur_idx;
|
280
|
+
cur_histogram.Add(data[i]);
|
281
|
+
if (block_boundary) {
|
282
|
+
histograms.push_back(cur_histogram);
|
283
|
+
cur_histogram.Clear();
|
284
|
+
++cur_idx;
|
285
|
+
}
|
286
|
+
}
|
287
|
+
std::vector<HistogramType> clustered_histograms;
|
288
|
+
std::vector<int> histogram_symbols;
|
289
|
+
// Block ids need to fit in one byte.
|
290
|
+
static const int kMaxNumberOfBlockTypes = 256;
|
291
|
+
ClusterHistograms(histograms, 1, histograms.size(),
|
292
|
+
kMaxNumberOfBlockTypes,
|
293
|
+
&clustered_histograms,
|
294
|
+
&histogram_symbols);
|
295
|
+
for (int i = 0; i < length; ++i) {
|
296
|
+
block_ids[i] = histogram_symbols[block_index[i]];
|
297
|
+
}
|
298
|
+
}
|
299
|
+
|
300
|
+
void BuildBlockSplit(const std::vector<uint8_t>& block_ids, BlockSplit* split) {
|
301
|
+
int cur_id = block_ids[0];
|
302
|
+
int cur_length = 1;
|
303
|
+
split->num_types = -1;
|
304
|
+
for (int i = 1; i < block_ids.size(); ++i) {
|
305
|
+
if (block_ids[i] != cur_id) {
|
306
|
+
split->types.push_back(cur_id);
|
307
|
+
split->lengths.push_back(cur_length);
|
308
|
+
split->num_types = std::max(split->num_types, cur_id);
|
309
|
+
cur_id = block_ids[i];
|
310
|
+
cur_length = 0;
|
311
|
+
}
|
312
|
+
++cur_length;
|
313
|
+
}
|
314
|
+
split->types.push_back(cur_id);
|
315
|
+
split->lengths.push_back(cur_length);
|
316
|
+
split->num_types = std::max(split->num_types, cur_id);
|
317
|
+
++split->num_types;
|
318
|
+
}
|
319
|
+
|
320
|
+
template<typename HistogramType, typename DataType>
|
321
|
+
void SplitByteVector(const std::vector<DataType>& data,
|
322
|
+
const int literals_per_histogram,
|
323
|
+
const int max_histograms,
|
324
|
+
const int sampling_stride_length,
|
325
|
+
const double block_switch_cost,
|
326
|
+
BlockSplit* split) {
|
327
|
+
if (data.empty()) {
|
328
|
+
split->num_types = 1;
|
329
|
+
return;
|
330
|
+
} else if (data.size() < kMinLengthForBlockSplitting) {
|
331
|
+
split->num_types = 1;
|
332
|
+
split->types.push_back(0);
|
333
|
+
split->lengths.push_back(data.size());
|
334
|
+
return;
|
335
|
+
}
|
336
|
+
std::vector<HistogramType> histograms;
|
337
|
+
// Find good entropy codes.
|
338
|
+
InitialEntropyCodes(data.data(), data.size(),
|
339
|
+
literals_per_histogram,
|
340
|
+
max_histograms,
|
341
|
+
sampling_stride_length,
|
342
|
+
&histograms);
|
343
|
+
RefineEntropyCodes(data.data(), data.size(),
|
344
|
+
sampling_stride_length,
|
345
|
+
&histograms);
|
346
|
+
// Find a good path through literals with the good entropy codes.
|
347
|
+
std::vector<uint8_t> block_ids(data.size());
|
348
|
+
for (int i = 0; i < 10; ++i) {
|
349
|
+
FindBlocks(data.data(), data.size(),
|
350
|
+
block_switch_cost,
|
351
|
+
histograms,
|
352
|
+
&block_ids[0]);
|
353
|
+
BuildBlockHistograms(data.data(), data.size(), &block_ids[0], &histograms);
|
354
|
+
}
|
355
|
+
ClusterBlocks<HistogramType>(data.data(), data.size(), &block_ids[0]);
|
356
|
+
BuildBlockSplit(block_ids, split);
|
357
|
+
}
|
358
|
+
|
359
|
+
void SplitBlock(const Command* cmds,
|
360
|
+
const size_t num_commands,
|
361
|
+
const uint8_t* data,
|
362
|
+
const size_t pos,
|
363
|
+
const size_t mask,
|
364
|
+
BlockSplit* literal_split,
|
365
|
+
BlockSplit* insert_and_copy_split,
|
366
|
+
BlockSplit* dist_split) {
|
367
|
+
// Create a continuous array of literals.
|
368
|
+
std::vector<uint8_t> literals;
|
369
|
+
CopyLiteralsToByteArray(cmds, num_commands, data, pos, mask, &literals);
|
370
|
+
|
371
|
+
// Compute prefix codes for commands.
|
372
|
+
std::vector<uint16_t> insert_and_copy_codes;
|
373
|
+
std::vector<uint16_t> distance_prefixes;
|
374
|
+
CopyCommandsToByteArray(cmds, num_commands,
|
375
|
+
&insert_and_copy_codes,
|
376
|
+
&distance_prefixes);
|
377
|
+
|
378
|
+
SplitByteVector<HistogramLiteral>(
|
379
|
+
literals,
|
380
|
+
kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,
|
381
|
+
kLiteralStrideLength, kLiteralBlockSwitchCost,
|
382
|
+
literal_split);
|
383
|
+
SplitByteVector<HistogramCommand>(
|
384
|
+
insert_and_copy_codes,
|
385
|
+
kSymbolsPerCommandHistogram, kMaxCommandHistograms,
|
386
|
+
kCommandStrideLength, kCommandBlockSwitchCost,
|
387
|
+
insert_and_copy_split);
|
388
|
+
SplitByteVector<HistogramDistance>(
|
389
|
+
distance_prefixes,
|
390
|
+
kSymbolsPerDistanceHistogram, kMaxCommandHistograms,
|
391
|
+
kCommandStrideLength, kDistanceBlockSwitchCost,
|
392
|
+
dist_split);
|
393
|
+
}
|
394
|
+
|
395
|
+
void SplitBlockByTotalLength(const Command* all_commands,
|
396
|
+
const size_t num_commands,
|
397
|
+
int input_size,
|
398
|
+
int target_length,
|
399
|
+
std::vector<std::vector<Command> >* blocks) {
|
400
|
+
int num_blocks = input_size / target_length + 1;
|
401
|
+
int length_limit = input_size / num_blocks + 1;
|
402
|
+
int total_length = 0;
|
403
|
+
std::vector<Command> cur_block;
|
404
|
+
for (int i = 0; i < num_commands; ++i) {
|
405
|
+
const Command& cmd = all_commands[i];
|
406
|
+
int cmd_length = cmd.insert_len_ + cmd.copy_len_;
|
407
|
+
if (total_length > length_limit) {
|
408
|
+
blocks->push_back(cur_block);
|
409
|
+
cur_block.clear();
|
410
|
+
total_length = 0;
|
411
|
+
}
|
412
|
+
cur_block.push_back(cmd);
|
413
|
+
total_length += cmd_length;
|
414
|
+
}
|
415
|
+
blocks->push_back(cur_block);
|
416
|
+
}
|
417
|
+
|
418
|
+
} // namespace brotli
|