extbrotli 0.0.1.PROTOTYPE
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +28 -0
- data/README.md +67 -0
- data/Rakefile +158 -0
- data/contrib/brotli/LICENSE +202 -0
- data/contrib/brotli/README.md +18 -0
- data/contrib/brotli/dec/bit_reader.c +55 -0
- data/contrib/brotli/dec/bit_reader.h +256 -0
- data/contrib/brotli/dec/context.h +260 -0
- data/contrib/brotli/dec/decode.c +1573 -0
- data/contrib/brotli/dec/decode.h +160 -0
- data/contrib/brotli/dec/dictionary.h +9494 -0
- data/contrib/brotli/dec/huffman.c +325 -0
- data/contrib/brotli/dec/huffman.h +77 -0
- data/contrib/brotli/dec/port.h +148 -0
- data/contrib/brotli/dec/prefix.h +756 -0
- data/contrib/brotli/dec/state.c +149 -0
- data/contrib/brotli/dec/state.h +185 -0
- data/contrib/brotli/dec/streams.c +99 -0
- data/contrib/brotli/dec/streams.h +100 -0
- data/contrib/brotli/dec/transform.h +315 -0
- data/contrib/brotli/dec/types.h +36 -0
- data/contrib/brotli/enc/backward_references.cc +769 -0
- data/contrib/brotli/enc/backward_references.h +50 -0
- data/contrib/brotli/enc/bit_cost.h +147 -0
- data/contrib/brotli/enc/block_splitter.cc +418 -0
- data/contrib/brotli/enc/block_splitter.h +78 -0
- data/contrib/brotli/enc/brotli_bit_stream.cc +884 -0
- data/contrib/brotli/enc/brotli_bit_stream.h +149 -0
- data/contrib/brotli/enc/cluster.h +290 -0
- data/contrib/brotli/enc/command.h +140 -0
- data/contrib/brotli/enc/context.h +185 -0
- data/contrib/brotli/enc/dictionary.h +9485 -0
- data/contrib/brotli/enc/dictionary_hash.h +4125 -0
- data/contrib/brotli/enc/encode.cc +715 -0
- data/contrib/brotli/enc/encode.h +196 -0
- data/contrib/brotli/enc/encode_parallel.cc +354 -0
- data/contrib/brotli/enc/encode_parallel.h +37 -0
- data/contrib/brotli/enc/entropy_encode.cc +492 -0
- data/contrib/brotli/enc/entropy_encode.h +88 -0
- data/contrib/brotli/enc/fast_log.h +179 -0
- data/contrib/brotli/enc/find_match_length.h +87 -0
- data/contrib/brotli/enc/hash.h +686 -0
- data/contrib/brotli/enc/histogram.cc +76 -0
- data/contrib/brotli/enc/histogram.h +100 -0
- data/contrib/brotli/enc/literal_cost.cc +172 -0
- data/contrib/brotli/enc/literal_cost.h +38 -0
- data/contrib/brotli/enc/metablock.cc +544 -0
- data/contrib/brotli/enc/metablock.h +88 -0
- data/contrib/brotli/enc/port.h +151 -0
- data/contrib/brotli/enc/prefix.h +85 -0
- data/contrib/brotli/enc/ringbuffer.h +108 -0
- data/contrib/brotli/enc/static_dict.cc +441 -0
- data/contrib/brotli/enc/static_dict.h +40 -0
- data/contrib/brotli/enc/static_dict_lut.h +12063 -0
- data/contrib/brotli/enc/streams.cc +127 -0
- data/contrib/brotli/enc/streams.h +129 -0
- data/contrib/brotli/enc/transform.h +250 -0
- data/contrib/brotli/enc/write_bits.h +91 -0
- data/ext/extbrotli.cc +24 -0
- data/ext/extbrotli.h +73 -0
- data/ext/extconf.rb +35 -0
- data/ext/lldecoder.c +220 -0
- data/ext/llencoder.cc +433 -0
- data/gemstub.rb +21 -0
- data/lib/extbrotli.rb +243 -0
- data/lib/extbrotli/version.rb +3 -0
- metadata +140 -0
@@ -0,0 +1,50 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Function to find backward reference copies.
|
16
|
+
|
17
|
+
#ifndef BROTLI_ENC_BACKWARD_REFERENCES_H_
|
18
|
+
#define BROTLI_ENC_BACKWARD_REFERENCES_H_
|
19
|
+
|
20
|
+
#include <stdint.h>
|
21
|
+
#include <vector>
|
22
|
+
|
23
|
+
#include "./hash.h"
|
24
|
+
#include "./command.h"
|
25
|
+
|
26
|
+
namespace brotli {
|
27
|
+
|
28
|
+
// "commands" points to the next output command to write to, "*num_commands" is
|
29
|
+
// initially the total amount of commands output by previous
|
30
|
+
// CreateBackwardReferences calls, and must be incremented by the amount written
|
31
|
+
// by this call.
|
32
|
+
void CreateBackwardReferences(size_t num_bytes,
|
33
|
+
size_t position,
|
34
|
+
const uint8_t* ringbuffer,
|
35
|
+
size_t ringbuffer_mask,
|
36
|
+
const float* literal_cost,
|
37
|
+
size_t literal_cost_mask,
|
38
|
+
const size_t max_backward_limit,
|
39
|
+
const int quality,
|
40
|
+
Hashers* hashers,
|
41
|
+
int hash_type,
|
42
|
+
int* dist_cache,
|
43
|
+
int* last_insert_len,
|
44
|
+
Command* commands,
|
45
|
+
int* num_commands,
|
46
|
+
int* num_literals);
|
47
|
+
|
48
|
+
} // namespace brotli
|
49
|
+
|
50
|
+
#endif // BROTLI_ENC_BACKWARD_REFERENCES_H_
|
@@ -0,0 +1,147 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Functions to estimate the bit cost of Huffman trees.
|
16
|
+
|
17
|
+
#ifndef BROTLI_ENC_BIT_COST_H_
|
18
|
+
#define BROTLI_ENC_BIT_COST_H_
|
19
|
+
|
20
|
+
|
21
|
+
#include <stdint.h>
|
22
|
+
|
23
|
+
#include "./entropy_encode.h"
|
24
|
+
#include "./fast_log.h"
|
25
|
+
|
26
|
+
namespace brotli {
|
27
|
+
|
28
|
+
static inline double ShannonEntropy(const int *population, int size,
|
29
|
+
int *total) {
|
30
|
+
int sum = 0;
|
31
|
+
double retval = 0;
|
32
|
+
const int *population_end = population + size;
|
33
|
+
int p;
|
34
|
+
if (size & 1) {
|
35
|
+
goto odd_number_of_elements_left;
|
36
|
+
}
|
37
|
+
while (population < population_end) {
|
38
|
+
p = *population++;
|
39
|
+
sum += p;
|
40
|
+
retval -= p * FastLog2(p);
|
41
|
+
odd_number_of_elements_left:
|
42
|
+
p = *population++;
|
43
|
+
sum += p;
|
44
|
+
retval -= p * FastLog2(p);
|
45
|
+
}
|
46
|
+
if (sum) retval += sum * FastLog2(sum);
|
47
|
+
*total = sum;
|
48
|
+
return retval;
|
49
|
+
}
|
50
|
+
|
51
|
+
static inline double BitsEntropy(const int *population, int size) {
|
52
|
+
int sum;
|
53
|
+
double retval = ShannonEntropy(population, size, &sum);
|
54
|
+
if (retval < sum) {
|
55
|
+
// At least one bit per literal is needed.
|
56
|
+
retval = sum;
|
57
|
+
}
|
58
|
+
return retval;
|
59
|
+
}
|
60
|
+
|
61
|
+
|
62
|
+
template<int kSize>
|
63
|
+
double PopulationCost(const Histogram<kSize>& histogram) {
|
64
|
+
if (histogram.total_count_ == 0) {
|
65
|
+
return 12;
|
66
|
+
}
|
67
|
+
int count = 0;
|
68
|
+
for (int i = 0; i < kSize; ++i) {
|
69
|
+
if (histogram.data_[i] > 0) {
|
70
|
+
++count;
|
71
|
+
}
|
72
|
+
}
|
73
|
+
if (count == 1) {
|
74
|
+
return 12;
|
75
|
+
}
|
76
|
+
if (count == 2) {
|
77
|
+
return 20 + histogram.total_count_;
|
78
|
+
}
|
79
|
+
double bits = 0;
|
80
|
+
uint8_t depth[kSize] = { 0 };
|
81
|
+
if (count <= 4) {
|
82
|
+
// For very low symbol count we build the Huffman tree.
|
83
|
+
CreateHuffmanTree(&histogram.data_[0], kSize, 15, depth);
|
84
|
+
for (int i = 0; i < kSize; ++i) {
|
85
|
+
bits += histogram.data_[i] * depth[i];
|
86
|
+
}
|
87
|
+
return count == 3 ? bits + 28 : bits + 37;
|
88
|
+
}
|
89
|
+
|
90
|
+
// In this loop we compute the entropy of the histogram and simultaneously
|
91
|
+
// build a simplified histogram of the code length codes where we use the
|
92
|
+
// zero repeat code 17, but we don't use the non-zero repeat code 16.
|
93
|
+
int max_depth = 1;
|
94
|
+
int depth_histo[kCodeLengthCodes] = { 0 };
|
95
|
+
const double log2total = FastLog2(histogram.total_count_);
|
96
|
+
for (int i = 0; i < kSize;) {
|
97
|
+
if (histogram.data_[i] > 0) {
|
98
|
+
// Compute -log2(P(symbol)) = -log2(count(symbol)/total_count) =
|
99
|
+
// = log2(total_count) - log2(count(symbol))
|
100
|
+
double log2p = log2total - FastLog2(histogram.data_[i]);
|
101
|
+
// Approximate the bit depth by round(-log2(P(symbol)))
|
102
|
+
int depth = static_cast<int>(log2p + 0.5);
|
103
|
+
bits += histogram.data_[i] * log2p;
|
104
|
+
if (depth > 15) {
|
105
|
+
depth = 15;
|
106
|
+
}
|
107
|
+
if (depth > max_depth) {
|
108
|
+
max_depth = depth;
|
109
|
+
}
|
110
|
+
++depth_histo[depth];
|
111
|
+
++i;
|
112
|
+
} else {
|
113
|
+
// Compute the run length of zeros and add the appropiate number of 0 and
|
114
|
+
// 17 code length codes to the code length code histogram.
|
115
|
+
int reps = 1;
|
116
|
+
for (int k = i + 1; k < kSize && histogram.data_[k] == 0; ++k) {
|
117
|
+
++reps;
|
118
|
+
}
|
119
|
+
i += reps;
|
120
|
+
if (i == kSize) {
|
121
|
+
// Don't add any cost for the last zero run, since these are encoded
|
122
|
+
// only implicitly.
|
123
|
+
break;
|
124
|
+
}
|
125
|
+
if (reps < 3) {
|
126
|
+
depth_histo[0] += reps;
|
127
|
+
} else {
|
128
|
+
reps -= 2;
|
129
|
+
while (reps > 0) {
|
130
|
+
++depth_histo[17];
|
131
|
+
// Add the 3 extra bits for the 17 code length code.
|
132
|
+
bits += 3;
|
133
|
+
reps >>= 3;
|
134
|
+
}
|
135
|
+
}
|
136
|
+
}
|
137
|
+
}
|
138
|
+
// Add the estimated encoding cost of the code length code histogram.
|
139
|
+
bits += 18 + 2 * max_depth;
|
140
|
+
// Add the entropy of the code length code histogram.
|
141
|
+
bits += BitsEntropy(depth_histo, kCodeLengthCodes);
|
142
|
+
return bits;
|
143
|
+
}
|
144
|
+
|
145
|
+
} // namespace brotli
|
146
|
+
|
147
|
+
#endif // BROTLI_ENC_BIT_COST_H_
|
@@ -0,0 +1,418 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Block split point selection utilities.
|
16
|
+
|
17
|
+
#include "./block_splitter.h"
|
18
|
+
|
19
|
+
#include <math.h>
|
20
|
+
#include <stdio.h>
|
21
|
+
#include <stdlib.h>
|
22
|
+
#include <string.h>
|
23
|
+
|
24
|
+
#include <algorithm>
|
25
|
+
#include <map>
|
26
|
+
|
27
|
+
#include "./cluster.h"
|
28
|
+
#include "./command.h"
|
29
|
+
#include "./fast_log.h"
|
30
|
+
#include "./histogram.h"
|
31
|
+
|
32
|
+
namespace brotli {
|
33
|
+
|
34
|
+
static const int kMaxLiteralHistograms = 100;
|
35
|
+
static const int kMaxCommandHistograms = 50;
|
36
|
+
static const double kLiteralBlockSwitchCost = 28.1;
|
37
|
+
static const double kCommandBlockSwitchCost = 13.5;
|
38
|
+
static const double kDistanceBlockSwitchCost = 14.6;
|
39
|
+
static const int kLiteralStrideLength = 70;
|
40
|
+
static const int kCommandStrideLength = 40;
|
41
|
+
static const int kSymbolsPerLiteralHistogram = 544;
|
42
|
+
static const int kSymbolsPerCommandHistogram = 530;
|
43
|
+
static const int kSymbolsPerDistanceHistogram = 544;
|
44
|
+
static const int kMinLengthForBlockSplitting = 128;
|
45
|
+
static const int kIterMulForRefining = 2;
|
46
|
+
static const int kMinItersForRefining = 100;
|
47
|
+
|
48
|
+
void CopyLiteralsToByteArray(const Command* cmds,
|
49
|
+
const size_t num_commands,
|
50
|
+
const uint8_t* data,
|
51
|
+
const size_t offset,
|
52
|
+
const size_t mask,
|
53
|
+
std::vector<uint8_t>* literals) {
|
54
|
+
// Count how many we have.
|
55
|
+
size_t total_length = 0;
|
56
|
+
for (int i = 0; i < num_commands; ++i) {
|
57
|
+
total_length += cmds[i].insert_len_;
|
58
|
+
}
|
59
|
+
if (total_length == 0) {
|
60
|
+
return;
|
61
|
+
}
|
62
|
+
|
63
|
+
// Allocate.
|
64
|
+
literals->resize(total_length);
|
65
|
+
|
66
|
+
// Loop again, and copy this time.
|
67
|
+
size_t pos = 0;
|
68
|
+
size_t from_pos = offset & mask;
|
69
|
+
for (int i = 0; i < num_commands && pos < total_length; ++i) {
|
70
|
+
size_t insert_len = cmds[i].insert_len_;
|
71
|
+
if (from_pos + insert_len > mask) {
|
72
|
+
size_t head_size = mask + 1 - from_pos;
|
73
|
+
memcpy(&(*literals)[pos], data + from_pos, head_size);
|
74
|
+
from_pos = 0;
|
75
|
+
pos += head_size;
|
76
|
+
insert_len -= head_size;
|
77
|
+
}
|
78
|
+
if (insert_len > 0) {
|
79
|
+
memcpy(&(*literals)[pos], data + from_pos, insert_len);
|
80
|
+
pos += insert_len;
|
81
|
+
}
|
82
|
+
from_pos = (from_pos + insert_len + cmds[i].copy_len_) & mask;
|
83
|
+
}
|
84
|
+
}
|
85
|
+
|
86
|
+
void CopyCommandsToByteArray(const Command* cmds,
|
87
|
+
const size_t num_commands,
|
88
|
+
std::vector<uint16_t>* insert_and_copy_codes,
|
89
|
+
std::vector<uint16_t>* distance_prefixes) {
|
90
|
+
for (int i = 0; i < num_commands; ++i) {
|
91
|
+
const Command& cmd = cmds[i];
|
92
|
+
insert_and_copy_codes->push_back(cmd.cmd_prefix_);
|
93
|
+
if (cmd.copy_len_ > 0 && cmd.cmd_prefix_ >= 128) {
|
94
|
+
distance_prefixes->push_back(cmd.dist_prefix_);
|
95
|
+
}
|
96
|
+
}
|
97
|
+
}
|
98
|
+
|
99
|
+
inline static unsigned int MyRand(unsigned int* seed) {
|
100
|
+
*seed *= 16807U;
|
101
|
+
if (*seed == 0) {
|
102
|
+
*seed = 1;
|
103
|
+
}
|
104
|
+
return *seed;
|
105
|
+
}
|
106
|
+
|
107
|
+
template<typename HistogramType, typename DataType>
|
108
|
+
void InitialEntropyCodes(const DataType* data, size_t length,
|
109
|
+
int literals_per_histogram,
|
110
|
+
int max_histograms,
|
111
|
+
size_t stride,
|
112
|
+
std::vector<HistogramType>* vec) {
|
113
|
+
int total_histograms = length / literals_per_histogram + 1;
|
114
|
+
if (total_histograms > max_histograms) {
|
115
|
+
total_histograms = max_histograms;
|
116
|
+
}
|
117
|
+
unsigned int seed = 7;
|
118
|
+
int block_length = length / total_histograms;
|
119
|
+
for (int i = 0; i < total_histograms; ++i) {
|
120
|
+
int pos = length * i / total_histograms;
|
121
|
+
if (i != 0) {
|
122
|
+
pos += MyRand(&seed) % block_length;
|
123
|
+
}
|
124
|
+
if (pos + stride >= length) {
|
125
|
+
pos = length - stride - 1;
|
126
|
+
}
|
127
|
+
HistogramType histo;
|
128
|
+
histo.Add(data + pos, stride);
|
129
|
+
vec->push_back(histo);
|
130
|
+
}
|
131
|
+
}
|
132
|
+
|
133
|
+
template<typename HistogramType, typename DataType>
|
134
|
+
void RandomSample(unsigned int* seed,
|
135
|
+
const DataType* data,
|
136
|
+
size_t length,
|
137
|
+
size_t stride,
|
138
|
+
HistogramType* sample) {
|
139
|
+
size_t pos = 0;
|
140
|
+
if (stride >= length) {
|
141
|
+
pos = 0;
|
142
|
+
stride = length;
|
143
|
+
} else {
|
144
|
+
pos = MyRand(seed) % (length - stride + 1);
|
145
|
+
}
|
146
|
+
sample->Add(data + pos, stride);
|
147
|
+
}
|
148
|
+
|
149
|
+
template<typename HistogramType, typename DataType>
|
150
|
+
void RefineEntropyCodes(const DataType* data, size_t length,
|
151
|
+
size_t stride,
|
152
|
+
std::vector<HistogramType>* vec) {
|
153
|
+
int iters =
|
154
|
+
kIterMulForRefining * length / stride + kMinItersForRefining;
|
155
|
+
unsigned int seed = 7;
|
156
|
+
iters = ((iters + vec->size() - 1) / vec->size()) * vec->size();
|
157
|
+
for (int iter = 0; iter < iters; ++iter) {
|
158
|
+
HistogramType sample;
|
159
|
+
RandomSample(&seed, data, length, stride, &sample);
|
160
|
+
int ix = iter % vec->size();
|
161
|
+
(*vec)[ix].AddHistogram(sample);
|
162
|
+
}
|
163
|
+
}
|
164
|
+
|
165
|
+
inline static float BitCost(int count) {
|
166
|
+
return count == 0 ? -2 : FastLog2(count);
|
167
|
+
}
|
168
|
+
|
169
|
+
template<typename DataType, int kSize>
|
170
|
+
void FindBlocks(const DataType* data, const size_t length,
|
171
|
+
const double block_switch_bitcost,
|
172
|
+
const std::vector<Histogram<kSize> > &vec,
|
173
|
+
uint8_t *block_id) {
|
174
|
+
if (vec.size() <= 1) {
|
175
|
+
for (int i = 0; i < length; ++i) {
|
176
|
+
block_id[i] = 0;
|
177
|
+
}
|
178
|
+
return;
|
179
|
+
}
|
180
|
+
int vecsize = vec.size();
|
181
|
+
double* insert_cost = new double[kSize * vecsize];
|
182
|
+
memset(insert_cost, 0, sizeof(insert_cost[0]) * kSize * vecsize);
|
183
|
+
for (int j = 0; j < vecsize; ++j) {
|
184
|
+
insert_cost[j] = FastLog2(vec[j].total_count_);
|
185
|
+
}
|
186
|
+
for (int i = kSize - 1; i >= 0; --i) {
|
187
|
+
for (int j = 0; j < vecsize; ++j) {
|
188
|
+
insert_cost[i * vecsize + j] = insert_cost[j] - BitCost(vec[j].data_[i]);
|
189
|
+
}
|
190
|
+
}
|
191
|
+
double *cost = new double[vecsize];
|
192
|
+
memset(cost, 0, sizeof(cost[0]) * vecsize);
|
193
|
+
bool* switch_signal = new bool[length * vecsize];
|
194
|
+
memset(switch_signal, 0, sizeof(switch_signal[0]) * length * vecsize);
|
195
|
+
// After each iteration of this loop, cost[k] will contain the difference
|
196
|
+
// between the minimum cost of arriving at the current byte position using
|
197
|
+
// entropy code k, and the minimum cost of arriving at the current byte
|
198
|
+
// position. This difference is capped at the block switch cost, and if it
|
199
|
+
// reaches block switch cost, it means that when we trace back from the last
|
200
|
+
// position, we need to switch here.
|
201
|
+
for (size_t byte_ix = 0; byte_ix < length; ++byte_ix) {
|
202
|
+
int ix = byte_ix * vecsize;
|
203
|
+
int insert_cost_ix = data[byte_ix] * vecsize;
|
204
|
+
double min_cost = 1e99;
|
205
|
+
for (int k = 0; k < vecsize; ++k) {
|
206
|
+
// We are coding the symbol in data[byte_ix] with entropy code k.
|
207
|
+
cost[k] += insert_cost[insert_cost_ix + k];
|
208
|
+
if (cost[k] < min_cost) {
|
209
|
+
min_cost = cost[k];
|
210
|
+
block_id[byte_ix] = k;
|
211
|
+
}
|
212
|
+
}
|
213
|
+
double block_switch_cost = block_switch_bitcost;
|
214
|
+
// More blocks for the beginning.
|
215
|
+
if (byte_ix < 2000) {
|
216
|
+
block_switch_cost *= 0.77 + 0.07 * byte_ix / 2000;
|
217
|
+
}
|
218
|
+
for (int k = 0; k < vecsize; ++k) {
|
219
|
+
cost[k] -= min_cost;
|
220
|
+
if (cost[k] >= block_switch_cost) {
|
221
|
+
cost[k] = block_switch_cost;
|
222
|
+
switch_signal[ix + k] = true;
|
223
|
+
}
|
224
|
+
}
|
225
|
+
}
|
226
|
+
// Now trace back from the last position and switch at the marked places.
|
227
|
+
int byte_ix = length - 1;
|
228
|
+
int ix = byte_ix * vecsize;
|
229
|
+
int cur_id = block_id[byte_ix];
|
230
|
+
while (byte_ix > 0) {
|
231
|
+
--byte_ix;
|
232
|
+
ix -= vecsize;
|
233
|
+
if (switch_signal[ix + cur_id]) {
|
234
|
+
cur_id = block_id[byte_ix];
|
235
|
+
}
|
236
|
+
block_id[byte_ix] = cur_id;
|
237
|
+
}
|
238
|
+
delete[] insert_cost;
|
239
|
+
delete[] cost;
|
240
|
+
delete[] switch_signal;
|
241
|
+
}
|
242
|
+
|
243
|
+
int RemapBlockIds(uint8_t* block_ids, const size_t length) {
|
244
|
+
std::map<uint8_t, uint8_t> new_id;
|
245
|
+
int next_id = 0;
|
246
|
+
for (int i = 0; i < length; ++i) {
|
247
|
+
if (new_id.find(block_ids[i]) == new_id.end()) {
|
248
|
+
new_id[block_ids[i]] = next_id;
|
249
|
+
++next_id;
|
250
|
+
}
|
251
|
+
}
|
252
|
+
for (int i = 0; i < length; ++i) {
|
253
|
+
block_ids[i] = new_id[block_ids[i]];
|
254
|
+
}
|
255
|
+
return next_id;
|
256
|
+
}
|
257
|
+
|
258
|
+
template<typename HistogramType, typename DataType>
|
259
|
+
void BuildBlockHistograms(const DataType* data, const size_t length,
|
260
|
+
uint8_t* block_ids,
|
261
|
+
std::vector<HistogramType>* histograms) {
|
262
|
+
int num_types = RemapBlockIds(block_ids, length);
|
263
|
+
histograms->clear();
|
264
|
+
histograms->resize(num_types);
|
265
|
+
for (int i = 0; i < length; ++i) {
|
266
|
+
(*histograms)[block_ids[i]].Add(data[i]);
|
267
|
+
}
|
268
|
+
}
|
269
|
+
|
270
|
+
template<typename HistogramType, typename DataType>
|
271
|
+
void ClusterBlocks(const DataType* data, const size_t length,
|
272
|
+
uint8_t* block_ids) {
|
273
|
+
std::vector<HistogramType> histograms;
|
274
|
+
std::vector<int> block_index(length);
|
275
|
+
int cur_idx = 0;
|
276
|
+
HistogramType cur_histogram;
|
277
|
+
for (int i = 0; i < length; ++i) {
|
278
|
+
bool block_boundary = (i + 1 == length || block_ids[i] != block_ids[i + 1]);
|
279
|
+
block_index[i] = cur_idx;
|
280
|
+
cur_histogram.Add(data[i]);
|
281
|
+
if (block_boundary) {
|
282
|
+
histograms.push_back(cur_histogram);
|
283
|
+
cur_histogram.Clear();
|
284
|
+
++cur_idx;
|
285
|
+
}
|
286
|
+
}
|
287
|
+
std::vector<HistogramType> clustered_histograms;
|
288
|
+
std::vector<int> histogram_symbols;
|
289
|
+
// Block ids need to fit in one byte.
|
290
|
+
static const int kMaxNumberOfBlockTypes = 256;
|
291
|
+
ClusterHistograms(histograms, 1, histograms.size(),
|
292
|
+
kMaxNumberOfBlockTypes,
|
293
|
+
&clustered_histograms,
|
294
|
+
&histogram_symbols);
|
295
|
+
for (int i = 0; i < length; ++i) {
|
296
|
+
block_ids[i] = histogram_symbols[block_index[i]];
|
297
|
+
}
|
298
|
+
}
|
299
|
+
|
300
|
+
void BuildBlockSplit(const std::vector<uint8_t>& block_ids, BlockSplit* split) {
|
301
|
+
int cur_id = block_ids[0];
|
302
|
+
int cur_length = 1;
|
303
|
+
split->num_types = -1;
|
304
|
+
for (int i = 1; i < block_ids.size(); ++i) {
|
305
|
+
if (block_ids[i] != cur_id) {
|
306
|
+
split->types.push_back(cur_id);
|
307
|
+
split->lengths.push_back(cur_length);
|
308
|
+
split->num_types = std::max(split->num_types, cur_id);
|
309
|
+
cur_id = block_ids[i];
|
310
|
+
cur_length = 0;
|
311
|
+
}
|
312
|
+
++cur_length;
|
313
|
+
}
|
314
|
+
split->types.push_back(cur_id);
|
315
|
+
split->lengths.push_back(cur_length);
|
316
|
+
split->num_types = std::max(split->num_types, cur_id);
|
317
|
+
++split->num_types;
|
318
|
+
}
|
319
|
+
|
320
|
+
template<typename HistogramType, typename DataType>
|
321
|
+
void SplitByteVector(const std::vector<DataType>& data,
|
322
|
+
const int literals_per_histogram,
|
323
|
+
const int max_histograms,
|
324
|
+
const int sampling_stride_length,
|
325
|
+
const double block_switch_cost,
|
326
|
+
BlockSplit* split) {
|
327
|
+
if (data.empty()) {
|
328
|
+
split->num_types = 1;
|
329
|
+
return;
|
330
|
+
} else if (data.size() < kMinLengthForBlockSplitting) {
|
331
|
+
split->num_types = 1;
|
332
|
+
split->types.push_back(0);
|
333
|
+
split->lengths.push_back(data.size());
|
334
|
+
return;
|
335
|
+
}
|
336
|
+
std::vector<HistogramType> histograms;
|
337
|
+
// Find good entropy codes.
|
338
|
+
InitialEntropyCodes(data.data(), data.size(),
|
339
|
+
literals_per_histogram,
|
340
|
+
max_histograms,
|
341
|
+
sampling_stride_length,
|
342
|
+
&histograms);
|
343
|
+
RefineEntropyCodes(data.data(), data.size(),
|
344
|
+
sampling_stride_length,
|
345
|
+
&histograms);
|
346
|
+
// Find a good path through literals with the good entropy codes.
|
347
|
+
std::vector<uint8_t> block_ids(data.size());
|
348
|
+
for (int i = 0; i < 10; ++i) {
|
349
|
+
FindBlocks(data.data(), data.size(),
|
350
|
+
block_switch_cost,
|
351
|
+
histograms,
|
352
|
+
&block_ids[0]);
|
353
|
+
BuildBlockHistograms(data.data(), data.size(), &block_ids[0], &histograms);
|
354
|
+
}
|
355
|
+
ClusterBlocks<HistogramType>(data.data(), data.size(), &block_ids[0]);
|
356
|
+
BuildBlockSplit(block_ids, split);
|
357
|
+
}
|
358
|
+
|
359
|
+
void SplitBlock(const Command* cmds,
|
360
|
+
const size_t num_commands,
|
361
|
+
const uint8_t* data,
|
362
|
+
const size_t pos,
|
363
|
+
const size_t mask,
|
364
|
+
BlockSplit* literal_split,
|
365
|
+
BlockSplit* insert_and_copy_split,
|
366
|
+
BlockSplit* dist_split) {
|
367
|
+
// Create a continuous array of literals.
|
368
|
+
std::vector<uint8_t> literals;
|
369
|
+
CopyLiteralsToByteArray(cmds, num_commands, data, pos, mask, &literals);
|
370
|
+
|
371
|
+
// Compute prefix codes for commands.
|
372
|
+
std::vector<uint16_t> insert_and_copy_codes;
|
373
|
+
std::vector<uint16_t> distance_prefixes;
|
374
|
+
CopyCommandsToByteArray(cmds, num_commands,
|
375
|
+
&insert_and_copy_codes,
|
376
|
+
&distance_prefixes);
|
377
|
+
|
378
|
+
SplitByteVector<HistogramLiteral>(
|
379
|
+
literals,
|
380
|
+
kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,
|
381
|
+
kLiteralStrideLength, kLiteralBlockSwitchCost,
|
382
|
+
literal_split);
|
383
|
+
SplitByteVector<HistogramCommand>(
|
384
|
+
insert_and_copy_codes,
|
385
|
+
kSymbolsPerCommandHistogram, kMaxCommandHistograms,
|
386
|
+
kCommandStrideLength, kCommandBlockSwitchCost,
|
387
|
+
insert_and_copy_split);
|
388
|
+
SplitByteVector<HistogramDistance>(
|
389
|
+
distance_prefixes,
|
390
|
+
kSymbolsPerDistanceHistogram, kMaxCommandHistograms,
|
391
|
+
kCommandStrideLength, kDistanceBlockSwitchCost,
|
392
|
+
dist_split);
|
393
|
+
}
|
394
|
+
|
395
|
+
void SplitBlockByTotalLength(const Command* all_commands,
|
396
|
+
const size_t num_commands,
|
397
|
+
int input_size,
|
398
|
+
int target_length,
|
399
|
+
std::vector<std::vector<Command> >* blocks) {
|
400
|
+
int num_blocks = input_size / target_length + 1;
|
401
|
+
int length_limit = input_size / num_blocks + 1;
|
402
|
+
int total_length = 0;
|
403
|
+
std::vector<Command> cur_block;
|
404
|
+
for (int i = 0; i < num_commands; ++i) {
|
405
|
+
const Command& cmd = all_commands[i];
|
406
|
+
int cmd_length = cmd.insert_len_ + cmd.copy_len_;
|
407
|
+
if (total_length > length_limit) {
|
408
|
+
blocks->push_back(cur_block);
|
409
|
+
cur_block.clear();
|
410
|
+
total_length = 0;
|
411
|
+
}
|
412
|
+
cur_block.push_back(cmd);
|
413
|
+
total_length += cmd_length;
|
414
|
+
}
|
415
|
+
blocks->push_back(cur_block);
|
416
|
+
}
|
417
|
+
|
418
|
+
} // namespace brotli
|