brotli 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (73) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +4 -0
  6. data/Gemfile +4 -0
  7. data/README.md +36 -0
  8. data/Rakefile +13 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +7 -0
  11. data/brotli.gemspec +28 -0
  12. data/ext/brotli/brotli.cc +67 -0
  13. data/ext/brotli/brotli.h +9 -0
  14. data/ext/brotli/extconf.rb +34 -0
  15. data/lib/brotli.rb +2 -0
  16. data/lib/brotli/version.rb +3 -0
  17. data/vendor/brotli/LICENSE +202 -0
  18. data/vendor/brotli/dec/Makefile +12 -0
  19. data/vendor/brotli/dec/bit_reader.c +55 -0
  20. data/vendor/brotli/dec/bit_reader.h +256 -0
  21. data/vendor/brotli/dec/context.h +260 -0
  22. data/vendor/brotli/dec/decode.c +1573 -0
  23. data/vendor/brotli/dec/decode.h +160 -0
  24. data/vendor/brotli/dec/dictionary.h +9494 -0
  25. data/vendor/brotli/dec/huffman.c +325 -0
  26. data/vendor/brotli/dec/huffman.h +77 -0
  27. data/vendor/brotli/dec/port.h +148 -0
  28. data/vendor/brotli/dec/prefix.h +756 -0
  29. data/vendor/brotli/dec/state.c +149 -0
  30. data/vendor/brotli/dec/state.h +185 -0
  31. data/vendor/brotli/dec/streams.c +99 -0
  32. data/vendor/brotli/dec/streams.h +100 -0
  33. data/vendor/brotli/dec/transform.h +315 -0
  34. data/vendor/brotli/dec/types.h +36 -0
  35. data/vendor/brotli/enc/Makefile +11 -0
  36. data/vendor/brotli/enc/backward_references.cc +769 -0
  37. data/vendor/brotli/enc/backward_references.h +50 -0
  38. data/vendor/brotli/enc/bit_cost.h +147 -0
  39. data/vendor/brotli/enc/block_splitter.cc +418 -0
  40. data/vendor/brotli/enc/block_splitter.h +78 -0
  41. data/vendor/brotli/enc/brotli_bit_stream.cc +884 -0
  42. data/vendor/brotli/enc/brotli_bit_stream.h +149 -0
  43. data/vendor/brotli/enc/cluster.h +290 -0
  44. data/vendor/brotli/enc/command.h +140 -0
  45. data/vendor/brotli/enc/context.h +185 -0
  46. data/vendor/brotli/enc/dictionary.h +9485 -0
  47. data/vendor/brotli/enc/dictionary_hash.h +4125 -0
  48. data/vendor/brotli/enc/encode.cc +715 -0
  49. data/vendor/brotli/enc/encode.h +196 -0
  50. data/vendor/brotli/enc/encode_parallel.cc +354 -0
  51. data/vendor/brotli/enc/encode_parallel.h +37 -0
  52. data/vendor/brotli/enc/entropy_encode.cc +492 -0
  53. data/vendor/brotli/enc/entropy_encode.h +88 -0
  54. data/vendor/brotli/enc/fast_log.h +179 -0
  55. data/vendor/brotli/enc/find_match_length.h +87 -0
  56. data/vendor/brotli/enc/hash.h +686 -0
  57. data/vendor/brotli/enc/histogram.cc +76 -0
  58. data/vendor/brotli/enc/histogram.h +100 -0
  59. data/vendor/brotli/enc/literal_cost.cc +172 -0
  60. data/vendor/brotli/enc/literal_cost.h +38 -0
  61. data/vendor/brotli/enc/metablock.cc +544 -0
  62. data/vendor/brotli/enc/metablock.h +88 -0
  63. data/vendor/brotli/enc/port.h +151 -0
  64. data/vendor/brotli/enc/prefix.h +85 -0
  65. data/vendor/brotli/enc/ringbuffer.h +108 -0
  66. data/vendor/brotli/enc/static_dict.cc +441 -0
  67. data/vendor/brotli/enc/static_dict.h +40 -0
  68. data/vendor/brotli/enc/static_dict_lut.h +12063 -0
  69. data/vendor/brotli/enc/streams.cc +127 -0
  70. data/vendor/brotli/enc/streams.h +129 -0
  71. data/vendor/brotli/enc/transform.h +250 -0
  72. data/vendor/brotli/enc/write_bits.h +91 -0
  73. metadata +171 -0
@@ -0,0 +1,76 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Build per-context histograms of literals, commands and distance codes.
16
+
17
+ #include "./histogram.h"
18
+
19
+ #include <stdint.h>
20
+ #include <cmath>
21
+
22
+ #include "./block_splitter.h"
23
+ #include "./command.h"
24
+ #include "./context.h"
25
+ #include "./prefix.h"
26
+
27
+ namespace brotli {
28
+
29
+ void BuildHistograms(
30
+ const Command* cmds,
31
+ const size_t num_commands,
32
+ const BlockSplit& literal_split,
33
+ const BlockSplit& insert_and_copy_split,
34
+ const BlockSplit& dist_split,
35
+ const uint8_t* ringbuffer,
36
+ size_t start_pos,
37
+ size_t mask,
38
+ uint8_t prev_byte,
39
+ uint8_t prev_byte2,
40
+ const std::vector<int>& context_modes,
41
+ std::vector<HistogramLiteral>* literal_histograms,
42
+ std::vector<HistogramCommand>* insert_and_copy_histograms,
43
+ std::vector<HistogramDistance>* copy_dist_histograms) {
44
+ size_t pos = start_pos;
45
+ BlockSplitIterator literal_it(literal_split);
46
+ BlockSplitIterator insert_and_copy_it(insert_and_copy_split);
47
+ BlockSplitIterator dist_it(dist_split);
48
+ for (int i = 0; i < num_commands; ++i) {
49
+ const Command &cmd = cmds[i];
50
+ insert_and_copy_it.Next();
51
+ (*insert_and_copy_histograms)[insert_and_copy_it.type_].Add(
52
+ cmd.cmd_prefix_);
53
+ for (int j = 0; j < cmd.insert_len_; ++j) {
54
+ literal_it.Next();
55
+ int context = (literal_it.type_ << kLiteralContextBits) +
56
+ Context(prev_byte, prev_byte2, context_modes[literal_it.type_]);
57
+ (*literal_histograms)[context].Add(ringbuffer[pos & mask]);
58
+ prev_byte2 = prev_byte;
59
+ prev_byte = ringbuffer[pos & mask];
60
+ ++pos;
61
+ }
62
+ pos += cmd.copy_len_;
63
+ if (cmd.copy_len_ > 0) {
64
+ prev_byte2 = ringbuffer[(pos - 2) & mask];
65
+ prev_byte = ringbuffer[(pos - 1) & mask];
66
+ if (cmd.cmd_prefix_ >= 128) {
67
+ dist_it.Next();
68
+ int context = (dist_it.type_ << kDistanceContextBits) +
69
+ cmd.DistanceContext();
70
+ (*copy_dist_histograms)[context].Add(cmd.dist_prefix_);
71
+ }
72
+ }
73
+ }
74
+ }
75
+
76
+ } // namespace brotli
@@ -0,0 +1,100 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Models the histograms of literals, commands and distance codes.
16
+
17
+ #ifndef BROTLI_ENC_HISTOGRAM_H_
18
+ #define BROTLI_ENC_HISTOGRAM_H_
19
+
20
+ #include <stdint.h>
21
+ #include <string.h>
22
+ #include <vector>
23
+ #include <utility>
24
+ #include "./command.h"
25
+ #include "./fast_log.h"
26
+ #include "./prefix.h"
27
+
28
+ namespace brotli {
29
+
30
+ class BlockSplit;
31
+
32
+ // A simple container for histograms of data in blocks.
33
+ template<int kDataSize>
34
+ struct Histogram {
35
+ Histogram() {
36
+ Clear();
37
+ }
38
+ void Clear() {
39
+ memset(data_, 0, sizeof(data_));
40
+ total_count_ = 0;
41
+ }
42
+ void Add(int val) {
43
+ ++data_[val];
44
+ ++total_count_;
45
+ }
46
+ void Remove(int val) {
47
+ --data_[val];
48
+ --total_count_;
49
+ }
50
+ template<typename DataType>
51
+ void Add(const DataType *p, size_t n) {
52
+ total_count_ += n;
53
+ n += 1;
54
+ while(--n) ++data_[*p++];
55
+ }
56
+ void AddHistogram(const Histogram& v) {
57
+ total_count_ += v.total_count_;
58
+ for (int i = 0; i < kDataSize; ++i) {
59
+ data_[i] += v.data_[i];
60
+ }
61
+ }
62
+
63
+ int data_[kDataSize];
64
+ int total_count_;
65
+ double bit_cost_;
66
+ };
67
+
68
+ // Literal histogram.
69
+ typedef Histogram<256> HistogramLiteral;
70
+ // Prefix histograms.
71
+ typedef Histogram<kNumCommandPrefixes> HistogramCommand;
72
+ typedef Histogram<kNumDistancePrefixes> HistogramDistance;
73
+ typedef Histogram<kNumBlockLenPrefixes> HistogramBlockLength;
74
+ // Context map histogram, 256 Huffman tree indexes + 16 run length codes.
75
+ typedef Histogram<272> HistogramContextMap;
76
+ // Block type histogram, 256 block types + 2 special symbols.
77
+ typedef Histogram<258> HistogramBlockType;
78
+
79
+ static const int kLiteralContextBits = 6;
80
+ static const int kDistanceContextBits = 2;
81
+
82
+ void BuildHistograms(
83
+ const Command* cmds,
84
+ const size_t num_commands,
85
+ const BlockSplit& literal_split,
86
+ const BlockSplit& insert_and_copy_split,
87
+ const BlockSplit& dist_split,
88
+ const uint8_t* ringbuffer,
89
+ size_t pos,
90
+ size_t mask,
91
+ uint8_t prev_byte,
92
+ uint8_t prev_byte2,
93
+ const std::vector<int>& context_modes,
94
+ std::vector<HistogramLiteral>* literal_histograms,
95
+ std::vector<HistogramCommand>* insert_and_copy_histograms,
96
+ std::vector<HistogramDistance>* copy_dist_histograms);
97
+
98
+ } // namespace brotli
99
+
100
+ #endif // BROTLI_ENC_HISTOGRAM_H_
@@ -0,0 +1,172 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Literal cost model to allow backward reference replacement to be efficient.
16
+
17
+ #include "./literal_cost.h"
18
+
19
+ #include <math.h>
20
+ #include <stdint.h>
21
+ #include <algorithm>
22
+
23
+ #include "./fast_log.h"
24
+
25
+ namespace brotli {
26
+
27
+ static int UTF8Position(int last, int c, int clamp) {
28
+ if (c < 128) {
29
+ return 0; // Next one is the 'Byte 1' again.
30
+ } else if (c >= 192) {
31
+ return std::min(1, clamp); // Next one is the 'Byte 2' of utf-8 encoding.
32
+ } else {
33
+ // Let's decide over the last byte if this ends the sequence.
34
+ if (last < 0xe0) {
35
+ return 0; // Completed two or three byte coding.
36
+ } else {
37
+ return std::min(2, clamp); // Next one is the 'Byte 3' of utf-8 encoding.
38
+ }
39
+ }
40
+ }
41
+
42
+ static int DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask,
43
+ const uint8_t *data) {
44
+ int counts[3] = { 0 };
45
+ int max_utf8 = 1; // should be 2, but 1 compresses better.
46
+ int last_c = 0;
47
+ int utf8_pos = 0;
48
+ for (int i = 0; i < len; ++i) {
49
+ int c = data[(pos + i) & mask];
50
+ utf8_pos = UTF8Position(last_c, c, 2);
51
+ ++counts[utf8_pos];
52
+ last_c = c;
53
+ }
54
+ if (counts[2] < 500) {
55
+ max_utf8 = 1;
56
+ }
57
+ if (counts[1] + counts[2] < 25) {
58
+ max_utf8 = 0;
59
+ }
60
+ return max_utf8;
61
+ }
62
+
63
+ void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
64
+ size_t cost_mask, const uint8_t *data,
65
+ float *cost) {
66
+
67
+ // max_utf8 is 0 (normal ascii single byte modeling),
68
+ // 1 (for 2-byte utf-8 modeling), or 2 (for 3-byte utf-8 modeling).
69
+ const int max_utf8 = DecideMultiByteStatsLevel(pos, len, mask, data);
70
+ int histogram[3][256] = { { 0 } };
71
+ int window_half = 495;
72
+ int in_window = std::min(static_cast<size_t>(window_half), len);
73
+ int in_window_utf8[3] = { 0 };
74
+
75
+ // Bootstrap histograms.
76
+ int last_c = 0;
77
+ int utf8_pos = 0;
78
+ for (int i = 0; i < in_window; ++i) {
79
+ int c = data[(pos + i) & mask];
80
+ ++histogram[utf8_pos][c];
81
+ ++in_window_utf8[utf8_pos];
82
+ utf8_pos = UTF8Position(last_c, c, max_utf8);
83
+ last_c = c;
84
+ }
85
+
86
+ // Compute bit costs with sliding window.
87
+ for (int i = 0; i < len; ++i) {
88
+ if (i - window_half >= 0) {
89
+ // Remove a byte in the past.
90
+ int c = (i - window_half - 1) < 0 ?
91
+ 0 : data[(pos + i - window_half - 1) & mask];
92
+ int last_c = (i - window_half - 2) < 0 ?
93
+ 0 : data[(pos + i - window_half - 2) & mask];
94
+ int utf8_pos2 = UTF8Position(last_c, c, max_utf8);
95
+ --histogram[utf8_pos2][data[(pos + i - window_half) & mask]];
96
+ --in_window_utf8[utf8_pos2];
97
+ }
98
+ if (i + window_half < len) {
99
+ // Add a byte in the future.
100
+ int c = (i + window_half - 1) < 0 ?
101
+ 0 : data[(pos + i + window_half - 1) & mask];
102
+ int last_c = (i + window_half - 2) < 0 ?
103
+ 0 : data[(pos + i + window_half - 2) & mask];
104
+ int utf8_pos2 = UTF8Position(last_c, c, max_utf8);
105
+ ++histogram[utf8_pos2][data[(pos + i + window_half) & mask]];
106
+ ++in_window_utf8[utf8_pos2];
107
+ }
108
+ int c = i < 1 ? 0 : data[(pos + i - 1) & mask];
109
+ int last_c = i < 2 ? 0 : data[(pos + i - 2) & mask];
110
+ int utf8_pos = UTF8Position(last_c, c, max_utf8);
111
+ int masked_pos = (pos + i) & mask;
112
+ int histo = histogram[utf8_pos][data[masked_pos]];
113
+ if (histo == 0) {
114
+ histo = 1;
115
+ }
116
+ float lit_cost = FastLog2(in_window_utf8[utf8_pos]) - FastLog2(histo);
117
+ lit_cost += 0.02905;
118
+ if (lit_cost < 1.0) {
119
+ lit_cost *= 0.5;
120
+ lit_cost += 0.5;
121
+ }
122
+ // Make the first bytes more expensive -- seems to help, not sure why.
123
+ // Perhaps because the entropy source is changing its properties
124
+ // rapidly in the beginning of the file, perhaps because the beginning
125
+ // of the data is a statistical "anomaly".
126
+ if (i < 2000) {
127
+ lit_cost += 0.7 - ((2000 - i) / 2000.0 * 0.35);
128
+ }
129
+ cost[(pos + i) & cost_mask] = lit_cost;
130
+ }
131
+ }
132
+
133
+ void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
134
+ size_t cost_mask, const uint8_t *data,
135
+ float *cost) {
136
+ int histogram[256] = { 0 };
137
+ int window_half = 2000;
138
+ int in_window = std::min(static_cast<size_t>(window_half), len);
139
+
140
+ // Bootstrap histogram.
141
+ for (int i = 0; i < in_window; ++i) {
142
+ ++histogram[data[(pos + i) & mask]];
143
+ }
144
+
145
+ // Compute bit costs with sliding window.
146
+ for (int i = 0; i < len; ++i) {
147
+ if (i - window_half >= 0) {
148
+ // Remove a byte in the past.
149
+ --histogram[data[(pos + i - window_half) & mask]];
150
+ --in_window;
151
+ }
152
+ if (i + window_half < len) {
153
+ // Add a byte in the future.
154
+ ++histogram[data[(pos + i + window_half) & mask]];
155
+ ++in_window;
156
+ }
157
+ int histo = histogram[data[(pos + i) & mask]];
158
+ if (histo == 0) {
159
+ histo = 1;
160
+ }
161
+ float lit_cost = FastLog2(in_window) - FastLog2(histo);
162
+ lit_cost += 0.029;
163
+ if (lit_cost < 1.0) {
164
+ lit_cost *= 0.5;
165
+ lit_cost += 0.5;
166
+ }
167
+ cost[(pos + i) & cost_mask] = lit_cost;
168
+ }
169
+ }
170
+
171
+
172
+ } // namespace brotli
@@ -0,0 +1,38 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Literal cost model to allow backward reference replacement to be efficient.
16
+
17
+ #ifndef BROTLI_ENC_LITERAL_COST_H_
18
+ #define BROTLI_ENC_LITERAL_COST_H_
19
+
20
+ #include <stddef.h>
21
+ #include <stdint.h>
22
+
23
+ namespace brotli {
24
+
25
+ // Estimates how many bits the literals in the interval [pos, pos + len) in the
26
+ // ringbuffer (data, mask) will take entropy coded and writes these estimates
27
+ // to the ringbuffer (cost, mask).
28
+ void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
29
+ size_t cost_mask, const uint8_t *data,
30
+ float *cost);
31
+
32
+ void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
33
+ size_t cost_mask, const uint8_t *data,
34
+ float *cost);
35
+
36
+ } // namespace brotli
37
+
38
+ #endif // BROTLI_ENC_LITERAL_COST_H_
@@ -0,0 +1,544 @@
1
+ // Copyright 2015 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Algorithms for distributing the literals and commands of a metablock between
16
+ // block types and contexts.
17
+
18
+ #include "./metablock.h"
19
+
20
+ #include "./block_splitter.h"
21
+ #include "./context.h"
22
+ #include "./cluster.h"
23
+ #include "./histogram.h"
24
+
25
+ namespace brotli {
26
+
27
+ void BuildMetaBlock(const uint8_t* ringbuffer,
28
+ const size_t pos,
29
+ const size_t mask,
30
+ uint8_t prev_byte,
31
+ uint8_t prev_byte2,
32
+ const Command* cmds,
33
+ size_t num_commands,
34
+ int literal_context_mode,
35
+ MetaBlockSplit* mb) {
36
+ SplitBlock(cmds, num_commands,
37
+ ringbuffer, pos, mask,
38
+ &mb->literal_split,
39
+ &mb->command_split,
40
+ &mb->distance_split);
41
+
42
+ std::vector<int> literal_context_modes(mb->literal_split.num_types,
43
+ literal_context_mode);
44
+
45
+ int num_literal_contexts =
46
+ mb->literal_split.num_types << kLiteralContextBits;
47
+ int num_distance_contexts =
48
+ mb->distance_split.num_types << kDistanceContextBits;
49
+ std::vector<HistogramLiteral> literal_histograms(num_literal_contexts);
50
+ mb->command_histograms.resize(mb->command_split.num_types);
51
+ std::vector<HistogramDistance> distance_histograms(num_distance_contexts);
52
+ BuildHistograms(cmds, num_commands,
53
+ mb->literal_split,
54
+ mb->command_split,
55
+ mb->distance_split,
56
+ ringbuffer,
57
+ pos,
58
+ mask,
59
+ prev_byte,
60
+ prev_byte2,
61
+ literal_context_modes,
62
+ &literal_histograms,
63
+ &mb->command_histograms,
64
+ &distance_histograms);
65
+
66
+ // Histogram ids need to fit in one byte.
67
+ static const int kMaxNumberOfHistograms = 256;
68
+
69
+ mb->literal_histograms = literal_histograms;
70
+ ClusterHistograms(literal_histograms,
71
+ 1 << kLiteralContextBits,
72
+ mb->literal_split.num_types,
73
+ kMaxNumberOfHistograms,
74
+ &mb->literal_histograms,
75
+ &mb->literal_context_map);
76
+
77
+ mb->distance_histograms = distance_histograms;
78
+ ClusterHistograms(distance_histograms,
79
+ 1 << kDistanceContextBits,
80
+ mb->distance_split.num_types,
81
+ kMaxNumberOfHistograms,
82
+ &mb->distance_histograms,
83
+ &mb->distance_context_map);
84
+ }
85
+
86
+ // Greedy block splitter for one block category (literal, command or distance).
87
+ template<typename HistogramType>
88
+ class BlockSplitter {
89
+ public:
90
+ BlockSplitter(int alphabet_size,
91
+ int min_block_size,
92
+ double split_threshold,
93
+ int num_symbols,
94
+ BlockSplit* split,
95
+ std::vector<HistogramType>* histograms)
96
+ : alphabet_size_(alphabet_size),
97
+ min_block_size_(min_block_size),
98
+ split_threshold_(split_threshold),
99
+ num_blocks_(0),
100
+ split_(split),
101
+ histograms_(histograms),
102
+ target_block_size_(min_block_size),
103
+ block_size_(0),
104
+ curr_histogram_ix_(0),
105
+ merge_last_count_(0) {
106
+ int max_num_blocks = num_symbols / min_block_size + 1;
107
+ // We have to allocate one more histogram than the maximum number of block
108
+ // types for the current histogram when the meta-block is too big.
109
+ int max_num_types = std::min(max_num_blocks, kMaxBlockTypes + 1);
110
+ split_->lengths.resize(max_num_blocks);
111
+ split_->types.resize(max_num_blocks);
112
+ histograms_->resize(max_num_types);
113
+ last_histogram_ix_[0] = last_histogram_ix_[1] = 0;
114
+ }
115
+
116
+ // Adds the next symbol to the current histogram. When the current histogram
117
+ // reaches the target size, decides on merging the block.
118
+ void AddSymbol(int symbol) {
119
+ (*histograms_)[curr_histogram_ix_].Add(symbol);
120
+ ++block_size_;
121
+ if (block_size_ == target_block_size_) {
122
+ FinishBlock(/* is_final = */ false);
123
+ }
124
+ }
125
+
126
+ // Does either of three things:
127
+ // (1) emits the current block with a new block type;
128
+ // (2) emits the current block with the type of the second last block;
129
+ // (3) merges the current block with the last block.
130
+ void FinishBlock(bool is_final) {
131
+ if (block_size_ < min_block_size_) {
132
+ block_size_ = min_block_size_;
133
+ }
134
+ if (num_blocks_ == 0) {
135
+ // Create first block.
136
+ split_->lengths[0] = block_size_;
137
+ split_->types[0] = 0;
138
+ last_entropy_[0] =
139
+ BitsEntropy(&(*histograms_)[0].data_[0], alphabet_size_);
140
+ last_entropy_[1] = last_entropy_[0];
141
+ ++num_blocks_;
142
+ ++split_->num_types;
143
+ ++curr_histogram_ix_;
144
+ block_size_ = 0;
145
+ } else if (block_size_ > 0) {
146
+ double entropy = BitsEntropy(&(*histograms_)[curr_histogram_ix_].data_[0],
147
+ alphabet_size_);
148
+ HistogramType combined_histo[2];
149
+ double combined_entropy[2];
150
+ double diff[2];
151
+ for (int j = 0; j < 2; ++j) {
152
+ int last_histogram_ix = last_histogram_ix_[j];
153
+ combined_histo[j] = (*histograms_)[curr_histogram_ix_];
154
+ combined_histo[j].AddHistogram((*histograms_)[last_histogram_ix]);
155
+ combined_entropy[j] = BitsEntropy(
156
+ &combined_histo[j].data_[0], alphabet_size_);
157
+ diff[j] = combined_entropy[j] - entropy - last_entropy_[j];
158
+ }
159
+
160
+ if (split_->num_types < kMaxBlockTypes &&
161
+ diff[0] > split_threshold_ &&
162
+ diff[1] > split_threshold_) {
163
+ // Create new block.
164
+ split_->lengths[num_blocks_] = block_size_;
165
+ split_->types[num_blocks_] = split_->num_types;
166
+ last_histogram_ix_[1] = last_histogram_ix_[0];
167
+ last_histogram_ix_[0] = split_->num_types;
168
+ last_entropy_[1] = last_entropy_[0];
169
+ last_entropy_[0] = entropy;
170
+ ++num_blocks_;
171
+ ++split_->num_types;
172
+ ++curr_histogram_ix_;
173
+ block_size_ = 0;
174
+ merge_last_count_ = 0;
175
+ target_block_size_ = min_block_size_;
176
+ } else if (diff[1] < diff[0] - 20.0) {
177
+ // Combine this block with second last block.
178
+ split_->lengths[num_blocks_] = block_size_;
179
+ split_->types[num_blocks_] = split_->types[num_blocks_ - 2];
180
+ std::swap(last_histogram_ix_[0], last_histogram_ix_[1]);
181
+ (*histograms_)[last_histogram_ix_[0]] = combined_histo[1];
182
+ last_entropy_[1] = last_entropy_[0];
183
+ last_entropy_[0] = combined_entropy[1];
184
+ ++num_blocks_;
185
+ block_size_ = 0;
186
+ (*histograms_)[curr_histogram_ix_].Clear();
187
+ merge_last_count_ = 0;
188
+ target_block_size_ = min_block_size_;
189
+ } else {
190
+ // Combine this block with last block.
191
+ split_->lengths[num_blocks_ - 1] += block_size_;
192
+ (*histograms_)[last_histogram_ix_[0]] = combined_histo[0];
193
+ last_entropy_[0] = combined_entropy[0];
194
+ if (split_->num_types == 1) {
195
+ last_entropy_[1] = last_entropy_[0];
196
+ }
197
+ block_size_ = 0;
198
+ (*histograms_)[curr_histogram_ix_].Clear();
199
+ if (++merge_last_count_ > 1) {
200
+ target_block_size_ += min_block_size_;
201
+ }
202
+ }
203
+ }
204
+ if (is_final) {
205
+ (*histograms_).resize(split_->num_types);
206
+ split_->types.resize(num_blocks_);
207
+ split_->lengths.resize(num_blocks_);
208
+ }
209
+ }
210
+
211
+ private:
212
+ static const int kMaxBlockTypes = 256;
213
+
214
+ // Alphabet size of particular block category.
215
+ const int alphabet_size_;
216
+ // We collect at least this many symbols for each block.
217
+ const int min_block_size_;
218
+ // We merge histograms A and B if
219
+ // entropy(A+B) < entropy(A) + entropy(B) + split_threshold_,
220
+ // where A is the current histogram and B is the histogram of the last or the
221
+ // second last block type.
222
+ const double split_threshold_;
223
+
224
+ int num_blocks_;
225
+ BlockSplit* split_; // not owned
226
+ std::vector<HistogramType>* histograms_; // not owned
227
+
228
+ // The number of symbols that we want to collect before deciding on whether
229
+ // or not to merge the block with a previous one or emit a new block.
230
+ int target_block_size_;
231
+ // The number of symbols in the current histogram.
232
+ int block_size_;
233
+ // Offset of the current histogram.
234
+ int curr_histogram_ix_;
235
+ // Offset of the histograms of the previous two block types.
236
+ int last_histogram_ix_[2];
237
+ // Entropy of the previous two block types.
238
+ double last_entropy_[2];
239
+ // The number of times we merged the current block with the last one.
240
+ int merge_last_count_;
241
+ };
242
+
243
+ void BuildMetaBlockGreedy(const uint8_t* ringbuffer,
244
+ size_t pos,
245
+ size_t mask,
246
+ const Command *commands,
247
+ size_t n_commands,
248
+ MetaBlockSplit* mb) {
249
+ int num_literals = 0;
250
+ for (int i = 0; i < n_commands; ++i) {
251
+ num_literals += commands[i].insert_len_;
252
+ }
253
+
254
+ BlockSplitter<HistogramLiteral> lit_blocks(
255
+ 256, 512, 400.0, num_literals,
256
+ &mb->literal_split, &mb->literal_histograms);
257
+ BlockSplitter<HistogramCommand> cmd_blocks(
258
+ kNumCommandPrefixes, 1024, 500.0, n_commands,
259
+ &mb->command_split, &mb->command_histograms);
260
+ BlockSplitter<HistogramDistance> dist_blocks(
261
+ 64, 512, 100.0, n_commands,
262
+ &mb->distance_split, &mb->distance_histograms);
263
+
264
+ for (int i = 0; i < n_commands; ++i) {
265
+ const Command cmd = commands[i];
266
+ cmd_blocks.AddSymbol(cmd.cmd_prefix_);
267
+ for (int j = 0; j < cmd.insert_len_; ++j) {
268
+ lit_blocks.AddSymbol(ringbuffer[pos & mask]);
269
+ ++pos;
270
+ }
271
+ pos += cmd.copy_len_;
272
+ if (cmd.copy_len_ > 0 && cmd.cmd_prefix_ >= 128) {
273
+ dist_blocks.AddSymbol(cmd.dist_prefix_);
274
+ }
275
+ }
276
+
277
+ lit_blocks.FinishBlock(/* is_final = */ true);
278
+ cmd_blocks.FinishBlock(/* is_final = */ true);
279
+ dist_blocks.FinishBlock(/* is_final = */ true);
280
+ }
281
+
282
+ // Greedy block splitter for one block category (literal, command or distance).
283
+ // Gathers histograms for all context buckets.
284
+ template<typename HistogramType>
285
+ class ContextBlockSplitter {
286
+ public:
287
+ ContextBlockSplitter(int alphabet_size,
288
+ int num_contexts,
289
+ int min_block_size,
290
+ double split_threshold,
291
+ int num_symbols,
292
+ BlockSplit* split,
293
+ std::vector<HistogramType>* histograms)
294
+ : alphabet_size_(alphabet_size),
295
+ num_contexts_(num_contexts),
296
+ max_block_types_(kMaxBlockTypes / num_contexts),
297
+ min_block_size_(min_block_size),
298
+ split_threshold_(split_threshold),
299
+ num_blocks_(0),
300
+ split_(split),
301
+ histograms_(histograms),
302
+ target_block_size_(min_block_size),
303
+ block_size_(0),
304
+ curr_histogram_ix_(0),
305
+ last_entropy_(2 * num_contexts),
306
+ merge_last_count_(0) {
307
+ int max_num_blocks = num_symbols / min_block_size + 1;
308
+ // We have to allocate one more histogram than the maximum number of block
309
+ // types for the current histogram when the meta-block is too big.
310
+ int max_num_types = std::min(max_num_blocks, max_block_types_ + 1);
311
+ split_->lengths.resize(max_num_blocks);
312
+ split_->types.resize(max_num_blocks);
313
+ histograms_->resize(max_num_types * num_contexts);
314
+ last_histogram_ix_[0] = last_histogram_ix_[1] = 0;
315
+ }
316
+
317
+ // Adds the next symbol to the current block type and context. When the
318
+ // current block reaches the target size, decides on merging the block.
319
+ void AddSymbol(int symbol, int context) {
320
+ (*histograms_)[curr_histogram_ix_ + context].Add(symbol);
321
+ ++block_size_;
322
+ if (block_size_ == target_block_size_) {
323
+ FinishBlock(/* is_final = */ false);
324
+ }
325
+ }
326
+
327
+ // Does either of three things:
328
+ // (1) emits the current block with a new block type;
329
+ // (2) emits the current block with the type of the second last block;
330
+ // (3) merges the current block with the last block.
331
+ void FinishBlock(bool is_final) {
332
+ if (block_size_ < min_block_size_) {
333
+ block_size_ = min_block_size_;
334
+ }
335
+ if (num_blocks_ == 0) {
336
+ // Create first block.
337
+ split_->lengths[0] = block_size_;
338
+ split_->types[0] = 0;
339
+ for (int i = 0; i < num_contexts_; ++i) {
340
+ last_entropy_[i] =
341
+ BitsEntropy(&(*histograms_)[i].data_[0], alphabet_size_);
342
+ last_entropy_[num_contexts_ + i] = last_entropy_[i];
343
+ }
344
+ ++num_blocks_;
345
+ ++split_->num_types;
346
+ curr_histogram_ix_ += num_contexts_;
347
+ block_size_ = 0;
348
+ } else if (block_size_ > 0) {
349
+ // Try merging the set of histograms for the current block type with the
350
+ // respective set of histograms for the last and second last block types.
351
+ // Decide over the split based on the total reduction of entropy across
352
+ // all contexts.
353
+ std::vector<double> entropy(num_contexts_);
354
+ std::vector<HistogramType> combined_histo(2 * num_contexts_);
355
+ std::vector<double> combined_entropy(2 * num_contexts_);
356
+ double diff[2] = { 0.0 };
357
+ for (int i = 0; i < num_contexts_; ++i) {
358
+ int curr_histo_ix = curr_histogram_ix_ + i;
359
+ entropy[i] = BitsEntropy(&(*histograms_)[curr_histo_ix].data_[0],
360
+ alphabet_size_);
361
+ for (int j = 0; j < 2; ++j) {
362
+ int jx = j * num_contexts_ + i;
363
+ int last_histogram_ix = last_histogram_ix_[j] + i;
364
+ combined_histo[jx] = (*histograms_)[curr_histo_ix];
365
+ combined_histo[jx].AddHistogram((*histograms_)[last_histogram_ix]);
366
+ combined_entropy[jx] = BitsEntropy(
367
+ &combined_histo[jx].data_[0], alphabet_size_);
368
+ diff[j] += combined_entropy[jx] - entropy[i] - last_entropy_[jx];
369
+ }
370
+ }
371
+
372
+ if (split_->num_types < max_block_types_ &&
373
+ diff[0] > split_threshold_ &&
374
+ diff[1] > split_threshold_) {
375
+ // Create new block.
376
+ split_->lengths[num_blocks_] = block_size_;
377
+ split_->types[num_blocks_] = split_->num_types;
378
+ last_histogram_ix_[1] = last_histogram_ix_[0];
379
+ last_histogram_ix_[0] = split_->num_types * num_contexts_;
380
+ for (int i = 0; i < num_contexts_; ++i) {
381
+ last_entropy_[num_contexts_ + i] = last_entropy_[i];
382
+ last_entropy_[i] = entropy[i];
383
+ }
384
+ ++num_blocks_;
385
+ ++split_->num_types;
386
+ curr_histogram_ix_ += num_contexts_;
387
+ block_size_ = 0;
388
+ merge_last_count_ = 0;
389
+ target_block_size_ = min_block_size_;
390
+ } else if (diff[1] < diff[0] - 20.0) {
391
+ // Combine this block with second last block.
392
+ split_->lengths[num_blocks_] = block_size_;
393
+ split_->types[num_blocks_] = split_->types[num_blocks_ - 2];
394
+ std::swap(last_histogram_ix_[0], last_histogram_ix_[1]);
395
+ for (int i = 0; i < num_contexts_; ++i) {
396
+ (*histograms_)[last_histogram_ix_[0] + i] =
397
+ combined_histo[num_contexts_ + i];
398
+ last_entropy_[num_contexts_ + i] = last_entropy_[i];
399
+ last_entropy_[i] = combined_entropy[num_contexts_ + i];
400
+ (*histograms_)[curr_histogram_ix_ + i].Clear();
401
+ }
402
+ ++num_blocks_;
403
+ block_size_ = 0;
404
+ merge_last_count_ = 0;
405
+ target_block_size_ = min_block_size_;
406
+ } else {
407
+ // Combine this block with last block.
408
+ split_->lengths[num_blocks_ - 1] += block_size_;
409
+ for (int i = 0; i < num_contexts_; ++i) {
410
+ (*histograms_)[last_histogram_ix_[0] + i] = combined_histo[i];
411
+ last_entropy_[i] = combined_entropy[i];
412
+ if (split_->num_types == 1) {
413
+ last_entropy_[num_contexts_ + i] = last_entropy_[i];
414
+ }
415
+ (*histograms_)[curr_histogram_ix_ + i].Clear();
416
+ }
417
+ block_size_ = 0;
418
+ if (++merge_last_count_ > 1) {
419
+ target_block_size_ += min_block_size_;
420
+ }
421
+ }
422
+ }
423
+ if (is_final) {
424
+ (*histograms_).resize(split_->num_types * num_contexts_);
425
+ split_->types.resize(num_blocks_);
426
+ split_->lengths.resize(num_blocks_);
427
+ }
428
+ }
429
+
430
+ private:
431
+ static const int kMaxBlockTypes = 256;
432
+
433
+ // Alphabet size of particular block category.
434
+ const int alphabet_size_;
435
+ const int num_contexts_;
436
+ const int max_block_types_;
437
+ // We collect at least this many symbols for each block.
438
+ const int min_block_size_;
439
+ // We merge histograms A and B if
440
+ // entropy(A+B) < entropy(A) + entropy(B) + split_threshold_,
441
+ // where A is the current histogram and B is the histogram of the last or the
442
+ // second last block type.
443
+ const double split_threshold_;
444
+
445
+ int num_blocks_;
446
+ BlockSplit* split_; // not owned
447
+ std::vector<HistogramType>* histograms_; // not owned
448
+
449
+ // The number of symbols that we want to collect before deciding on whether
450
+ // or not to merge the block with a previous one or emit a new block.
451
+ int target_block_size_;
452
+ // The number of symbols in the current histogram.
453
+ int block_size_;
454
+ // Offset of the current histogram.
455
+ int curr_histogram_ix_;
456
+ // Offset of the histograms of the previous two block types.
457
+ int last_histogram_ix_[2];
458
+ // Entropy of the previous two block types.
459
+ std::vector<double> last_entropy_;
460
+ // The number of times we merged the current block with the last one.
461
+ int merge_last_count_;
462
+ };
463
+
464
+ void BuildMetaBlockGreedyWithContexts(const uint8_t* ringbuffer,
465
+ size_t pos,
466
+ size_t mask,
467
+ uint8_t prev_byte,
468
+ uint8_t prev_byte2,
469
+ int literal_context_mode,
470
+ int num_contexts,
471
+ const int* static_context_map,
472
+ const Command *commands,
473
+ size_t n_commands,
474
+ MetaBlockSplit* mb) {
475
+ int num_literals = 0;
476
+ for (int i = 0; i < n_commands; ++i) {
477
+ num_literals += commands[i].insert_len_;
478
+ }
479
+
480
+ ContextBlockSplitter<HistogramLiteral> lit_blocks(
481
+ 256, num_contexts, 512, 400.0, num_literals,
482
+ &mb->literal_split, &mb->literal_histograms);
483
+ BlockSplitter<HistogramCommand> cmd_blocks(
484
+ kNumCommandPrefixes, 1024, 500.0, n_commands,
485
+ &mb->command_split, &mb->command_histograms);
486
+ BlockSplitter<HistogramDistance> dist_blocks(
487
+ 64, 512, 100.0, n_commands,
488
+ &mb->distance_split, &mb->distance_histograms);
489
+
490
+ for (int i = 0; i < n_commands; ++i) {
491
+ const Command cmd = commands[i];
492
+ cmd_blocks.AddSymbol(cmd.cmd_prefix_);
493
+ for (int j = 0; j < cmd.insert_len_; ++j) {
494
+ int context = Context(prev_byte, prev_byte2, literal_context_mode);
495
+ uint8_t literal = ringbuffer[pos & mask];
496
+ lit_blocks.AddSymbol(literal, static_context_map[context]);
497
+ prev_byte2 = prev_byte;
498
+ prev_byte = literal;
499
+ ++pos;
500
+ }
501
+ pos += cmd.copy_len_;
502
+ if (cmd.copy_len_ > 0) {
503
+ prev_byte2 = ringbuffer[(pos - 2) & mask];
504
+ prev_byte = ringbuffer[(pos - 1) & mask];
505
+ if (cmd.cmd_prefix_ >= 128) {
506
+ dist_blocks.AddSymbol(cmd.dist_prefix_);
507
+ }
508
+ }
509
+ }
510
+
511
+ lit_blocks.FinishBlock(/* is_final = */ true);
512
+ cmd_blocks.FinishBlock(/* is_final = */ true);
513
+ dist_blocks.FinishBlock(/* is_final = */ true);
514
+
515
+ mb->literal_context_map.resize(
516
+ mb->literal_split.num_types << kLiteralContextBits);
517
+ for (int i = 0; i < mb->literal_split.num_types; ++i) {
518
+ for (int j = 0; j < (1 << kLiteralContextBits); ++j) {
519
+ mb->literal_context_map[(i << kLiteralContextBits) + j] =
520
+ i * num_contexts + static_context_map[j];
521
+ }
522
+ }
523
+ }
524
+
525
+ void OptimizeHistograms(int num_direct_distance_codes,
526
+ int distance_postfix_bits,
527
+ MetaBlockSplit* mb) {
528
+ for (int i = 0; i < mb->literal_histograms.size(); ++i) {
529
+ OptimizeHuffmanCountsForRle(256, &mb->literal_histograms[i].data_[0]);
530
+ }
531
+ for (int i = 0; i < mb->command_histograms.size(); ++i) {
532
+ OptimizeHuffmanCountsForRle(kNumCommandPrefixes,
533
+ &mb->command_histograms[i].data_[0]);
534
+ }
535
+ int num_distance_codes =
536
+ kNumDistanceShortCodes + num_direct_distance_codes +
537
+ (48 << distance_postfix_bits);
538
+ for (int i = 0; i < mb->distance_histograms.size(); ++i) {
539
+ OptimizeHuffmanCountsForRle(num_distance_codes,
540
+ &mb->distance_histograms[i].data_[0]);
541
+ }
542
+ }
543
+
544
+ } // namespace brotli