isomorfeus-ferret 0.12.6 → 0.12.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -4
- data/ext/isomorfeus_ferret_ext/brotli_common_constants.c +15 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_constants.h +200 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_context.c +156 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_context.h +113 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +5914 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.h +64 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_platform.c +22 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_platform.h +594 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_transform.c +291 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_transform.h +85 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_version.h +26 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_bit_reader.c +76 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_bit_reader.h +351 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +2608 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_huffman.c +339 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_huffman.h +121 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_prefix.h +732 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_state.c +159 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_state.h +365 -0
- data/ext/isomorfeus_ferret_ext/brotli_decode.h +344 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references.c +145 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references.h +39 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_hq.c +843 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_hq.h +95 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_inc.h +163 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost.c +35 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost.h +63 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost_inc.h +127 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_block_encoder_inc.h +34 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter.c +194 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter.h +51 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter_inc.h +440 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_brotli_bit_stream.c +1314 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_brotli_bit_stream.h +84 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_cluster.c +56 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_cluster.h +48 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_cluster_inc.h +320 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_command.c +28 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_command.h +190 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment.c +790 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment.h +61 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment_two_pass.c +645 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment_two_pass.h +54 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_dictionary_hash.c +1846 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_dictionary_hash.h +25 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_encode.c +1927 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_encoder_dict.c +33 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_encoder_dict.h +43 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode.c +503 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode.h +122 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode_static.h +539 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_fast_log.c +105 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_fast_log.h +66 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_find_match_length.h +79 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash.h +488 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_composite_inc.h +125 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_forgetful_chain_inc.h +293 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match64_inc.h +267 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match_inc.h +262 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match_quickly_inc.h +266 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_rolling_inc.h +212 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_to_binary_tree_inc.h +329 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_histogram.c +100 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_histogram.h +63 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_histogram_inc.h +51 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_literal_cost.c +175 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_literal_cost.h +30 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_memory.c +170 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_memory.h +114 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_metablock.c +663 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_metablock.h +105 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_metablock_inc.h +183 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_params.h +46 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_prefix.h +53 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_quality.h +165 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_ringbuffer.h +167 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict.c +486 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict.h +40 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict_lut.h +5864 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_utf8_util.c +85 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_utf8_util.h +32 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_write_bits.h +87 -0
- data/ext/isomorfeus_ferret_ext/brotli_encode.h +448 -0
- data/ext/isomorfeus_ferret_ext/brotli_port.h +288 -0
- data/ext/isomorfeus_ferret_ext/brotli_types.h +83 -0
- data/ext/isomorfeus_ferret_ext/frb_index.c +35 -4
- data/ext/isomorfeus_ferret_ext/frt_document.h +1 -0
- data/ext/isomorfeus_ferret_ext/frt_fs_store.c +1 -0
- data/ext/isomorfeus_ferret_ext/frt_index.c +174 -25
- data/ext/isomorfeus_ferret_ext/frt_index.h +6 -3
- data/ext/isomorfeus_ferret_ext/frt_ram_store.c +1 -0
- data/ext/isomorfeus_ferret_ext/test_fields.c +57 -45
- data/ext/isomorfeus_ferret_ext/test_index.c +4 -1
- data/lib/isomorfeus/ferret/version.rb +1 -1
- metadata +88 -3
@@ -0,0 +1,194 @@
|
|
1
|
+
/* Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Distributed under MIT license.
|
4
|
+
See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
|
5
|
+
*/
|
6
|
+
|
7
|
+
/* Block split point selection utilities. */
|
8
|
+
|
9
|
+
#include "brotli_enc_block_splitter.h"
|
10
|
+
|
11
|
+
#include <string.h> /* memcpy, memset */
|
12
|
+
|
13
|
+
#include "brotli_common_platform.h"
|
14
|
+
#include "brotli_enc_bit_cost.h"
|
15
|
+
#include "brotli_enc_cluster.h"
|
16
|
+
#include "brotli_enc_command.h"
|
17
|
+
#include "brotli_enc_fast_log.h"
|
18
|
+
#include "brotli_enc_histogram.h"
|
19
|
+
#include "brotli_enc_memory.h"
|
20
|
+
#include "brotli_enc_quality.h"
|
21
|
+
|
22
|
+
#if defined(__cplusplus) || defined(c_plusplus)
|
23
|
+
extern "C" {
|
24
|
+
#endif
|
25
|
+
|
26
|
+
static const size_t kMaxLiteralHistograms = 100;
|
27
|
+
static const size_t kMaxCommandHistograms = 50;
|
28
|
+
static const double kLiteralBlockSwitchCost = 28.1;
|
29
|
+
static const double kCommandBlockSwitchCost = 13.5;
|
30
|
+
static const double kDistanceBlockSwitchCost = 14.6;
|
31
|
+
static const size_t kLiteralStrideLength = 70;
|
32
|
+
static const size_t kCommandStrideLength = 40;
|
33
|
+
static const size_t kSymbolsPerLiteralHistogram = 544;
|
34
|
+
static const size_t kSymbolsPerCommandHistogram = 530;
|
35
|
+
static const size_t kSymbolsPerDistanceHistogram = 544;
|
36
|
+
static const size_t kMinLengthForBlockSplitting = 128;
|
37
|
+
static const size_t kIterMulForRefining = 2;
|
38
|
+
static const size_t kMinItersForRefining = 100;
|
39
|
+
|
40
|
+
static size_t CountLiterals(const Command* cmds, const size_t num_commands) {
|
41
|
+
/* Count how many we have. */
|
42
|
+
size_t total_length = 0;
|
43
|
+
size_t i;
|
44
|
+
for (i = 0; i < num_commands; ++i) {
|
45
|
+
total_length += cmds[i].insert_len_;
|
46
|
+
}
|
47
|
+
return total_length;
|
48
|
+
}
|
49
|
+
|
50
|
+
static void CopyLiteralsToByteArray(const Command* cmds,
|
51
|
+
const size_t num_commands,
|
52
|
+
const uint8_t* data,
|
53
|
+
const size_t offset,
|
54
|
+
const size_t mask,
|
55
|
+
uint8_t* literals) {
|
56
|
+
size_t pos = 0;
|
57
|
+
size_t from_pos = offset & mask;
|
58
|
+
size_t i;
|
59
|
+
for (i = 0; i < num_commands; ++i) {
|
60
|
+
size_t insert_len = cmds[i].insert_len_;
|
61
|
+
if (from_pos + insert_len > mask) {
|
62
|
+
size_t head_size = mask + 1 - from_pos;
|
63
|
+
memcpy(literals + pos, data + from_pos, head_size);
|
64
|
+
from_pos = 0;
|
65
|
+
pos += head_size;
|
66
|
+
insert_len -= head_size;
|
67
|
+
}
|
68
|
+
if (insert_len > 0) {
|
69
|
+
memcpy(literals + pos, data + from_pos, insert_len);
|
70
|
+
pos += insert_len;
|
71
|
+
}
|
72
|
+
from_pos = (from_pos + insert_len + CommandCopyLen(&cmds[i])) & mask;
|
73
|
+
}
|
74
|
+
}
|
75
|
+
|
76
|
+
static BROTLI_INLINE uint32_t MyRand(uint32_t* seed) {
|
77
|
+
/* Initial seed should be 7. In this case, loop length is (1 << 29). */
|
78
|
+
*seed *= 16807U;
|
79
|
+
return *seed;
|
80
|
+
}
|
81
|
+
|
82
|
+
static BROTLI_INLINE double BitCost(size_t count) {
|
83
|
+
return count == 0 ? -2.0 : FastLog2(count);
|
84
|
+
}
|
85
|
+
|
86
|
+
#define HISTOGRAMS_PER_BATCH 64
|
87
|
+
#define CLUSTERS_PER_BATCH 16
|
88
|
+
|
89
|
+
#define FN(X) X ## Literal
|
90
|
+
#define DataType uint8_t
|
91
|
+
/* NOLINTNEXTLINE(build/include) */
|
92
|
+
#include "brotli_enc_block_splitter_inc.h"
|
93
|
+
#undef DataType
|
94
|
+
#undef FN
|
95
|
+
|
96
|
+
#define FN(X) X ## Command
|
97
|
+
#define DataType uint16_t
|
98
|
+
/* NOLINTNEXTLINE(build/include) */
|
99
|
+
#include "brotli_enc_block_splitter_inc.h"
|
100
|
+
#undef FN
|
101
|
+
|
102
|
+
#define FN(X) X ## Distance
|
103
|
+
/* NOLINTNEXTLINE(build/include) */
|
104
|
+
#include "brotli_enc_block_splitter_inc.h"
|
105
|
+
#undef DataType
|
106
|
+
#undef FN
|
107
|
+
|
108
|
+
void BrotliInitBlockSplit(BlockSplit* self) {
|
109
|
+
self->num_types = 0;
|
110
|
+
self->num_blocks = 0;
|
111
|
+
self->types = 0;
|
112
|
+
self->lengths = 0;
|
113
|
+
self->types_alloc_size = 0;
|
114
|
+
self->lengths_alloc_size = 0;
|
115
|
+
}
|
116
|
+
|
117
|
+
void BrotliDestroyBlockSplit(MemoryManager* m, BlockSplit* self) {
|
118
|
+
BROTLI_FREE(m, self->types);
|
119
|
+
BROTLI_FREE(m, self->lengths);
|
120
|
+
}
|
121
|
+
|
122
|
+
void BrotliSplitBlock(MemoryManager* m,
|
123
|
+
const Command* cmds,
|
124
|
+
const size_t num_commands,
|
125
|
+
const uint8_t* data,
|
126
|
+
const size_t pos,
|
127
|
+
const size_t mask,
|
128
|
+
const BrotliEncoderParams* params,
|
129
|
+
BlockSplit* literal_split,
|
130
|
+
BlockSplit* insert_and_copy_split,
|
131
|
+
BlockSplit* dist_split) {
|
132
|
+
{
|
133
|
+
size_t literals_count = CountLiterals(cmds, num_commands);
|
134
|
+
uint8_t* literals = BROTLI_ALLOC(m, uint8_t, literals_count);
|
135
|
+
if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(literals)) return;
|
136
|
+
/* Create a continuous array of literals. */
|
137
|
+
CopyLiteralsToByteArray(cmds, num_commands, data, pos, mask, literals);
|
138
|
+
/* Create the block split on the array of literals.
|
139
|
+
Literal histograms have alphabet size 256. */
|
140
|
+
SplitByteVectorLiteral(
|
141
|
+
m, literals, literals_count,
|
142
|
+
kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,
|
143
|
+
kLiteralStrideLength, kLiteralBlockSwitchCost, params,
|
144
|
+
literal_split);
|
145
|
+
if (BROTLI_IS_OOM(m)) return;
|
146
|
+
BROTLI_FREE(m, literals);
|
147
|
+
}
|
148
|
+
|
149
|
+
{
|
150
|
+
/* Compute prefix codes for commands. */
|
151
|
+
uint16_t* insert_and_copy_codes = BROTLI_ALLOC(m, uint16_t, num_commands);
|
152
|
+
size_t i;
|
153
|
+
if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(insert_and_copy_codes)) return;
|
154
|
+
for (i = 0; i < num_commands; ++i) {
|
155
|
+
insert_and_copy_codes[i] = cmds[i].cmd_prefix_;
|
156
|
+
}
|
157
|
+
/* Create the block split on the array of command prefixes. */
|
158
|
+
SplitByteVectorCommand(
|
159
|
+
m, insert_and_copy_codes, num_commands,
|
160
|
+
kSymbolsPerCommandHistogram, kMaxCommandHistograms,
|
161
|
+
kCommandStrideLength, kCommandBlockSwitchCost, params,
|
162
|
+
insert_and_copy_split);
|
163
|
+
if (BROTLI_IS_OOM(m)) return;
|
164
|
+
/* TODO: reuse for distances? */
|
165
|
+
BROTLI_FREE(m, insert_and_copy_codes);
|
166
|
+
}
|
167
|
+
|
168
|
+
{
|
169
|
+
/* Create a continuous array of distance prefixes. */
|
170
|
+
uint16_t* distance_prefixes = BROTLI_ALLOC(m, uint16_t, num_commands);
|
171
|
+
size_t j = 0;
|
172
|
+
size_t i;
|
173
|
+
if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(distance_prefixes)) return;
|
174
|
+
for (i = 0; i < num_commands; ++i) {
|
175
|
+
const Command* cmd = &cmds[i];
|
176
|
+
if (CommandCopyLen(cmd) && cmd->cmd_prefix_ >= 128) {
|
177
|
+
distance_prefixes[j++] = cmd->dist_prefix_ & 0x3FF;
|
178
|
+
}
|
179
|
+
}
|
180
|
+
/* Create the block split on the array of distance prefixes. */
|
181
|
+
SplitByteVectorDistance(
|
182
|
+
m, distance_prefixes, j,
|
183
|
+
kSymbolsPerDistanceHistogram, kMaxCommandHistograms,
|
184
|
+
kCommandStrideLength, kDistanceBlockSwitchCost, params,
|
185
|
+
dist_split);
|
186
|
+
if (BROTLI_IS_OOM(m)) return;
|
187
|
+
BROTLI_FREE(m, distance_prefixes);
|
188
|
+
}
|
189
|
+
}
|
190
|
+
|
191
|
+
|
192
|
+
#if defined(__cplusplus) || defined(c_plusplus)
|
193
|
+
} /* extern "C" */
|
194
|
+
#endif
|
@@ -0,0 +1,51 @@
|
|
1
|
+
/* Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Distributed under MIT license.
|
4
|
+
See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
|
5
|
+
*/
|
6
|
+
|
7
|
+
/* Block split point selection utilities. */
|
8
|
+
|
9
|
+
#ifndef BROTLI_ENC_BLOCK_SPLITTER_H_
|
10
|
+
#define BROTLI_ENC_BLOCK_SPLITTER_H_
|
11
|
+
|
12
|
+
#include "brotli_common_platform.h"
|
13
|
+
#include "brotli_types.h"
|
14
|
+
#include "brotli_enc_command.h"
|
15
|
+
#include "brotli_enc_memory.h"
|
16
|
+
#include "brotli_enc_quality.h"
|
17
|
+
|
18
|
+
#if defined(__cplusplus) || defined(c_plusplus)
|
19
|
+
extern "C" {
|
20
|
+
#endif
|
21
|
+
|
22
|
+
typedef struct BlockSplit {
|
23
|
+
size_t num_types; /* Amount of distinct types */
|
24
|
+
size_t num_blocks; /* Amount of values in types and length */
|
25
|
+
uint8_t* types;
|
26
|
+
uint32_t* lengths;
|
27
|
+
|
28
|
+
size_t types_alloc_size;
|
29
|
+
size_t lengths_alloc_size;
|
30
|
+
} BlockSplit;
|
31
|
+
|
32
|
+
BROTLI_INTERNAL void BrotliInitBlockSplit(BlockSplit* self);
|
33
|
+
BROTLI_INTERNAL void BrotliDestroyBlockSplit(MemoryManager* m,
|
34
|
+
BlockSplit* self);
|
35
|
+
|
36
|
+
BROTLI_INTERNAL void BrotliSplitBlock(MemoryManager* m,
|
37
|
+
const Command* cmds,
|
38
|
+
const size_t num_commands,
|
39
|
+
const uint8_t* data,
|
40
|
+
const size_t offset,
|
41
|
+
const size_t mask,
|
42
|
+
const BrotliEncoderParams* params,
|
43
|
+
BlockSplit* literal_split,
|
44
|
+
BlockSplit* insert_and_copy_split,
|
45
|
+
BlockSplit* dist_split);
|
46
|
+
|
47
|
+
#if defined(__cplusplus) || defined(c_plusplus)
|
48
|
+
} /* extern "C" */
|
49
|
+
#endif
|
50
|
+
|
51
|
+
#endif /* BROTLI_ENC_BLOCK_SPLITTER_H_ */
|
@@ -0,0 +1,440 @@
|
|
1
|
+
/* NOLINT(build/header_guard) */
|
2
|
+
/* Copyright 2013 Google Inc. All Rights Reserved.
|
3
|
+
|
4
|
+
Distributed under MIT license.
|
5
|
+
See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
|
6
|
+
*/
|
7
|
+
|
8
|
+
/* template parameters: FN, DataType */
|
9
|
+
|
10
|
+
#define HistogramType FN(Histogram)
|
11
|
+
|
12
|
+
static void FN(InitialEntropyCodes)(const DataType* data, size_t length,
|
13
|
+
size_t stride,
|
14
|
+
size_t num_histograms,
|
15
|
+
HistogramType* histograms) {
|
16
|
+
uint32_t seed = 7;
|
17
|
+
size_t block_length = length / num_histograms;
|
18
|
+
size_t i;
|
19
|
+
FN(ClearHistograms)(histograms, num_histograms);
|
20
|
+
for (i = 0; i < num_histograms; ++i) {
|
21
|
+
size_t pos = length * i / num_histograms;
|
22
|
+
if (i != 0) {
|
23
|
+
pos += MyRand(&seed) % block_length;
|
24
|
+
}
|
25
|
+
if (pos + stride >= length) {
|
26
|
+
pos = length - stride - 1;
|
27
|
+
}
|
28
|
+
FN(HistogramAddVector)(&histograms[i], data + pos, stride);
|
29
|
+
}
|
30
|
+
}
|
31
|
+
|
32
|
+
static void FN(RandomSample)(uint32_t* seed,
|
33
|
+
const DataType* data,
|
34
|
+
size_t length,
|
35
|
+
size_t stride,
|
36
|
+
HistogramType* sample) {
|
37
|
+
size_t pos = 0;
|
38
|
+
if (stride >= length) {
|
39
|
+
stride = length;
|
40
|
+
} else {
|
41
|
+
pos = MyRand(seed) % (length - stride + 1);
|
42
|
+
}
|
43
|
+
FN(HistogramAddVector)(sample, data + pos, stride);
|
44
|
+
}
|
45
|
+
|
46
|
+
static void FN(RefineEntropyCodes)(const DataType* data, size_t length,
|
47
|
+
size_t stride,
|
48
|
+
size_t num_histograms,
|
49
|
+
HistogramType* histograms) {
|
50
|
+
size_t iters =
|
51
|
+
kIterMulForRefining * length / stride + kMinItersForRefining;
|
52
|
+
uint32_t seed = 7;
|
53
|
+
size_t iter;
|
54
|
+
iters = ((iters + num_histograms - 1) / num_histograms) * num_histograms;
|
55
|
+
for (iter = 0; iter < iters; ++iter) {
|
56
|
+
HistogramType sample;
|
57
|
+
FN(HistogramClear)(&sample);
|
58
|
+
FN(RandomSample)(&seed, data, length, stride, &sample);
|
59
|
+
FN(HistogramAddHistogram)(&histograms[iter % num_histograms], &sample);
|
60
|
+
}
|
61
|
+
}
|
62
|
+
|
63
|
+
/* Assigns a block id from the range [0, num_histograms) to each data element
|
64
|
+
in data[0..length) and fills in block_id[0..length) with the assigned values.
|
65
|
+
Returns the number of blocks, i.e. one plus the number of block switches. */
|
66
|
+
static size_t FN(FindBlocks)(const DataType* data, const size_t length,
|
67
|
+
const double block_switch_bitcost,
|
68
|
+
const size_t num_histograms,
|
69
|
+
const HistogramType* histograms,
|
70
|
+
double* insert_cost,
|
71
|
+
double* cost,
|
72
|
+
uint8_t* switch_signal,
|
73
|
+
uint8_t* block_id) {
|
74
|
+
const size_t data_size = FN(HistogramDataSize)();
|
75
|
+
const size_t bitmaplen = (num_histograms + 7) >> 3;
|
76
|
+
size_t num_blocks = 1;
|
77
|
+
size_t i;
|
78
|
+
size_t j;
|
79
|
+
BROTLI_DCHECK(num_histograms <= 256);
|
80
|
+
if (num_histograms <= 1) {
|
81
|
+
for (i = 0; i < length; ++i) {
|
82
|
+
block_id[i] = 0;
|
83
|
+
}
|
84
|
+
return 1;
|
85
|
+
}
|
86
|
+
memset(insert_cost, 0, sizeof(insert_cost[0]) * data_size * num_histograms);
|
87
|
+
for (i = 0; i < num_histograms; ++i) {
|
88
|
+
insert_cost[i] = FastLog2((uint32_t)histograms[i].total_count_);
|
89
|
+
}
|
90
|
+
for (i = data_size; i != 0;) {
|
91
|
+
--i;
|
92
|
+
for (j = 0; j < num_histograms; ++j) {
|
93
|
+
insert_cost[i * num_histograms + j] =
|
94
|
+
insert_cost[j] - BitCost(histograms[j].data_[i]);
|
95
|
+
}
|
96
|
+
}
|
97
|
+
memset(cost, 0, sizeof(cost[0]) * num_histograms);
|
98
|
+
memset(switch_signal, 0, sizeof(switch_signal[0]) * length * bitmaplen);
|
99
|
+
/* After each iteration of this loop, cost[k] will contain the difference
|
100
|
+
between the minimum cost of arriving at the current byte position using
|
101
|
+
entropy code k, and the minimum cost of arriving at the current byte
|
102
|
+
position. This difference is capped at the block switch cost, and if it
|
103
|
+
reaches block switch cost, it means that when we trace back from the last
|
104
|
+
position, we need to switch here. */
|
105
|
+
for (i = 0; i < length; ++i) {
|
106
|
+
const size_t byte_ix = i;
|
107
|
+
size_t ix = byte_ix * bitmaplen;
|
108
|
+
size_t insert_cost_ix = data[byte_ix] * num_histograms;
|
109
|
+
double min_cost = 1e99;
|
110
|
+
double block_switch_cost = block_switch_bitcost;
|
111
|
+
size_t k;
|
112
|
+
for (k = 0; k < num_histograms; ++k) {
|
113
|
+
/* We are coding the symbol in data[byte_ix] with entropy code k. */
|
114
|
+
cost[k] += insert_cost[insert_cost_ix + k];
|
115
|
+
if (cost[k] < min_cost) {
|
116
|
+
min_cost = cost[k];
|
117
|
+
block_id[byte_ix] = (uint8_t)k;
|
118
|
+
}
|
119
|
+
}
|
120
|
+
/* More blocks for the beginning. */
|
121
|
+
if (byte_ix < 2000) {
|
122
|
+
block_switch_cost *= 0.77 + 0.07 * (double)byte_ix / 2000;
|
123
|
+
}
|
124
|
+
for (k = 0; k < num_histograms; ++k) {
|
125
|
+
cost[k] -= min_cost;
|
126
|
+
if (cost[k] >= block_switch_cost) {
|
127
|
+
const uint8_t mask = (uint8_t)(1u << (k & 7));
|
128
|
+
cost[k] = block_switch_cost;
|
129
|
+
BROTLI_DCHECK((k >> 3) < bitmaplen);
|
130
|
+
switch_signal[ix + (k >> 3)] |= mask;
|
131
|
+
}
|
132
|
+
}
|
133
|
+
}
|
134
|
+
{ /* Trace back from the last position and switch at the marked places. */
|
135
|
+
size_t byte_ix = length - 1;
|
136
|
+
size_t ix = byte_ix * bitmaplen;
|
137
|
+
uint8_t cur_id = block_id[byte_ix];
|
138
|
+
while (byte_ix > 0) {
|
139
|
+
const uint8_t mask = (uint8_t)(1u << (cur_id & 7));
|
140
|
+
BROTLI_DCHECK(((size_t)cur_id >> 3) < bitmaplen);
|
141
|
+
--byte_ix;
|
142
|
+
ix -= bitmaplen;
|
143
|
+
if (switch_signal[ix + (cur_id >> 3)] & mask) {
|
144
|
+
if (cur_id != block_id[byte_ix]) {
|
145
|
+
cur_id = block_id[byte_ix];
|
146
|
+
++num_blocks;
|
147
|
+
}
|
148
|
+
}
|
149
|
+
block_id[byte_ix] = cur_id;
|
150
|
+
}
|
151
|
+
}
|
152
|
+
return num_blocks;
|
153
|
+
}
|
154
|
+
|
155
|
+
static size_t FN(RemapBlockIds)(uint8_t* block_ids, const size_t length,
|
156
|
+
uint16_t* new_id, const size_t num_histograms) {
|
157
|
+
static const uint16_t kInvalidId = 256;
|
158
|
+
uint16_t next_id = 0;
|
159
|
+
size_t i;
|
160
|
+
for (i = 0; i < num_histograms; ++i) {
|
161
|
+
new_id[i] = kInvalidId;
|
162
|
+
}
|
163
|
+
for (i = 0; i < length; ++i) {
|
164
|
+
BROTLI_DCHECK(block_ids[i] < num_histograms);
|
165
|
+
if (new_id[block_ids[i]] == kInvalidId) {
|
166
|
+
new_id[block_ids[i]] = next_id++;
|
167
|
+
}
|
168
|
+
}
|
169
|
+
for (i = 0; i < length; ++i) {
|
170
|
+
block_ids[i] = (uint8_t)new_id[block_ids[i]];
|
171
|
+
BROTLI_DCHECK(block_ids[i] < num_histograms);
|
172
|
+
}
|
173
|
+
BROTLI_DCHECK(next_id <= num_histograms);
|
174
|
+
return next_id;
|
175
|
+
}
|
176
|
+
|
177
|
+
static void FN(BuildBlockHistograms)(const DataType* data, const size_t length,
|
178
|
+
const uint8_t* block_ids,
|
179
|
+
const size_t num_histograms,
|
180
|
+
HistogramType* histograms) {
|
181
|
+
size_t i;
|
182
|
+
FN(ClearHistograms)(histograms, num_histograms);
|
183
|
+
for (i = 0; i < length; ++i) {
|
184
|
+
FN(HistogramAdd)(&histograms[block_ids[i]], data[i]);
|
185
|
+
}
|
186
|
+
}
|
187
|
+
|
188
|
+
static void FN(ClusterBlocks)(MemoryManager* m,
|
189
|
+
const DataType* data, const size_t length,
|
190
|
+
const size_t num_blocks,
|
191
|
+
uint8_t* block_ids,
|
192
|
+
BlockSplit* split) {
|
193
|
+
uint32_t* histogram_symbols = BROTLI_ALLOC(m, uint32_t, num_blocks);
|
194
|
+
uint32_t* block_lengths = BROTLI_ALLOC(m, uint32_t, num_blocks);
|
195
|
+
const size_t expected_num_clusters = CLUSTERS_PER_BATCH *
|
196
|
+
(num_blocks + HISTOGRAMS_PER_BATCH - 1) / HISTOGRAMS_PER_BATCH;
|
197
|
+
size_t all_histograms_size = 0;
|
198
|
+
size_t all_histograms_capacity = expected_num_clusters;
|
199
|
+
HistogramType* all_histograms =
|
200
|
+
BROTLI_ALLOC(m, HistogramType, all_histograms_capacity);
|
201
|
+
size_t cluster_size_size = 0;
|
202
|
+
size_t cluster_size_capacity = expected_num_clusters;
|
203
|
+
uint32_t* cluster_size = BROTLI_ALLOC(m, uint32_t, cluster_size_capacity);
|
204
|
+
size_t num_clusters = 0;
|
205
|
+
HistogramType* histograms = BROTLI_ALLOC(m, HistogramType,
|
206
|
+
BROTLI_MIN(size_t, num_blocks, HISTOGRAMS_PER_BATCH));
|
207
|
+
size_t max_num_pairs =
|
208
|
+
HISTOGRAMS_PER_BATCH * HISTOGRAMS_PER_BATCH / 2;
|
209
|
+
size_t pairs_capacity = max_num_pairs + 1;
|
210
|
+
HistogramPair* pairs = BROTLI_ALLOC(m, HistogramPair, pairs_capacity);
|
211
|
+
size_t pos = 0;
|
212
|
+
uint32_t* clusters;
|
213
|
+
size_t num_final_clusters;
|
214
|
+
static const uint32_t kInvalidIndex = BROTLI_UINT32_MAX;
|
215
|
+
uint32_t* new_index;
|
216
|
+
size_t i;
|
217
|
+
uint32_t sizes[HISTOGRAMS_PER_BATCH] = { 0 };
|
218
|
+
uint32_t new_clusters[HISTOGRAMS_PER_BATCH] = { 0 };
|
219
|
+
uint32_t symbols[HISTOGRAMS_PER_BATCH] = { 0 };
|
220
|
+
uint32_t remap[HISTOGRAMS_PER_BATCH] = { 0 };
|
221
|
+
|
222
|
+
if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(histogram_symbols) ||
|
223
|
+
BROTLI_IS_NULL(block_lengths) || BROTLI_IS_NULL(all_histograms) ||
|
224
|
+
BROTLI_IS_NULL(cluster_size) || BROTLI_IS_NULL(histograms) ||
|
225
|
+
BROTLI_IS_NULL(pairs)) {
|
226
|
+
return;
|
227
|
+
}
|
228
|
+
|
229
|
+
memset(block_lengths, 0, num_blocks * sizeof(uint32_t));
|
230
|
+
|
231
|
+
{
|
232
|
+
size_t block_idx = 0;
|
233
|
+
for (i = 0; i < length; ++i) {
|
234
|
+
BROTLI_DCHECK(block_idx < num_blocks);
|
235
|
+
++block_lengths[block_idx];
|
236
|
+
if (i + 1 == length || block_ids[i] != block_ids[i + 1]) {
|
237
|
+
++block_idx;
|
238
|
+
}
|
239
|
+
}
|
240
|
+
BROTLI_DCHECK(block_idx == num_blocks);
|
241
|
+
}
|
242
|
+
|
243
|
+
for (i = 0; i < num_blocks; i += HISTOGRAMS_PER_BATCH) {
|
244
|
+
const size_t num_to_combine =
|
245
|
+
BROTLI_MIN(size_t, num_blocks - i, HISTOGRAMS_PER_BATCH);
|
246
|
+
size_t num_new_clusters;
|
247
|
+
size_t j;
|
248
|
+
for (j = 0; j < num_to_combine; ++j) {
|
249
|
+
size_t k;
|
250
|
+
FN(HistogramClear)(&histograms[j]);
|
251
|
+
for (k = 0; k < block_lengths[i + j]; ++k) {
|
252
|
+
FN(HistogramAdd)(&histograms[j], data[pos++]);
|
253
|
+
}
|
254
|
+
histograms[j].bit_cost_ = FN(BrotliPopulationCost)(&histograms[j]);
|
255
|
+
new_clusters[j] = (uint32_t)j;
|
256
|
+
symbols[j] = (uint32_t)j;
|
257
|
+
sizes[j] = 1;
|
258
|
+
}
|
259
|
+
num_new_clusters = FN(BrotliHistogramCombine)(
|
260
|
+
histograms, sizes, symbols, new_clusters, pairs, num_to_combine,
|
261
|
+
num_to_combine, HISTOGRAMS_PER_BATCH, max_num_pairs);
|
262
|
+
BROTLI_ENSURE_CAPACITY(m, HistogramType, all_histograms,
|
263
|
+
all_histograms_capacity, all_histograms_size + num_new_clusters);
|
264
|
+
BROTLI_ENSURE_CAPACITY(m, uint32_t, cluster_size,
|
265
|
+
cluster_size_capacity, cluster_size_size + num_new_clusters);
|
266
|
+
if (BROTLI_IS_OOM(m)) return;
|
267
|
+
for (j = 0; j < num_new_clusters; ++j) {
|
268
|
+
all_histograms[all_histograms_size++] = histograms[new_clusters[j]];
|
269
|
+
cluster_size[cluster_size_size++] = sizes[new_clusters[j]];
|
270
|
+
remap[new_clusters[j]] = (uint32_t)j;
|
271
|
+
}
|
272
|
+
for (j = 0; j < num_to_combine; ++j) {
|
273
|
+
histogram_symbols[i + j] = (uint32_t)num_clusters + remap[symbols[j]];
|
274
|
+
}
|
275
|
+
num_clusters += num_new_clusters;
|
276
|
+
BROTLI_DCHECK(num_clusters == cluster_size_size);
|
277
|
+
BROTLI_DCHECK(num_clusters == all_histograms_size);
|
278
|
+
}
|
279
|
+
BROTLI_FREE(m, histograms);
|
280
|
+
|
281
|
+
max_num_pairs =
|
282
|
+
BROTLI_MIN(size_t, 64 * num_clusters, (num_clusters / 2) * num_clusters);
|
283
|
+
if (pairs_capacity < max_num_pairs + 1) {
|
284
|
+
BROTLI_FREE(m, pairs);
|
285
|
+
pairs = BROTLI_ALLOC(m, HistogramPair, max_num_pairs + 1);
|
286
|
+
if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(pairs)) return;
|
287
|
+
}
|
288
|
+
|
289
|
+
clusters = BROTLI_ALLOC(m, uint32_t, num_clusters);
|
290
|
+
if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(clusters)) return;
|
291
|
+
for (i = 0; i < num_clusters; ++i) {
|
292
|
+
clusters[i] = (uint32_t)i;
|
293
|
+
}
|
294
|
+
num_final_clusters = FN(BrotliHistogramCombine)(
|
295
|
+
all_histograms, cluster_size, histogram_symbols, clusters, pairs,
|
296
|
+
num_clusters, num_blocks, BROTLI_MAX_NUMBER_OF_BLOCK_TYPES,
|
297
|
+
max_num_pairs);
|
298
|
+
BROTLI_FREE(m, pairs);
|
299
|
+
BROTLI_FREE(m, cluster_size);
|
300
|
+
|
301
|
+
new_index = BROTLI_ALLOC(m, uint32_t, num_clusters);
|
302
|
+
if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(new_index)) return;
|
303
|
+
for (i = 0; i < num_clusters; ++i) new_index[i] = kInvalidIndex;
|
304
|
+
pos = 0;
|
305
|
+
{
|
306
|
+
uint32_t next_index = 0;
|
307
|
+
for (i = 0; i < num_blocks; ++i) {
|
308
|
+
HistogramType histo;
|
309
|
+
size_t j;
|
310
|
+
uint32_t best_out;
|
311
|
+
double best_bits;
|
312
|
+
FN(HistogramClear)(&histo);
|
313
|
+
for (j = 0; j < block_lengths[i]; ++j) {
|
314
|
+
FN(HistogramAdd)(&histo, data[pos++]);
|
315
|
+
}
|
316
|
+
best_out = (i == 0) ? histogram_symbols[0] : histogram_symbols[i - 1];
|
317
|
+
best_bits =
|
318
|
+
FN(BrotliHistogramBitCostDistance)(&histo, &all_histograms[best_out]);
|
319
|
+
for (j = 0; j < num_final_clusters; ++j) {
|
320
|
+
const double cur_bits = FN(BrotliHistogramBitCostDistance)(
|
321
|
+
&histo, &all_histograms[clusters[j]]);
|
322
|
+
if (cur_bits < best_bits) {
|
323
|
+
best_bits = cur_bits;
|
324
|
+
best_out = clusters[j];
|
325
|
+
}
|
326
|
+
}
|
327
|
+
histogram_symbols[i] = best_out;
|
328
|
+
if (new_index[best_out] == kInvalidIndex) {
|
329
|
+
new_index[best_out] = next_index++;
|
330
|
+
}
|
331
|
+
}
|
332
|
+
}
|
333
|
+
BROTLI_FREE(m, clusters);
|
334
|
+
BROTLI_FREE(m, all_histograms);
|
335
|
+
BROTLI_ENSURE_CAPACITY(
|
336
|
+
m, uint8_t, split->types, split->types_alloc_size, num_blocks);
|
337
|
+
BROTLI_ENSURE_CAPACITY(
|
338
|
+
m, uint32_t, split->lengths, split->lengths_alloc_size, num_blocks);
|
339
|
+
if (BROTLI_IS_OOM(m)) return;
|
340
|
+
{
|
341
|
+
uint32_t cur_length = 0;
|
342
|
+
size_t block_idx = 0;
|
343
|
+
uint8_t max_type = 0;
|
344
|
+
for (i = 0; i < num_blocks; ++i) {
|
345
|
+
cur_length += block_lengths[i];
|
346
|
+
if (i + 1 == num_blocks ||
|
347
|
+
histogram_symbols[i] != histogram_symbols[i + 1]) {
|
348
|
+
const uint8_t id = (uint8_t)new_index[histogram_symbols[i]];
|
349
|
+
split->types[block_idx] = id;
|
350
|
+
split->lengths[block_idx] = cur_length;
|
351
|
+
max_type = BROTLI_MAX(uint8_t, max_type, id);
|
352
|
+
cur_length = 0;
|
353
|
+
++block_idx;
|
354
|
+
}
|
355
|
+
}
|
356
|
+
split->num_blocks = block_idx;
|
357
|
+
split->num_types = (size_t)max_type + 1;
|
358
|
+
}
|
359
|
+
BROTLI_FREE(m, new_index);
|
360
|
+
BROTLI_FREE(m, block_lengths);
|
361
|
+
BROTLI_FREE(m, histogram_symbols);
|
362
|
+
}
|
363
|
+
|
364
|
+
static void FN(SplitByteVector)(MemoryManager* m,
|
365
|
+
const DataType* data, const size_t length,
|
366
|
+
const size_t literals_per_histogram,
|
367
|
+
const size_t max_histograms,
|
368
|
+
const size_t sampling_stride_length,
|
369
|
+
const double block_switch_cost,
|
370
|
+
const BrotliEncoderParams* params,
|
371
|
+
BlockSplit* split) {
|
372
|
+
const size_t data_size = FN(HistogramDataSize)();
|
373
|
+
size_t num_histograms = length / literals_per_histogram + 1;
|
374
|
+
HistogramType* histograms;
|
375
|
+
if (num_histograms > max_histograms) {
|
376
|
+
num_histograms = max_histograms;
|
377
|
+
}
|
378
|
+
if (length == 0) {
|
379
|
+
split->num_types = 1;
|
380
|
+
return;
|
381
|
+
} else if (length < kMinLengthForBlockSplitting) {
|
382
|
+
BROTLI_ENSURE_CAPACITY(m, uint8_t,
|
383
|
+
split->types, split->types_alloc_size, split->num_blocks + 1);
|
384
|
+
BROTLI_ENSURE_CAPACITY(m, uint32_t,
|
385
|
+
split->lengths, split->lengths_alloc_size, split->num_blocks + 1);
|
386
|
+
if (BROTLI_IS_OOM(m)) return;
|
387
|
+
split->num_types = 1;
|
388
|
+
split->types[split->num_blocks] = 0;
|
389
|
+
split->lengths[split->num_blocks] = (uint32_t)length;
|
390
|
+
split->num_blocks++;
|
391
|
+
return;
|
392
|
+
}
|
393
|
+
histograms = BROTLI_ALLOC(m, HistogramType, num_histograms);
|
394
|
+
if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(histograms)) return;
|
395
|
+
/* Find good entropy codes. */
|
396
|
+
FN(InitialEntropyCodes)(data, length,
|
397
|
+
sampling_stride_length,
|
398
|
+
num_histograms, histograms);
|
399
|
+
FN(RefineEntropyCodes)(data, length,
|
400
|
+
sampling_stride_length,
|
401
|
+
num_histograms, histograms);
|
402
|
+
{
|
403
|
+
/* Find a good path through literals with the good entropy codes. */
|
404
|
+
uint8_t* block_ids = BROTLI_ALLOC(m, uint8_t, length);
|
405
|
+
size_t num_blocks = 0;
|
406
|
+
const size_t bitmaplen = (num_histograms + 7) >> 3;
|
407
|
+
double* insert_cost = BROTLI_ALLOC(m, double, data_size * num_histograms);
|
408
|
+
double* cost = BROTLI_ALLOC(m, double, num_histograms);
|
409
|
+
uint8_t* switch_signal = BROTLI_ALLOC(m, uint8_t, length * bitmaplen);
|
410
|
+
uint16_t* new_id = BROTLI_ALLOC(m, uint16_t, num_histograms);
|
411
|
+
const size_t iters = params->quality < HQ_ZOPFLIFICATION_QUALITY ? 3 : 10;
|
412
|
+
size_t i;
|
413
|
+
if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(block_ids) ||
|
414
|
+
BROTLI_IS_NULL(insert_cost) || BROTLI_IS_NULL(cost) ||
|
415
|
+
BROTLI_IS_NULL(switch_signal) || BROTLI_IS_NULL(new_id)) {
|
416
|
+
return;
|
417
|
+
}
|
418
|
+
for (i = 0; i < iters; ++i) {
|
419
|
+
num_blocks = FN(FindBlocks)(data, length,
|
420
|
+
block_switch_cost,
|
421
|
+
num_histograms, histograms,
|
422
|
+
insert_cost, cost, switch_signal,
|
423
|
+
block_ids);
|
424
|
+
num_histograms = FN(RemapBlockIds)(block_ids, length,
|
425
|
+
new_id, num_histograms);
|
426
|
+
FN(BuildBlockHistograms)(data, length, block_ids,
|
427
|
+
num_histograms, histograms);
|
428
|
+
}
|
429
|
+
BROTLI_FREE(m, insert_cost);
|
430
|
+
BROTLI_FREE(m, cost);
|
431
|
+
BROTLI_FREE(m, switch_signal);
|
432
|
+
BROTLI_FREE(m, new_id);
|
433
|
+
BROTLI_FREE(m, histograms);
|
434
|
+
FN(ClusterBlocks)(m, data, length, num_blocks, block_ids, split);
|
435
|
+
if (BROTLI_IS_OOM(m)) return;
|
436
|
+
BROTLI_FREE(m, block_ids);
|
437
|
+
}
|
438
|
+
}
|
439
|
+
|
440
|
+
#undef HistogramType
|