google-protobuf 3.25.3-aarch64-linux → 4.26.0-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of google-protobuf might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/ext/google/protobuf_c/convert.c +7 -4
- data/ext/google/protobuf_c/defs.c +40 -27
- data/ext/google/protobuf_c/extconf.rb +1 -1
- data/ext/google/protobuf_c/map.c +12 -19
- data/ext/google/protobuf_c/map.h +1 -1
- data/ext/google/protobuf_c/message.c +41 -77
- data/ext/google/protobuf_c/message.h +1 -1
- data/ext/google/protobuf_c/protobuf.c +19 -6
- data/ext/google/protobuf_c/repeated_field.c +6 -15
- data/ext/google/protobuf_c/repeated_field.h +1 -1
- data/ext/google/protobuf_c/ruby-upb.c +11788 -10795
- data/ext/google/protobuf_c/ruby-upb.h +5164 -4242
- data/ext/google/protobuf_c/shared_convert.c +5 -3
- data/ext/google/protobuf_c/shared_convert.h +2 -2
- data/ext/google/protobuf_c/shared_message.c +8 -6
- data/ext/google/protobuf_c/third_party/utf8_range/utf8_range.c +467 -0
- data/ext/google/protobuf_c/third_party/utf8_range/utf8_range.h +9 -8
- data/lib/google/2.7/protobuf_c.so +0 -0
- data/lib/google/3.0/protobuf_c.so +0 -0
- data/lib/google/3.1/protobuf_c.so +0 -0
- data/lib/google/3.2/protobuf_c.so +0 -0
- data/lib/google/3.3/protobuf_c.so +0 -0
- data/lib/google/protobuf/any_pb.rb +1 -22
- data/lib/google/protobuf/api_pb.rb +1 -24
- data/lib/google/protobuf/descriptor_pb.rb +2 -23
- data/lib/google/protobuf/duration_pb.rb +1 -22
- data/lib/google/protobuf/empty_pb.rb +1 -22
- data/lib/google/protobuf/ffi/descriptor.rb +2 -3
- data/lib/google/protobuf/ffi/enum_descriptor.rb +1 -1
- data/lib/google/protobuf/ffi/ffi.rb +3 -1
- data/lib/google/protobuf/ffi/field_descriptor.rb +10 -1
- data/lib/google/protobuf/ffi/file_descriptor.rb +1 -13
- data/lib/google/protobuf/ffi/internal/convert.rb +7 -23
- data/lib/google/protobuf/ffi/map.rb +13 -11
- data/lib/google/protobuf/ffi/message.rb +10 -13
- data/lib/google/protobuf/ffi/object_cache.rb +3 -3
- data/lib/google/protobuf/ffi/oneof_descriptor.rb +1 -1
- data/lib/google/protobuf/ffi/repeated_field.rb +12 -10
- data/lib/google/protobuf/field_mask_pb.rb +1 -22
- data/lib/google/protobuf/internal/object_cache.rb +99 -0
- data/lib/google/protobuf/plugin_pb.rb +2 -24
- data/lib/google/protobuf/repeated_field.rb +1 -2
- data/lib/google/protobuf/source_context_pb.rb +1 -22
- data/lib/google/protobuf/struct_pb.rb +1 -22
- data/lib/google/protobuf/timestamp_pb.rb +1 -22
- data/lib/google/protobuf/type_pb.rb +1 -24
- data/lib/google/protobuf/wrappers_pb.rb +1 -22
- data/lib/google/protobuf.rb +1 -1
- data/lib/google/protobuf_ffi.rb +1 -2
- data/lib/google/protobuf_native.rb +0 -1
- data/lib/google/tasks/ffi.rake +1 -3
- metadata +8 -11
- data/ext/google/protobuf_c/third_party/utf8_range/naive.c +0 -92
- data/ext/google/protobuf_c/third_party/utf8_range/range2-neon.c +0 -157
- data/ext/google/protobuf_c/third_party/utf8_range/range2-sse.c +0 -170
- data/lib/google/protobuf/descriptor_dsl.rb +0 -465
- data/lib/google/protobuf/object_cache.rb +0 -97
@@ -12,7 +12,7 @@
|
|
12
12
|
#include "shared_convert.h"
|
13
13
|
|
14
14
|
bool shared_Msgval_IsEqual(upb_MessageValue val1, upb_MessageValue val2,
|
15
|
-
upb_CType type, upb_MessageDef* msgdef,
|
15
|
+
upb_CType type, const upb_MessageDef* msgdef,
|
16
16
|
upb_Status* status) {
|
17
17
|
switch (type) {
|
18
18
|
case kUpb_CType_Bool:
|
@@ -35,11 +35,12 @@ bool shared_Msgval_IsEqual(upb_MessageValue val1, upb_MessageValue val2,
|
|
35
35
|
return shared_Message_Equal(val1.msg_val, val2.msg_val, msgdef, status);
|
36
36
|
default:
|
37
37
|
upb_Status_SetErrorMessage(status, "Internal error, unexpected type");
|
38
|
+
return false;
|
38
39
|
}
|
39
40
|
}
|
40
41
|
|
41
42
|
uint64_t shared_Msgval_GetHash(upb_MessageValue val, upb_CType type,
|
42
|
-
upb_MessageDef* msgdef, uint64_t seed,
|
43
|
+
const upb_MessageDef* msgdef, uint64_t seed,
|
43
44
|
upb_Status* status) {
|
44
45
|
switch (type) {
|
45
46
|
case kUpb_CType_Bool:
|
@@ -60,5 +61,6 @@ uint64_t shared_Msgval_GetHash(upb_MessageValue val, upb_CType type,
|
|
60
61
|
return shared_Message_Hash(val.msg_val, msgdef, seed, status);
|
61
62
|
default:
|
62
63
|
upb_Status_SetErrorMessage(status, "Internal error, unexpected type");
|
64
|
+
return 0;
|
63
65
|
}
|
64
|
-
}
|
66
|
+
}
|
@@ -16,11 +16,11 @@
|
|
16
16
|
#include "shared_message.h"
|
17
17
|
|
18
18
|
bool shared_Msgval_IsEqual(upb_MessageValue val1, upb_MessageValue val2,
|
19
|
-
upb_CType type, upb_MessageDef* msgdef,
|
19
|
+
upb_CType type, const upb_MessageDef* msgdef,
|
20
20
|
upb_Status* status);
|
21
21
|
|
22
22
|
uint64_t shared_Msgval_GetHash(upb_MessageValue val, upb_CType type,
|
23
|
-
upb_MessageDef* msgdef, uint64_t seed,
|
23
|
+
const upb_MessageDef* msgdef, uint64_t seed,
|
24
24
|
upb_Status* status);
|
25
25
|
|
26
26
|
#endif // RUBY_PROTOBUF_SHARED_CONVERT_H_
|
@@ -29,10 +29,11 @@ uint64_t shared_Message_Hash(const upb_Message* msg, const upb_MessageDef* m,
|
|
29
29
|
uint64_t ret = _upb_Hash(data, size, seed);
|
30
30
|
upb_Arena_Free(arena);
|
31
31
|
return ret;
|
32
|
-
} else {
|
33
|
-
upb_Arena_Free(arena);
|
34
|
-
upb_Status_SetErrorMessage(status, "Error calculating hash");
|
35
32
|
}
|
33
|
+
|
34
|
+
upb_Arena_Free(arena);
|
35
|
+
upb_Status_SetErrorMessage(status, "Error calculating hash");
|
36
|
+
return 0;
|
36
37
|
}
|
37
38
|
|
38
39
|
// Support function for Message_Equal
|
@@ -58,8 +59,9 @@ bool shared_Message_Equal(const upb_Message* m1, const upb_Message* m2,
|
|
58
59
|
bool ret = (size1 == size2) && (memcmp(data1, data2, size1) == 0);
|
59
60
|
upb_Arena_Free(arena_tmp);
|
60
61
|
return ret;
|
61
|
-
} else {
|
62
|
-
upb_Arena_Free(arena_tmp);
|
63
|
-
upb_Status_SetErrorMessage(status, "Error comparing messages");
|
64
62
|
}
|
63
|
+
|
64
|
+
upb_Arena_Free(arena_tmp);
|
65
|
+
upb_Status_SetErrorMessage(status, "Error comparing messages");
|
66
|
+
return 0;
|
65
67
|
}
|
@@ -0,0 +1,467 @@
|
|
1
|
+
// Copyright 2023 Google LLC
|
2
|
+
//
|
3
|
+
// Use of this source code is governed by an MIT-style
|
4
|
+
// license that can be found in the LICENSE file or at
|
5
|
+
// https://opensource.org/licenses/MIT.
|
6
|
+
|
7
|
+
/* This is a wrapper for the Google range-sse.cc algorithm which checks whether
|
8
|
+
* a sequence of bytes is a valid UTF-8 sequence and finds the longest valid
|
9
|
+
* prefix of the UTF-8 sequence.
|
10
|
+
*
|
11
|
+
* The key difference is that it checks for as much ASCII symbols as possible
|
12
|
+
* and then falls back to the range-sse.cc algorithm. The changes to the
|
13
|
+
* algorithm are cosmetic, mostly to trick the clang compiler to produce optimal
|
14
|
+
* code.
|
15
|
+
*
|
16
|
+
* For API see the utf8_validity.h header.
|
17
|
+
*/
|
18
|
+
#include "utf8_range.h"
|
19
|
+
|
20
|
+
#include <stddef.h>
|
21
|
+
#include <stdint.h>
|
22
|
+
#include <string.h>
|
23
|
+
|
24
|
+
#ifdef __SSE4_1__
|
25
|
+
#include <emmintrin.h>
|
26
|
+
#include <smmintrin.h>
|
27
|
+
#include <tmmintrin.h>
|
28
|
+
#endif
|
29
|
+
|
30
|
+
#if defined(__GNUC__)
|
31
|
+
#define FORCE_INLINE_ATTR __attribute__((always_inline))
|
32
|
+
#elif defined(_MSC_VER)
|
33
|
+
#define FORCE_INLINE_ATTR __forceinline
|
34
|
+
#else
|
35
|
+
#define FORCE_INLINE_ATTR
|
36
|
+
#endif
|
37
|
+
|
38
|
+
static FORCE_INLINE_ATTR inline uint64_t utf8_range_UnalignedLoad64(
|
39
|
+
const void* p) {
|
40
|
+
uint64_t t;
|
41
|
+
memcpy(&t, p, sizeof t);
|
42
|
+
return t;
|
43
|
+
}
|
44
|
+
|
45
|
+
static FORCE_INLINE_ATTR inline int utf8_range_AsciiIsAscii(unsigned char c) {
|
46
|
+
return c < 128;
|
47
|
+
}
|
48
|
+
|
49
|
+
static FORCE_INLINE_ATTR inline int utf8_range_IsTrailByteOk(const char c) {
|
50
|
+
return (int8_t)(c) <= (int8_t)(0xBF);
|
51
|
+
}
|
52
|
+
|
53
|
+
/* If return_position is false then it returns 1 if |data| is a valid utf8
|
54
|
+
* sequence, otherwise returns 0.
|
55
|
+
* If return_position is set to true, returns the length in bytes of the prefix
|
56
|
+
of |data| that is all structurally valid UTF-8.
|
57
|
+
*/
|
58
|
+
static size_t utf8_range_ValidateUTF8Naive(const char* data, const char* end,
|
59
|
+
int return_position) {
|
60
|
+
/* We return err_pos in the loop which is always 0 if !return_position */
|
61
|
+
size_t err_pos = 0;
|
62
|
+
size_t codepoint_bytes = 0;
|
63
|
+
/* The early check is done because of early continue's on codepoints of all
|
64
|
+
* sizes, i.e. we first check for ascii and if it is, we call continue, then
|
65
|
+
* for 2 byte codepoints, etc. This is done in order to reduce indentation and
|
66
|
+
* improve readability of the codepoint validity check.
|
67
|
+
*/
|
68
|
+
while (data + codepoint_bytes < end) {
|
69
|
+
if (return_position) {
|
70
|
+
err_pos += codepoint_bytes;
|
71
|
+
}
|
72
|
+
data += codepoint_bytes;
|
73
|
+
const size_t len = end - data;
|
74
|
+
const unsigned char byte1 = data[0];
|
75
|
+
|
76
|
+
/* We do not skip many ascii bytes at the same time as this function is
|
77
|
+
used for tail checking (< 16 bytes) and for non x86 platforms. We also
|
78
|
+
don't think that cases where non-ASCII codepoints are followed by ascii
|
79
|
+
happen often. For small strings it also introduces some penalty. For
|
80
|
+
purely ascii UTF8 strings (which is the overwhelming case) we call
|
81
|
+
SkipAscii function which is multiplatform and extremely fast.
|
82
|
+
*/
|
83
|
+
/* [00..7F] ASCII -> 1 byte */
|
84
|
+
if (utf8_range_AsciiIsAscii(byte1)) {
|
85
|
+
codepoint_bytes = 1;
|
86
|
+
continue;
|
87
|
+
}
|
88
|
+
/* [C2..DF], [80..BF] -> 2 bytes */
|
89
|
+
if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF &&
|
90
|
+
utf8_range_IsTrailByteOk(data[1])) {
|
91
|
+
codepoint_bytes = 2;
|
92
|
+
continue;
|
93
|
+
}
|
94
|
+
if (len >= 3) {
|
95
|
+
const unsigned char byte2 = data[1];
|
96
|
+
const unsigned char byte3 = data[2];
|
97
|
+
|
98
|
+
/* Is byte2, byte3 between [0x80, 0xBF]
|
99
|
+
* Check for 0x80 was done above.
|
100
|
+
*/
|
101
|
+
if (!utf8_range_IsTrailByteOk(byte2) ||
|
102
|
+
!utf8_range_IsTrailByteOk(byte3)) {
|
103
|
+
return err_pos;
|
104
|
+
}
|
105
|
+
|
106
|
+
if (/* E0, A0..BF, 80..BF */
|
107
|
+
((byte1 == 0xE0 && byte2 >= 0xA0) ||
|
108
|
+
/* E1..EC, 80..BF, 80..BF */
|
109
|
+
(byte1 >= 0xE1 && byte1 <= 0xEC) ||
|
110
|
+
/* ED, 80..9F, 80..BF */
|
111
|
+
(byte1 == 0xED && byte2 <= 0x9F) ||
|
112
|
+
/* EE..EF, 80..BF, 80..BF */
|
113
|
+
(byte1 >= 0xEE && byte1 <= 0xEF))) {
|
114
|
+
codepoint_bytes = 3;
|
115
|
+
continue;
|
116
|
+
}
|
117
|
+
if (len >= 4) {
|
118
|
+
const unsigned char byte4 = data[3];
|
119
|
+
/* Is byte4 between 0x80 ~ 0xBF */
|
120
|
+
if (!utf8_range_IsTrailByteOk(byte4)) {
|
121
|
+
return err_pos;
|
122
|
+
}
|
123
|
+
|
124
|
+
if (/* F0, 90..BF, 80..BF, 80..BF */
|
125
|
+
((byte1 == 0xF0 && byte2 >= 0x90) ||
|
126
|
+
/* F1..F3, 80..BF, 80..BF, 80..BF */
|
127
|
+
(byte1 >= 0xF1 && byte1 <= 0xF3) ||
|
128
|
+
/* F4, 80..8F, 80..BF, 80..BF */
|
129
|
+
(byte1 == 0xF4 && byte2 <= 0x8F))) {
|
130
|
+
codepoint_bytes = 4;
|
131
|
+
continue;
|
132
|
+
}
|
133
|
+
}
|
134
|
+
}
|
135
|
+
return err_pos;
|
136
|
+
}
|
137
|
+
if (return_position) {
|
138
|
+
err_pos += codepoint_bytes;
|
139
|
+
}
|
140
|
+
/* if return_position is false, this returns 1.
|
141
|
+
* if return_position is true, this returns err_pos.
|
142
|
+
*/
|
143
|
+
return err_pos + (1 - return_position);
|
144
|
+
}
|
145
|
+
|
146
|
+
#ifdef __SSE4_1__
|
147
|
+
/* Returns the number of bytes needed to skip backwards to get to the first
|
148
|
+
byte of codepoint.
|
149
|
+
*/
|
150
|
+
static inline int utf8_range_CodepointSkipBackwards(int32_t codepoint_word) {
|
151
|
+
const int8_t* const codepoint = (const int8_t*)(&codepoint_word);
|
152
|
+
if (!utf8_range_IsTrailByteOk(codepoint[3])) {
|
153
|
+
return 1;
|
154
|
+
} else if (!utf8_range_IsTrailByteOk(codepoint[2])) {
|
155
|
+
return 2;
|
156
|
+
} else if (!utf8_range_IsTrailByteOk(codepoint[1])) {
|
157
|
+
return 3;
|
158
|
+
}
|
159
|
+
return 0;
|
160
|
+
}
|
161
|
+
#endif // __SSE4_1__
|
162
|
+
|
163
|
+
/* Skipping over ASCII as much as possible, per 8 bytes. It is intentional
|
164
|
+
as most strings to check for validity consist only of 1 byte codepoints.
|
165
|
+
*/
|
166
|
+
static inline const char* utf8_range_SkipAscii(const char* data,
|
167
|
+
const char* end) {
|
168
|
+
while (8 <= end - data &&
|
169
|
+
(utf8_range_UnalignedLoad64(data) & 0x8080808080808080) == 0) {
|
170
|
+
data += 8;
|
171
|
+
}
|
172
|
+
while (data < end && utf8_range_AsciiIsAscii(*data)) {
|
173
|
+
++data;
|
174
|
+
}
|
175
|
+
return data;
|
176
|
+
}
|
177
|
+
|
178
|
+
static FORCE_INLINE_ATTR inline size_t utf8_range_Validate(
|
179
|
+
const char* data, size_t len, int return_position) {
|
180
|
+
if (len == 0) return 1 - return_position;
|
181
|
+
const char* const end = data + len;
|
182
|
+
data = utf8_range_SkipAscii(data, end);
|
183
|
+
/* SIMD algorithm always outperforms the naive version for any data of
|
184
|
+
length >=16.
|
185
|
+
*/
|
186
|
+
if (end - data < 16) {
|
187
|
+
return (return_position ? (data - (end - len)) : 0) +
|
188
|
+
utf8_range_ValidateUTF8Naive(data, end, return_position);
|
189
|
+
}
|
190
|
+
#ifndef __SSE4_1__
|
191
|
+
return (return_position ? (data - (end - len)) : 0) +
|
192
|
+
utf8_range_ValidateUTF8Naive(data, end, return_position);
|
193
|
+
#else
|
194
|
+
/* This code checks that utf-8 ranges are structurally valid 16 bytes at once
|
195
|
+
* using superscalar instructions.
|
196
|
+
* The mapping between ranges of codepoint and their corresponding utf-8
|
197
|
+
* sequences is below.
|
198
|
+
*/
|
199
|
+
|
200
|
+
/*
|
201
|
+
* U+0000...U+007F 00...7F
|
202
|
+
* U+0080...U+07FF C2...DF 80...BF
|
203
|
+
* U+0800...U+0FFF E0 A0...BF 80...BF
|
204
|
+
* U+1000...U+CFFF E1...EC 80...BF 80...BF
|
205
|
+
* U+D000...U+D7FF ED 80...9F 80...BF
|
206
|
+
* U+E000...U+FFFF EE...EF 80...BF 80...BF
|
207
|
+
* U+10000...U+3FFFF F0 90...BF 80...BF 80...BF
|
208
|
+
* U+40000...U+FFFFF F1...F3 80...BF 80...BF 80...BF
|
209
|
+
* U+100000...U+10FFFF F4 80...8F 80...BF 80...BF
|
210
|
+
*/
|
211
|
+
|
212
|
+
/* First we compute the type for each byte, as given by the table below.
|
213
|
+
* This type will be used as an index later on.
|
214
|
+
*/
|
215
|
+
|
216
|
+
/*
|
217
|
+
* Index Min Max Byte Type
|
218
|
+
* 0 00 7F Single byte sequence
|
219
|
+
* 1,2,3 80 BF Second, third and fourth byte for many of the sequences.
|
220
|
+
* 4 A0 BF Second byte after E0
|
221
|
+
* 5 80 9F Second byte after ED
|
222
|
+
* 6 90 BF Second byte after F0
|
223
|
+
* 7 80 8F Second byte after F4
|
224
|
+
* 8 C2 F4 First non ASCII byte
|
225
|
+
* 9..15 7F 80 Invalid byte
|
226
|
+
*/
|
227
|
+
|
228
|
+
/* After the first step we compute the index for all bytes, then we permute
|
229
|
+
the bytes according to their indices to check the ranges from the range
|
230
|
+
table.
|
231
|
+
* The range for a given type can be found in the range_min_table and
|
232
|
+
range_max_table, the range for type/index X is in range_min_table[X] ...
|
233
|
+
range_max_table[X].
|
234
|
+
*/
|
235
|
+
|
236
|
+
/* Algorithm:
|
237
|
+
* Put index zero to all bytes.
|
238
|
+
* Find all non ASCII characters, give them index 8.
|
239
|
+
* For each tail byte in a codepoint sequence, give it an index corresponding
|
240
|
+
to the 1 based index from the end.
|
241
|
+
* If the first byte of the codepoint is in the [C0...DF] range, we write
|
242
|
+
index 1 in the following byte.
|
243
|
+
* If the first byte of the codepoint is in the range [E0...EF], we write
|
244
|
+
indices 2 and 1 in the next two bytes.
|
245
|
+
* If the first byte of the codepoint is in the range [F0...FF] we write
|
246
|
+
indices 3,2,1 into the next three bytes.
|
247
|
+
* For finding the number of bytes we need to look at high nibbles (4 bits)
|
248
|
+
and do the lookup from the table, it can be done with shift by 4 + shuffle
|
249
|
+
instructions. We call it `first_len`.
|
250
|
+
* Then we shift first_len by 8 bits to get the indices of the 2nd bytes.
|
251
|
+
* Saturating sub 1 and shift by 8 bits to get the indices of the 3rd bytes.
|
252
|
+
* Again to get the indices of the 4th bytes.
|
253
|
+
* Take OR of all that 4 values and check within range.
|
254
|
+
*/
|
255
|
+
/* For example:
|
256
|
+
* input C3 80 68 E2 80 20 A6 F0 A0 80 AC 20 F0 93 80 80
|
257
|
+
* first_len 1 0 0 2 0 0 0 3 0 0 0 0 3 0 0 0
|
258
|
+
* 1st byte 8 0 0 8 0 0 0 8 0 0 0 0 8 0 0 0
|
259
|
+
* 2nd byte 0 1 0 0 2 0 0 0 3 0 0 0 0 3 0 0 // Shift + sub
|
260
|
+
* 3rd byte 0 0 0 0 0 1 0 0 0 2 0 0 0 0 2 0 // Shift + sub
|
261
|
+
* 4th byte 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 // Shift + sub
|
262
|
+
* Index 8 1 0 8 2 1 0 8 3 2 1 0 8 3 2 1 // OR of results
|
263
|
+
*/
|
264
|
+
|
265
|
+
/* Checking for errors:
|
266
|
+
* Error checking is done by looking up the high nibble (4 bits) of each byte
|
267
|
+
against an error checking table.
|
268
|
+
* Because the lookup value for the second byte depends of the value of the
|
269
|
+
first byte in codepoint, we use saturated operations to adjust the index.
|
270
|
+
* Specifically we need to add 2 for E0, 3 for ED, 3 for F0 and 4 for F4 to
|
271
|
+
match the correct index.
|
272
|
+
* If we subtract from all bytes EF then EO -> 241, ED -> 254, F0 -> 1,
|
273
|
+
F4 -> 5
|
274
|
+
* Do saturating sub 240, then E0 -> 1, ED -> 14 and we can do lookup to
|
275
|
+
match the adjustment
|
276
|
+
* Add saturating 112, then F0 -> 113, F4 -> 117, all that were > 16 will
|
277
|
+
be more 128 and lookup in ef_fe_table will return 0 but for F0
|
278
|
+
and F4 it will be 4 and 5 accordingly
|
279
|
+
*/
|
280
|
+
/*
|
281
|
+
* Then just check the appropriate ranges with greater/smaller equal
|
282
|
+
instructions. Check tail with a naive algorithm.
|
283
|
+
* To save from previous 16 byte checks we just align previous_first_len to
|
284
|
+
get correct continuations of the codepoints.
|
285
|
+
*/
|
286
|
+
|
287
|
+
/*
|
288
|
+
* Map high nibble of "First Byte" to legal character length minus 1
|
289
|
+
* 0x00 ~ 0xBF --> 0
|
290
|
+
* 0xC0 ~ 0xDF --> 1
|
291
|
+
* 0xE0 ~ 0xEF --> 2
|
292
|
+
* 0xF0 ~ 0xFF --> 3
|
293
|
+
*/
|
294
|
+
const __m128i first_len_table =
|
295
|
+
_mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3);
|
296
|
+
|
297
|
+
/* Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) */
|
298
|
+
const __m128i first_range_table =
|
299
|
+
_mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8);
|
300
|
+
|
301
|
+
/*
|
302
|
+
* Range table, map range index to min and max values
|
303
|
+
*/
|
304
|
+
const __m128i range_min_table =
|
305
|
+
_mm_setr_epi8(0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80, 0xC2, 0x7F,
|
306
|
+
0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F);
|
307
|
+
|
308
|
+
const __m128i range_max_table =
|
309
|
+
_mm_setr_epi8(0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F, 0xF4, 0x80,
|
310
|
+
0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
|
311
|
+
|
312
|
+
/*
|
313
|
+
* Tables for fast handling of four special First Bytes(E0,ED,F0,F4), after
|
314
|
+
* which the Second Byte are not 80~BF. It contains "range index adjustment".
|
315
|
+
* +------------+---------------+------------------+----------------+
|
316
|
+
* | First Byte | original range| range adjustment | adjusted range |
|
317
|
+
* +------------+---------------+------------------+----------------+
|
318
|
+
* | E0 | 2 | 2 | 4 |
|
319
|
+
* +------------+---------------+------------------+----------------+
|
320
|
+
* | ED | 2 | 3 | 5 |
|
321
|
+
* +------------+---------------+------------------+----------------+
|
322
|
+
* | F0 | 3 | 3 | 6 |
|
323
|
+
* +------------+---------------+------------------+----------------+
|
324
|
+
* | F4 | 4 | 4 | 8 |
|
325
|
+
* +------------+---------------+------------------+----------------+
|
326
|
+
*/
|
327
|
+
|
328
|
+
/* df_ee_table[1] -> E0, df_ee_table[14] -> ED as ED - E0 = 13 */
|
329
|
+
// The values represent the adjustment in the Range Index table for a correct
|
330
|
+
// index.
|
331
|
+
const __m128i df_ee_table =
|
332
|
+
_mm_setr_epi8(0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0);
|
333
|
+
|
334
|
+
/* ef_fe_table[1] -> F0, ef_fe_table[5] -> F4, F4 - F0 = 4 */
|
335
|
+
// The values represent the adjustment in the Range Index table for a correct
|
336
|
+
// index.
|
337
|
+
const __m128i ef_fe_table =
|
338
|
+
_mm_setr_epi8(0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
339
|
+
|
340
|
+
__m128i prev_input = _mm_set1_epi8(0);
|
341
|
+
__m128i prev_first_len = _mm_set1_epi8(0);
|
342
|
+
__m128i error = _mm_set1_epi8(0);
|
343
|
+
while (end - data >= 16) {
|
344
|
+
const __m128i input =
|
345
|
+
_mm_loadu_si128((const __m128i*)(data));
|
346
|
+
|
347
|
+
/* high_nibbles = input >> 4 */
|
348
|
+
const __m128i high_nibbles =
|
349
|
+
_mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0F));
|
350
|
+
|
351
|
+
/* first_len = legal character length minus 1 */
|
352
|
+
/* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
|
353
|
+
/* first_len = first_len_table[high_nibbles] */
|
354
|
+
__m128i first_len = _mm_shuffle_epi8(first_len_table, high_nibbles);
|
355
|
+
|
356
|
+
/* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */
|
357
|
+
/* range = first_range_table[high_nibbles] */
|
358
|
+
__m128i range = _mm_shuffle_epi8(first_range_table, high_nibbles);
|
359
|
+
|
360
|
+
/* Second Byte: set range index to first_len */
|
361
|
+
/* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
|
362
|
+
/* range |= (first_len, prev_first_len) << 1 byte */
|
363
|
+
range = _mm_or_si128(range, _mm_alignr_epi8(first_len, prev_first_len, 15));
|
364
|
+
|
365
|
+
/* Third Byte: set range index to saturate_sub(first_len, 1) */
|
366
|
+
/* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */
|
367
|
+
__m128i tmp1;
|
368
|
+
__m128i tmp2;
|
369
|
+
/* tmp1 = saturate_sub(first_len, 1) */
|
370
|
+
tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(1));
|
371
|
+
/* tmp2 = saturate_sub(prev_first_len, 1) */
|
372
|
+
tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(1));
|
373
|
+
/* range |= (tmp1, tmp2) << 2 bytes */
|
374
|
+
range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 14));
|
375
|
+
|
376
|
+
/* Fourth Byte: set range index to saturate_sub(first_len, 2) */
|
377
|
+
/* 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF */
|
378
|
+
/* tmp1 = saturate_sub(first_len, 2) */
|
379
|
+
tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(2));
|
380
|
+
/* tmp2 = saturate_sub(prev_first_len, 2) */
|
381
|
+
tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(2));
|
382
|
+
/* range |= (tmp1, tmp2) << 3 bytes */
|
383
|
+
range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 13));
|
384
|
+
|
385
|
+
/*
|
386
|
+
* Now we have below range indices calculated
|
387
|
+
* Correct cases:
|
388
|
+
* - 8 for C0~FF
|
389
|
+
* - 3 for 1st byte after F0~FF
|
390
|
+
* - 2 for 1st byte after E0~EF or 2nd byte after F0~FF
|
391
|
+
* - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or
|
392
|
+
* 3rd byte after F0~FF
|
393
|
+
* - 0 for others
|
394
|
+
* Error cases:
|
395
|
+
* >9 for non ascii First Byte overlapping
|
396
|
+
* E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error
|
397
|
+
*/
|
398
|
+
|
399
|
+
/* Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) */
|
400
|
+
/* Overlaps lead to index 9~15, which are illegal in range table */
|
401
|
+
__m128i shift1;
|
402
|
+
__m128i pos;
|
403
|
+
__m128i range2;
|
404
|
+
/* shift1 = (input, prev_input) << 1 byte */
|
405
|
+
shift1 = _mm_alignr_epi8(input, prev_input, 15);
|
406
|
+
pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));
|
407
|
+
/*
|
408
|
+
* shift1: | EF F0 ... FE | FF 00 ... ... DE | DF E0 ... EE |
|
409
|
+
* pos: | 0 1 15 | 16 17 239| 240 241 255|
|
410
|
+
* pos-240: | 0 0 0 | 0 0 0 | 0 1 15 |
|
411
|
+
* pos+112: | 112 113 127| >= 128 | >= 128 |
|
412
|
+
*/
|
413
|
+
tmp1 = _mm_subs_epu8(pos, _mm_set1_epi8(-16));
|
414
|
+
range2 = _mm_shuffle_epi8(df_ee_table, tmp1);
|
415
|
+
tmp2 = _mm_adds_epu8(pos, _mm_set1_epi8(112));
|
416
|
+
range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_table, tmp2));
|
417
|
+
|
418
|
+
range = _mm_add_epi8(range, range2);
|
419
|
+
|
420
|
+
/* Load min and max values per calculated range index */
|
421
|
+
__m128i min_range = _mm_shuffle_epi8(range_min_table, range);
|
422
|
+
__m128i max_range = _mm_shuffle_epi8(range_max_table, range);
|
423
|
+
|
424
|
+
/* Check value range */
|
425
|
+
if (return_position) {
|
426
|
+
error = _mm_cmplt_epi8(input, min_range);
|
427
|
+
error = _mm_or_si128(error, _mm_cmpgt_epi8(input, max_range));
|
428
|
+
/* 5% performance drop from this conditional branch */
|
429
|
+
if (!_mm_testz_si128(error, error)) {
|
430
|
+
break;
|
431
|
+
}
|
432
|
+
} else {
|
433
|
+
error = _mm_or_si128(error, _mm_cmplt_epi8(input, min_range));
|
434
|
+
error = _mm_or_si128(error, _mm_cmpgt_epi8(input, max_range));
|
435
|
+
}
|
436
|
+
|
437
|
+
prev_input = input;
|
438
|
+
prev_first_len = first_len;
|
439
|
+
|
440
|
+
data += 16;
|
441
|
+
}
|
442
|
+
/* If we got to the end, we don't need to skip any bytes backwards */
|
443
|
+
if (return_position && (data - (end - len)) == 0) {
|
444
|
+
return utf8_range_ValidateUTF8Naive(data, end, return_position);
|
445
|
+
}
|
446
|
+
/* Find previous codepoint (not 80~BF) */
|
447
|
+
data -= utf8_range_CodepointSkipBackwards(_mm_extract_epi32(prev_input, 3));
|
448
|
+
if (return_position) {
|
449
|
+
return (data - (end - len)) +
|
450
|
+
utf8_range_ValidateUTF8Naive(data, end, return_position);
|
451
|
+
}
|
452
|
+
/* Test if there was any error */
|
453
|
+
if (!_mm_testz_si128(error, error)) {
|
454
|
+
return 0;
|
455
|
+
}
|
456
|
+
/* Check the tail */
|
457
|
+
return utf8_range_ValidateUTF8Naive(data, end, return_position);
|
458
|
+
#endif
|
459
|
+
}
|
460
|
+
|
461
|
+
int utf8_range_IsValid(const char* data, size_t len) {
|
462
|
+
return utf8_range_Validate(data, len, /*return_position=*/0) != 0;
|
463
|
+
}
|
464
|
+
|
465
|
+
size_t utf8_range_ValidPrefix(const char* data, size_t len) {
|
466
|
+
return utf8_range_Validate(data, len, /*return_position=*/1);
|
467
|
+
}
|
@@ -1,18 +1,19 @@
|
|
1
1
|
#ifndef THIRD_PARTY_UTF8_RANGE_UTF8_RANGE_H_
|
2
2
|
#define THIRD_PARTY_UTF8_RANGE_UTF8_RANGE_H_
|
3
3
|
|
4
|
+
#include <stddef.h>
|
5
|
+
|
4
6
|
#ifdef __cplusplus
|
5
7
|
extern "C" {
|
6
8
|
#endif
|
7
9
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
#endif
|
10
|
+
// Returns 1 if the sequence of characters is a valid UTF-8 sequence, otherwise
|
11
|
+
// 0.
|
12
|
+
int utf8_range_IsValid(const char* data, size_t len);
|
13
|
+
|
14
|
+
// Returns the length in bytes of the prefix of str that is all
|
15
|
+
// structurally valid UTF-8.
|
16
|
+
size_t utf8_range_ValidPrefix(const char* data, size_t len);
|
16
17
|
|
17
18
|
#ifdef __cplusplus
|
18
19
|
} // extern "C"
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -8,28 +8,7 @@ require 'google/protobuf'
|
|
8
8
|
descriptor_data = "\n\x19google/protobuf/any.proto\x12\x0fgoogle.protobuf\"&\n\x03\x41ny\x12\x10\n\x08type_url\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x42v\n\x13\x63om.google.protobufB\x08\x41nyProtoP\x01Z,google.golang.org/protobuf/types/known/anypb\xa2\x02\x03GPB\xaa\x02\x1eGoogle.Protobuf.WellKnownTypesb\x06proto3"
|
9
9
|
|
10
10
|
pool = Google::Protobuf::DescriptorPool.generated_pool
|
11
|
-
|
12
|
-
begin
|
13
|
-
pool.add_serialized_file(descriptor_data)
|
14
|
-
rescue TypeError
|
15
|
-
# Compatibility code: will be removed in the next major version.
|
16
|
-
require 'google/protobuf/descriptor_pb'
|
17
|
-
parsed = Google::Protobuf::FileDescriptorProto.decode(descriptor_data)
|
18
|
-
parsed.clear_dependency
|
19
|
-
serialized = parsed.class.encode(parsed)
|
20
|
-
file = pool.add_serialized_file(serialized)
|
21
|
-
warn "Warning: Protobuf detected an import path issue while loading generated file #{__FILE__}"
|
22
|
-
imports = [
|
23
|
-
]
|
24
|
-
imports.each do |type_name, expected_filename|
|
25
|
-
import_file = pool.lookup(type_name).file_descriptor
|
26
|
-
if import_file.name != expected_filename
|
27
|
-
warn "- #{file.name} imports #{expected_filename}, but that import was loaded as #{import_file.name}"
|
28
|
-
end
|
29
|
-
end
|
30
|
-
warn "Each proto file must use a consistent fully-qualified name."
|
31
|
-
warn "This will become an error in the next major version."
|
32
|
-
end
|
11
|
+
pool.add_serialized_file(descriptor_data)
|
33
12
|
|
34
13
|
module Google
|
35
14
|
module Protobuf
|
@@ -11,30 +11,7 @@ require 'google/protobuf/type_pb'
|
|
11
11
|
descriptor_data = "\n\x19google/protobuf/api.proto\x12\x0fgoogle.protobuf\x1a$google/protobuf/source_context.proto\x1a\x1agoogle/protobuf/type.proto\"\x81\x02\n\x03\x41pi\x12\x0c\n\x04name\x18\x01 \x01(\t\x12(\n\x07methods\x18\x02 \x03(\x0b\x32\x17.google.protobuf.Method\x12(\n\x07options\x18\x03 \x03(\x0b\x32\x17.google.protobuf.Option\x12\x0f\n\x07version\x18\x04 \x01(\t\x12\x36\n\x0esource_context\x18\x05 \x01(\x0b\x32\x1e.google.protobuf.SourceContext\x12&\n\x06mixins\x18\x06 \x03(\x0b\x32\x16.google.protobuf.Mixin\x12\'\n\x06syntax\x18\x07 \x01(\x0e\x32\x17.google.protobuf.Syntax\"\xd5\x01\n\x06Method\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x18\n\x10request_type_url\x18\x02 \x01(\t\x12\x19\n\x11request_streaming\x18\x03 \x01(\x08\x12\x19\n\x11response_type_url\x18\x04 \x01(\t\x12\x1a\n\x12response_streaming\x18\x05 \x01(\x08\x12(\n\x07options\x18\x06 \x03(\x0b\x32\x17.google.protobuf.Option\x12\'\n\x06syntax\x18\x07 \x01(\x0e\x32\x17.google.protobuf.Syntax\"#\n\x05Mixin\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04root\x18\x02 \x01(\tBv\n\x13\x63om.google.protobufB\x08\x41piProtoP\x01Z,google.golang.org/protobuf/types/known/apipb\xa2\x02\x03GPB\xaa\x02\x1eGoogle.Protobuf.WellKnownTypesb\x06proto3"
|
12
12
|
|
13
13
|
pool = Google::Protobuf::DescriptorPool.generated_pool
|
14
|
-
|
15
|
-
begin
|
16
|
-
pool.add_serialized_file(descriptor_data)
|
17
|
-
rescue TypeError
|
18
|
-
# Compatibility code: will be removed in the next major version.
|
19
|
-
require 'google/protobuf/descriptor_pb'
|
20
|
-
parsed = Google::Protobuf::FileDescriptorProto.decode(descriptor_data)
|
21
|
-
parsed.clear_dependency
|
22
|
-
serialized = parsed.class.encode(parsed)
|
23
|
-
file = pool.add_serialized_file(serialized)
|
24
|
-
warn "Warning: Protobuf detected an import path issue while loading generated file #{__FILE__}"
|
25
|
-
imports = [
|
26
|
-
["google.protobuf.Option", "google/protobuf/type.proto"],
|
27
|
-
["google.protobuf.SourceContext", "google/protobuf/source_context.proto"],
|
28
|
-
]
|
29
|
-
imports.each do |type_name, expected_filename|
|
30
|
-
import_file = pool.lookup(type_name).file_descriptor
|
31
|
-
if import_file.name != expected_filename
|
32
|
-
warn "- #{file.name} imports #{expected_filename}, but that import was loaded as #{import_file.name}"
|
33
|
-
end
|
34
|
-
end
|
35
|
-
warn "Each proto file must use a consistent fully-qualified name."
|
36
|
-
warn "This will become an error in the next major version."
|
37
|
-
end
|
14
|
+
pool.add_serialized_file(descriptor_data)
|
38
15
|
|
39
16
|
module Google
|
40
17
|
module Protobuf
|