google-protobuf 3.20.0.rc.1-x86_64-linux → 3.20.0.rc.2-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of google-protobuf might be problematic. Click here for more details.

@@ -0,0 +1,92 @@
1
+ #include <stdio.h>
2
+
3
+ /*
4
+ * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
5
+ *
6
+ * Table 3-7. Well-Formed UTF-8 Byte Sequences
7
+ *
8
+ * +--------------------+------------+-------------+------------+-------------+
9
+ * | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte |
10
+ * +--------------------+------------+-------------+------------+-------------+
11
+ * | U+0000..U+007F | 00..7F | | | |
12
+ * +--------------------+------------+-------------+------------+-------------+
13
+ * | U+0080..U+07FF | C2..DF | 80..BF | | |
14
+ * +--------------------+------------+-------------+------------+-------------+
15
+ * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
16
+ * +--------------------+------------+-------------+------------+-------------+
17
+ * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
18
+ * +--------------------+------------+-------------+------------+-------------+
19
+ * | U+D000..U+D7FF | ED | 80..9F | 80..BF | |
20
+ * +--------------------+------------+-------------+------------+-------------+
21
+ * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
22
+ * +--------------------+------------+-------------+------------+-------------+
23
+ * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
24
+ * +--------------------+------------+-------------+------------+-------------+
25
+ * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
26
+ * +--------------------+------------+-------------+------------+-------------+
27
+ * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
28
+ * +--------------------+------------+-------------+------------+-------------+
29
+ */
30
+
31
+ /* Return 0 - success, >0 - index(1 based) of first error char */
32
+ int utf8_naive(const unsigned char *data, int len)
33
+ {
34
+ int err_pos = 1;
35
+
36
+ while (len) {
37
+ int bytes;
38
+ const unsigned char byte1 = data[0];
39
+
40
+ /* 00..7F */
41
+ if (byte1 <= 0x7F) {
42
+ bytes = 1;
43
+ /* C2..DF, 80..BF */
44
+ } else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF &&
45
+ (signed char)data[1] <= (signed char)0xBF) {
46
+ bytes = 2;
47
+ } else if (len >= 3) {
48
+ const unsigned char byte2 = data[1];
49
+
50
+ /* Is byte2, byte3 between 0x80 ~ 0xBF */
51
+ const int byte2_ok = (signed char)byte2 <= (signed char)0xBF;
52
+ const int byte3_ok = (signed char)data[2] <= (signed char)0xBF;
53
+
54
+ if (byte2_ok && byte3_ok &&
55
+ /* E0, A0..BF, 80..BF */
56
+ ((byte1 == 0xE0 && byte2 >= 0xA0) ||
57
+ /* E1..EC, 80..BF, 80..BF */
58
+ (byte1 >= 0xE1 && byte1 <= 0xEC) ||
59
+ /* ED, 80..9F, 80..BF */
60
+ (byte1 == 0xED && byte2 <= 0x9F) ||
61
+ /* EE..EF, 80..BF, 80..BF */
62
+ (byte1 >= 0xEE && byte1 <= 0xEF))) {
63
+ bytes = 3;
64
+ } else if (len >= 4) {
65
+ /* Is byte4 between 0x80 ~ 0xBF */
66
+ const int byte4_ok = (signed char)data[3] <= (signed char)0xBF;
67
+
68
+ if (byte2_ok && byte3_ok && byte4_ok &&
69
+ /* F0, 90..BF, 80..BF, 80..BF */
70
+ ((byte1 == 0xF0 && byte2 >= 0x90) ||
71
+ /* F1..F3, 80..BF, 80..BF, 80..BF */
72
+ (byte1 >= 0xF1 && byte1 <= 0xF3) ||
73
+ /* F4, 80..8F, 80..BF, 80..BF */
74
+ (byte1 == 0xF4 && byte2 <= 0x8F))) {
75
+ bytes = 4;
76
+ } else {
77
+ return err_pos;
78
+ }
79
+ } else {
80
+ return err_pos;
81
+ }
82
+ } else {
83
+ return err_pos;
84
+ }
85
+
86
+ len -= bytes;
87
+ err_pos += bytes;
88
+ data += bytes;
89
+ }
90
+
91
+ return 0;
92
+ }
@@ -0,0 +1,157 @@
1
+ /*
2
+ * Process 2x16 bytes in each iteration.
3
+ * Comments removed for brevity. See range-neon.c for details.
4
+ */
5
+ #if defined(__aarch64__) && defined(__ARM_NEON)
6
+
7
+ #include <stdio.h>
8
+ #include <stdint.h>
9
+ #include <arm_neon.h>
10
+
11
+ int utf8_naive(const unsigned char *data, int len);
12
+
13
+ static const uint8_t _first_len_tbl[] = {
14
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3,
15
+ };
16
+
17
+ static const uint8_t _first_range_tbl[] = {
18
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8,
19
+ };
20
+
21
+ static const uint8_t _range_min_tbl[] = {
22
+ 0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80,
23
+ 0xC2, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
24
+ };
25
+ static const uint8_t _range_max_tbl[] = {
26
+ 0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F,
27
+ 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
28
+ };
29
+
30
+ static const uint8_t _range_adjust_tbl[] = {
31
+ 2, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0,
32
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0,
33
+ };
34
+
35
+ /* Return 0 on success, -1 on error */
36
+ int utf8_range2(const unsigned char *data, int len)
37
+ {
38
+ if (len >= 32) {
39
+ uint8x16_t prev_input = vdupq_n_u8(0);
40
+ uint8x16_t prev_first_len = vdupq_n_u8(0);
41
+
42
+ const uint8x16_t first_len_tbl = vld1q_u8(_first_len_tbl);
43
+ const uint8x16_t first_range_tbl = vld1q_u8(_first_range_tbl);
44
+ const uint8x16_t range_min_tbl = vld1q_u8(_range_min_tbl);
45
+ const uint8x16_t range_max_tbl = vld1q_u8(_range_max_tbl);
46
+ const uint8x16x2_t range_adjust_tbl = vld2q_u8(_range_adjust_tbl);
47
+
48
+ const uint8x16_t const_1 = vdupq_n_u8(1);
49
+ const uint8x16_t const_2 = vdupq_n_u8(2);
50
+ const uint8x16_t const_e0 = vdupq_n_u8(0xE0);
51
+
52
+ uint8x16_t error1 = vdupq_n_u8(0);
53
+ uint8x16_t error2 = vdupq_n_u8(0);
54
+ uint8x16_t error3 = vdupq_n_u8(0);
55
+ uint8x16_t error4 = vdupq_n_u8(0);
56
+
57
+ while (len >= 32) {
58
+ /******************* two blocks interleaved **********************/
59
+
60
+ #if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 8)
61
+ /* gcc doesn't support vldq1_u8_x2 until version 8 */
62
+ const uint8x16_t input_a = vld1q_u8(data);
63
+ const uint8x16_t input_b = vld1q_u8(data + 16);
64
+ #else
65
+ /* Forces a double load on Clang */
66
+ const uint8x16x2_t input_pair = vld1q_u8_x2(data);
67
+ const uint8x16_t input_a = input_pair.val[0];
68
+ const uint8x16_t input_b = input_pair.val[1];
69
+ #endif
70
+
71
+ const uint8x16_t high_nibbles_a = vshrq_n_u8(input_a, 4);
72
+ const uint8x16_t high_nibbles_b = vshrq_n_u8(input_b, 4);
73
+
74
+ const uint8x16_t first_len_a =
75
+ vqtbl1q_u8(first_len_tbl, high_nibbles_a);
76
+ const uint8x16_t first_len_b =
77
+ vqtbl1q_u8(first_len_tbl, high_nibbles_b);
78
+
79
+ uint8x16_t range_a = vqtbl1q_u8(first_range_tbl, high_nibbles_a);
80
+ uint8x16_t range_b = vqtbl1q_u8(first_range_tbl, high_nibbles_b);
81
+
82
+ range_a =
83
+ vorrq_u8(range_a, vextq_u8(prev_first_len, first_len_a, 15));
84
+ range_b =
85
+ vorrq_u8(range_b, vextq_u8(first_len_a, first_len_b, 15));
86
+
87
+ uint8x16_t tmp1_a, tmp2_a, tmp1_b, tmp2_b;
88
+ tmp1_a = vextq_u8(prev_first_len, first_len_a, 14);
89
+ tmp1_a = vqsubq_u8(tmp1_a, const_1);
90
+ range_a = vorrq_u8(range_a, tmp1_a);
91
+
92
+ tmp1_b = vextq_u8(first_len_a, first_len_b, 14);
93
+ tmp1_b = vqsubq_u8(tmp1_b, const_1);
94
+ range_b = vorrq_u8(range_b, tmp1_b);
95
+
96
+ tmp2_a = vextq_u8(prev_first_len, first_len_a, 13);
97
+ tmp2_a = vqsubq_u8(tmp2_a, const_2);
98
+ range_a = vorrq_u8(range_a, tmp2_a);
99
+
100
+ tmp2_b = vextq_u8(first_len_a, first_len_b, 13);
101
+ tmp2_b = vqsubq_u8(tmp2_b, const_2);
102
+ range_b = vorrq_u8(range_b, tmp2_b);
103
+
104
+ uint8x16_t shift1_a = vextq_u8(prev_input, input_a, 15);
105
+ uint8x16_t pos_a = vsubq_u8(shift1_a, const_e0);
106
+ range_a = vaddq_u8(range_a, vqtbl2q_u8(range_adjust_tbl, pos_a));
107
+
108
+ uint8x16_t shift1_b = vextq_u8(input_a, input_b, 15);
109
+ uint8x16_t pos_b = vsubq_u8(shift1_b, const_e0);
110
+ range_b = vaddq_u8(range_b, vqtbl2q_u8(range_adjust_tbl, pos_b));
111
+
112
+ uint8x16_t minv_a = vqtbl1q_u8(range_min_tbl, range_a);
113
+ uint8x16_t maxv_a = vqtbl1q_u8(range_max_tbl, range_a);
114
+
115
+ uint8x16_t minv_b = vqtbl1q_u8(range_min_tbl, range_b);
116
+ uint8x16_t maxv_b = vqtbl1q_u8(range_max_tbl, range_b);
117
+
118
+ error1 = vorrq_u8(error1, vcltq_u8(input_a, minv_a));
119
+ error2 = vorrq_u8(error2, vcgtq_u8(input_a, maxv_a));
120
+
121
+ error3 = vorrq_u8(error3, vcltq_u8(input_b, minv_b));
122
+ error4 = vorrq_u8(error4, vcgtq_u8(input_b, maxv_b));
123
+
124
+ /************************ next iteration *************************/
125
+ prev_input = input_b;
126
+ prev_first_len = first_len_b;
127
+
128
+ data += 32;
129
+ len -= 32;
130
+ }
131
+ error1 = vorrq_u8(error1, error2);
132
+ error1 = vorrq_u8(error1, error3);
133
+ error1 = vorrq_u8(error1, error4);
134
+
135
+ if (vmaxvq_u8(error1))
136
+ return -1;
137
+
138
+ uint32_t token4;
139
+ vst1q_lane_u32(&token4, vreinterpretq_u32_u8(prev_input), 3);
140
+
141
+ const int8_t *token = (const int8_t *)&token4;
142
+ int lookahead = 0;
143
+ if (token[3] > (int8_t)0xBF)
144
+ lookahead = 1;
145
+ else if (token[2] > (int8_t)0xBF)
146
+ lookahead = 2;
147
+ else if (token[1] > (int8_t)0xBF)
148
+ lookahead = 3;
149
+
150
+ data -= lookahead;
151
+ len += lookahead;
152
+ }
153
+
154
+ return utf8_naive(data, len);
155
+ }
156
+
157
+ #endif
@@ -0,0 +1,170 @@
1
+ /*
2
+ * Process 2x16 bytes in each iteration.
3
+ * Comments removed for brevity. See range-sse.c for details.
4
+ */
5
+ #ifdef __SSE4_1__
6
+
7
+ #include <stdio.h>
8
+ #include <stdint.h>
9
+ #include <x86intrin.h>
10
+
11
+ int utf8_naive(const unsigned char *data, int len);
12
+
13
+ static const int8_t _first_len_tbl[] = {
14
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3,
15
+ };
16
+
17
+ static const int8_t _first_range_tbl[] = {
18
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8,
19
+ };
20
+
21
+ static const int8_t _range_min_tbl[] = {
22
+ 0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80,
23
+ 0xC2, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
24
+ };
25
+ static const int8_t _range_max_tbl[] = {
26
+ 0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F,
27
+ 0xF4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
28
+ };
29
+
30
+ static const int8_t _df_ee_tbl[] = {
31
+ 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
32
+ };
33
+ static const int8_t _ef_fe_tbl[] = {
34
+ 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
35
+ };
36
+
37
+ /* Return 0 on success, -1 on error */
38
+ int utf8_range2(const unsigned char *data, int len)
39
+ {
40
+ if (len >= 32) {
41
+ __m128i prev_input = _mm_set1_epi8(0);
42
+ __m128i prev_first_len = _mm_set1_epi8(0);
43
+
44
+ const __m128i first_len_tbl =
45
+ _mm_loadu_si128((const __m128i *)_first_len_tbl);
46
+ const __m128i first_range_tbl =
47
+ _mm_loadu_si128((const __m128i *)_first_range_tbl);
48
+ const __m128i range_min_tbl =
49
+ _mm_loadu_si128((const __m128i *)_range_min_tbl);
50
+ const __m128i range_max_tbl =
51
+ _mm_loadu_si128((const __m128i *)_range_max_tbl);
52
+ const __m128i df_ee_tbl =
53
+ _mm_loadu_si128((const __m128i *)_df_ee_tbl);
54
+ const __m128i ef_fe_tbl =
55
+ _mm_loadu_si128((const __m128i *)_ef_fe_tbl);
56
+
57
+ __m128i error = _mm_set1_epi8(0);
58
+
59
+ while (len >= 32) {
60
+ /***************************** block 1 ****************************/
61
+ const __m128i input_a = _mm_loadu_si128((const __m128i *)data);
62
+
63
+ __m128i high_nibbles =
64
+ _mm_and_si128(_mm_srli_epi16(input_a, 4), _mm_set1_epi8(0x0F));
65
+
66
+ __m128i first_len_a = _mm_shuffle_epi8(first_len_tbl, high_nibbles);
67
+
68
+ __m128i range_a = _mm_shuffle_epi8(first_range_tbl, high_nibbles);
69
+
70
+ range_a = _mm_or_si128(
71
+ range_a, _mm_alignr_epi8(first_len_a, prev_first_len, 15));
72
+
73
+ __m128i tmp;
74
+ tmp = _mm_alignr_epi8(first_len_a, prev_first_len, 14);
75
+ tmp = _mm_subs_epu8(tmp, _mm_set1_epi8(1));
76
+ range_a = _mm_or_si128(range_a, tmp);
77
+
78
+ tmp = _mm_alignr_epi8(first_len_a, prev_first_len, 13);
79
+ tmp = _mm_subs_epu8(tmp, _mm_set1_epi8(2));
80
+ range_a = _mm_or_si128(range_a, tmp);
81
+
82
+ __m128i shift1, pos, range2;
83
+ shift1 = _mm_alignr_epi8(input_a, prev_input, 15);
84
+ pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));
85
+ tmp = _mm_subs_epu8(pos, _mm_set1_epi8(0xF0));
86
+ range2 = _mm_shuffle_epi8(df_ee_tbl, tmp);
87
+ tmp = _mm_adds_epu8(pos, _mm_set1_epi8(0x70));
88
+ range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_tbl, tmp));
89
+
90
+ range_a = _mm_add_epi8(range_a, range2);
91
+
92
+ __m128i minv = _mm_shuffle_epi8(range_min_tbl, range_a);
93
+ __m128i maxv = _mm_shuffle_epi8(range_max_tbl, range_a);
94
+
95
+ tmp = _mm_or_si128(
96
+ _mm_cmplt_epi8(input_a, minv),
97
+ _mm_cmpgt_epi8(input_a, maxv)
98
+ );
99
+ error = _mm_or_si128(error, tmp);
100
+
101
+ /***************************** block 2 ****************************/
102
+ const __m128i input_b = _mm_loadu_si128((const __m128i *)(data+16));
103
+
104
+ high_nibbles =
105
+ _mm_and_si128(_mm_srli_epi16(input_b, 4), _mm_set1_epi8(0x0F));
106
+
107
+ __m128i first_len_b = _mm_shuffle_epi8(first_len_tbl, high_nibbles);
108
+
109
+ __m128i range_b = _mm_shuffle_epi8(first_range_tbl, high_nibbles);
110
+
111
+ range_b = _mm_or_si128(
112
+ range_b, _mm_alignr_epi8(first_len_b, first_len_a, 15));
113
+
114
+
115
+ tmp = _mm_alignr_epi8(first_len_b, first_len_a, 14);
116
+ tmp = _mm_subs_epu8(tmp, _mm_set1_epi8(1));
117
+ range_b = _mm_or_si128(range_b, tmp);
118
+
119
+ tmp = _mm_alignr_epi8(first_len_b, first_len_a, 13);
120
+ tmp = _mm_subs_epu8(tmp, _mm_set1_epi8(2));
121
+ range_b = _mm_or_si128(range_b, tmp);
122
+
123
+ shift1 = _mm_alignr_epi8(input_b, input_a, 15);
124
+ pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));
125
+ tmp = _mm_subs_epu8(pos, _mm_set1_epi8(0xF0));
126
+ range2 = _mm_shuffle_epi8(df_ee_tbl, tmp);
127
+ tmp = _mm_adds_epu8(pos, _mm_set1_epi8(0x70));
128
+ range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_tbl, tmp));
129
+
130
+ range_b = _mm_add_epi8(range_b, range2);
131
+
132
+ minv = _mm_shuffle_epi8(range_min_tbl, range_b);
133
+ maxv = _mm_shuffle_epi8(range_max_tbl, range_b);
134
+
135
+
136
+ tmp = _mm_or_si128(
137
+ _mm_cmplt_epi8(input_b, minv),
138
+ _mm_cmpgt_epi8(input_b, maxv)
139
+ );
140
+ error = _mm_or_si128(error, tmp);
141
+
142
+ /************************ next iteration **************************/
143
+ prev_input = input_b;
144
+ prev_first_len = first_len_b;
145
+
146
+ data += 32;
147
+ len -= 32;
148
+ }
149
+
150
+ if (!_mm_testz_si128(error, error))
151
+ return -1;
152
+
153
+ int32_t token4 = _mm_extract_epi32(prev_input, 3);
154
+ const int8_t *token = (const int8_t *)&token4;
155
+ int lookahead = 0;
156
+ if (token[3] > (int8_t)0xBF)
157
+ lookahead = 1;
158
+ else if (token[2] > (int8_t)0xBF)
159
+ lookahead = 2;
160
+ else if (token[1] > (int8_t)0xBF)
161
+ lookahead = 3;
162
+
163
+ data -= lookahead;
164
+ len += lookahead;
165
+ }
166
+
167
+ return utf8_naive(data, len);
168
+ }
169
+
170
+ #endif
@@ -1,5 +1,5 @@
1
1
 
2
- #if defined(__ARM_NEON) || defined(__SSE4_1__)
2
+ #if (defined(__ARM_NEON) && defined(__aarch64__)) || defined(__SSE4_1__)
3
3
  int utf8_range2(const unsigned char* data, int len);
4
4
  #else
5
5
  int utf8_naive(const unsigned char* data, int len);
Binary file
Binary file
Binary file
Binary file
data/tests/basic.rb CHANGED
@@ -658,5 +658,13 @@ module BasicTest
658
658
  assert_equal str, m.optional_string
659
659
  assert_equal str, m.optional_bytes
660
660
  end
661
+
662
+ def test_utf8
663
+ m = proto_module::TestMessage.new(
664
+ optional_string: "µpb",
665
+ )
666
+ m2 = proto_module::TestMessage.decode(proto_module::TestMessage.encode(m))
667
+ assert_equal m2, m
668
+ end
661
669
  end
662
670
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: google-protobuf
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.20.0.rc.1
4
+ version: 3.20.0.rc.2
5
5
  platform: x86_64-linux
6
6
  authors:
7
7
  - Protobuf Authors
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-03-04 00:00:00.000000000 Z
11
+ date: 2022-03-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake-compiler-dock
@@ -80,7 +80,9 @@ files:
80
80
  - ext/google/protobuf_c/ruby-upb.c
81
81
  - ext/google/protobuf_c/ruby-upb.h
82
82
  - ext/google/protobuf_c/third_party/utf8_range/LICENSE
83
- - ext/google/protobuf_c/third_party/utf8_range/utf8_range.c
83
+ - ext/google/protobuf_c/third_party/utf8_range/naive.c
84
+ - ext/google/protobuf_c/third_party/utf8_range/range2-neon.c
85
+ - ext/google/protobuf_c/third_party/utf8_range/range2-sse.c
84
86
  - ext/google/protobuf_c/third_party/utf8_range/utf8_range.h
85
87
  - ext/google/protobuf_c/wrap_memcpy.c
86
88
  - lib/google/2.5/protobuf_c.so
@@ -110,7 +112,7 @@ homepage: https://developers.google.com/protocol-buffers
110
112
  licenses:
111
113
  - BSD-3-Clause
112
114
  metadata:
113
- source_code_uri: https://github.com/protocolbuffers/protobuf/tree/v3.20.0-rc1/ruby
115
+ source_code_uri: https://github.com/protocolbuffers/protobuf/tree/v3.20.0-rc2/ruby
114
116
  post_install_message:
115
117
  rdoc_options: []
116
118
  require_paths: