bson 4.5.0 → 4.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/README.md +7 -5
- data/ext/bson/{native-endian.h → bson-endian.h} +5 -99
- data/ext/bson/bson-native.h +112 -0
- data/ext/bson/bytebuf.c +133 -0
- data/ext/bson/endian.c +116 -0
- data/ext/bson/init.c +289 -0
- data/ext/bson/libbson-utf8.c +230 -0
- data/ext/bson/read.c +294 -0
- data/ext/bson/util.c +55 -0
- data/ext/bson/write.c +637 -0
- data/lib/bson/document.rb +43 -1
- data/lib/bson/hash.rb +11 -2
- data/lib/bson/int32.rb +19 -13
- data/lib/bson/int64.rb +19 -13
- data/lib/bson/version.rb +1 -1
- data/spec/bson/byte_buffer_read_spec.rb +141 -0
- data/spec/bson/byte_buffer_spec.rb +14 -451
- data/spec/bson/byte_buffer_write_spec.rb +758 -0
- data/spec/bson/corpus_spec.rb +8 -5
- data/spec/bson/document_spec.rb +29 -29
- data/spec/bson/hash_spec.rb +65 -0
- data/spec/bson/int32_spec.rb +21 -3
- data/spec/bson/int64_spec.rb +22 -3
- data/spec/bson/string_spec.rb +18 -0
- data/spec/support/corpus-tests/array.json +8 -2
- data/spec/support/shared_examples.rb +2 -4
- data/spec/support/utils.rb +10 -0
- metadata +74 -55
- metadata.gz.sig +0 -0
- data/ext/bson/bson_native.c +0 -1344
data/ext/bson/init.c
ADDED
@@ -0,0 +1,289 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (C) 2009-2019 MongoDB Inc.
|
3
|
+
*
|
4
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
* you may not use this file except in compliance with the License.
|
6
|
+
* You may obtain a copy of the License at
|
7
|
+
*
|
8
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
*
|
10
|
+
* Unless required by applicable law or agreed to in writing, software
|
11
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
* See the License for the specific language governing permissions and
|
14
|
+
* limitations under the License.
|
15
|
+
*/
|
16
|
+
#include "bson-native.h"
|
17
|
+
|
18
|
+
/**
|
19
|
+
* The counter for incrementing object ids.
|
20
|
+
*/
|
21
|
+
uint32_t rb_bson_object_id_counter;
|
22
|
+
|
23
|
+
|
24
|
+
VALUE rb_bson_registry;
|
25
|
+
|
26
|
+
VALUE rb_bson_illegal_key;
|
27
|
+
|
28
|
+
const rb_data_type_t rb_byte_buffer_data_type = {
|
29
|
+
"bson/byte_buffer",
|
30
|
+
{ NULL, rb_bson_byte_buffer_free, rb_bson_byte_buffer_memsize }
|
31
|
+
};
|
32
|
+
|
33
|
+
/**
|
34
|
+
* Initialize the bson_native extension.
|
35
|
+
*/
|
36
|
+
void Init_bson_native()
|
37
|
+
{
|
38
|
+
char rb_bson_machine_id[256];
|
39
|
+
|
40
|
+
VALUE rb_bson_module = rb_define_module("BSON");
|
41
|
+
|
42
|
+
/* Document-class: BSON::ByteBuffer
|
43
|
+
*
|
44
|
+
* Stores BSON-serialized data and provides efficient serialization and
|
45
|
+
* deserialization of common Ruby classes using native code.
|
46
|
+
*/
|
47
|
+
VALUE rb_byte_buffer_class = rb_define_class_under(rb_bson_module, "ByteBuffer", rb_cObject);
|
48
|
+
|
49
|
+
VALUE rb_bson_object_id_class = rb_const_get(rb_bson_module, rb_intern("ObjectId"));
|
50
|
+
VALUE rb_bson_object_id_generator_class = rb_const_get(rb_bson_object_id_class, rb_intern("Generator"));
|
51
|
+
VALUE rb_digest_class = rb_const_get(rb_cObject, rb_intern("Digest"));
|
52
|
+
VALUE rb_md5_class = rb_const_get(rb_digest_class, rb_intern("MD5"));
|
53
|
+
|
54
|
+
rb_bson_illegal_key = rb_const_get(rb_const_get(rb_bson_module, rb_intern("String")),rb_intern("IllegalKey"));
|
55
|
+
|
56
|
+
rb_define_alloc_func(rb_byte_buffer_class, rb_bson_byte_buffer_allocate);
|
57
|
+
rb_define_method(rb_byte_buffer_class, "initialize", rb_bson_byte_buffer_initialize, -1);
|
58
|
+
rb_define_method(rb_byte_buffer_class, "length", rb_bson_byte_buffer_length, 0);
|
59
|
+
|
60
|
+
/*
|
61
|
+
* call-seq:
|
62
|
+
* buffer.read_position -> Fixnum
|
63
|
+
*
|
64
|
+
* Returns the read position in the buffer.
|
65
|
+
*/
|
66
|
+
rb_define_method(rb_byte_buffer_class, "read_position", rb_bson_byte_buffer_read_position, 0);
|
67
|
+
|
68
|
+
rb_define_method(rb_byte_buffer_class, "get_byte", rb_bson_byte_buffer_get_byte, 0);
|
69
|
+
rb_define_method(rb_byte_buffer_class, "get_bytes", rb_bson_byte_buffer_get_bytes, 1);
|
70
|
+
rb_define_method(rb_byte_buffer_class, "get_cstring", rb_bson_byte_buffer_get_cstring, 0);
|
71
|
+
rb_define_method(rb_byte_buffer_class, "get_decimal128_bytes", rb_bson_byte_buffer_get_decimal128_bytes, 0);
|
72
|
+
rb_define_method(rb_byte_buffer_class, "get_double", rb_bson_byte_buffer_get_double, 0);
|
73
|
+
rb_define_method(rb_byte_buffer_class, "get_hash", rb_bson_byte_buffer_get_hash, 0);
|
74
|
+
rb_define_method(rb_byte_buffer_class, "get_array", rb_bson_byte_buffer_get_array, 0);
|
75
|
+
rb_define_method(rb_byte_buffer_class, "get_int32", rb_bson_byte_buffer_get_int32, 0);
|
76
|
+
rb_define_method(rb_byte_buffer_class, "get_int64", rb_bson_byte_buffer_get_int64, 0);
|
77
|
+
rb_define_method(rb_byte_buffer_class, "get_string", rb_bson_byte_buffer_get_string, 0);
|
78
|
+
|
79
|
+
/*
|
80
|
+
* call-seq:
|
81
|
+
* buffer.write_position -> Fixnum
|
82
|
+
*
|
83
|
+
* Returns the write position in the buffer.
|
84
|
+
*/
|
85
|
+
rb_define_method(rb_byte_buffer_class, "write_position", rb_bson_byte_buffer_write_position, 0);
|
86
|
+
|
87
|
+
/*
|
88
|
+
* call-seq:
|
89
|
+
* buffer.put_byte(binary_str) -> ByteBuffer
|
90
|
+
*
|
91
|
+
* Writes the specified byte string, which must be of length 1,
|
92
|
+
* to the byte buffer.
|
93
|
+
*
|
94
|
+
* Returns the modified +self+.
|
95
|
+
*/
|
96
|
+
rb_define_method(rb_byte_buffer_class, "put_byte", rb_bson_byte_buffer_put_byte, 1);
|
97
|
+
|
98
|
+
/*
|
99
|
+
* call-seq:
|
100
|
+
* buffer.put_bytes(binary_str) -> ByteBuffer
|
101
|
+
*
|
102
|
+
* Writes the specified byte string to the byte buffer.
|
103
|
+
*
|
104
|
+
* This method writes exactly the provided byte string - in particular, it
|
105
|
+
* does not prepend the length, and does not append a null byte at the end.
|
106
|
+
*
|
107
|
+
* Returns the modified +self+.
|
108
|
+
*/
|
109
|
+
rb_define_method(rb_byte_buffer_class, "put_bytes", rb_bson_byte_buffer_put_bytes, 1);
|
110
|
+
|
111
|
+
/*
|
112
|
+
* call-seq:
|
113
|
+
* buffer.put_string(str) -> ByteBuffer
|
114
|
+
*
|
115
|
+
* Writes the specified string to the byte buffer as a BSON string.
|
116
|
+
*
|
117
|
+
* Unlike #put_bytes, this method writes the provided byte string as
|
118
|
+
* a "BSON string" - the string is prefixed with its length and suffixed
|
119
|
+
* with a null byte. The byte string may contain null bytes itself thus
|
120
|
+
* the null terminator is redundant, but it is required by the BSON
|
121
|
+
* specification.
|
122
|
+
*
|
123
|
+
* +str+ must either already be in UTF-8 encoding or be a string encodable
|
124
|
+
* to UTF-8. In particular, a string in BINARY/ASCII-8BIT encoding is
|
125
|
+
* generally not suitable for this method. +EncodingError+ will be raised
|
126
|
+
* if +str+ cannot be encoded in UTF-8, or if +str+ claims to be encoded in
|
127
|
+
* UTF-8 but contains bytes/byte sequences which are not valid in UTF-8.
|
128
|
+
* Use #put_bytes to write arbitrary byte strings to the buffer.
|
129
|
+
*
|
130
|
+
* Returns the modified +self+.
|
131
|
+
*/
|
132
|
+
rb_define_method(rb_byte_buffer_class, "put_string", rb_bson_byte_buffer_put_string, 1);
|
133
|
+
|
134
|
+
/**
|
135
|
+
* call-seq:
|
136
|
+
* buffer.put_cstring(obj) -> ByteBuffer
|
137
|
+
*
|
138
|
+
* Converts +obj+ to a string, which must not contain any null bytes, and
|
139
|
+
* which must be valid UTF-8, and writes the string to the buffer as a
|
140
|
+
* BSON cstring. +obj+ can be an instance of String, Symbol or Fixnum.
|
141
|
+
*
|
142
|
+
* If the string serialization of +obj+ contains null bytes, this method
|
143
|
+
* raises +ArgumentError+. If +obj+ is of an unsupported type, this method
|
144
|
+
* raises +TypeError+.
|
145
|
+
*
|
146
|
+
* BSON cstring serialization contains no length of the string (relying
|
147
|
+
* instead on the null terminator), unlike the BSON string serialization.
|
148
|
+
*/
|
149
|
+
rb_define_method(rb_byte_buffer_class, "put_cstring", rb_bson_byte_buffer_put_cstring, 1);
|
150
|
+
|
151
|
+
/**
|
152
|
+
* call-seq:
|
153
|
+
* buffer.put_symbol(sym) -> ByteBuffer
|
154
|
+
*
|
155
|
+
* Converts +sym+ to a string and writes the resulting string to the byte
|
156
|
+
* buffer.
|
157
|
+
*
|
158
|
+
* The symbol may contain null bytes.
|
159
|
+
*
|
160
|
+
* The symbol value is assumed to be encoded in UTF-8. If the symbol value
|
161
|
+
* contains bytes or byte sequences that are not valid in UTF-8, this method
|
162
|
+
* raises +EncodingError+.
|
163
|
+
*
|
164
|
+
* Note: due to the string conversion, a symbol written to the buffer becomes
|
165
|
+
* indistinguishable from a string with the same value written to the buffer.
|
166
|
+
*/
|
167
|
+
rb_define_method(rb_byte_buffer_class, "put_symbol", rb_bson_byte_buffer_put_symbol, 1);
|
168
|
+
|
169
|
+
/*
|
170
|
+
* call-seq:
|
171
|
+
* buffer.put_int32(fixnum) -> ByteBuffer
|
172
|
+
*
|
173
|
+
* Writes a 32-bit integer value to the buffer.
|
174
|
+
*
|
175
|
+
* If the argument cannot be represented in 32 bits, raises RangeError.
|
176
|
+
*
|
177
|
+
* Returns the modified +self+.
|
178
|
+
*/
|
179
|
+
rb_define_method(rb_byte_buffer_class, "put_int32", rb_bson_byte_buffer_put_int32, 1);
|
180
|
+
|
181
|
+
/*
|
182
|
+
* call-seq:
|
183
|
+
* buffer.put_int64(fixnum) -> ByteBuffer
|
184
|
+
*
|
185
|
+
* Writes a 64-bit integer value to the buffer.
|
186
|
+
*
|
187
|
+
* If the argument cannot be represented in 64 bits, raises RangeError.
|
188
|
+
*
|
189
|
+
* Returns the modified +self+.
|
190
|
+
*/
|
191
|
+
rb_define_method(rb_byte_buffer_class, "put_int64", rb_bson_byte_buffer_put_int64, 1);
|
192
|
+
|
193
|
+
/*
|
194
|
+
* call-seq:
|
195
|
+
* buffer.put_double(double) -> ByteBuffer
|
196
|
+
*
|
197
|
+
* Writes a 64-bit floating point value to the buffer.
|
198
|
+
*
|
199
|
+
* Returns the modified +self+.
|
200
|
+
*/
|
201
|
+
rb_define_method(rb_byte_buffer_class, "put_double", rb_bson_byte_buffer_put_double, 1);
|
202
|
+
|
203
|
+
/*
|
204
|
+
* call-seq:
|
205
|
+
* buffer.put_decimal128(low_64bit, high_64bit) -> ByteBuffer
|
206
|
+
*
|
207
|
+
* Writes a 128-bit Decimal128 value to the buffer.
|
208
|
+
*
|
209
|
+
* +low_64bit+ and +high_64bit+ are Fixnum objects containing the low and
|
210
|
+
* the high parts of the 128-bit Decimal128 value, respectively.
|
211
|
+
*
|
212
|
+
* Returns the modified +self+.
|
213
|
+
*/
|
214
|
+
rb_define_method(rb_byte_buffer_class, "put_decimal128", rb_bson_byte_buffer_put_decimal128, 2);
|
215
|
+
|
216
|
+
/*
|
217
|
+
* call-seq:
|
218
|
+
* buffer.put_hash(hash) -> ByteBuffer
|
219
|
+
*
|
220
|
+
* Writes a Hash into the byte buffer.
|
221
|
+
*
|
222
|
+
* Returns the modified +self+.
|
223
|
+
*/
|
224
|
+
rb_define_method(rb_byte_buffer_class, "put_hash", rb_bson_byte_buffer_put_hash, 2);
|
225
|
+
|
226
|
+
/*
|
227
|
+
* call-seq:
|
228
|
+
* buffer.put_array(array) -> ByteBuffer
|
229
|
+
*
|
230
|
+
* Writes an Array into the byte buffer.
|
231
|
+
*
|
232
|
+
* Returns the modified +self+.
|
233
|
+
*/
|
234
|
+
rb_define_method(rb_byte_buffer_class, "put_array", rb_bson_byte_buffer_put_array, 2);
|
235
|
+
|
236
|
+
/*
|
237
|
+
* call-seq:
|
238
|
+
* buffer.replace_int32(position, fixnum) -> ByteBuffer
|
239
|
+
*
|
240
|
+
* Replaces a 32-bit integer value at the specified position in the buffer.
|
241
|
+
*
|
242
|
+
* The position must be a non-negative integer, and must be completely
|
243
|
+
* contained within the data already written. For example, if the buffer has
|
244
|
+
* the write position of 12, the acceptable range of positions for this
|
245
|
+
* method is 0..8.
|
246
|
+
*
|
247
|
+
* If the argument cannot be represented in 32 bits, raises RangeError.
|
248
|
+
*
|
249
|
+
* Returns the modified +self+.
|
250
|
+
*/
|
251
|
+
rb_define_method(rb_byte_buffer_class, "replace_int32", rb_bson_byte_buffer_replace_int32, 2);
|
252
|
+
|
253
|
+
/*
|
254
|
+
* call-seq:
|
255
|
+
* buffer.rewind! -> ByteBuffer
|
256
|
+
*
|
257
|
+
* Resets the read position to the beginning of the byte buffer.
|
258
|
+
*
|
259
|
+
* Note: +rewind!+ does not change the buffer's write position.
|
260
|
+
*
|
261
|
+
* Returns the modified +self+.
|
262
|
+
*/
|
263
|
+
rb_define_method(rb_byte_buffer_class, "rewind!", rb_bson_byte_buffer_rewind, 0);
|
264
|
+
|
265
|
+
/*
|
266
|
+
* call-seq:
|
267
|
+
* buffer.to_s -> String
|
268
|
+
*
|
269
|
+
* Returns the contents of the buffer as a binary string.
|
270
|
+
*
|
271
|
+
* Note: this method copies the buffer's contents into a newly allocated
|
272
|
+
* +String+ instance. It does not return a reference to the data stored in
|
273
|
+
* the buffer itself.
|
274
|
+
*/
|
275
|
+
rb_define_method(rb_byte_buffer_class, "to_s", rb_bson_byte_buffer_to_s, 0);
|
276
|
+
|
277
|
+
rb_define_method(rb_bson_object_id_generator_class, "next_object_id", rb_bson_object_id_generator_next, -1);
|
278
|
+
|
279
|
+
// Get the object id machine id and hash it.
|
280
|
+
rb_require("digest/md5");
|
281
|
+
gethostname(rb_bson_machine_id, sizeof(rb_bson_machine_id));
|
282
|
+
rb_bson_machine_id[255] = '\0';
|
283
|
+
rb_bson_generate_machine_id(rb_md5_class, rb_bson_machine_id);
|
284
|
+
|
285
|
+
// Set the object id counter to a random number
|
286
|
+
rb_bson_object_id_counter = FIX2INT(rb_funcall(rb_mKernel, rb_intern("rand"), 1, INT2FIX(0x1000000)));
|
287
|
+
|
288
|
+
rb_bson_registry = rb_const_get(rb_bson_module, rb_intern("Registry"));
|
289
|
+
}
|
@@ -0,0 +1,230 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <ruby/encoding.h>
|
3
|
+
#include <stdbool.h>
|
4
|
+
#include <unistd.h>
|
5
|
+
#include <assert.h>
|
6
|
+
#include "bson-native.h"
|
7
|
+
|
8
|
+
/**
|
9
|
+
* Taken from libbson.
|
10
|
+
*/
|
11
|
+
|
12
|
+
#define BSON_ASSERT assert
|
13
|
+
#define BSON_INLINE
|
14
|
+
|
15
|
+
|
16
|
+
/*
|
17
|
+
*--------------------------------------------------------------------------
|
18
|
+
*
|
19
|
+
* _bson_utf8_get_sequence --
|
20
|
+
*
|
21
|
+
* Determine the sequence length of the first UTF-8 character in
|
22
|
+
* @utf8. The sequence length is stored in @seq_length and the mask
|
23
|
+
* for the first character is stored in @first_mask.
|
24
|
+
*
|
25
|
+
* Returns:
|
26
|
+
* None.
|
27
|
+
*
|
28
|
+
* Side effects:
|
29
|
+
* @seq_length is set.
|
30
|
+
* @first_mask is set.
|
31
|
+
*
|
32
|
+
*--------------------------------------------------------------------------
|
33
|
+
*/
|
34
|
+
|
35
|
+
static BSON_INLINE void
|
36
|
+
_bson_utf8_get_sequence (const char *utf8, /* IN */
|
37
|
+
uint8_t *seq_length, /* OUT */
|
38
|
+
uint8_t *first_mask) /* OUT */
|
39
|
+
{
|
40
|
+
unsigned char c = *(const unsigned char *) utf8;
|
41
|
+
uint8_t m;
|
42
|
+
uint8_t n;
|
43
|
+
|
44
|
+
/*
|
45
|
+
* See the following[1] for a description of what the given multi-byte
|
46
|
+
* sequences will be based on the bits set of the first byte. We also need
|
47
|
+
* to mask the first byte based on that. All subsequent bytes are masked
|
48
|
+
* against 0x3F.
|
49
|
+
*
|
50
|
+
* [1] http://www.joelonsoftware.com/articles/Unicode.html
|
51
|
+
*/
|
52
|
+
|
53
|
+
if ((c & 0x80) == 0) {
|
54
|
+
n = 1;
|
55
|
+
m = 0x7F;
|
56
|
+
} else if ((c & 0xE0) == 0xC0) {
|
57
|
+
n = 2;
|
58
|
+
m = 0x1F;
|
59
|
+
} else if ((c & 0xF0) == 0xE0) {
|
60
|
+
n = 3;
|
61
|
+
m = 0x0F;
|
62
|
+
} else if ((c & 0xF8) == 0xF0) {
|
63
|
+
n = 4;
|
64
|
+
m = 0x07;
|
65
|
+
} else {
|
66
|
+
n = 0;
|
67
|
+
m = 0;
|
68
|
+
}
|
69
|
+
|
70
|
+
*seq_length = n;
|
71
|
+
*first_mask = m;
|
72
|
+
}
|
73
|
+
|
74
|
+
|
75
|
+
/*
|
76
|
+
*--------------------------------------------------------------------------
|
77
|
+
*
|
78
|
+
* bson_utf8_validate --
|
79
|
+
*
|
80
|
+
* Validates that @utf8 is a valid UTF-8 string. Note that we only
|
81
|
+
* support UTF-8 characters which have sequence length less than or equal
|
82
|
+
* to 4 bytes (RFC 3629).
|
83
|
+
*
|
84
|
+
* If @allow_null is true, then \0 is allowed within @utf8_len bytes
|
85
|
+
* of @utf8. Generally, this is bad practice since the main point of
|
86
|
+
* UTF-8 strings is that they can be used with strlen() and friends.
|
87
|
+
* However, some languages such as Python can send UTF-8 encoded
|
88
|
+
* strings with NUL's in them.
|
89
|
+
*
|
90
|
+
* Parameters:
|
91
|
+
* @utf8: A UTF-8 encoded string.
|
92
|
+
* @utf8_len: The length of @utf8 in bytes.
|
93
|
+
* @allow_null: If \0 is allowed within @utf8, exclusing trailing \0.
|
94
|
+
* @data_type: The data type being serialized.
|
95
|
+
*
|
96
|
+
* Returns:
|
97
|
+
* true if @utf8 is valid UTF-8. otherwise false.
|
98
|
+
*
|
99
|
+
* Side effects:
|
100
|
+
* None.
|
101
|
+
*
|
102
|
+
*--------------------------------------------------------------------------
|
103
|
+
*/
|
104
|
+
|
105
|
+
void
|
106
|
+
rb_bson_utf8_validate (const char *utf8, /* IN */
|
107
|
+
size_t utf8_len, /* IN */
|
108
|
+
bool allow_null, /* IN */
|
109
|
+
const char *data_type) /* IN */
|
110
|
+
{
|
111
|
+
uint32_t c;
|
112
|
+
uint8_t first_mask;
|
113
|
+
uint8_t seq_length;
|
114
|
+
unsigned i;
|
115
|
+
unsigned j;
|
116
|
+
bool not_shortest_form;
|
117
|
+
|
118
|
+
BSON_ASSERT (utf8);
|
119
|
+
|
120
|
+
for (i = 0; i < utf8_len; i += seq_length) {
|
121
|
+
_bson_utf8_get_sequence (&utf8[i], &seq_length, &first_mask);
|
122
|
+
|
123
|
+
/*
|
124
|
+
* Ensure we have a valid multi-byte sequence length.
|
125
|
+
*/
|
126
|
+
if (!seq_length) {
|
127
|
+
rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: bogus initial bits", data_type, utf8);
|
128
|
+
}
|
129
|
+
|
130
|
+
/*
|
131
|
+
* Ensure we have enough bytes left.
|
132
|
+
*/
|
133
|
+
if ((utf8_len - i) < seq_length) {
|
134
|
+
rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: truncated multi-byte sequence", data_type, utf8);
|
135
|
+
}
|
136
|
+
|
137
|
+
/*
|
138
|
+
* Also calculate the next char as a unichar so we can
|
139
|
+
* check code ranges for non-shortest form.
|
140
|
+
*/
|
141
|
+
c = utf8[i] & first_mask;
|
142
|
+
|
143
|
+
/*
|
144
|
+
* Check the high-bits for each additional sequence byte.
|
145
|
+
*/
|
146
|
+
for (j = i + 1; j < (i + seq_length); j++) {
|
147
|
+
c = (c << 6) | (utf8[j] & 0x3F);
|
148
|
+
if ((utf8[j] & 0xC0) != 0x80) {
|
149
|
+
rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: bogus high bits for continuation byte", data_type, utf8);
|
150
|
+
}
|
151
|
+
}
|
152
|
+
|
153
|
+
/*
|
154
|
+
* Check for NULL bytes afterwards.
|
155
|
+
*
|
156
|
+
* Hint: if you want to optimize this function, starting here to do
|
157
|
+
* this in the same pass as the data above would probably be a good
|
158
|
+
* idea. You would add a branch into the inner loop, but save possibly
|
159
|
+
* on cache-line bouncing on larger strings. Just a thought.
|
160
|
+
*/
|
161
|
+
if (!allow_null) {
|
162
|
+
for (j = 0; j < seq_length; j++) {
|
163
|
+
if (((i + j) > utf8_len) || !utf8[i + j]) {
|
164
|
+
rb_raise(rb_eArgError, "%s %s contains null bytes", data_type, utf8);
|
165
|
+
}
|
166
|
+
}
|
167
|
+
}
|
168
|
+
|
169
|
+
/*
|
170
|
+
* Code point won't fit in utf-16, not allowed.
|
171
|
+
*/
|
172
|
+
if (c > 0x0010FFFF) {
|
173
|
+
rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: code point %"PRIu32" does not fit in UTF-16", data_type, utf8, c);
|
174
|
+
}
|
175
|
+
|
176
|
+
/*
|
177
|
+
* Byte is in reserved range for UTF-16 high-marks
|
178
|
+
* for surrogate pairs.
|
179
|
+
*/
|
180
|
+
if ((c & 0xFFFFF800) == 0xD800) {
|
181
|
+
rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: byte is in surrogate pair reserved range", data_type, utf8);
|
182
|
+
}
|
183
|
+
|
184
|
+
/*
|
185
|
+
* Check non-shortest form unicode.
|
186
|
+
*/
|
187
|
+
not_shortest_form = false;
|
188
|
+
switch (seq_length) {
|
189
|
+
case 1:
|
190
|
+
if (c <= 0x007F) {
|
191
|
+
continue;
|
192
|
+
}
|
193
|
+
not_shortest_form = true;
|
194
|
+
|
195
|
+
case 2:
|
196
|
+
if ((c >= 0x0080) && (c <= 0x07FF)) {
|
197
|
+
continue;
|
198
|
+
} else if (c == 0) {
|
199
|
+
/* Two-byte representation for NULL. */
|
200
|
+
if (!allow_null) {
|
201
|
+
rb_raise(rb_eArgError, "%s %s contains null bytes", data_type, utf8);
|
202
|
+
}
|
203
|
+
continue;
|
204
|
+
}
|
205
|
+
not_shortest_form = true;
|
206
|
+
|
207
|
+
case 3:
|
208
|
+
if (((c >= 0x0800) && (c <= 0x0FFF)) ||
|
209
|
+
((c >= 0x1000) && (c <= 0xFFFF))) {
|
210
|
+
continue;
|
211
|
+
}
|
212
|
+
not_shortest_form = true;
|
213
|
+
|
214
|
+
case 4:
|
215
|
+
if (((c >= 0x10000) && (c <= 0x3FFFF)) ||
|
216
|
+
((c >= 0x40000) && (c <= 0xFFFFF)) ||
|
217
|
+
((c >= 0x100000) && (c <= 0x10FFFF))) {
|
218
|
+
continue;
|
219
|
+
}
|
220
|
+
not_shortest_form = true;
|
221
|
+
|
222
|
+
default:
|
223
|
+
not_shortest_form = true;
|
224
|
+
}
|
225
|
+
|
226
|
+
if (not_shortest_form) {
|
227
|
+
rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: not in shortest form", data_type, utf8);
|
228
|
+
}
|
229
|
+
}
|
230
|
+
}
|