bson 4.5.0 → 4.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,289 @@
1
+ /*
2
+ * Copyright (C) 2009-2019 MongoDB Inc.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ */
16
+ #include "bson-native.h"
17
+
18
+ /**
19
+ * The counter for incrementing object ids.
20
+ */
21
+ uint32_t rb_bson_object_id_counter;
22
+
23
+
24
+ VALUE rb_bson_registry;
25
+
26
+ VALUE rb_bson_illegal_key;
27
+
28
+ const rb_data_type_t rb_byte_buffer_data_type = {
29
+ "bson/byte_buffer",
30
+ { NULL, rb_bson_byte_buffer_free, rb_bson_byte_buffer_memsize }
31
+ };
32
+
33
+ /**
34
+ * Initialize the bson_native extension.
35
+ */
36
+ void Init_bson_native()
37
+ {
38
+ char rb_bson_machine_id[256];
39
+
40
+ VALUE rb_bson_module = rb_define_module("BSON");
41
+
42
+ /* Document-class: BSON::ByteBuffer
43
+ *
44
+ * Stores BSON-serialized data and provides efficient serialization and
45
+ * deserialization of common Ruby classes using native code.
46
+ */
47
+ VALUE rb_byte_buffer_class = rb_define_class_under(rb_bson_module, "ByteBuffer", rb_cObject);
48
+
49
+ VALUE rb_bson_object_id_class = rb_const_get(rb_bson_module, rb_intern("ObjectId"));
50
+ VALUE rb_bson_object_id_generator_class = rb_const_get(rb_bson_object_id_class, rb_intern("Generator"));
51
+ VALUE rb_digest_class = rb_const_get(rb_cObject, rb_intern("Digest"));
52
+ VALUE rb_md5_class = rb_const_get(rb_digest_class, rb_intern("MD5"));
53
+
54
+ rb_bson_illegal_key = rb_const_get(rb_const_get(rb_bson_module, rb_intern("String")),rb_intern("IllegalKey"));
55
+
56
+ rb_define_alloc_func(rb_byte_buffer_class, rb_bson_byte_buffer_allocate);
57
+ rb_define_method(rb_byte_buffer_class, "initialize", rb_bson_byte_buffer_initialize, -1);
58
+ rb_define_method(rb_byte_buffer_class, "length", rb_bson_byte_buffer_length, 0);
59
+
60
+ /*
61
+ * call-seq:
62
+ * buffer.read_position -> Fixnum
63
+ *
64
+ * Returns the read position in the buffer.
65
+ */
66
+ rb_define_method(rb_byte_buffer_class, "read_position", rb_bson_byte_buffer_read_position, 0);
67
+
68
+ rb_define_method(rb_byte_buffer_class, "get_byte", rb_bson_byte_buffer_get_byte, 0);
69
+ rb_define_method(rb_byte_buffer_class, "get_bytes", rb_bson_byte_buffer_get_bytes, 1);
70
+ rb_define_method(rb_byte_buffer_class, "get_cstring", rb_bson_byte_buffer_get_cstring, 0);
71
+ rb_define_method(rb_byte_buffer_class, "get_decimal128_bytes", rb_bson_byte_buffer_get_decimal128_bytes, 0);
72
+ rb_define_method(rb_byte_buffer_class, "get_double", rb_bson_byte_buffer_get_double, 0);
73
+ rb_define_method(rb_byte_buffer_class, "get_hash", rb_bson_byte_buffer_get_hash, 0);
74
+ rb_define_method(rb_byte_buffer_class, "get_array", rb_bson_byte_buffer_get_array, 0);
75
+ rb_define_method(rb_byte_buffer_class, "get_int32", rb_bson_byte_buffer_get_int32, 0);
76
+ rb_define_method(rb_byte_buffer_class, "get_int64", rb_bson_byte_buffer_get_int64, 0);
77
+ rb_define_method(rb_byte_buffer_class, "get_string", rb_bson_byte_buffer_get_string, 0);
78
+
79
+ /*
80
+ * call-seq:
81
+ * buffer.write_position -> Fixnum
82
+ *
83
+ * Returns the write position in the buffer.
84
+ */
85
+ rb_define_method(rb_byte_buffer_class, "write_position", rb_bson_byte_buffer_write_position, 0);
86
+
87
+ /*
88
+ * call-seq:
89
+ * buffer.put_byte(binary_str) -> ByteBuffer
90
+ *
91
+ * Writes the specified byte string, which must be of length 1,
92
+ * to the byte buffer.
93
+ *
94
+ * Returns the modified +self+.
95
+ */
96
+ rb_define_method(rb_byte_buffer_class, "put_byte", rb_bson_byte_buffer_put_byte, 1);
97
+
98
+ /*
99
+ * call-seq:
100
+ * buffer.put_bytes(binary_str) -> ByteBuffer
101
+ *
102
+ * Writes the specified byte string to the byte buffer.
103
+ *
104
+ * This method writes exactly the provided byte string - in particular, it
105
+ * does not prepend the length, and does not append a null byte at the end.
106
+ *
107
+ * Returns the modified +self+.
108
+ */
109
+ rb_define_method(rb_byte_buffer_class, "put_bytes", rb_bson_byte_buffer_put_bytes, 1);
110
+
111
+ /*
112
+ * call-seq:
113
+ * buffer.put_string(str) -> ByteBuffer
114
+ *
115
+ * Writes the specified string to the byte buffer as a BSON string.
116
+ *
117
+ * Unlike #put_bytes, this method writes the provided byte string as
118
+ * a "BSON string" - the string is prefixed with its length and suffixed
119
+ * with a null byte. The byte string may contain null bytes itself thus
120
+ * the null terminator is redundant, but it is required by the BSON
121
+ * specification.
122
+ *
123
+ * +str+ must either already be in UTF-8 encoding or be a string encodable
124
+ * to UTF-8. In particular, a string in BINARY/ASCII-8BIT encoding is
125
+ * generally not suitable for this method. +EncodingError+ will be raised
126
+ * if +str+ cannot be encoded in UTF-8, or if +str+ claims to be encoded in
127
+ * UTF-8 but contains bytes/byte sequences which are not valid in UTF-8.
128
+ * Use #put_bytes to write arbitrary byte strings to the buffer.
129
+ *
130
+ * Returns the modified +self+.
131
+ */
132
+ rb_define_method(rb_byte_buffer_class, "put_string", rb_bson_byte_buffer_put_string, 1);
133
+
134
+ /**
135
+ * call-seq:
136
+ * buffer.put_cstring(obj) -> ByteBuffer
137
+ *
138
+ * Converts +obj+ to a string, which must not contain any null bytes, and
139
+ * which must be valid UTF-8, and writes the string to the buffer as a
140
+ * BSON cstring. +obj+ can be an instance of String, Symbol or Fixnum.
141
+ *
142
+ * If the string serialization of +obj+ contains null bytes, this method
143
+ * raises +ArgumentError+. If +obj+ is of an unsupported type, this method
144
+ * raises +TypeError+.
145
+ *
146
+ * BSON cstring serialization contains no length of the string (relying
147
+ * instead on the null terminator), unlike the BSON string serialization.
148
+ */
149
+ rb_define_method(rb_byte_buffer_class, "put_cstring", rb_bson_byte_buffer_put_cstring, 1);
150
+
151
+ /**
152
+ * call-seq:
153
+ * buffer.put_symbol(sym) -> ByteBuffer
154
+ *
155
+ * Converts +sym+ to a string and writes the resulting string to the byte
156
+ * buffer.
157
+ *
158
+ * The symbol may contain null bytes.
159
+ *
160
+ * The symbol value is assumed to be encoded in UTF-8. If the symbol value
161
+ * contains bytes or byte sequences that are not valid in UTF-8, this method
162
+ * raises +EncodingError+.
163
+ *
164
+ * Note: due to the string conversion, a symbol written to the buffer becomes
165
+ * indistinguishable from a string with the same value written to the buffer.
166
+ */
167
+ rb_define_method(rb_byte_buffer_class, "put_symbol", rb_bson_byte_buffer_put_symbol, 1);
168
+
169
+ /*
170
+ * call-seq:
171
+ * buffer.put_int32(fixnum) -> ByteBuffer
172
+ *
173
+ * Writes a 32-bit integer value to the buffer.
174
+ *
175
+ * If the argument cannot be represented in 32 bits, raises RangeError.
176
+ *
177
+ * Returns the modified +self+.
178
+ */
179
+ rb_define_method(rb_byte_buffer_class, "put_int32", rb_bson_byte_buffer_put_int32, 1);
180
+
181
+ /*
182
+ * call-seq:
183
+ * buffer.put_int64(fixnum) -> ByteBuffer
184
+ *
185
+ * Writes a 64-bit integer value to the buffer.
186
+ *
187
+ * If the argument cannot be represented in 64 bits, raises RangeError.
188
+ *
189
+ * Returns the modified +self+.
190
+ */
191
+ rb_define_method(rb_byte_buffer_class, "put_int64", rb_bson_byte_buffer_put_int64, 1);
192
+
193
+ /*
194
+ * call-seq:
195
+ * buffer.put_double(double) -> ByteBuffer
196
+ *
197
+ * Writes a 64-bit floating point value to the buffer.
198
+ *
199
+ * Returns the modified +self+.
200
+ */
201
+ rb_define_method(rb_byte_buffer_class, "put_double", rb_bson_byte_buffer_put_double, 1);
202
+
203
+ /*
204
+ * call-seq:
205
+ * buffer.put_decimal128(low_64bit, high_64bit) -> ByteBuffer
206
+ *
207
+ * Writes a 128-bit Decimal128 value to the buffer.
208
+ *
209
+ * +low_64bit+ and +high_64bit+ are Fixnum objects containing the low and
210
+ * the high parts of the 128-bit Decimal128 value, respectively.
211
+ *
212
+ * Returns the modified +self+.
213
+ */
214
+ rb_define_method(rb_byte_buffer_class, "put_decimal128", rb_bson_byte_buffer_put_decimal128, 2);
215
+
216
+ /*
217
+ * call-seq:
218
+ * buffer.put_hash(hash) -> ByteBuffer
219
+ *
220
+ * Writes a Hash into the byte buffer.
221
+ *
222
+ * Returns the modified +self+.
223
+ */
224
+ rb_define_method(rb_byte_buffer_class, "put_hash", rb_bson_byte_buffer_put_hash, 2);
225
+
226
+ /*
227
+ * call-seq:
228
+ * buffer.put_array(array) -> ByteBuffer
229
+ *
230
+ * Writes an Array into the byte buffer.
231
+ *
232
+ * Returns the modified +self+.
233
+ */
234
+ rb_define_method(rb_byte_buffer_class, "put_array", rb_bson_byte_buffer_put_array, 2);
235
+
236
+ /*
237
+ * call-seq:
238
+ * buffer.replace_int32(position, fixnum) -> ByteBuffer
239
+ *
240
+ * Replaces a 32-bit integer value at the specified position in the buffer.
241
+ *
242
+ * The position must be a non-negative integer, and must be completely
243
+ * contained within the data already written. For example, if the buffer has
244
+ * the write position of 12, the acceptable range of positions for this
245
+ * method is 0..8.
246
+ *
247
+ * If the argument cannot be represented in 32 bits, raises RangeError.
248
+ *
249
+ * Returns the modified +self+.
250
+ */
251
+ rb_define_method(rb_byte_buffer_class, "replace_int32", rb_bson_byte_buffer_replace_int32, 2);
252
+
253
+ /*
254
+ * call-seq:
255
+ * buffer.rewind! -> ByteBuffer
256
+ *
257
+ * Resets the read position to the beginning of the byte buffer.
258
+ *
259
+ * Note: +rewind!+ does not change the buffer's write position.
260
+ *
261
+ * Returns the modified +self+.
262
+ */
263
+ rb_define_method(rb_byte_buffer_class, "rewind!", rb_bson_byte_buffer_rewind, 0);
264
+
265
+ /*
266
+ * call-seq:
267
+ * buffer.to_s -> String
268
+ *
269
+ * Returns the contents of the buffer as a binary string.
270
+ *
271
+ * Note: this method copies the buffer's contents into a newly allocated
272
+ * +String+ instance. It does not return a reference to the data stored in
273
+ * the buffer itself.
274
+ */
275
+ rb_define_method(rb_byte_buffer_class, "to_s", rb_bson_byte_buffer_to_s, 0);
276
+
277
+ rb_define_method(rb_bson_object_id_generator_class, "next_object_id", rb_bson_object_id_generator_next, -1);
278
+
279
+ // Get the object id machine id and hash it.
280
+ rb_require("digest/md5");
281
+ gethostname(rb_bson_machine_id, sizeof(rb_bson_machine_id));
282
+ rb_bson_machine_id[255] = '\0';
283
+ rb_bson_generate_machine_id(rb_md5_class, rb_bson_machine_id);
284
+
285
+ // Set the object id counter to a random number
286
+ rb_bson_object_id_counter = FIX2INT(rb_funcall(rb_mKernel, rb_intern("rand"), 1, INT2FIX(0x1000000)));
287
+
288
+ rb_bson_registry = rb_const_get(rb_bson_module, rb_intern("Registry"));
289
+ }
@@ -0,0 +1,230 @@
1
+ #include <ruby.h>
2
+ #include <ruby/encoding.h>
3
+ #include <stdbool.h>
4
+ #include <unistd.h>
5
+ #include <assert.h>
6
+ #include "bson-native.h"
7
+
8
+ /**
9
+ * Taken from libbson.
10
+ */
11
+
12
+ #define BSON_ASSERT assert
13
+ #define BSON_INLINE
14
+
15
+
16
+ /*
17
+ *--------------------------------------------------------------------------
18
+ *
19
+ * _bson_utf8_get_sequence --
20
+ *
21
+ * Determine the sequence length of the first UTF-8 character in
22
+ * @utf8. The sequence length is stored in @seq_length and the mask
23
+ * for the first character is stored in @first_mask.
24
+ *
25
+ * Returns:
26
+ * None.
27
+ *
28
+ * Side effects:
29
+ * @seq_length is set.
30
+ * @first_mask is set.
31
+ *
32
+ *--------------------------------------------------------------------------
33
+ */
34
+
35
+ static BSON_INLINE void
36
+ _bson_utf8_get_sequence (const char *utf8, /* IN */
37
+ uint8_t *seq_length, /* OUT */
38
+ uint8_t *first_mask) /* OUT */
39
+ {
40
+ unsigned char c = *(const unsigned char *) utf8;
41
+ uint8_t m;
42
+ uint8_t n;
43
+
44
+ /*
45
+ * See the following[1] for a description of what the given multi-byte
46
+ * sequences will be based on the bits set of the first byte. We also need
47
+ * to mask the first byte based on that. All subsequent bytes are masked
48
+ * against 0x3F.
49
+ *
50
+ * [1] http://www.joelonsoftware.com/articles/Unicode.html
51
+ */
52
+
53
+ if ((c & 0x80) == 0) {
54
+ n = 1;
55
+ m = 0x7F;
56
+ } else if ((c & 0xE0) == 0xC0) {
57
+ n = 2;
58
+ m = 0x1F;
59
+ } else if ((c & 0xF0) == 0xE0) {
60
+ n = 3;
61
+ m = 0x0F;
62
+ } else if ((c & 0xF8) == 0xF0) {
63
+ n = 4;
64
+ m = 0x07;
65
+ } else {
66
+ n = 0;
67
+ m = 0;
68
+ }
69
+
70
+ *seq_length = n;
71
+ *first_mask = m;
72
+ }
73
+
74
+
75
+ /*
76
+ *--------------------------------------------------------------------------
77
+ *
78
+ * bson_utf8_validate --
79
+ *
80
+ * Validates that @utf8 is a valid UTF-8 string. Note that we only
81
+ * support UTF-8 characters which have sequence length less than or equal
82
+ * to 4 bytes (RFC 3629).
83
+ *
84
+ * If @allow_null is true, then \0 is allowed within @utf8_len bytes
85
+ * of @utf8. Generally, this is bad practice since the main point of
86
+ * UTF-8 strings is that they can be used with strlen() and friends.
87
+ * However, some languages such as Python can send UTF-8 encoded
88
+ * strings with NUL's in them.
89
+ *
90
+ * Parameters:
91
+ * @utf8: A UTF-8 encoded string.
92
+ * @utf8_len: The length of @utf8 in bytes.
93
+ * @allow_null: If \0 is allowed within @utf8, exclusing trailing \0.
94
+ * @data_type: The data type being serialized.
95
+ *
96
+ * Returns:
97
+ * true if @utf8 is valid UTF-8. otherwise false.
98
+ *
99
+ * Side effects:
100
+ * None.
101
+ *
102
+ *--------------------------------------------------------------------------
103
+ */
104
+
105
+ void
106
+ rb_bson_utf8_validate (const char *utf8, /* IN */
107
+ size_t utf8_len, /* IN */
108
+ bool allow_null, /* IN */
109
+ const char *data_type) /* IN */
110
+ {
111
+ uint32_t c;
112
+ uint8_t first_mask;
113
+ uint8_t seq_length;
114
+ unsigned i;
115
+ unsigned j;
116
+ bool not_shortest_form;
117
+
118
+ BSON_ASSERT (utf8);
119
+
120
+ for (i = 0; i < utf8_len; i += seq_length) {
121
+ _bson_utf8_get_sequence (&utf8[i], &seq_length, &first_mask);
122
+
123
+ /*
124
+ * Ensure we have a valid multi-byte sequence length.
125
+ */
126
+ if (!seq_length) {
127
+ rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: bogus initial bits", data_type, utf8);
128
+ }
129
+
130
+ /*
131
+ * Ensure we have enough bytes left.
132
+ */
133
+ if ((utf8_len - i) < seq_length) {
134
+ rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: truncated multi-byte sequence", data_type, utf8);
135
+ }
136
+
137
+ /*
138
+ * Also calculate the next char as a unichar so we can
139
+ * check code ranges for non-shortest form.
140
+ */
141
+ c = utf8[i] & first_mask;
142
+
143
+ /*
144
+ * Check the high-bits for each additional sequence byte.
145
+ */
146
+ for (j = i + 1; j < (i + seq_length); j++) {
147
+ c = (c << 6) | (utf8[j] & 0x3F);
148
+ if ((utf8[j] & 0xC0) != 0x80) {
149
+ rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: bogus high bits for continuation byte", data_type, utf8);
150
+ }
151
+ }
152
+
153
+ /*
154
+ * Check for NULL bytes afterwards.
155
+ *
156
+ * Hint: if you want to optimize this function, starting here to do
157
+ * this in the same pass as the data above would probably be a good
158
+ * idea. You would add a branch into the inner loop, but save possibly
159
+ * on cache-line bouncing on larger strings. Just a thought.
160
+ */
161
+ if (!allow_null) {
162
+ for (j = 0; j < seq_length; j++) {
163
+ if (((i + j) > utf8_len) || !utf8[i + j]) {
164
+ rb_raise(rb_eArgError, "%s %s contains null bytes", data_type, utf8);
165
+ }
166
+ }
167
+ }
168
+
169
+ /*
170
+ * Code point won't fit in utf-16, not allowed.
171
+ */
172
+ if (c > 0x0010FFFF) {
173
+ rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: code point %"PRIu32" does not fit in UTF-16", data_type, utf8, c);
174
+ }
175
+
176
+ /*
177
+ * Byte is in reserved range for UTF-16 high-marks
178
+ * for surrogate pairs.
179
+ */
180
+ if ((c & 0xFFFFF800) == 0xD800) {
181
+ rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: byte is in surrogate pair reserved range", data_type, utf8);
182
+ }
183
+
184
+ /*
185
+ * Check non-shortest form unicode.
186
+ */
187
+ not_shortest_form = false;
188
+ switch (seq_length) {
189
+ case 1:
190
+ if (c <= 0x007F) {
191
+ continue;
192
+ }
193
+ not_shortest_form = true;
194
+
195
+ case 2:
196
+ if ((c >= 0x0080) && (c <= 0x07FF)) {
197
+ continue;
198
+ } else if (c == 0) {
199
+ /* Two-byte representation for NULL. */
200
+ if (!allow_null) {
201
+ rb_raise(rb_eArgError, "%s %s contains null bytes", data_type, utf8);
202
+ }
203
+ continue;
204
+ }
205
+ not_shortest_form = true;
206
+
207
+ case 3:
208
+ if (((c >= 0x0800) && (c <= 0x0FFF)) ||
209
+ ((c >= 0x1000) && (c <= 0xFFFF))) {
210
+ continue;
211
+ }
212
+ not_shortest_form = true;
213
+
214
+ case 4:
215
+ if (((c >= 0x10000) && (c <= 0x3FFFF)) ||
216
+ ((c >= 0x40000) && (c <= 0xFFFFF)) ||
217
+ ((c >= 0x100000) && (c <= 0x10FFFF))) {
218
+ continue;
219
+ }
220
+ not_shortest_form = true;
221
+
222
+ default:
223
+ not_shortest_form = true;
224
+ }
225
+
226
+ if (not_shortest_form) {
227
+ rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: not in shortest form", data_type, utf8);
228
+ }
229
+ }
230
+ }