bson 4.5.0 → 4.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,289 @@
1
+ /*
2
+ * Copyright (C) 2009-2019 MongoDB Inc.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ */
16
+ #include "bson-native.h"
17
+
18
+ /**
19
+ * The counter for incrementing object ids.
20
+ */
21
+ uint32_t rb_bson_object_id_counter;
22
+
23
+
24
+ VALUE rb_bson_registry;
25
+
26
+ VALUE rb_bson_illegal_key;
27
+
28
+ const rb_data_type_t rb_byte_buffer_data_type = {
29
+ "bson/byte_buffer",
30
+ { NULL, rb_bson_byte_buffer_free, rb_bson_byte_buffer_memsize }
31
+ };
32
+
33
+ /**
34
+ * Initialize the bson_native extension.
35
+ */
36
+ void Init_bson_native()
37
+ {
38
+ char rb_bson_machine_id[256];
39
+
40
+ VALUE rb_bson_module = rb_define_module("BSON");
41
+
42
+ /* Document-class: BSON::ByteBuffer
43
+ *
44
+ * Stores BSON-serialized data and provides efficient serialization and
45
+ * deserialization of common Ruby classes using native code.
46
+ */
47
+ VALUE rb_byte_buffer_class = rb_define_class_under(rb_bson_module, "ByteBuffer", rb_cObject);
48
+
49
+ VALUE rb_bson_object_id_class = rb_const_get(rb_bson_module, rb_intern("ObjectId"));
50
+ VALUE rb_bson_object_id_generator_class = rb_const_get(rb_bson_object_id_class, rb_intern("Generator"));
51
+ VALUE rb_digest_class = rb_const_get(rb_cObject, rb_intern("Digest"));
52
+ VALUE rb_md5_class = rb_const_get(rb_digest_class, rb_intern("MD5"));
53
+
54
+ rb_bson_illegal_key = rb_const_get(rb_const_get(rb_bson_module, rb_intern("String")),rb_intern("IllegalKey"));
55
+
56
+ rb_define_alloc_func(rb_byte_buffer_class, rb_bson_byte_buffer_allocate);
57
+ rb_define_method(rb_byte_buffer_class, "initialize", rb_bson_byte_buffer_initialize, -1);
58
+ rb_define_method(rb_byte_buffer_class, "length", rb_bson_byte_buffer_length, 0);
59
+
60
+ /*
61
+ * call-seq:
62
+ * buffer.read_position -> Fixnum
63
+ *
64
+ * Returns the read position in the buffer.
65
+ */
66
+ rb_define_method(rb_byte_buffer_class, "read_position", rb_bson_byte_buffer_read_position, 0);
67
+
68
+ rb_define_method(rb_byte_buffer_class, "get_byte", rb_bson_byte_buffer_get_byte, 0);
69
+ rb_define_method(rb_byte_buffer_class, "get_bytes", rb_bson_byte_buffer_get_bytes, 1);
70
+ rb_define_method(rb_byte_buffer_class, "get_cstring", rb_bson_byte_buffer_get_cstring, 0);
71
+ rb_define_method(rb_byte_buffer_class, "get_decimal128_bytes", rb_bson_byte_buffer_get_decimal128_bytes, 0);
72
+ rb_define_method(rb_byte_buffer_class, "get_double", rb_bson_byte_buffer_get_double, 0);
73
+ rb_define_method(rb_byte_buffer_class, "get_hash", rb_bson_byte_buffer_get_hash, 0);
74
+ rb_define_method(rb_byte_buffer_class, "get_array", rb_bson_byte_buffer_get_array, 0);
75
+ rb_define_method(rb_byte_buffer_class, "get_int32", rb_bson_byte_buffer_get_int32, 0);
76
+ rb_define_method(rb_byte_buffer_class, "get_int64", rb_bson_byte_buffer_get_int64, 0);
77
+ rb_define_method(rb_byte_buffer_class, "get_string", rb_bson_byte_buffer_get_string, 0);
78
+
79
+ /*
80
+ * call-seq:
81
+ * buffer.write_position -> Fixnum
82
+ *
83
+ * Returns the write position in the buffer.
84
+ */
85
+ rb_define_method(rb_byte_buffer_class, "write_position", rb_bson_byte_buffer_write_position, 0);
86
+
87
+ /*
88
+ * call-seq:
89
+ * buffer.put_byte(binary_str) -> ByteBuffer
90
+ *
91
+ * Writes the specified byte string, which must be of length 1,
92
+ * to the byte buffer.
93
+ *
94
+ * Returns the modified +self+.
95
+ */
96
+ rb_define_method(rb_byte_buffer_class, "put_byte", rb_bson_byte_buffer_put_byte, 1);
97
+
98
+ /*
99
+ * call-seq:
100
+ * buffer.put_bytes(binary_str) -> ByteBuffer
101
+ *
102
+ * Writes the specified byte string to the byte buffer.
103
+ *
104
+ * This method writes exactly the provided byte string - in particular, it
105
+ * does not prepend the length, and does not append a null byte at the end.
106
+ *
107
+ * Returns the modified +self+.
108
+ */
109
+ rb_define_method(rb_byte_buffer_class, "put_bytes", rb_bson_byte_buffer_put_bytes, 1);
110
+
111
+ /*
112
+ * call-seq:
113
+ * buffer.put_string(str) -> ByteBuffer
114
+ *
115
+ * Writes the specified string to the byte buffer as a BSON string.
116
+ *
117
+ * Unlike #put_bytes, this method writes the provided byte string as
118
+ * a "BSON string" - the string is prefixed with its length and suffixed
119
+ * with a null byte. The byte string may contain null bytes itself thus
120
+ * the null terminator is redundant, but it is required by the BSON
121
+ * specification.
122
+ *
123
+ * +str+ must either already be in UTF-8 encoding or be a string encodable
124
+ * to UTF-8. In particular, a string in BINARY/ASCII-8BIT encoding is
125
+ * generally not suitable for this method. +EncodingError+ will be raised
126
+ * if +str+ cannot be encoded in UTF-8, or if +str+ claims to be encoded in
127
+ * UTF-8 but contains bytes/byte sequences which are not valid in UTF-8.
128
+ * Use #put_bytes to write arbitrary byte strings to the buffer.
129
+ *
130
+ * Returns the modified +self+.
131
+ */
132
+ rb_define_method(rb_byte_buffer_class, "put_string", rb_bson_byte_buffer_put_string, 1);
133
+
134
+ /**
135
+ * call-seq:
136
+ * buffer.put_cstring(obj) -> ByteBuffer
137
+ *
138
+ * Converts +obj+ to a string, which must not contain any null bytes, and
139
+ * which must be valid UTF-8, and writes the string to the buffer as a
140
+ * BSON cstring. +obj+ can be an instance of String, Symbol or Fixnum.
141
+ *
142
+ * If the string serialization of +obj+ contains null bytes, this method
143
+ * raises +ArgumentError+. If +obj+ is of an unsupported type, this method
144
+ * raises +TypeError+.
145
+ *
146
+ * BSON cstring serialization contains no length of the string (relying
147
+ * instead on the null terminator), unlike the BSON string serialization.
148
+ */
149
+ rb_define_method(rb_byte_buffer_class, "put_cstring", rb_bson_byte_buffer_put_cstring, 1);
150
+
151
+ /**
152
+ * call-seq:
153
+ * buffer.put_symbol(sym) -> ByteBuffer
154
+ *
155
+ * Converts +sym+ to a string and writes the resulting string to the byte
156
+ * buffer.
157
+ *
158
+ * The symbol may contain null bytes.
159
+ *
160
+ * The symbol value is assumed to be encoded in UTF-8. If the symbol value
161
+ * contains bytes or byte sequences that are not valid in UTF-8, this method
162
+ * raises +EncodingError+.
163
+ *
164
+ * Note: due to the string conversion, a symbol written to the buffer becomes
165
+ * indistinguishable from a string with the same value written to the buffer.
166
+ */
167
+ rb_define_method(rb_byte_buffer_class, "put_symbol", rb_bson_byte_buffer_put_symbol, 1);
168
+
169
+ /*
170
+ * call-seq:
171
+ * buffer.put_int32(fixnum) -> ByteBuffer
172
+ *
173
+ * Writes a 32-bit integer value to the buffer.
174
+ *
175
+ * If the argument cannot be represented in 32 bits, raises RangeError.
176
+ *
177
+ * Returns the modified +self+.
178
+ */
179
+ rb_define_method(rb_byte_buffer_class, "put_int32", rb_bson_byte_buffer_put_int32, 1);
180
+
181
+ /*
182
+ * call-seq:
183
+ * buffer.put_int64(fixnum) -> ByteBuffer
184
+ *
185
+ * Writes a 64-bit integer value to the buffer.
186
+ *
187
+ * If the argument cannot be represented in 64 bits, raises RangeError.
188
+ *
189
+ * Returns the modified +self+.
190
+ */
191
+ rb_define_method(rb_byte_buffer_class, "put_int64", rb_bson_byte_buffer_put_int64, 1);
192
+
193
+ /*
194
+ * call-seq:
195
+ * buffer.put_double(double) -> ByteBuffer
196
+ *
197
+ * Writes a 64-bit floating point value to the buffer.
198
+ *
199
+ * Returns the modified +self+.
200
+ */
201
+ rb_define_method(rb_byte_buffer_class, "put_double", rb_bson_byte_buffer_put_double, 1);
202
+
203
+ /*
204
+ * call-seq:
205
+ * buffer.put_decimal128(low_64bit, high_64bit) -> ByteBuffer
206
+ *
207
+ * Writes a 128-bit Decimal128 value to the buffer.
208
+ *
209
+ * +low_64bit+ and +high_64bit+ are Fixnum objects containing the low and
210
+ * the high parts of the 128-bit Decimal128 value, respectively.
211
+ *
212
+ * Returns the modified +self+.
213
+ */
214
+ rb_define_method(rb_byte_buffer_class, "put_decimal128", rb_bson_byte_buffer_put_decimal128, 2);
215
+
216
+ /*
217
+ * call-seq:
218
+ * buffer.put_hash(hash) -> ByteBuffer
219
+ *
220
+ * Writes a Hash into the byte buffer.
221
+ *
222
+ * Returns the modified +self+.
223
+ */
224
+ rb_define_method(rb_byte_buffer_class, "put_hash", rb_bson_byte_buffer_put_hash, 2);
225
+
226
+ /*
227
+ * call-seq:
228
+ * buffer.put_array(array) -> ByteBuffer
229
+ *
230
+ * Writes an Array into the byte buffer.
231
+ *
232
+ * Returns the modified +self+.
233
+ */
234
+ rb_define_method(rb_byte_buffer_class, "put_array", rb_bson_byte_buffer_put_array, 2);
235
+
236
+ /*
237
+ * call-seq:
238
+ * buffer.replace_int32(position, fixnum) -> ByteBuffer
239
+ *
240
+ * Replaces a 32-bit integer value at the specified position in the buffer.
241
+ *
242
+ * The position must be a non-negative integer, and must be completely
243
+ * contained within the data already written. For example, if the buffer has
244
+ * the write position of 12, the acceptable range of positions for this
245
+ * method is 0..8.
246
+ *
247
+ * If the argument cannot be represented in 32 bits, raises RangeError.
248
+ *
249
+ * Returns the modified +self+.
250
+ */
251
+ rb_define_method(rb_byte_buffer_class, "replace_int32", rb_bson_byte_buffer_replace_int32, 2);
252
+
253
+ /*
254
+ * call-seq:
255
+ * buffer.rewind! -> ByteBuffer
256
+ *
257
+ * Resets the read position to the beginning of the byte buffer.
258
+ *
259
+ * Note: +rewind!+ does not change the buffer's write position.
260
+ *
261
+ * Returns the modified +self+.
262
+ */
263
+ rb_define_method(rb_byte_buffer_class, "rewind!", rb_bson_byte_buffer_rewind, 0);
264
+
265
+ /*
266
+ * call-seq:
267
+ * buffer.to_s -> String
268
+ *
269
+ * Returns the contents of the buffer as a binary string.
270
+ *
271
+ * Note: this method copies the buffer's contents into a newly allocated
272
+ * +String+ instance. It does not return a reference to the data stored in
273
+ * the buffer itself.
274
+ */
275
+ rb_define_method(rb_byte_buffer_class, "to_s", rb_bson_byte_buffer_to_s, 0);
276
+
277
+ rb_define_method(rb_bson_object_id_generator_class, "next_object_id", rb_bson_object_id_generator_next, -1);
278
+
279
+ // Get the object id machine id and hash it.
280
+ rb_require("digest/md5");
281
+ gethostname(rb_bson_machine_id, sizeof(rb_bson_machine_id));
282
+ rb_bson_machine_id[255] = '\0';
283
+ rb_bson_generate_machine_id(rb_md5_class, rb_bson_machine_id);
284
+
285
+ // Set the object id counter to a random number
286
+ rb_bson_object_id_counter = FIX2INT(rb_funcall(rb_mKernel, rb_intern("rand"), 1, INT2FIX(0x1000000)));
287
+
288
+ rb_bson_registry = rb_const_get(rb_bson_module, rb_intern("Registry"));
289
+ }
@@ -0,0 +1,230 @@
1
+ #include <ruby.h>
2
+ #include <ruby/encoding.h>
3
+ #include <stdbool.h>
4
+ #include <unistd.h>
5
+ #include <assert.h>
6
+ #include "bson-native.h"
7
+
8
+ /**
9
+ * Taken from libbson.
10
+ */
11
+
12
+ #define BSON_ASSERT assert
13
+ #define BSON_INLINE
14
+
15
+
16
+ /*
17
+ *--------------------------------------------------------------------------
18
+ *
19
+ * _bson_utf8_get_sequence --
20
+ *
21
+ * Determine the sequence length of the first UTF-8 character in
22
+ * @utf8. The sequence length is stored in @seq_length and the mask
23
+ * for the first character is stored in @first_mask.
24
+ *
25
+ * Returns:
26
+ * None.
27
+ *
28
+ * Side effects:
29
+ * @seq_length is set.
30
+ * @first_mask is set.
31
+ *
32
+ *--------------------------------------------------------------------------
33
+ */
34
+
35
+ static BSON_INLINE void
36
+ _bson_utf8_get_sequence (const char *utf8, /* IN */
37
+ uint8_t *seq_length, /* OUT */
38
+ uint8_t *first_mask) /* OUT */
39
+ {
40
+ unsigned char c = *(const unsigned char *) utf8;
41
+ uint8_t m;
42
+ uint8_t n;
43
+
44
+ /*
45
+ * See the following[1] for a description of what the given multi-byte
46
+ * sequences will be based on the bits set of the first byte. We also need
47
+ * to mask the first byte based on that. All subsequent bytes are masked
48
+ * against 0x3F.
49
+ *
50
+ * [1] http://www.joelonsoftware.com/articles/Unicode.html
51
+ */
52
+
53
+ if ((c & 0x80) == 0) {
54
+ n = 1;
55
+ m = 0x7F;
56
+ } else if ((c & 0xE0) == 0xC0) {
57
+ n = 2;
58
+ m = 0x1F;
59
+ } else if ((c & 0xF0) == 0xE0) {
60
+ n = 3;
61
+ m = 0x0F;
62
+ } else if ((c & 0xF8) == 0xF0) {
63
+ n = 4;
64
+ m = 0x07;
65
+ } else {
66
+ n = 0;
67
+ m = 0;
68
+ }
69
+
70
+ *seq_length = n;
71
+ *first_mask = m;
72
+ }
73
+
74
+
75
+ /*
76
+ *--------------------------------------------------------------------------
77
+ *
78
+ * bson_utf8_validate --
79
+ *
80
+ * Validates that @utf8 is a valid UTF-8 string. Note that we only
81
+ * support UTF-8 characters which have sequence length less than or equal
82
+ * to 4 bytes (RFC 3629).
83
+ *
84
+ * If @allow_null is true, then \0 is allowed within @utf8_len bytes
85
+ * of @utf8. Generally, this is bad practice since the main point of
86
+ * UTF-8 strings is that they can be used with strlen() and friends.
87
+ * However, some languages such as Python can send UTF-8 encoded
88
+ * strings with NUL's in them.
89
+ *
90
+ * Parameters:
91
+ * @utf8: A UTF-8 encoded string.
92
+ * @utf8_len: The length of @utf8 in bytes.
93
+ * @allow_null: If \0 is allowed within @utf8, exclusing trailing \0.
94
+ * @data_type: The data type being serialized.
95
+ *
96
+ * Returns:
97
+ * true if @utf8 is valid UTF-8. otherwise false.
98
+ *
99
+ * Side effects:
100
+ * None.
101
+ *
102
+ *--------------------------------------------------------------------------
103
+ */
104
+
105
+ void
106
+ rb_bson_utf8_validate (const char *utf8, /* IN */
107
+ size_t utf8_len, /* IN */
108
+ bool allow_null, /* IN */
109
+ const char *data_type) /* IN */
110
+ {
111
+ uint32_t c;
112
+ uint8_t first_mask;
113
+ uint8_t seq_length;
114
+ unsigned i;
115
+ unsigned j;
116
+ bool not_shortest_form;
117
+
118
+ BSON_ASSERT (utf8);
119
+
120
+ for (i = 0; i < utf8_len; i += seq_length) {
121
+ _bson_utf8_get_sequence (&utf8[i], &seq_length, &first_mask);
122
+
123
+ /*
124
+ * Ensure we have a valid multi-byte sequence length.
125
+ */
126
+ if (!seq_length) {
127
+ rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: bogus initial bits", data_type, utf8);
128
+ }
129
+
130
+ /*
131
+ * Ensure we have enough bytes left.
132
+ */
133
+ if ((utf8_len - i) < seq_length) {
134
+ rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: truncated multi-byte sequence", data_type, utf8);
135
+ }
136
+
137
+ /*
138
+ * Also calculate the next char as a unichar so we can
139
+ * check code ranges for non-shortest form.
140
+ */
141
+ c = utf8[i] & first_mask;
142
+
143
+ /*
144
+ * Check the high-bits for each additional sequence byte.
145
+ */
146
+ for (j = i + 1; j < (i + seq_length); j++) {
147
+ c = (c << 6) | (utf8[j] & 0x3F);
148
+ if ((utf8[j] & 0xC0) != 0x80) {
149
+ rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: bogus high bits for continuation byte", data_type, utf8);
150
+ }
151
+ }
152
+
153
+ /*
154
+ * Check for NULL bytes afterwards.
155
+ *
156
+ * Hint: if you want to optimize this function, starting here to do
157
+ * this in the same pass as the data above would probably be a good
158
+ * idea. You would add a branch into the inner loop, but save possibly
159
+ * on cache-line bouncing on larger strings. Just a thought.
160
+ */
161
+ if (!allow_null) {
162
+ for (j = 0; j < seq_length; j++) {
163
+ if (((i + j) > utf8_len) || !utf8[i + j]) {
164
+ rb_raise(rb_eArgError, "%s %s contains null bytes", data_type, utf8);
165
+ }
166
+ }
167
+ }
168
+
169
+ /*
170
+ * Code point won't fit in utf-16, not allowed.
171
+ */
172
+ if (c > 0x0010FFFF) {
173
+ rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: code point %"PRIu32" does not fit in UTF-16", data_type, utf8, c);
174
+ }
175
+
176
+ /*
177
+ * Byte is in reserved range for UTF-16 high-marks
178
+ * for surrogate pairs.
179
+ */
180
+ if ((c & 0xFFFFF800) == 0xD800) {
181
+ rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: byte is in surrogate pair reserved range", data_type, utf8);
182
+ }
183
+
184
+ /*
185
+ * Check non-shortest form unicode.
186
+ */
187
+ not_shortest_form = false;
188
+ switch (seq_length) {
189
+ case 1:
190
+ if (c <= 0x007F) {
191
+ continue;
192
+ }
193
+ not_shortest_form = true;
194
+
195
+ case 2:
196
+ if ((c >= 0x0080) && (c <= 0x07FF)) {
197
+ continue;
198
+ } else if (c == 0) {
199
+ /* Two-byte representation for NULL. */
200
+ if (!allow_null) {
201
+ rb_raise(rb_eArgError, "%s %s contains null bytes", data_type, utf8);
202
+ }
203
+ continue;
204
+ }
205
+ not_shortest_form = true;
206
+
207
+ case 3:
208
+ if (((c >= 0x0800) && (c <= 0x0FFF)) ||
209
+ ((c >= 0x1000) && (c <= 0xFFFF))) {
210
+ continue;
211
+ }
212
+ not_shortest_form = true;
213
+
214
+ case 4:
215
+ if (((c >= 0x10000) && (c <= 0x3FFFF)) ||
216
+ ((c >= 0x40000) && (c <= 0xFFFFF)) ||
217
+ ((c >= 0x100000) && (c <= 0x10FFFF))) {
218
+ continue;
219
+ }
220
+ not_shortest_form = true;
221
+
222
+ default:
223
+ not_shortest_form = true;
224
+ }
225
+
226
+ if (not_shortest_form) {
227
+ rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: not in shortest form", data_type, utf8);
228
+ }
229
+ }
230
+ }