RubyGems - bson - Versions diffs - 4.5.0 → 4.6.0 - Mend

bson 4.5.0 → 4.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +4 -4
checksums.yaml.gz.sig +0 -0
data.tar.gz.sig +0 -0
data/README.md +7 -5
data/ext/bson/{native-endian.h → bson-endian.h} +5 -99
data/ext/bson/bson-native.h +112 -0
data/ext/bson/bytebuf.c +133 -0
data/ext/bson/endian.c +116 -0
data/ext/bson/init.c +289 -0
data/ext/bson/libbson-utf8.c +230 -0
data/ext/bson/read.c +294 -0
data/ext/bson/util.c +55 -0
data/ext/bson/write.c +637 -0
data/lib/bson/document.rb +43 -1
data/lib/bson/hash.rb +11 -2
data/lib/bson/int32.rb +19 -13
data/lib/bson/int64.rb +19 -13
data/lib/bson/version.rb +1 -1
data/spec/bson/byte_buffer_read_spec.rb +141 -0
data/spec/bson/byte_buffer_spec.rb +14 -451
data/spec/bson/byte_buffer_write_spec.rb +758 -0
data/spec/bson/corpus_spec.rb +8 -5
data/spec/bson/document_spec.rb +29 -29
data/spec/bson/hash_spec.rb +65 -0
data/spec/bson/int32_spec.rb +21 -3
data/spec/bson/int64_spec.rb +22 -3
data/spec/bson/string_spec.rb +18 -0
data/spec/support/corpus-tests/array.json +8 -2
data/spec/support/shared_examples.rb +2 -4
data/spec/support/utils.rb +10 -0
metadata +74 -55
metadata.gz.sig +0 -0
data/ext/bson/bson_native.c +0 -1344

data/ext/bson/init.c ADDED

@@ -0,0 +1,289 @@
+/*
+ * Copyright (C) 2009-2019 MongoDB Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "bson-native.h"
+/**
+ * The counter for incrementing object ids.
+ */
+uint32_t rb_bson_object_id_counter;
+VALUE rb_bson_registry;
+VALUE rb_bson_illegal_key;
+const rb_data_type_t rb_byte_buffer_data_type = {
+  "bson/byte_buffer",
+  { NULL, rb_bson_byte_buffer_free, rb_bson_byte_buffer_memsize }
+};
+/**
+ * Initialize the bson_native extension.
+ */
+void Init_bson_native()
+{
+  char rb_bson_machine_id[256];
+  VALUE rb_bson_module = rb_define_module("BSON");
+  /* Document-class: BSON::ByteBuffer
+   *
+   * Stores BSON-serialized data and provides efficient serialization and
+   * deserialization of common Ruby classes using native code.
+   */
+  VALUE rb_byte_buffer_class = rb_define_class_under(rb_bson_module, "ByteBuffer", rb_cObject);
+  VALUE rb_bson_object_id_class = rb_const_get(rb_bson_module, rb_intern("ObjectId"));
+  VALUE rb_bson_object_id_generator_class = rb_const_get(rb_bson_object_id_class, rb_intern("Generator"));
+  VALUE rb_digest_class = rb_const_get(rb_cObject, rb_intern("Digest"));
+  VALUE rb_md5_class = rb_const_get(rb_digest_class, rb_intern("MD5"));
+  rb_bson_illegal_key = rb_const_get(rb_const_get(rb_bson_module, rb_intern("String")),rb_intern("IllegalKey"));
+  rb_define_alloc_func(rb_byte_buffer_class, rb_bson_byte_buffer_allocate);
+  rb_define_method(rb_byte_buffer_class, "initialize", rb_bson_byte_buffer_initialize, -1);
+  rb_define_method(rb_byte_buffer_class, "length", rb_bson_byte_buffer_length, 0);
+  /*
+   * call-seq:
+   *   buffer.read_position -> Fixnum
+   *
+   * Returns the read position in the buffer.
+   */
+  rb_define_method(rb_byte_buffer_class, "read_position", rb_bson_byte_buffer_read_position, 0);
+  rb_define_method(rb_byte_buffer_class, "get_byte", rb_bson_byte_buffer_get_byte, 0);
+  rb_define_method(rb_byte_buffer_class, "get_bytes", rb_bson_byte_buffer_get_bytes, 1);
+  rb_define_method(rb_byte_buffer_class, "get_cstring", rb_bson_byte_buffer_get_cstring, 0);
+  rb_define_method(rb_byte_buffer_class, "get_decimal128_bytes", rb_bson_byte_buffer_get_decimal128_bytes, 0);
+  rb_define_method(rb_byte_buffer_class, "get_double", rb_bson_byte_buffer_get_double, 0);
+  rb_define_method(rb_byte_buffer_class, "get_hash", rb_bson_byte_buffer_get_hash, 0);
+  rb_define_method(rb_byte_buffer_class, "get_array", rb_bson_byte_buffer_get_array, 0);
+  rb_define_method(rb_byte_buffer_class, "get_int32", rb_bson_byte_buffer_get_int32, 0);
+  rb_define_method(rb_byte_buffer_class, "get_int64", rb_bson_byte_buffer_get_int64, 0);
+  rb_define_method(rb_byte_buffer_class, "get_string", rb_bson_byte_buffer_get_string, 0);
+  /*
+   * call-seq:
+   *   buffer.write_position -> Fixnum
+   *
+   * Returns the write position in the buffer.
+   */
+  rb_define_method(rb_byte_buffer_class, "write_position", rb_bson_byte_buffer_write_position, 0);
+  /*
+   * call-seq:
+   *   buffer.put_byte(binary_str) -> ByteBuffer
+   *
+   * Writes the specified byte string, which must be of length 1,
+   * to the byte buffer.
+   *
+   * Returns the modified +self+.
+   */
+  rb_define_method(rb_byte_buffer_class, "put_byte", rb_bson_byte_buffer_put_byte, 1);
+  /*
+   * call-seq:
+   *   buffer.put_bytes(binary_str) -> ByteBuffer
+   *
+   * Writes the specified byte string to the byte buffer.
+   *
+   * This method writes exactly the provided byte string - in particular, it
+   * does not prepend the length, and does not append a null byte at the end.
+   *
+   * Returns the modified +self+.
+   */
+  rb_define_method(rb_byte_buffer_class, "put_bytes", rb_bson_byte_buffer_put_bytes, 1);
+  /*
+   * call-seq:
+   *   buffer.put_string(str) -> ByteBuffer
+   *
+   * Writes the specified string to the byte buffer as a BSON string.
+   *
+   * Unlike #put_bytes, this method writes the provided byte string as
+   * a "BSON string" - the string is prefixed with its length and suffixed
+   * with a null byte. The byte string may contain null bytes itself thus
+   * the null terminator is redundant, but it is required by the BSON
+   * specification.
+   *
+   * +str+ must either already be in UTF-8 encoding or be a string encodable
+   * to UTF-8. In particular, a string in BINARY/ASCII-8BIT encoding is
+   * generally not suitable for this method. +EncodingError+ will be raised
+   * if +str+ cannot be encoded in UTF-8, or if +str+ claims to be encoded in
+   * UTF-8 but contains bytes/byte sequences which are not valid in UTF-8.
+   * Use #put_bytes to write arbitrary byte strings to the buffer.
+   *
+   * Returns the modified +self+.
+   */
+  rb_define_method(rb_byte_buffer_class, "put_string", rb_bson_byte_buffer_put_string, 1);
+  /**
+   * call-seq:
+   *   buffer.put_cstring(obj) -> ByteBuffer
+   *
+   * Converts +obj+ to a string, which must not contain any null bytes, and
+   * which must be valid UTF-8, and writes the string to the buffer as a
+   * BSON cstring. +obj+ can be an instance of String, Symbol or Fixnum.
+   *
+   * If the string serialization of +obj+ contains null bytes, this method
+   * raises +ArgumentError+. If +obj+ is of an unsupported type, this method
+   * raises +TypeError+.
+   *
+   * BSON cstring serialization contains no length of the string (relying
+   * instead on the null terminator), unlike the BSON string serialization.
+   */
+  rb_define_method(rb_byte_buffer_class, "put_cstring", rb_bson_byte_buffer_put_cstring, 1);
+  /**
+   * call-seq:
+   *   buffer.put_symbol(sym) -> ByteBuffer
+   *
+   * Converts +sym+ to a string and writes the resulting string to the byte
+   * buffer.
+   *
+   * The symbol may contain null bytes.
+   *
+   * The symbol value is assumed to be encoded in UTF-8. If the symbol value
+   * contains bytes or byte sequences that are not valid in UTF-8, this method
+   * raises +EncodingError+.
+   *
+   * Note: due to the string conversion, a symbol written to the buffer becomes
+   * indistinguishable from a string with the same value written to the buffer.
+   */
+  rb_define_method(rb_byte_buffer_class, "put_symbol", rb_bson_byte_buffer_put_symbol, 1);
+  /*
+   * call-seq:
+   *   buffer.put_int32(fixnum) -> ByteBuffer
+   *
+   * Writes a 32-bit integer value to the buffer.
+   *
+   * If the argument cannot be represented in 32 bits, raises RangeError.
+   *
+   * Returns the modified +self+.
+   */
+  rb_define_method(rb_byte_buffer_class, "put_int32", rb_bson_byte_buffer_put_int32, 1);
+  /*
+   * call-seq:
+   *   buffer.put_int64(fixnum) -> ByteBuffer
+   *
+   * Writes a 64-bit integer value to the buffer.
+   *
+   * If the argument cannot be represented in 64 bits, raises RangeError.
+   *
+   * Returns the modified +self+.
+   */
+  rb_define_method(rb_byte_buffer_class, "put_int64", rb_bson_byte_buffer_put_int64, 1);
+  /*
+   * call-seq:
+   *   buffer.put_double(double) -> ByteBuffer
+   *
+   * Writes a 64-bit floating point value to the buffer.
+   *
+   * Returns the modified +self+.
+   */
+  rb_define_method(rb_byte_buffer_class, "put_double", rb_bson_byte_buffer_put_double, 1);
+  /*
+   * call-seq:
+   *   buffer.put_decimal128(low_64bit, high_64bit) -> ByteBuffer
+   *
+   * Writes a 128-bit Decimal128 value to the buffer.
+   *
+   * +low_64bit+ and +high_64bit+ are Fixnum objects containing the low and
+   * the high parts of the 128-bit Decimal128 value, respectively.
+   *
+   * Returns the modified +self+.
+   */
+  rb_define_method(rb_byte_buffer_class, "put_decimal128", rb_bson_byte_buffer_put_decimal128, 2);
+  /*
+   * call-seq:
+   *   buffer.put_hash(hash) -> ByteBuffer
+   *
+   * Writes a Hash into the byte buffer.
+   *
+   * Returns the modified +self+.
+   */
+  rb_define_method(rb_byte_buffer_class, "put_hash", rb_bson_byte_buffer_put_hash, 2);
+  /*
+   * call-seq:
+   *   buffer.put_array(array) -> ByteBuffer
+   *
+   * Writes an Array into the byte buffer.
+   *
+   * Returns the modified +self+.
+   */
+  rb_define_method(rb_byte_buffer_class, "put_array", rb_bson_byte_buffer_put_array, 2);
+  /*
+   * call-seq:
+   *   buffer.replace_int32(position, fixnum) -> ByteBuffer
+   *
+   * Replaces a 32-bit integer value at the specified position in the buffer.
+   *
+   * The position must be a non-negative integer, and must be completely
+   * contained within the data already written. For example, if the buffer has
+   * the write position of 12, the acceptable range of positions for this
+   * method is 0..8.
+   *
+   * If the argument cannot be represented in 32 bits, raises RangeError.
+   *
+   * Returns the modified +self+.
+   */
+  rb_define_method(rb_byte_buffer_class, "replace_int32", rb_bson_byte_buffer_replace_int32, 2);
+  /*
+   * call-seq:
+   *   buffer.rewind! -> ByteBuffer
+   *
+   * Resets the read position to the beginning of the byte buffer.
+   *
+   * Note: +rewind!+ does not change the buffer's write position.
+   *
+   * Returns the modified +self+.
+   */
+  rb_define_method(rb_byte_buffer_class, "rewind!", rb_bson_byte_buffer_rewind, 0);
+  /*
+   * call-seq:
+   *   buffer.to_s -> String
+   *
+   * Returns the contents of the buffer as a binary string.
+   *
+   * Note: this method copies the buffer's contents into a newly allocated
+   * +String+ instance. It does not return a reference to the data stored in
+   * the buffer itself.
+   */
+  rb_define_method(rb_byte_buffer_class, "to_s", rb_bson_byte_buffer_to_s, 0);
+  rb_define_method(rb_bson_object_id_generator_class, "next_object_id", rb_bson_object_id_generator_next, -1);
+  // Get the object id machine id and hash it.
+  rb_require("digest/md5");
+  gethostname(rb_bson_machine_id, sizeof(rb_bson_machine_id));
+  rb_bson_machine_id[255] = '\0';
+  rb_bson_generate_machine_id(rb_md5_class, rb_bson_machine_id);
+  // Set the object id counter to a random number
+  rb_bson_object_id_counter = FIX2INT(rb_funcall(rb_mKernel, rb_intern("rand"), 1, INT2FIX(0x1000000)));
+  rb_bson_registry = rb_const_get(rb_bson_module, rb_intern("Registry"));
+}

data/ext/bson/libbson-utf8.c ADDED

@@ -0,0 +1,230 @@
+#include <ruby.h>
+#include <ruby/encoding.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <assert.h>
+#include "bson-native.h"
+/**
+ * Taken from libbson.
+ */
+#define BSON_ASSERT assert
+#define BSON_INLINE
+/*
+ *--------------------------------------------------------------------------
+ *
+ * _bson_utf8_get_sequence --
+ *
+ *       Determine the sequence length of the first UTF-8 character in
+ *       @utf8. The sequence length is stored in @seq_length and the mask
+ *       for the first character is stored in @first_mask.
+ *
+ * Returns:
+ *       None.
+ *
+ * Side effects:
+ *       @seq_length is set.
+ *       @first_mask is set.
+ *
+ *--------------------------------------------------------------------------
+ */
+static BSON_INLINE void
+_bson_utf8_get_sequence (const char *utf8,    /* IN */
+                         uint8_t *seq_length, /* OUT */
+                         uint8_t *first_mask) /* OUT */
+{
+   unsigned char c = *(const unsigned char *) utf8;
+   uint8_t m;
+   uint8_t n;
+   /*
+    * See the following[1] for a description of what the given multi-byte
+    * sequences will be based on the bits set of the first byte. We also need
+    * to mask the first byte based on that.  All subsequent bytes are masked
+    * against 0x3F.
+    *
+    * [1] http://www.joelonsoftware.com/articles/Unicode.html
+    */
+   if ((c & 0x80) == 0) {
+      n = 1;
+      m = 0x7F;
+   } else if ((c & 0xE0) == 0xC0) {
+      n = 2;
+      m = 0x1F;
+   } else if ((c & 0xF0) == 0xE0) {
+      n = 3;
+      m = 0x0F;
+   } else if ((c & 0xF8) == 0xF0) {
+      n = 4;
+      m = 0x07;
+   } else {
+      n = 0;
+      m = 0;
+   }
+   *seq_length = n;
+   *first_mask = m;
+}
+/*
+ *--------------------------------------------------------------------------
+ *
+ * bson_utf8_validate --
+ *
+ *       Validates that @utf8 is a valid UTF-8 string. Note that we only
+ *       support UTF-8 characters which have sequence length less than or equal
+ *       to 4 bytes (RFC 3629).
+ *
+ *       If @allow_null is true, then \0 is allowed within @utf8_len bytes
+ *       of @utf8.  Generally, this is bad practice since the main point of
+ *       UTF-8 strings is that they can be used with strlen() and friends.
+ *       However, some languages such as Python can send UTF-8 encoded
+ *       strings with NUL's in them.
+ *
+ * Parameters:
+ *       @utf8: A UTF-8 encoded string.
+ *       @utf8_len: The length of @utf8 in bytes.
+ *       @allow_null: If \0 is allowed within @utf8, exclusing trailing \0.
+ *       @data_type: The data type being serialized.
+ *
+ * Returns:
+ *       true if @utf8 is valid UTF-8. otherwise false.
+ *
+ * Side effects:
+ *       None.
+ *
+ *--------------------------------------------------------------------------
+ */
+void
+rb_bson_utf8_validate (const char *utf8, /* IN */
+                    size_t utf8_len,  /* IN */
+                    bool allow_null, /* IN */
+                    const char *data_type)  /* IN */
+{
+   uint32_t c;
+   uint8_t first_mask;
+   uint8_t seq_length;
+   unsigned i;
+   unsigned j;
+   bool not_shortest_form;
+   BSON_ASSERT (utf8);
+   for (i = 0; i < utf8_len; i += seq_length) {
+      _bson_utf8_get_sequence (&utf8[i], &seq_length, &first_mask);
+      /*
+       * Ensure we have a valid multi-byte sequence length.
+       */
+      if (!seq_length) {
+         rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: bogus initial bits", data_type, utf8);
+      }
+      /*
+       * Ensure we have enough bytes left.
+       */
+      if ((utf8_len - i) < seq_length) {
+         rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: truncated multi-byte sequence", data_type, utf8);
+      }
+      /*
+       * Also calculate the next char as a unichar so we can
+       * check code ranges for non-shortest form.
+       */
+      c = utf8[i] & first_mask;
+      /*
+       * Check the high-bits for each additional sequence byte.
+       */
+      for (j = i + 1; j < (i + seq_length); j++) {
+         c = (c << 6) | (utf8[j] & 0x3F);
+         if ((utf8[j] & 0xC0) != 0x80) {
+            rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: bogus high bits for continuation byte", data_type, utf8);
+         }
+      }
+      /*
+       * Check for NULL bytes afterwards.
+       *
+       * Hint: if you want to optimize this function, starting here to do
+       * this in the same pass as the data above would probably be a good
+       * idea. You would add a branch into the inner loop, but save possibly
+       * on cache-line bouncing on larger strings. Just a thought.
+       */
+      if (!allow_null) {
+         for (j = 0; j < seq_length; j++) {
+            if (((i + j) > utf8_len) || !utf8[i + j]) {
+               rb_raise(rb_eArgError, "%s %s contains null bytes", data_type, utf8);
+            }
+         }
+      }
+      /*
+       * Code point won't fit in utf-16, not allowed.
+       */
+      if (c > 0x0010FFFF) {
+         rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: code point %"PRIu32" does not fit in UTF-16", data_type, utf8, c);
+      }
+      /*
+       * Byte is in reserved range for UTF-16 high-marks
+       * for surrogate pairs.
+       */
+      if ((c & 0xFFFFF800) == 0xD800) {
+         rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: byte is in surrogate pair reserved range", data_type, utf8);
+      }
+      /*
+       * Check non-shortest form unicode.
+       */
+      not_shortest_form = false;
+      switch (seq_length) {
+      case 1:
+         if (c <= 0x007F) {
+            continue;
+         }
+         not_shortest_form = true;
+      case 2:
+         if ((c >= 0x0080) && (c <= 0x07FF)) {
+            continue;
+         } else if (c == 0) {
+            /* Two-byte representation for NULL. */
+            if (!allow_null) {
+               rb_raise(rb_eArgError, "%s %s contains null bytes", data_type, utf8);
+            }
+            continue;
+         }
+         not_shortest_form = true;
+      case 3:
+         if (((c >= 0x0800) && (c <= 0x0FFF)) ||
+             ((c >= 0x1000) && (c <= 0xFFFF))) {
+            continue;
+         }
+         not_shortest_form = true;
+      case 4:
+         if (((c >= 0x10000) && (c <= 0x3FFFF)) ||
+             ((c >= 0x40000) && (c <= 0xFFFFF)) ||
+             ((c >= 0x100000) && (c <= 0x10FFFF))) {
+            continue;
+         }
+         not_shortest_form = true;
+      default:
+         not_shortest_form = true;
+      }
+      if (not_shortest_form) {
+        rb_raise(rb_eEncodingError, "%s %s is not valid UTF-8: not in shortest form", data_type, utf8);
+      }
+   }
+}