mochilo 1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ OTMzOGM2MTMxMWI0YjZlNjZlMzhjNGU2ZDUyOWJmMWQyNGQ1OWJmZg==
5
+ data.tar.gz: !binary |-
6
+ YjdjODY4NDYyNWRhNGE2ODBiMTU4N2YwM2MwN2I0YWEyNzc0MjE0MA==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ MTQ5OTExMDNiMTM1MDVkODVlYTA3MDgzNGUwYzU5MGQ5OWE4MDBmY2E1Y2I5
10
+ NWNhOGU3MzVjOTVhYzIwZGZkMTBmYTNkZDk5Y2Y4NWI3YWE4NWEwZjljMGNk
11
+ ODhiY2MxNTY2ZjdlMjhmNmI2MTE5M2E1ZmY1MTExNjQ0M2Q5YTg=
12
+ data.tar.gz: !binary |-
13
+ NWNjMzNmMDU1YTM2ZGMwZGRhZWFhZDk0NzQ5NmYxNzkzMTcwNWNjOGY5OWUy
14
+ MTliZmJkYWU1ZGYxOThlNzJhMjc4NDAzNDY2M2E0ZGNhZmM1Y2YzMzA5ODFi
15
+ OGYwZDA4YTQzNmNiNzMxNmY1YjZmYjNmOTczYmU4MzdhOTBmNDk=
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ *.dSYM
2
+ *.so
3
+ *.bundle
4
+ /tmp
5
+ /bin
6
+ /vendor/gems
7
+ /.bundle
8
+ /.rbenv-version
9
+ /vendor/cache/*.gem
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
4
+
5
+ gem "pry"
data/Gemfile.lock ADDED
@@ -0,0 +1,30 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ mochilo (1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ coderay (1.0.9)
10
+ method_source (0.8.1)
11
+ minitest (5.0.6)
12
+ msgpack (0.5.5)
13
+ pry (0.9.12.2)
14
+ coderay (~> 1.0.5)
15
+ method_source (~> 0.8)
16
+ slop (~> 3.4)
17
+ rake (10.1.0)
18
+ rake-compiler (0.8.3)
19
+ rake
20
+ slop (3.4.5)
21
+
22
+ PLATFORMS
23
+ ruby
24
+
25
+ DEPENDENCIES
26
+ minitest (>= 4.1.0)
27
+ mochilo!
28
+ msgpack
29
+ pry
30
+ rake-compiler (>= 0.8.1)
data/README.md ADDED
@@ -0,0 +1,56 @@
1
+ # Mochilo
2
+
3
+ Mochilo is a Ruby library implementing the BananaPack protocol. BananaPack is a superset of MessagePack. It adds three new types to the protocol: Symbol and String in 16 and 32 bit lengths.
4
+
5
+ The Symbol type is a String of text composed of ASCII characters with a maximum length of (2^8)-1 bytes.
6
+
7
+ The String16 and String32 types are exactly the same as the [Raw16](http://wiki.msgpack.org/display/MSGPACK/Format+specification#Formatspecification-raw16) and [Raw32](http://wiki.msgpack.org/display/MSGPACK/Format+specification#Formatspecification-raw32) types from MessagePack except there is an encoding flag stored along with the bytes. This allows the format to differentiate binary from text.
8
+
9
+ Check out docs/format-spec.md for more detailed information on the differences between BananaPack and MessagePack.
10
+
11
+ ## Usage
12
+
13
+ ``` ruby
14
+ require 'mochilo'
15
+ obj = {key: "value"}
16
+ bpack = Mochilo.pack(obj)
17
+ #=> "\x81\xD8\x00\x03\x01key\xD8\x00\x05\x00value"
18
+
19
+ hash = Mochilo.unpack(bpack)
20
+ #=> {"key"=>"value"}
21
+ ```
22
+
23
+ Notice how `key` came back into Ruby as a String instead of a Symbol? This is because the `pack` method of Mochilo only generates "safe" bpack.
24
+
25
+ bpack without Symbols is considered "safe" for Ruby because in Ruby, Symbols aren't garbage collected. So if you unpack arbitrary bpack from an untrusted or malicious source, you could end up potentially exhausting the memory of your server.
26
+
27
+ To generate "unsafe" bpack, use `pack_unsafe` and `unpack_unsafe` methods instead:
28
+
29
+ ``` ruby
30
+ require 'mochilo'
31
+ obj = {key: "value"}
32
+ bpack = Mochilo.pack_unsafe(obj)
33
+ #=> "\x81\xD4\x03key\xD8\x00\x05\x00value"
34
+
35
+ hash = Mochilo.unpack_unsafe(bpack)
36
+ #=> {:key=>"value"}
37
+ ```
38
+
39
+ If you attempt to unpack "unsafe" bpack without using `unpack_unsafe`, an exception is raised.
40
+
41
+ ## Supported Ruby Types
42
+
43
+ The following Ruby types are supported. Meaning they will be deserialized into the same Ruby type they were before serialization.
44
+
45
+ If any other object type is encountered during serialization, an exception is raised. This is to ensure you have explicit control over what is being serialized.
46
+
47
+ * Fixnum
48
+ * Bignum
49
+ * Float
50
+ * Symbol
51
+ * String (with encoding)
52
+ * nil
53
+ * true
54
+ * false
55
+ * Array
56
+ * Hash
data/Rakefile ADDED
@@ -0,0 +1,33 @@
1
+ require 'date'
2
+ require 'rake/clean'
3
+ require 'rake/extensiontask'
4
+ require 'digest/md5'
5
+
6
+ task :default => :compile
7
+
8
+ # ==========================================================
9
+ # Ruby Extension
10
+ # ==========================================================
11
+ Rake::ExtensionTask.new('mochilo') do |ext|
12
+ ext.cross_compile = true
13
+ ext.cross_platform = ['x86-mingw32', 'x86-mswin32-60']
14
+
15
+ ext.lib_dir = File.join 'lib', 'mochilo'
16
+ end
17
+
18
+ desc "Open an irb session preloaded with Mochilo"
19
+ task :console do
20
+ sh "irb -rubygems -I lib -r ./lib/mochilo"
21
+ end
22
+
23
+ require 'rake/testtask'
24
+ Rake::TestTask.new('test') do |t|
25
+ t.test_files = FileList['test/*_test.rb']
26
+ t.ruby_opts += ['-rubygems'] if defined? Gem
27
+ end
28
+ task 'test' => [:compile]
29
+
30
+
31
+ task :encodings do
32
+ sh "ruby genperf.rb | gperf > ./ext/mochilo/encodings.h"
33
+ end
@@ -0,0 +1,46 @@
1
+ ## String
2
+
3
+ The String type is nearly identical to the Raw type from MessagePack but is specifically
4
+ aimed at marking a set of bytes as text.
5
+
6
+ ### Format Specification
7
+
8
+ #### Symbol
9
+
10
+ For storing symbol names as ASCII text up to (2^8)-1 bytes.
11
+ Length is stored in unsigned 8-bit integer.
12
+
13
+ ```
14
+ +--------+--------+----------
15
+ | 0xd4 |XXXXXXXX|...N bytes
16
+ +--------+--------+----------
17
+ => XXXXXXXX (=N) bytes of raw bytes.
18
+ ```
19
+
20
+ #### String16
21
+
22
+ For storing text up to (2^16)-1 bytes.
23
+ Length is stored in unsigned 16-bit big-endian integer.
24
+ Encoding is stored as an int8
25
+
26
+ ```
27
+ +--------+--------+--------+--------+----------
28
+ | 0xd8 |XXXXXXXX|XXXXXXXX|YYYYYYYY|...N bytes
29
+ +--------+--------+--------+--------+----------
30
+ => XXXXXXXX_XXXXXXXX (=N) bytes of raw bytes.
31
+ => YYYYYYYY encoding flag
32
+ ```
33
+
34
+ #### String32
35
+
36
+ For storing text up to (2^32)-1 bytes.
37
+ Length is stored in unsigned 32-bit big-endian integer.
38
+ Encoding is stored as an int8
39
+
40
+ ```
41
+ +--------+--------+--------+--------+--------+--------+----------
42
+ | 0xd9 |XXXXXXXX|XXXXXXXX|XXXXXXXX|XXXXXXXX|YYYYYYYY|...N bytes
43
+ +--------+--------+--------+--------+--------+--------+----------
44
+ => XXXXXXXX_XXXXXXXX_XXXXXXXX_XXXXXXXX (=N) bytes of raw bytes.
45
+ => YYYYYYYY encoding flag
46
+ ```
@@ -0,0 +1,25 @@
1
+ # Why BananaPack?
2
+
3
+ It's actually quite simple. We need to be able to serialize data while being able to differentiate between text and binary.
4
+
5
+ There are many, many other serialization formats out there. Some require a schema be defined up-front, while others can be parsed with no prior knowledge of the structure of the serialized data. Some are extremely verbose and harder to parse while others are compact and efficient to parse.
6
+
7
+ A couple of them that are comparable to BananaPack are MessagePack and JSON. To help explain why we created BananaPack, let's answer a few questions about those two first.
8
+
9
+ ## Why not JSON?
10
+
11
+ JSON has the basic principles of a serialized format that we want. It's only capable of representing primitive data types in common with almost every language. That makes it great for cross-language communication.
12
+
13
+ The problem is that it's just text. Unicode text to be precise. That means you can't serialize arbitrary binary data without encoding it first. Common practice is to Base64 encode the data first. But then deserialization side *has* to know to unencode it after parsing. There's no need for hacks like this.
14
+
15
+ ## Why not MessagePack?
16
+
17
+ MessagePack is nearly perfect for what we want. But text and binary share a single type in the spec, "Raw". That means when deserializing the data you could be reading a contents of a Markdown file, or the contents of a JPEG. There are countless hacks to get around this but there shouldn't be a need for them either.
18
+
19
+ ## BananaPack
20
+
21
+ We wanted something compact and easy to parse. But it also needed to have native support for differentiating between text and binary data.
22
+
23
+ Knowing what encoding text is in is just as important as knowing the timezone of a timestamp. This has been true since the birth of the Internet.
24
+
25
+ So we used a couple of the "reserved" data type slots in the MessagePack spec to add a String16, String32 and Symbol type. String16 and String32 are exactly the same as the Raw16 and Raw32 types from the MessagePack spec except they have an encoding flag as well. This means that it's possible to serialize text data without hacks. This also means it's possible to serialize the contents of a JPEG without first needing to encode or escape it.
@@ -0,0 +1,141 @@
1
+ /*
2
+ * Copyright (C) 2012 GitHub, Inc
3
+ */
4
+ #include <stdarg.h>
5
+ #include <ctype.h>
6
+ #include "mochilo.h"
7
+
8
+ #define MOCHILO_CHUNK_SIZE 1024
9
+ #define MOCHILO_CHUNK_INIT 8
10
+
11
+ static mochilo_buf_chunk *init_cur_chunk(mochilo_buf *buf, size_t chunk_size)
12
+ {
13
+ mochilo_buf_chunk *chunk = &buf->chunks[buf->cur_chunk];
14
+
15
+ if (chunk_size <= MOCHILO_CHUNK_SIZE)
16
+ chunk_size = MOCHILO_CHUNK_SIZE;
17
+ else
18
+ chunk_size += MOCHILO_CHUNK_SIZE;
19
+
20
+ buf->last_alloc = chunk->ptr = malloc(chunk_size);
21
+ if (!chunk->ptr)
22
+ return NULL;
23
+
24
+ chunk->end = chunk->ptr + chunk_size;
25
+ return chunk;
26
+ }
27
+
28
+ static void skip_last_chunk(mochilo_buf *buf)
29
+ {
30
+ mochilo_buf_chunk *chunk = &buf->chunks[buf->cur_chunk];
31
+
32
+ buf->total_size += (chunk->ptr - buf->last_alloc);
33
+ buf->cur_chunk++;
34
+
35
+ chunk->end = chunk->ptr;
36
+ chunk->ptr = buf->last_alloc;
37
+ }
38
+
39
+ static void free_buf(mochilo_buf *buf)
40
+ {
41
+ uint16_t i;
42
+
43
+ for (i = 0; i < buf->cur_chunk; ++i)
44
+ free(buf->chunks[i].ptr);
45
+
46
+ free(buf->chunks);
47
+ }
48
+
49
+
50
+ void mochilo_buf_init(mochilo_buf *buf)
51
+ {
52
+ buf->chunks = malloc(MOCHILO_CHUNK_INIT * sizeof(mochilo_buf_chunk));
53
+ buf->total_size = 0;
54
+ buf->cur_chunk = 0;
55
+ buf->chunk_count = MOCHILO_CHUNK_INIT;
56
+
57
+ init_cur_chunk(buf, MOCHILO_CHUNK_SIZE);
58
+ }
59
+
60
+ VALUE mochilo_buf_flush(mochilo_buf *buf)
61
+ {
62
+ VALUE rb_str;
63
+ char *ptr;
64
+ uint16_t i;
65
+
66
+ skip_last_chunk(buf);
67
+
68
+ #ifdef RUBINIUS
69
+ char *alloc;
70
+ alloc = ptr = malloc(buf->total_size);
71
+ if (alloc == NULL)
72
+ rb_raise(rb_eNoMemError, "Failed to alloc temp buffer");
73
+ #else
74
+ rb_str = rb_str_new(NULL, buf->total_size);
75
+ ptr = RSTRING_PTR(rb_str);
76
+ #endif
77
+
78
+ for (i = 0; i < buf->cur_chunk; ++i) {
79
+ mochilo_buf_chunk *chunk = &buf->chunks[i];
80
+ size_t chunk_len = chunk->end - chunk->ptr;
81
+
82
+ memcpy(ptr, chunk->ptr, chunk_len);
83
+ ptr += chunk_len;
84
+ free(chunk->ptr);
85
+ }
86
+
87
+ free(buf->chunks);
88
+
89
+ #ifdef RUBINIUS
90
+ rb_str = rb_str_new(alloc, buf->total_size);
91
+ #endif
92
+
93
+ return rb_str;
94
+ }
95
+
96
+ mochilo_buf_chunk *mochilo_buf_rechunk2(mochilo_buf *buf, size_t chunk_size)
97
+ {
98
+ skip_last_chunk(buf);
99
+
100
+ if (buf->cur_chunk == buf->chunk_count) {
101
+ buf->chunk_count *= 2;
102
+
103
+ buf->chunks = realloc(buf->chunks, buf->chunk_count * sizeof(mochilo_buf_chunk));
104
+ if (!buf->chunks)
105
+ return NULL;
106
+
107
+ }
108
+
109
+ return init_cur_chunk(buf, chunk_size);
110
+ }
111
+
112
+ mochilo_buf_chunk *mochilo_buf_rechunk(mochilo_buf *buf)
113
+ {
114
+ return mochilo_buf_rechunk2(buf, MOCHILO_CHUNK_SIZE);
115
+ }
116
+
117
+ void mochilo_buf_put(mochilo_buf *buf, const char *data, size_t len)
118
+ {
119
+ mochilo_buf_chunk *chunk = &buf->chunks[buf->cur_chunk];
120
+
121
+ if (unlikely(chunk->ptr + len > chunk->end)) {
122
+ if (!(chunk = mochilo_buf_rechunk2(buf, len)))
123
+ return;
124
+ }
125
+
126
+ memmove(chunk->ptr, data, len);
127
+ chunk->ptr += len;
128
+ }
129
+
130
+ const char *mochilo_src_peek(mochilo_src *buf, size_t need)
131
+ {
132
+ const char *ptr;
133
+
134
+ if (unlikely(buf->ptr + need > buf->end))
135
+ return NULL;
136
+
137
+ ptr = buf->ptr;
138
+ buf->ptr += need;
139
+ return ptr;
140
+ }
141
+
@@ -0,0 +1,146 @@
1
+ #ifndef INCLUDE_buffer_h__
2
+ #define INCLUDE_buffer_h__
3
+
4
+ #include <string.h>
5
+ #include <stdlib.h>
6
+ #include <assert.h>
7
+ #include <stdarg.h>
8
+ #include <stdint.h>
9
+
10
+ #define unlikely(x) __builtin_expect((x),0)
11
+
12
+ /**
13
+ * Byteswap code
14
+ */
15
+ static inline void swap8(const uint8_t *buffer, void *out)
16
+ {
17
+ unsigned char *ptr = (unsigned char *)out;
18
+ *ptr = *buffer;
19
+ }
20
+
21
+ static inline void swap16(const uint8_t *buffer, void *out)
22
+ {
23
+ unsigned char *ptr = (unsigned char *)out;
24
+ ptr[0] = buffer[1];
25
+ ptr[1] = buffer[0];
26
+ }
27
+
28
+ static inline void swap32(const uint8_t *buffer, void *out)
29
+ {
30
+ unsigned char *ptr = (unsigned char *)out;
31
+ ptr[0] = buffer[3];
32
+ ptr[1] = buffer[2];
33
+ ptr[2] = buffer[1];
34
+ ptr[3] = buffer[0];
35
+ }
36
+
37
+ static inline void swap64(const uint8_t *buffer, void *out)
38
+ {
39
+ unsigned char *ptr = (unsigned char *)out;
40
+ ptr[0] = buffer[7];
41
+ ptr[1] = buffer[6];
42
+ ptr[2] = buffer[5];
43
+ ptr[3] = buffer[4];
44
+ ptr[4] = buffer[3];
45
+ ptr[5] = buffer[2];
46
+ ptr[6] = buffer[1];
47
+ ptr[7] = buffer[0];
48
+ }
49
+
50
+ /**
51
+ * Buffer code
52
+ */
53
+ typedef struct {
54
+ char *ptr;
55
+ char *end;
56
+ } mochilo_buf_chunk;
57
+
58
+ typedef struct {
59
+ mochilo_buf_chunk *chunks;
60
+ char *last_alloc;
61
+ size_t total_size;
62
+ uint16_t chunk_count, cur_chunk;
63
+ } mochilo_buf;
64
+
65
+ typedef struct {
66
+ const char *ptr;
67
+ const char *end;
68
+ int trusted;
69
+ } mochilo_src;
70
+
71
+ void mochilo_buf_init(mochilo_buf *buf);
72
+ VALUE mochilo_buf_flush(mochilo_buf *buf);
73
+
74
+ mochilo_buf_chunk *mochilo_buf_rechunk(mochilo_buf *buf);
75
+ mochilo_buf_chunk *mochilo_buf_rechunk2(mochilo_buf *buf, size_t chunk_size);
76
+
77
+ void mochilo_buf_put(mochilo_buf *buf, const char *data, size_t len);
78
+
79
+ const char *mochilo_src_peek(mochilo_src *buf, size_t need);
80
+
81
+ #define BUF_ENSURE_AVAIL(b, d) \
82
+ mochilo_buf_chunk *chunk = &b->chunks[b->cur_chunk]; \
83
+ if (unlikely(chunk->ptr + (d) > chunk->end)) { \
84
+ if ((chunk = mochilo_buf_rechunk(b)) == NULL) return; };
85
+
86
+ #define SRC_CHECK_AVAIL(src, bytes) (src->ptr + bytes <= src->end)
87
+
88
+ #define SRC_ENSURE_AVAIL(src, bytes) \
89
+ if (unlikely(src->ptr + bytes > src->end)) \
90
+ return -1;
91
+
92
+ static inline void mochilo_buf_putc(mochilo_buf *buf, uint8_t c)
93
+ {
94
+ BUF_ENSURE_AVAIL(buf, 1);
95
+ *chunk->ptr = c;
96
+ chunk->ptr++;
97
+ }
98
+
99
+ static inline void mochilo_buf_put16be(mochilo_buf *buf, void *src16)
100
+ {
101
+ BUF_ENSURE_AVAIL(buf, 2);
102
+ swap16(src16, chunk->ptr);
103
+ chunk->ptr += 2;
104
+ }
105
+
106
+ static inline void mochilo_buf_put32be(mochilo_buf *buf, void *src32)
107
+ {
108
+ BUF_ENSURE_AVAIL(buf, 4);
109
+ swap32(src32, chunk->ptr);
110
+ chunk->ptr += 4;
111
+ }
112
+
113
+ static inline void mochilo_buf_put64be(mochilo_buf *buf, void *src64)
114
+ {
115
+ BUF_ENSURE_AVAIL(buf, 8);
116
+ swap64(src64, chunk->ptr);
117
+ chunk->ptr += 8;
118
+ }
119
+
120
+
121
+
122
+ static inline void mochilo_src_get8be(mochilo_src *buf, uint8_t *dst8)
123
+ {
124
+ *dst8 = *buf->ptr;
125
+ buf->ptr += 1;
126
+ }
127
+
128
+ static inline void mochilo_src_get16be(mochilo_src *buf, void *dst16)
129
+ {
130
+ swap16(buf->ptr, dst16);
131
+ buf->ptr += 2;
132
+ }
133
+
134
+ static inline void mochilo_src_get32be(mochilo_src *buf, void *dst32)
135
+ {
136
+ swap32(buf->ptr, dst32);
137
+ buf->ptr += 4;
138
+ }
139
+
140
+ static inline void mochilo_src_get64be(mochilo_src *buf, void *dst64)
141
+ {
142
+ swap64(buf->ptr, dst64);
143
+ buf->ptr += 8;
144
+ }
145
+
146
+ #endif