rejectu 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0b6567f2dd67415c2a817e5907895310439af685
4
+ data.tar.gz: 0dc65aa88bb81341421d8eabf548e05b6352a882
5
+ SHA512:
6
+ metadata.gz: af3e3a94fc5650dfe55669461efa16b42fc0bf617b3e08c8165152aaaebfe2345184a570fdd6e8b9278063b2d82f870a3893a54966d026640779154aed108849
7
+ data.tar.gz: a987555733a255e82adc2ff41fa6f4fa712b4db0e11f6ad0693bc7780c9c50da5e91ad409fa5c1ef8cdba3052dd5febd8bf565bac185b2a50a0eff1cc7487ba6
@@ -0,0 +1,5 @@
1
+ /.bundle/
2
+ /lib/rejectu/*.so
3
+ /lib/rejectu/*.bundle
4
+ /tmp/*
5
+ *.gem
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
@@ -0,0 +1,18 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ rejectu (0.0.1)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ rake (10.3.2)
10
+ rake-compiler (0.9.2)
11
+ rake
12
+
13
+ PLATFORMS
14
+ ruby
15
+
16
+ DEPENDENCIES
17
+ rake-compiler (~> 0.9)
18
+ rejectu!
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 Scott Francis <scott.francis@shopify.com>
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,23 @@
1
+ ## Rejectu
2
+
3
+ A simple Ruby extension that verifies that a UTF-8 string does not contain any characters from supplementary planes (code points >= `U+10000`).
4
+
5
+ ### Features
6
+
7
+ - C extension that uses SSE2 for webscale
8
+
9
+ ### Usage
10
+
11
+ ```ruby
12
+ require 'rejectu/rejectu'
13
+
14
+ Rejectu.valid? "happy! \xf2\xa4\xb7\xa4" # false
15
+ Rejectu.valid? "really happy!" # true
16
+ Rejectu.valid? "this should be good too \xe2\x84\xa2" # true
17
+
18
+ Rejectu.scrub "happy! \xf2\xa4\xb7\xa4" # => "happy! ?"
19
+ ```
20
+
21
+ ### Limitations
22
+
23
+ - The replacement character for `scrub` currently cannot be configured
@@ -0,0 +1,32 @@
1
+ task :default => :test
2
+
3
+ # ==========================================================
4
+ # Packaging
5
+ # ==========================================================
6
+
7
+ GEMSPEC = eval(File.read('rejectu.gemspec'))
8
+
9
+ require 'rubygems/package_task'
10
+ Gem::PackageTask.new(GEMSPEC) do |pkg|
11
+ end
12
+
13
+ # ==========================================================
14
+ # Ruby Extension
15
+ # ==========================================================
16
+
17
+ require 'rake/extensiontask'
18
+ Rake::ExtensionTask.new('rejectu', GEMSPEC) do |ext|
19
+ ext.ext_dir = 'ext/rejectu'
20
+ ext.lib_dir = 'lib/rejectu'
21
+ end
22
+ task :build => :compile
23
+
24
+ # ==========================================================
25
+ # Testing
26
+ # ==========================================================
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new 'test' do |t|
30
+ t.test_files = FileList['test/test_*.rb']
31
+ end
32
+ task :test => :build
@@ -0,0 +1,5 @@
1
+ require 'mkmf'
2
+
3
+ $CFLAGS = "-O3"
4
+
5
+ create_makefile('rejectu/rejectu')
@@ -0,0 +1,197 @@
1
+ #include <ruby.h>
2
+ #include <ruby/encoding.h>
3
+ #ifdef __SSE2__
4
+ #include <emmintrin.h>
5
+ #endif
6
+
7
+ static VALUE mRejectu = Qnil;
8
+ static VALUE idEncoding, idTo_s;
9
+
10
+ static inline int
11
+ has_utf8_supplementary_planes(__m128i v)
12
+ {
13
+ v = _mm_srli_epi16(v, 4);
14
+ v = _mm_cmpeq_epi16(v, _mm_set1_epi16(0x0f));
15
+ return _mm_movemask_epi8(v) == 0 ? 0 : 1;
16
+ }
17
+
18
+ static inline void
19
+ validate_utf8_input(VALUE str)
20
+ {
21
+ VALUE encoding;
22
+
23
+ Check_Type(str, T_STRING);
24
+
25
+ encoding = rb_funcall(rb_funcall(str, idEncoding, 0), idTo_s, 0);
26
+ if (TYPE(encoding) != T_STRING || strcmp(RSTRING_PTR(encoding), "UTF-8") != 0) {
27
+ rb_raise(rb_eArgError, "input string is not UTF-8");
28
+ }
29
+ }
30
+
31
+ static VALUE
32
+ is_valid(VALUE self, VALUE str)
33
+ {
34
+ unsigned char *p, *end;
35
+ long len, remain;
36
+ #ifdef __SSE2__
37
+ __m128i chunk, part;
38
+ int mask;
39
+ #endif
40
+
41
+ validate_utf8_input(str);
42
+
43
+ len = RSTRING_LEN(str);
44
+ p = RSTRING_PTR(str);
45
+ end = RSTRING_END(str);
46
+
47
+ #ifdef __SSE2__
48
+ /* advance p until it's 16 byte aligned */
49
+ while (((uintptr_t) p & 0xf) != 0 && p < end) {
50
+ if ((*p & 0xf0) == 0xf0) {
51
+ return Qfalse;
52
+ }
53
+ p++;
54
+ }
55
+
56
+ while (p < end) {
57
+ if (end - p < 16)
58
+ break;
59
+
60
+ chunk = _mm_load_si128((__m128i *) p);
61
+ /* check if the top bit of any of the bytes is set, which is 1 if the character is multibyte */
62
+ mask = _mm_movemask_epi8(chunk);
63
+ if (mask) {
64
+ /*
65
+ * If there's a multi-bye character somewhere in this chunk, we need to check if it's a codepoint
66
+ * from the supplementary plane (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx).
67
+ *
68
+ * 1) Unpack the chunk into two halves (16-bit integers)
69
+ * 2) Shift each 16-bit integer 4 bits to the right
70
+ * 3) Check if the value is 0xf (first four bits set to 1)
71
+ * 4) Check the high bit of each 8-bit integer
72
+ *
73
+ * If the result of step 4 is non-zero, the part has a supplementary plane character.
74
+ *
75
+ * Example: the string "hello test! \xf0\x9f\x98\x80" (13 characters, 16 bytes)
76
+ *
77
+ * UTF-8 representation:
78
+ * h e l l o <space> t e
79
+ * 01101000 01100001 01101100 01101100 01101111 00100000 01110100 01100001
80
+ *
81
+ * s t ! <space> 😀 GRINNING FACE (1F600)
82
+ * 01110011 01110100 00100001 00100000 11110000 10011111 10011000 10000000
83
+ *
84
+ * Low part:
85
+ *
86
+ * 1) Compare the low part into 16 bit values = 0x00680065006c006c006f002000740065
87
+ * 2) Shift each 16 bit value to the right by 4 = 0x00060006000600060006000000070006
88
+ * 3) Compare each 16 bit value to 0xf = 0x00000000000000000000000000000000
89
+ * 4) Check the high bit of each 8-bit value = 0
90
+ *
91
+ * No supplementary plane characters in this part
92
+ *
93
+ * High part:
94
+ *
95
+ * 1) Compare the low part into 16 bit values = 0x007300740021002000f0009f00980080
96
+ * 2) Shift each 16 bit value to the right by 4 = 0x0007000700020002000f000900090008
97
+ * 3) Compare each 16 bit value to 0xf = 0x0000000000000000ffff000000000000
98
+ * 4) Check the high bit of each 8-bit value = 0xc0 (0b0000000011000000)
99
+ *
100
+ * The result is non-zero, so this part has a supplementary plane character.
101
+ *
102
+ */
103
+ if (has_utf8_supplementary_planes(_mm_unpacklo_epi8(chunk, _mm_setzero_si128())) ||
104
+ has_utf8_supplementary_planes(_mm_unpackhi_epi8(chunk, _mm_setzero_si128()))) {
105
+ return Qfalse;
106
+ }
107
+ }
108
+
109
+ p += 16;
110
+ }
111
+ #endif
112
+
113
+ remain = end - p;
114
+ while (remain) {
115
+ if ((*p & 0xf0) == 0xf0) {
116
+ return Qfalse;
117
+ }
118
+ p++;
119
+ remain = end - p;
120
+ }
121
+
122
+ return Qtrue;
123
+ }
124
+
125
+ static VALUE
126
+ do_scrub(VALUE str)
127
+ {
128
+ VALUE out_str;
129
+ unsigned char *p, *end, *out_start, *out;
130
+ long len, out_len;
131
+
132
+ validate_utf8_input(str);
133
+
134
+ len = RSTRING_LEN(str);
135
+ p = RSTRING_PTR(str);
136
+ end = RSTRING_END(str);
137
+
138
+ out_start = out = (unsigned char *) malloc(len);
139
+ if (!out_start) {
140
+ rb_raise(rb_eNoMemError, "out of memory");
141
+ }
142
+
143
+ while (p < end) {
144
+ if ((*p & 0xf0) == 0xf0) {
145
+ if ((*p & 0xfc) == 0xfc) {
146
+ p += 6;
147
+ } else if ((*p & 0xf8) == 0xf8) {
148
+ p += 5;
149
+ } else {
150
+ p += 4;
151
+ }
152
+ *out++ = '?';
153
+ } else {
154
+ *out++ = *p++;
155
+ }
156
+ }
157
+ *out = '\0';
158
+
159
+ out_str = rb_enc_str_new(out_start, out - out_start, rb_utf8_encoding());
160
+ free(out_start);
161
+
162
+ return out_str;
163
+ }
164
+
165
+ static VALUE
166
+ scrub(VALUE self, VALUE str)
167
+ {
168
+ if (is_valid(self, str) == Qtrue) {
169
+ return rb_enc_str_new(RSTRING_PTR(str), RSTRING_LEN(str), rb_utf8_encoding());
170
+ }
171
+ return do_scrub(str);
172
+ }
173
+
174
+ static VALUE
175
+ scrub_bang(VALUE self, VALUE str)
176
+ {
177
+ VALUE repl;
178
+ if (is_valid(self, str) == Qtrue) {
179
+ return str;
180
+ }
181
+ repl = do_scrub(str);
182
+ if (!NIL_P(repl)) rb_str_replace(str, repl);
183
+ return str;
184
+ }
185
+
186
+ void
187
+ Init_rejectu()
188
+ {
189
+ mRejectu = rb_define_module("Rejectu");
190
+
191
+ rb_define_singleton_method(mRejectu, "valid?", is_valid, 1);
192
+ rb_define_singleton_method(mRejectu, "scrub", scrub, 1);
193
+ rb_define_singleton_method(mRejectu, "scrub!", scrub_bang, 1);
194
+
195
+ idEncoding = rb_intern("encoding");
196
+ idTo_s = rb_intern("to_s");
197
+ }
@@ -0,0 +1,17 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'rejectu'
3
+ s.version = '0.0.1'
4
+ s.summary = 'Detects if a UTF-8 string supplementary plane code points'
5
+ s.description = <<-DOC
6
+ This gem detects if a UTF-8 encoded string contains characters from the UTF-8 supplementary
7
+ plane (code points >= U+10000).
8
+ DOC
9
+ s.homepage = 'https://github.com/csfrancis/rejectu'
10
+ s.authors = 'Scott Francis'
11
+ s.email = 'scott.francis@shopify.com'
12
+ s.license = 'MIT'
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.extensions = ['ext/rejectu/extconf.rb']
16
+ s.add_development_dependency 'rake-compiler', '~> 0.9'
17
+ end
@@ -0,0 +1,86 @@
1
+ require 'test/unit'
2
+ require 'rejectu/rejectu'
3
+
4
+ class TestRejectu < Test::Unit::TestCase
5
+ def test_valid_only_accepts_string
6
+ assert_raises TypeError do
7
+ Rejectu.valid?(Object.new)
8
+ end
9
+ end
10
+
11
+ def test_valid_returns_boolean
12
+ assert Rejectu.valid? "test string"
13
+ end
14
+
15
+ def test_valid_short_string
16
+ assert Rejectu.valid? "test"
17
+ end
18
+
19
+ def test_invalid_string
20
+ refute Rejectu.valid? "\xf2\xa4\xb7\xa4 test string"
21
+ end
22
+
23
+ def test_invalid_string2
24
+ refute Rejectu.valid? "teststri\xf2\xa4\xb7\xa4"
25
+ end
26
+
27
+ def test_invalid_string3
28
+ refute Rejectu.valid? "teststri12\xf2\xa4\xb7\xa4"
29
+ end
30
+
31
+ def test_non_utf8_string
32
+ assert_raises ArgumentError do
33
+ Rejectu.valid? "hello world".encode("ISO-8859-1")
34
+ end
35
+ end
36
+
37
+ def test_longer_valid_utf8_string
38
+ s = <<-END
39
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt
40
+ ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco
41
+ laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in
42
+ voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
43
+ non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
44
+ END
45
+ assert Rejectu.valid? s
46
+ end
47
+
48
+ def test_longer_utf8_string
49
+ s = <<-END
50
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt
51
+ ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco
52
+ laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in
53
+ voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
54
+ non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. \xf2\xa4\xb7\xa4
55
+ END
56
+ refute Rejectu.valid? s
57
+ end
58
+
59
+ def test_longer_utf8_string2
60
+ s = <<-END
61
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt
62
+ ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco
63
+ laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in
64
+ voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
65
+ non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
66
+ \xf2\xa4\xb7\xa4
67
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt
68
+ ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco
69
+ laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in
70
+ voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat
71
+ non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
72
+ END
73
+ refute Rejectu.valid? s
74
+ end
75
+
76
+ def test_scrub
77
+ assert_equal "? test string", Rejectu.scrub("\xf2\xa4\xb7\xa4 test string")
78
+ end
79
+
80
+ def test_scrub!
81
+ s = "\xf2\xa4\xb7\xa4 test string"
82
+ assert_equal "? test string", Rejectu.scrub!(s)
83
+ assert_equal "? test string", s
84
+ end
85
+
86
+ end
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rejectu
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Scott Francis
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake-compiler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.9'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.9'
27
+ description: |2
28
+ This gem detects if a UTF-8 encoded string contains characters from the UTF-8 supplementary
29
+ plane (code points >= U+10000).
30
+ email: scott.francis@shopify.com
31
+ executables: []
32
+ extensions:
33
+ - ext/rejectu/extconf.rb
34
+ extra_rdoc_files: []
35
+ files:
36
+ - ".gitignore"
37
+ - Gemfile
38
+ - Gemfile.lock
39
+ - LICENSE.md
40
+ - README.md
41
+ - Rakefile
42
+ - ext/rejectu/extconf.rb
43
+ - ext/rejectu/rejectu.c
44
+ - rejectu.gemspec
45
+ - test/test_rejectu.rb
46
+ homepage: https://github.com/csfrancis/rejectu
47
+ licenses:
48
+ - MIT
49
+ metadata: {}
50
+ post_install_message:
51
+ rdoc_options: []
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: '0'
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ requirements: []
65
+ rubyforge_project:
66
+ rubygems_version: 2.2.2
67
+ signing_key:
68
+ specification_version: 4
69
+ summary: Detects if a UTF-8 string supplementary plane code points
70
+ test_files: []