escape_utils 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data/.gitignore +2 -1
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +206 -0
  4. data/benchmark/html_escape.rb +1 -0
  5. data/benchmark/html_unescape.rb +1 -0
  6. data/benchmark/javascript_escape.rb +1 -0
  7. data/benchmark/javascript_unescape.rb +1 -0
  8. data/benchmark/url_escape.rb +1 -0
  9. data/benchmark/url_unescape.rb +1 -0
  10. data/escape_utils.gemspec +0 -3
  11. data/ext/escape_utils/buffer.c +228 -0
  12. data/ext/escape_utils/buffer.h +91 -0
  13. data/ext/escape_utils/escape_utils.c +111 -531
  14. data/ext/escape_utils/houdini.h +15 -0
  15. data/ext/escape_utils/houdini_html.c +214 -0
  16. data/ext/escape_utils/houdini_js.c +148 -0
  17. data/ext/escape_utils/houdini_uri.c +130 -0
  18. data/ext/escape_utils/html_unescape.h +754 -0
  19. data/ext/escape_utils/uri_escape.h +35 -0
  20. data/lib/escape_utils.rb +2 -2
  21. data/lib/escape_utils/html/cgi.rb +0 -2
  22. data/lib/escape_utils/html/erb.rb +0 -2
  23. data/lib/escape_utils/html/haml.rb +0 -2
  24. data/lib/escape_utils/html/rack.rb +0 -2
  25. data/lib/escape_utils/html_safety.rb +0 -2
  26. data/lib/escape_utils/javascript/action_view.rb +0 -2
  27. data/lib/escape_utils/url/cgi.rb +0 -2
  28. data/lib/escape_utils/url/erb.rb +0 -2
  29. data/lib/escape_utils/url/rack.rb +0 -2
  30. data/lib/escape_utils/url/uri.rb +0 -2
  31. data/lib/escape_utils/version.rb +1 -1
  32. data/spec/html/escape_spec.rb +0 -1
  33. data/spec/html/unescape_spec.rb +0 -1
  34. data/spec/html_safety_spec.rb +0 -1
  35. data/spec/javascript/escape_spec.rb +0 -1
  36. data/spec/javascript/unescape_spec.rb +0 -1
  37. data/spec/query/escape_spec.rb +0 -1
  38. data/spec/query/unescape_spec.rb +1 -0
  39. data/spec/spec_helper.rb +0 -1
  40. data/spec/uri/escape_spec.rb +0 -1
  41. data/spec/uri/unescape_spec.rb +1 -0
  42. data/spec/url/escape_spec.rb +0 -1
  43. data/spec/url/unescape_spec.rb +1 -0
  44. metadata +16 -8
  45. data/README.rdoc +0 -146
data/.gitignore CHANGED
@@ -5,4 +5,5 @@ pkg/*
5
5
  doc/*
6
6
  *.rbc
7
7
  tmp/
8
- Gemfile.lock
8
+ Gemfile.lock
9
+ vendor/*
@@ -1,5 +1,9 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.2.4 (September 7th, 2011)
4
+ * swap out custom escaping routines for houdini - https://github.com/tanoku/houdini
5
+ * add RSTRING_NOT_MODIFIED define for a Rubinius speedup
6
+
3
7
  ## 0.2.3 (March 9th, 2011)
4
8
  * change encoding strategy to simply return strings in the encoding the input string was in, not taking into account Encoding.default_internal
5
9
 
@@ -0,0 +1,206 @@
1
+ # escape_utils
2
+
3
+ Being as though we're all html escaping everything these days, why not make it faster?
4
+
5
+ For character encoding in 1.9, the output string's encoding is copied from the input string.
6
+
7
+ It has monkey-patches for Rack::Utils, CGI, URI, ERB::Util and Haml and ActionView so you can drop this in and have your app start escaping fast as balls in no time
8
+
9
+ It supports HTML, URL, URI and Javascript escaping/unescaping.
10
+
11
+ ## Installing
12
+
13
+ ``` sh
14
+ gem install escape_utils
15
+ ```
16
+
17
+ ## Warning: UTF-8 only
18
+
19
+ escape_utils assumes all input is encoded as valid UTF-8. If you are dealing with other encodings do your best to transcode the string into a UTF-8 byte stream before handing it to escape_utils.
20
+
21
+ On Ruby 1.9 this is as easy as:
22
+
23
+ ``` ruby
24
+ utf8_string = non_utf8_string.encode('UTF-8')
25
+ ```
26
+
27
+ If you're on Ruby 1.8 you can use [charlock_holmes](https://github.com/brianmario/charlock_holmes) to transcode like so:
28
+
29
+ ``` ruby
30
+ # NOTE: we're assuming you know the encoding of `non_utf8_string` here.
31
+ # if you don't, you can use the detection API of charlock_holmes
32
+ utf8_string = CharlockHolmes::Converter.convert(non_utf8_string, other_encoding, 'UTF-8')
33
+ ```
34
+
35
+ ## Usage
36
+
37
+ ### HTML
38
+
39
+ #### Escaping
40
+
41
+ ``` ruby
42
+ html = `curl -s http://maps.google.com`
43
+ escaped_html = EscapeUtils.escape_html(html)
44
+ ```
45
+
46
+ By default escape_utils will escape `/` characters with `/`, but you can disable that by setting `EscapeUtils.html_secure = false`
47
+ or per-call by passing `false` as the second parameter to `escape_html` like `EscapeUtils.escape_html(html, false)`
48
+
49
+ For more information check out: http://www.owasp.org/index.php/XSS_(Cross_Site_Scripting)_Prevention_Cheat_Sheet#RULE_.231_-_HTML_Escape_Before_Inserting_Untrusted_Data_into_HTML_Element_Content
50
+
51
+ #### Unescaping
52
+
53
+ ``` ruby
54
+ html = `curl -s http://maps.google.com`
55
+ escaped_html = EscapeUtils.escape_html(html)
56
+ html = EscapeUtils.unescape_html(escaped_html)
57
+ ```
58
+
59
+ #### Monkey Patches
60
+
61
+ ``` ruby
62
+ require 'escape_utils/html/rack' # to patch Rack::Utils
63
+ require 'escape_utils/html/erb' # to patch ERB::Util
64
+ require 'escape_utils/html/cgi' # to patch CGI
65
+ require 'escape_utils/html/haml' # to patch Haml::Helpers
66
+ ```
67
+
68
+ ### URL
69
+
70
+ Use (un)escape_uri to get RFC-compliant escaping (like PHP rawurlencode).
71
+
72
+ Use (un)escape_url to get CGI escaping (where space is +).
73
+
74
+ #### Escaping
75
+
76
+ ``` ruby
77
+ url = "https://www.yourmom.com/cgi-bin/session.cgi?sess_args=mcEA~!!#*YH*>@!U"
78
+ escaped_url = EscapeUtils.escape_url(url)
79
+ ```
80
+
81
+ #### Unescaping
82
+
83
+ ``` ruby
84
+ url = "https://www.yourmom.com/cgi-bin/session.cgi?sess_args=mcEA~!!#*YH*>@!U"
85
+ escaped_url = EscapeUtils.escape_url(url)
86
+ EscapeUtils.unescape_url(escaped_url) == url # => true
87
+ ```
88
+
89
+ #### Monkey Patches
90
+
91
+ ``` ruby
92
+ require 'escape_utils/url/cgi' # to patch CGI
93
+ require 'escape_utils/url/erb' # to patch ERB::Util
94
+ require 'escape_utils/url/rack' # to patch Rack::Utils
95
+ require 'escape_utils/url/uri' # to patch URI
96
+ ```
97
+
98
+ ### Javascript
99
+
100
+ #### Escaping
101
+
102
+ ``` ruby
103
+ javascript = `curl -s http://code.jquery.com/jquery-1.4.2.js`
104
+ escaped_javascript = EscapeUtils.escape_javascript(javascript)
105
+ ```
106
+
107
+ #### Unescaping
108
+
109
+ ``` ruby
110
+ javascript = `curl -s http://code.jquery.com/jquery-1.4.2.js`
111
+ escaped_javascript = EscapeUtils.escape_javascript(javascript)
112
+ EscapeUtils.unescape_javascript(escaped_javascript) == javascript # => true
113
+ ```
114
+
115
+ #### Monkey Patches
116
+
117
+ ``` ruby
118
+ require 'escape_utils/javascript/action_view' # to patch ActionView::Helpers::JavaScriptHelper
119
+ ```
120
+
121
+ ## Benchmarks
122
+
123
+ In my testing, escaping html is around 10-30x faster than the pure ruby implementations in wide use today.
124
+ While unescaping html is around 40-100x faster than CGI.unescapeHTML which is also pure ruby.
125
+ Escaping Javascript is around 16-30x faster.
126
+
127
+ This output is from my laptop using the benchmark scripts in the benchmarks folder.
128
+
129
+ ### HTML
130
+
131
+ #### Escaping
132
+
133
+ ```
134
+ Rack::Utils.escape_html
135
+ 9.650000 0.090000 9.740000 ( 9.750756)
136
+ Haml::Helpers.html_escape
137
+ 9.310000 0.110000 9.420000 ( 9.417317)
138
+ ERB::Util.html_escape
139
+ 5.330000 0.390000 5.720000 ( 5.748394)
140
+ CGI.escapeHTML
141
+ 5.370000 0.380000 5.750000 ( 5.791344)
142
+ FasterHTMLEscape.html_escape
143
+ 0.520000 0.010000 0.530000 ( 0.539485)
144
+ fast_xs_extra#fast_xs_html
145
+ 0.310000 0.030000 0.340000 ( 0.336734)
146
+ EscapeUtils.escape_html
147
+ 0.200000 0.050000 0.250000 ( 0.258839)
148
+ ```
149
+
150
+ #### Unescaping
151
+
152
+ ```
153
+ CGI.unescapeHTML
154
+ 16.520000 0.080000 16.600000 ( 16.853888)
155
+ EscapeUtils.unescape_html
156
+ 0.120000 0.040000 0.160000 ( 0.162696)
157
+ ```
158
+
159
+ ### Javascript
160
+
161
+ #### Escaping
162
+
163
+ ```
164
+ ActionView::Helpers::JavaScriptHelper#escape_javascript
165
+ 3.810000 0.100000 3.910000 ( 3.925557)
166
+ EscapeUtils.escape_javascript
167
+ 0.200000 0.040000 0.240000 ( 0.236692)
168
+ ```
169
+
170
+ #### Unescaping
171
+
172
+ I didn't look that hard, but I'm not aware of another ruby library that does Javascript unescaping to benchmark against. Anyone know of any?
173
+
174
+ ### URL
175
+
176
+ #### Escaping
177
+
178
+ ```
179
+ ERB::Util.url_encode
180
+ 0.520000 0.010000 0.530000 ( 0.529277)
181
+ Rack::Utils.escape
182
+ 0.460000 0.010000 0.470000 ( 0.466962)
183
+ CGI.escape
184
+ 0.440000 0.000000 0.440000 ( 0.443017)
185
+ URLEscape#escape
186
+ 0.040000 0.000000 0.040000 ( 0.045661)
187
+ fast_xs_extra#fast_xs_url
188
+ 0.010000 0.000000 0.010000 ( 0.015429)
189
+ EscapeUtils.escape_url
190
+ 0.010000 0.000000 0.010000 ( 0.010843)
191
+ ```
192
+
193
+ #### Unescaping
194
+
195
+ ```
196
+ Rack::Utils.unescape
197
+ 0.250000 0.010000 0.260000 ( 0.257558)
198
+ CGI.unescape
199
+ 0.250000 0.000000 0.250000 ( 0.257837)
200
+ URLEscape#unescape
201
+ 0.040000 0.000000 0.040000 ( 0.031548)
202
+ fast_xs_extra#fast_uxs_cgi
203
+ 0.010000 0.000000 0.010000 ( 0.006062)
204
+ EscapeUtils.unescape_url
205
+ 0.000000 0.000000 0.000000 ( 0.005679)
206
+ ```
@@ -19,6 +19,7 @@ end
19
19
  times = 100
20
20
  url = "http://en.wikipedia.org/wiki/Line_of_succession_to_the_British_throne"
21
21
  html = `curl -s #{url}`
22
+ html = html.force_encoding('binary') if html.respond_to?(:force_encoding)
22
23
  puts "Escaping #{html.bytesize} bytes of html #{times} times, from #{url}"
23
24
 
24
25
  Benchmark.bmbm do |x|
@@ -16,6 +16,7 @@ end
16
16
  times = 100
17
17
  url = "http://en.wikipedia.org/wiki/Line_of_succession_to_the_British_throne"
18
18
  html = `curl -s #{url}`
19
+ html = html.force_encoding('binary') if html.respond_to?(:force_encoding)
19
20
  escaped_html = EscapeUtils.escape_html(html)
20
21
  puts "Unescaping #{escaped_html.bytesize} bytes of escaped html #{times} times, from #{url}"
21
22
 
@@ -15,6 +15,7 @@ end
15
15
  times = 100
16
16
  url = "http://ajax.googleapis.com/ajax/libs/dojo/1.4.3/dojo/dojo.xd.js.uncompressed.js"
17
17
  javascript = `curl -s #{url}`
18
+ javascript = javascript.force_encoding('utf-8') if javascript.respond_to?(:force_encoding)
18
19
  puts "Escaping #{javascript.bytesize} bytes of javascript #{times} times, from #{url}"
19
20
 
20
21
  Benchmark.bmbm do |x|
@@ -10,6 +10,7 @@ require 'escape_utils'
10
10
  times = 100
11
11
  url = "http://ajax.googleapis.com/ajax/libs/dojo/1.4.3/dojo/dojo.xd.js.uncompressed.js"
12
12
  javascript = `curl -s #{url}`
13
+ javascript = javascript.force_encoding('utf-8') if javascript.respond_to?(:force_encoding)
13
14
  escaped_javascript = EscapeUtils.escape_javascript(javascript)
14
15
  puts "Escaping #{escaped_javascript.bytesize} bytes of javascript #{times} times, from #{url}"
15
16
 
@@ -14,6 +14,7 @@ require 'escape_utils'
14
14
 
15
15
  times = 10_000
16
16
  url = "https://www.yourmom.com/cgi-bin/session.cgi?sess_args=mYHcEA dh435dqUs0moGHeeAJTSLLbdbcbd9ef----,574b95600e9ab7d27eb0bf524ac68c27----"
17
+ url = url.force_encoding('us-ascii') if url.respond_to?(:force_encoding)
17
18
  puts "Escaping a #{url.bytesize} byte URL #{times} times"
18
19
 
19
20
  Benchmark.bmbm do |x|
@@ -13,6 +13,7 @@ require 'escape_utils'
13
13
 
14
14
  times = 10_000
15
15
  url = "https://www.yourmom.com/cgi-bin/session.cgi?sess_args=mYHcEA dh435dqUs0moGHeeAJTSLLbdbcbd9ef----,574b95600e9ab7d27eb0bf524ac68c27----"
16
+ url = url.force_encoding('us-ascii') if url.respond_to?(:force_encoding)
16
17
  escaped_url = EscapeUtils.escape_url(url)
17
18
  puts "Escaping a #{url.bytesize} byte URL #{times} times"
18
19
 
@@ -7,9 +7,6 @@ Gem::Specification.new do |s|
7
7
  s.date = Time.now.utc.strftime("%Y-%m-%d")
8
8
  s.email = %q{seniorlopez@gmail.com}
9
9
  s.extensions = ["ext/escape_utils/extconf.rb"]
10
- s.extra_rdoc_files = [
11
- "README.rdoc"
12
- ]
13
10
  s.files = `git ls-files`.split("\n")
14
11
  s.homepage = %q{http://github.com/brianmario/escape_utils}
15
12
  s.rdoc_options = ["--charset=UTF-8"]
@@ -0,0 +1,228 @@
1
+ /*
2
+ * Copyright (c) 2008, Natacha Porté
3
+ * Copyright (c) 2011, Vicent Martí
4
+ *
5
+ * Permission to use, copy, modify, and distribute this software for any
6
+ * purpose with or without fee is hereby granted, provided that the above
7
+ * copyright notice and this permission notice appear in all copies.
8
+ *
9
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
+ */
17
+
18
+ #define BUFFER_MAX_ALLOC_SIZE (1024 * 1024 * 16) //16mb
19
+
20
+ #include "buffer.h"
21
+
22
+ #include <stdio.h>
23
+ #include <stdlib.h>
24
+ #include <string.h>
25
+
26
+ /* MSVC compat */
27
+ #if defined(_MSC_VER)
28
+ # define _buf_vsnprintf _vsnprintf
29
+ #else
30
+ # define _buf_vsnprintf vsnprintf
31
+ #endif
32
+
33
+ int
34
+ bufprefix(const struct buf *buf, const char *prefix)
35
+ {
36
+ size_t i;
37
+
38
+ for (i = 0; i < buf->size; ++i) {
39
+ if (prefix[i] == 0)
40
+ return 0;
41
+
42
+ if (buf->data[i] != prefix[i])
43
+ return buf->data[i] - prefix[i];
44
+ }
45
+
46
+ return 0;
47
+ }
48
+
49
+ /* bufgrow: increasing the allocated size to the given value */
50
+ int
51
+ bufgrow(struct buf *buf, size_t neosz)
52
+ {
53
+ size_t neoasz;
54
+ void *neodata;
55
+ if (!buf || !buf->unit || neosz > BUFFER_MAX_ALLOC_SIZE)
56
+ return BUF_ENOMEM;
57
+
58
+ if (buf->asize >= neosz)
59
+ return BUF_OK;
60
+
61
+ neoasz = buf->asize + buf->unit;
62
+ while (neoasz < neosz)
63
+ neoasz += buf->unit;
64
+
65
+ neodata = realloc(buf->data, neoasz);
66
+ if (!neodata)
67
+ return BUF_ENOMEM;
68
+
69
+ buf->data = neodata;
70
+ buf->asize = neoasz;
71
+ return BUF_OK;
72
+ }
73
+
74
+
75
+ /* bufnew: allocation of a new buffer */
76
+ struct buf *
77
+ bufnew(size_t unit)
78
+ {
79
+ struct buf *ret;
80
+ ret = malloc(sizeof (struct buf));
81
+
82
+ if (ret) {
83
+ ret->data = 0;
84
+ ret->size = ret->asize = 0;
85
+ ret->unit = unit;
86
+ }
87
+ return ret;
88
+ }
89
+
90
+ /* bufnullterm: NULL-termination of the string array */
91
+ const char *
92
+ bufcstr(struct buf *buf)
93
+ {
94
+ if (!buf || !buf->unit)
95
+ return NULL;
96
+
97
+ if (buf->size < buf->asize && buf->data[buf->size] == 0)
98
+ return (char *)buf->data;
99
+
100
+ if (buf->size + 1 <= buf->asize || bufgrow(buf, buf->size + 1) == 0) {
101
+ buf->data[buf->size] = 0;
102
+ return (char *)buf->data;
103
+ }
104
+
105
+ return NULL;
106
+ }
107
+
108
+ /* bufprintf: formatted printing to a buffer */
109
+ void
110
+ bufprintf(struct buf *buf, const char *fmt, ...)
111
+ {
112
+ va_list ap;
113
+ if (!buf || !buf->unit)
114
+ return;
115
+
116
+ va_start(ap, fmt);
117
+ vbufprintf(buf, fmt, ap);
118
+ va_end(ap);
119
+ }
120
+
121
+ /* bufput: appends raw data to a buffer */
122
+ void
123
+ bufput(struct buf *buf, const void *data, size_t len)
124
+ {
125
+ if (!buf)
126
+ return;
127
+
128
+ if (buf->size + len > buf->asize && bufgrow(buf, buf->size + len) < 0)
129
+ return;
130
+
131
+ memcpy(buf->data + buf->size, data, len);
132
+ buf->size += len;
133
+ }
134
+
135
+ /* bufputs: appends a NUL-terminated string to a buffer */
136
+ void
137
+ bufputs(struct buf *buf, const char *str)
138
+ {
139
+ bufput(buf, str, strlen(str));
140
+ }
141
+
142
+
143
+ /* bufputc: appends a single uint8_t to a buffer */
144
+ void
145
+ bufputc(struct buf *buf, int c)
146
+ {
147
+ if (!buf)
148
+ return;
149
+
150
+ if (buf->size + 1 > buf->asize && bufgrow(buf, buf->size + 1) < 0)
151
+ return;
152
+
153
+ buf->data[buf->size] = c;
154
+ buf->size += 1;
155
+ }
156
+
157
+ /* bufrelease: decrease the reference count and free the buffer if needed */
158
+ void
159
+ bufrelease(struct buf *buf)
160
+ {
161
+ if (!buf)
162
+ return;
163
+
164
+ free(buf->data);
165
+ free(buf);
166
+ }
167
+
168
+
169
+ /* bufreset: frees internal data of the buffer */
170
+ void
171
+ bufreset(struct buf *buf)
172
+ {
173
+ if (!buf)
174
+ return;
175
+
176
+ free(buf->data);
177
+ buf->data = NULL;
178
+ buf->size = buf->asize = 0;
179
+ }
180
+
181
+ /* bufslurp: removes a given number of bytes from the head of the array */
182
+ void
183
+ bufslurp(struct buf *buf, size_t len)
184
+ {
185
+ if (!buf || !buf->unit || len <= 0)
186
+ return;
187
+
188
+ if (len >= buf->size) {
189
+ buf->size = 0;
190
+ return;
191
+ }
192
+
193
+ buf->size -= len;
194
+ memmove(buf->data, buf->data + len, buf->size);
195
+ }
196
+
197
+ /* vbufprintf: stdarg variant of formatted printing into a buffer */
198
+ void
199
+ vbufprintf(struct buf *buf, const char *fmt, va_list ap)
200
+ {
201
+ int n;
202
+
203
+ if (buf == 0 || (buf->size >= buf->asize && bufgrow(buf, buf->size + 1)) < 0)
204
+ return;
205
+
206
+ n = _buf_vsnprintf((char *)buf->data + buf->size, buf->asize - buf->size, fmt, ap);
207
+
208
+ if (n < 0) {
209
+ #ifdef _MSC_VER
210
+ n = _vscprintf(fmt, ap);
211
+ #else
212
+ return;
213
+ #endif
214
+ }
215
+
216
+ if ((size_t)n >= buf->asize - buf->size) {
217
+ if (bufgrow(buf, buf->size + n + 1) < 0)
218
+ return;
219
+
220
+ n = _buf_vsnprintf((char *)buf->data + buf->size, buf->asize - buf->size, fmt, ap);
221
+ }
222
+
223
+ if (n < 0)
224
+ return;
225
+
226
+ buf->size += n;
227
+ }
228
+