escape_utils 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. data/.gitignore +2 -1
  2. data/.travis.yml +13 -0
  3. data/CHANGELOG.md +7 -0
  4. data/MIT-LICENSE +1 -1
  5. data/Rakefile +5 -18
  6. data/benchmark/html_escape.rb +9 -2
  7. data/benchmark/xml_escape.rb +29 -0
  8. data/escape_utils.gemspec +2 -3
  9. data/ext/escape_utils/buffer.c +181 -160
  10. data/ext/escape_utils/buffer.h +90 -68
  11. data/ext/escape_utils/escape_utils.c +77 -39
  12. data/ext/escape_utils/extconf.rb +1 -1
  13. data/ext/escape_utils/houdini.h +37 -8
  14. data/ext/escape_utils/houdini_href_e.c +115 -0
  15. data/ext/escape_utils/houdini_html_e.c +90 -0
  16. data/ext/escape_utils/houdini_html_u.c +122 -0
  17. data/ext/escape_utils/{houdini_js.c → houdini_js_e.c} +17 -75
  18. data/ext/escape_utils/houdini_js_u.c +60 -0
  19. data/ext/escape_utils/{uri_escape.h → houdini_uri_e.c} +68 -2
  20. data/ext/escape_utils/houdini_uri_u.c +65 -0
  21. data/ext/escape_utils/houdini_xml_e.c +136 -0
  22. data/lib/escape_utils/version.rb +1 -1
  23. data/lib/escape_utils/xml/builder.rb +8 -0
  24. data/test/helper.rb +14 -0
  25. data/test/html/escape_test.rb +61 -0
  26. data/test/html/unescape_test.rb +48 -0
  27. data/test/html_safety_test.rb +46 -0
  28. data/test/javascript/escape_test.rb +42 -0
  29. data/test/javascript/unescape_test.rb +46 -0
  30. data/test/query/escape_test.rb +50 -0
  31. data/test/query/unescape_test.rb +52 -0
  32. data/test/uri/escape_test.rb +50 -0
  33. data/test/uri/unescape_test.rb +55 -0
  34. data/test/url/escape_test.rb +58 -0
  35. data/test/url/unescape_test.rb +60 -0
  36. data/test/xml/escape_test.rb +67 -0
  37. metadata +136 -152
  38. data/.rspec +0 -2
  39. data/ext/escape_utils/houdini_html.c +0 -214
  40. data/ext/escape_utils/houdini_uri.c +0 -130
  41. data/spec/html/escape_spec.rb +0 -42
  42. data/spec/html/unescape_spec.rb +0 -37
  43. data/spec/html_safety_spec.rb +0 -48
  44. data/spec/javascript/escape_spec.rb +0 -34
  45. data/spec/javascript/unescape_spec.rb +0 -37
  46. data/spec/query/escape_spec.rb +0 -44
  47. data/spec/query/unescape_spec.rb +0 -46
  48. data/spec/rcov.opts +0 -3
  49. data/spec/spec_helper.rb +0 -5
  50. data/spec/uri/escape_spec.rb +0 -43
  51. data/spec/uri/unescape_spec.rb +0 -57
  52. data/spec/url/escape_spec.rb +0 -52
  53. data/spec/url/unescape_spec.rb +0 -57
@@ -0,0 +1,60 @@
1
+ #include <assert.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+
5
+ #include "houdini.h"
6
+
7
+ int
8
+ houdini_unescape_js(gh_buf *ob, const uint8_t *src, size_t size)
9
+ {
10
+ size_t i = 0, org, ch;
11
+
12
+ while (i < size) {
13
+ org = i;
14
+ while (i < size && src[i] != '\\')
15
+ i++;
16
+
17
+ if (likely(i > org)) {
18
+ if (unlikely(org == 0)) {
19
+ if (i >= size)
20
+ return 0;
21
+
22
+ gh_buf_grow(ob, HOUDINI_UNESCAPED_SIZE(size));
23
+ }
24
+
25
+ gh_buf_put(ob, src + org, i - org);
26
+ }
27
+
28
+ /* escaping */
29
+ if (i == size)
30
+ break;
31
+
32
+ if (++i == size) {
33
+ gh_buf_putc(ob, '\\');
34
+ break;
35
+ }
36
+
37
+ ch = src[i];
38
+
39
+ switch (ch) {
40
+ case 'n':
41
+ ch = '\n';
42
+ /* pass through */
43
+
44
+ case '\\':
45
+ case '\'':
46
+ case '\"':
47
+ case '/':
48
+ gh_buf_putc(ob, ch);
49
+ i++;
50
+ break;
51
+
52
+ default:
53
+ gh_buf_putc(ob, '\\');
54
+ break;
55
+ }
56
+ }
57
+
58
+ return 1;
59
+ }
60
+
@@ -1,3 +1,9 @@
1
+ #include <assert.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+
5
+ #include "houdini.h"
6
+
1
7
  static const char URL_SAFE[] = {
2
8
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3
9
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -14,7 +20,8 @@ static const char URL_SAFE[] = {
14
20
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
15
21
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
16
22
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
17
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, };
23
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
24
+ };
18
25
 
19
26
  static const char URI_SAFE[] = {
20
27
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -32,4 +39,63 @@ static const char URI_SAFE[] = {
32
39
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
33
40
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
34
41
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
35
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, };
42
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
43
+ };
44
+
45
+ static int
46
+ escape(gh_buf *ob, const uint8_t *src, size_t size, int is_url)
47
+ {
48
+ static const uint8_t hex_chars[] = "0123456789ABCDEF";
49
+ const char *safe_table = is_url ? URL_SAFE : URI_SAFE;
50
+
51
+ size_t i = 0, org;
52
+ uint8_t hex_str[3];
53
+
54
+ hex_str[0] = '%';
55
+
56
+ while (i < size) {
57
+ org = i;
58
+ while (i < size && safe_table[src[i]] != 0)
59
+ i++;
60
+
61
+ if (likely(i > org)) {
62
+ if (unlikely(org == 0)) {
63
+ if (i >= size)
64
+ return 0;
65
+
66
+ gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size));
67
+ }
68
+
69
+ gh_buf_put(ob, src + org, i - org);
70
+ }
71
+
72
+ /* escaping */
73
+ if (i >= size)
74
+ break;
75
+
76
+ if (src[i] == ' ' && is_url) {
77
+ gh_buf_putc(ob, '+');
78
+ } else {
79
+ hex_str[1] = hex_chars[(src[i] >> 4) & 0xF];
80
+ hex_str[2] = hex_chars[src[i] & 0xF];
81
+ gh_buf_put(ob, hex_str, 3);
82
+ }
83
+
84
+ i++;
85
+ }
86
+
87
+ return 1;
88
+ }
89
+
90
+ int
91
+ houdini_escape_uri(gh_buf *ob, const uint8_t *src, size_t size)
92
+ {
93
+ return escape(ob, src, size, 0);
94
+ }
95
+
96
+ int
97
+ houdini_escape_url(gh_buf *ob, const uint8_t *src, size_t size)
98
+ {
99
+ return escape(ob, src, size, 1);
100
+ }
101
+
@@ -0,0 +1,65 @@
1
+ #include <assert.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+
5
+ #include "houdini.h"
6
+
7
+ #define hex2c(c) ((c | 32) % 39 - 9)
8
+
9
+ static int
10
+ unescape(gh_buf *ob, const uint8_t *src, size_t size, int is_url)
11
+ {
12
+ size_t i = 0, org;
13
+
14
+ while (i < size) {
15
+ org = i;
16
+ while (i < size && src[i] != '%')
17
+ i++;
18
+
19
+ if (likely(i > org)) {
20
+ if (unlikely(org == 0)) {
21
+ if (i >= size && !is_url)
22
+ return 0;
23
+
24
+ gh_buf_grow(ob, HOUDINI_UNESCAPED_SIZE(size));
25
+ }
26
+
27
+ gh_buf_put(ob, src + org, i - org);
28
+ }
29
+
30
+ /* escaping */
31
+ if (i >= size)
32
+ break;
33
+
34
+ i++;
35
+
36
+ if (i + 1 < size && _isxdigit(src[i]) && _isxdigit(src[i + 1])) {
37
+ unsigned char new_char = (hex2c(src[i]) << 4) + hex2c(src[i + 1]);
38
+ gh_buf_putc(ob, new_char);
39
+ i += 2;
40
+ } else {
41
+ gh_buf_putc(ob, '%');
42
+ }
43
+ }
44
+
45
+ if (is_url) {
46
+ char *find = (char *)gh_buf_cstr(ob);
47
+ while ((find = strchr(find, '+')) != NULL)
48
+ *find = ' ';
49
+ }
50
+
51
+ return 1;
52
+ }
53
+
54
+ int
55
+ houdini_unescape_uri(gh_buf *ob, const uint8_t *src, size_t size)
56
+ {
57
+ return unescape(ob, src, size, 0);
58
+ }
59
+
60
+ int
61
+ houdini_unescape_url(gh_buf *ob, const uint8_t *src, size_t size)
62
+ {
63
+ return unescape(ob, src, size, 1);
64
+ }
65
+
@@ -0,0 +1,136 @@
1
+ #include <assert.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+
5
+ #include "houdini.h"
6
+
7
+ /**
8
+ * & --> &amp;
9
+ * < --> &lt;
10
+ * > --> &gt;
11
+ * " --> &quot;
12
+ * ' --> &apos;
13
+ */
14
+ static const char *LOOKUP_CODES[] = {
15
+ "", /* reserved: use literal single character */
16
+ "", /* unused */
17
+ "", /* reserved: 2 character UTF-8 */
18
+ "", /* reserved: 3 character UTF-8 */
19
+ "", /* reserved: 4 character UTF-8 */
20
+ "?", /* invalid UTF-8 character */
21
+ "&quot;",
22
+ "&amp;",
23
+ "&apos;",
24
+ "&lt;",
25
+ "&gt;"
26
+ };
27
+
28
+ static const char CODE_INVALID = 5;
29
+
30
+ static const char XML_LOOKUP_TABLE[] = {
31
+ /* ASCII: 0xxxxxxx */
32
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 5, 5, 0, 5, 5,
33
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
34
+ 0, 0, 6, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0,
35
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0,10, 0,
36
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
37
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
38
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
39
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
40
+
41
+ /* Invalid UTF-8 char start: 10xxxxxx */
42
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
43
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
44
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
45
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
46
+
47
+ /* Multibyte UTF-8 */
48
+
49
+ /* 2 bytes: 110xxxxx */
50
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
51
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
52
+
53
+ /* 3 bytes: 1110xxxx */
54
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
55
+
56
+ /* 4 bytes: 11110xxx */
57
+ 4, 4, 4, 4, 4, 4, 4, 4,
58
+
59
+ /* Invalid UTF-8: 11111xxx */
60
+ 5, 5, 5, 5, 5, 5, 5, 5,
61
+ };
62
+
63
+ int
64
+ houdini_escape_xml(gh_buf *ob, const uint8_t *src, size_t size)
65
+ {
66
+ size_t i = 0;
67
+ unsigned char code = 0;
68
+
69
+ gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size));
70
+
71
+ while (i < size) {
72
+ size_t start, end;
73
+
74
+ start = end = i;
75
+
76
+ while (i < size) {
77
+ unsigned int byte;
78
+
79
+ byte = src[i++];
80
+ code = XML_LOOKUP_TABLE[byte];
81
+
82
+ if (!code) {
83
+ /* single character used literally */
84
+ } else if (code >= CODE_INVALID) {
85
+ break; /* insert lookup code string */
86
+ } else if (code > size - end) {
87
+ code = CODE_INVALID; /* truncated UTF-8 character */
88
+ break;
89
+ } else {
90
+ unsigned int chr = byte & (0xff >> code);
91
+
92
+ while (--code) {
93
+ byte = src[i++];
94
+ if ((byte & 0xc0) != 0x80) {
95
+ code = CODE_INVALID;
96
+ break;
97
+ }
98
+ chr = (chr << 6) + (byte & 0x3f);
99
+ }
100
+
101
+ switch (i - end) {
102
+ case 2:
103
+ if (chr < 0x80)
104
+ code = CODE_INVALID;
105
+ break;
106
+ case 3:
107
+ if (chr < 0x800 ||
108
+ (chr > 0xd7ff && chr < 0xe000) ||
109
+ chr > 0xfffd)
110
+ code = CODE_INVALID;
111
+ break;
112
+ case 4:
113
+ if (chr < 0x10000 || chr > 0x10ffff)
114
+ code = CODE_INVALID;
115
+ break;
116
+ default:
117
+ break;
118
+ }
119
+ if (code == CODE_INVALID)
120
+ break;
121
+ }
122
+ end = i;
123
+ }
124
+
125
+ if (end > start)
126
+ gh_buf_put(ob, src + start, end - start);
127
+
128
+ /* escaping */
129
+ if (end >= size)
130
+ break;
131
+
132
+ gh_buf_puts(ob, LOOKUP_CODES[code]);
133
+ }
134
+
135
+ return 1;
136
+ }
@@ -1,3 +1,3 @@
1
1
  module EscapeUtils
2
- VERSION = "0.2.4"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -0,0 +1,8 @@
1
+ module Builder
2
+ class XmlBase < BlankSlate
3
+ private
4
+ def _escape(text)
5
+ EscapeUtils.escape_xml(text.to_s)
6
+ end
7
+ end
8
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,14 @@
1
+ # Basic test environment.
2
+
3
+ # blah fuck this
4
+ require 'rubygems' if !defined?(Gem)
5
+ require 'bundler/setup'
6
+
7
+ require 'escape_utils'
8
+
9
+ # bring in minitest
10
+ require 'minitest/autorun'
11
+
12
+ # put lib and test dirs directly on load path
13
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
14
+ $LOAD_PATH.unshift File.expand_path('..', __FILE__)
@@ -0,0 +1,61 @@
1
+ require File.expand_path("../../helper", __FILE__)
2
+
3
+ class HtmlEscapeTest < MiniTest::Unit::TestCase
4
+ def test_escape_basic_html_with_secure
5
+ assert_equal "&lt;some_tag&#47;&gt;", EscapeUtils.escape_html("<some_tag/>")
6
+
7
+ secure_before = EscapeUtils.html_secure
8
+ EscapeUtils.html_secure = true
9
+ assert_equal "&lt;some_tag&#47;&gt;", EscapeUtils.escape_html("<some_tag/>")
10
+ EscapeUtils.html_secure = secure_before
11
+ end
12
+
13
+ def test_escape_basic_html_without_secure
14
+ assert_equal "&lt;some_tag/&gt;", EscapeUtils.escape_html("<some_tag/>", false)
15
+
16
+ secure_before = EscapeUtils.html_secure
17
+ EscapeUtils.html_secure = false
18
+ assert_equal "&lt;some_tag/&gt;", EscapeUtils.escape_html("<some_tag/>")
19
+ EscapeUtils.html_secure = secure_before
20
+ end
21
+
22
+ def test_escape_double_quotes
23
+ assert_equal "&lt;some_tag some_attr=&quot;some value&quot;&#47;&gt;", EscapeUtils.escape_html("<some_tag some_attr=\"some value\"/>")
24
+ end
25
+
26
+ def test_escape_single_quotes
27
+ assert_equal "&lt;some_tag some_attr=&#39;some value&#39;&#47;&gt;", EscapeUtils.escape_html("<some_tag some_attr='some value'/>")
28
+ end
29
+
30
+ def test_escape_ampersand
31
+ assert_equal "&lt;b&gt;Bourbon &amp; Branch&lt;&#47;b&gt;", EscapeUtils.escape_html("<b>Bourbon & Branch</b>")
32
+ end
33
+
34
+ def test_returns_original_if_not_escaped
35
+ str = 'foobar'
36
+ assert_equal str.object_id, EscapeUtils.escape_html(str).object_id
37
+ end
38
+
39
+ if RUBY_VERSION =~ /^1.9/
40
+ def test_utf8_or_ascii_input_only
41
+ str = "<b>Bourbon & Branch</b>"
42
+
43
+ str.force_encoding 'ISO-8859-1'
44
+ assert_raises Encoding::CompatibilityError do
45
+ EscapeUtils.escape_html(str)
46
+ end
47
+
48
+ str.force_encoding 'UTF-8'
49
+ begin
50
+ EscapeUtils.escape_html(str)
51
+ rescue Encoding::CompatibilityError => e
52
+ assert_nil e, "#{e.class.name} raised, expected not to"
53
+ end
54
+ end
55
+
56
+ def test_return_value_is_tagged_as_utf8
57
+ str = "<b>Bourbon & Branch</b>".encode('utf-8')
58
+ assert_equal Encoding.find('UTF-8'), EscapeUtils.escape_html(str).encoding
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,48 @@
1
+ # encoding: utf-8
2
+ require File.expand_path("../../helper", __FILE__)
3
+
4
+ class HtmlUnescapeTest < MiniTest::Unit::TestCase
5
+ def test_basic_html
6
+ assert_equal "<some_tag/>", EscapeUtils.unescape_html("&lt;some_tag&#47;&gt;")
7
+ end
8
+
9
+ def test_double_quotes
10
+ assert_equal "<some_tag some_attr=\"some value\"/>", EscapeUtils.unescape_html("&lt;some_tag some_attr=&quot;some value&quot;&#47;&gt;")
11
+ end
12
+
13
+ def test_single_quotes
14
+ assert_equal "<some_tag some_attr='some value'/>", EscapeUtils.unescape_html("&lt;some_tag some_attr=&#39;some value&#39;&#47;&gt;")
15
+ end
16
+
17
+ def test_amperstand
18
+ assert_equal "<b>Bourbon & Branch</b>", EscapeUtils.unescape_html("&lt;b&gt;Bourbon &amp; Branch&lt;&#47;b&gt;")
19
+ end
20
+
21
+ def test_passes_through_incompletely_escaped_tags
22
+ assert_equal "&", EscapeUtils.unescape_html("&")
23
+ assert_equal "&lt", EscapeUtils.unescape_html("&lt")
24
+ end
25
+
26
+ if RUBY_VERSION =~ /^1.9/
27
+ def test_input_must_be_utf8_or_ascii
28
+ escaped = EscapeUtils.escape_html("<b>Bourbon & Branch</b>")
29
+
30
+ escaped.force_encoding 'ISO-8859-1'
31
+ assert_raises Encoding::CompatibilityError do
32
+ EscapeUtils.unescape_html(escaped)
33
+ end
34
+
35
+ escaped.force_encoding 'UTF-8'
36
+ begin
37
+ EscapeUtils.unescape_html(escaped)
38
+ rescue Encoding::CompatibilityError => e
39
+ assert_nil e, "#{e.class.name} raised, expected not to"
40
+ end
41
+ end
42
+
43
+ def test_return_value_is_tagged_as_utf8
44
+ escaped = EscapeUtils.escape_html("<b>Bourbon & Branch</b>")
45
+ assert_equal Encoding.find('UTF-8'), EscapeUtils.unescape_html(escaped).encoding
46
+ end
47
+ end
48
+ end