escape_utils 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. data/.gitignore +2 -1
  2. data/.travis.yml +13 -0
  3. data/CHANGELOG.md +7 -0
  4. data/MIT-LICENSE +1 -1
  5. data/Rakefile +5 -18
  6. data/benchmark/html_escape.rb +9 -2
  7. data/benchmark/xml_escape.rb +29 -0
  8. data/escape_utils.gemspec +2 -3
  9. data/ext/escape_utils/buffer.c +181 -160
  10. data/ext/escape_utils/buffer.h +90 -68
  11. data/ext/escape_utils/escape_utils.c +77 -39
  12. data/ext/escape_utils/extconf.rb +1 -1
  13. data/ext/escape_utils/houdini.h +37 -8
  14. data/ext/escape_utils/houdini_href_e.c +115 -0
  15. data/ext/escape_utils/houdini_html_e.c +90 -0
  16. data/ext/escape_utils/houdini_html_u.c +122 -0
  17. data/ext/escape_utils/{houdini_js.c → houdini_js_e.c} +17 -75
  18. data/ext/escape_utils/houdini_js_u.c +60 -0
  19. data/ext/escape_utils/{uri_escape.h → houdini_uri_e.c} +68 -2
  20. data/ext/escape_utils/houdini_uri_u.c +65 -0
  21. data/ext/escape_utils/houdini_xml_e.c +136 -0
  22. data/lib/escape_utils/version.rb +1 -1
  23. data/lib/escape_utils/xml/builder.rb +8 -0
  24. data/test/helper.rb +14 -0
  25. data/test/html/escape_test.rb +61 -0
  26. data/test/html/unescape_test.rb +48 -0
  27. data/test/html_safety_test.rb +46 -0
  28. data/test/javascript/escape_test.rb +42 -0
  29. data/test/javascript/unescape_test.rb +46 -0
  30. data/test/query/escape_test.rb +50 -0
  31. data/test/query/unescape_test.rb +52 -0
  32. data/test/uri/escape_test.rb +50 -0
  33. data/test/uri/unescape_test.rb +55 -0
  34. data/test/url/escape_test.rb +58 -0
  35. data/test/url/unescape_test.rb +60 -0
  36. data/test/xml/escape_test.rb +67 -0
  37. metadata +136 -152
  38. data/.rspec +0 -2
  39. data/ext/escape_utils/houdini_html.c +0 -214
  40. data/ext/escape_utils/houdini_uri.c +0 -130
  41. data/spec/html/escape_spec.rb +0 -42
  42. data/spec/html/unescape_spec.rb +0 -37
  43. data/spec/html_safety_spec.rb +0 -48
  44. data/spec/javascript/escape_spec.rb +0 -34
  45. data/spec/javascript/unescape_spec.rb +0 -37
  46. data/spec/query/escape_spec.rb +0 -44
  47. data/spec/query/unescape_spec.rb +0 -46
  48. data/spec/rcov.opts +0 -3
  49. data/spec/spec_helper.rb +0 -5
  50. data/spec/uri/escape_spec.rb +0 -43
  51. data/spec/uri/unescape_spec.rb +0 -57
  52. data/spec/url/escape_spec.rb +0 -52
  53. data/spec/url/unescape_spec.rb +0 -57
@@ -0,0 +1,60 @@
1
+ #include <assert.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+
5
+ #include "houdini.h"
6
+
7
+ int
8
+ houdini_unescape_js(gh_buf *ob, const uint8_t *src, size_t size)
9
+ {
10
+ size_t i = 0, org, ch;
11
+
12
+ while (i < size) {
13
+ org = i;
14
+ while (i < size && src[i] != '\\')
15
+ i++;
16
+
17
+ if (likely(i > org)) {
18
+ if (unlikely(org == 0)) {
19
+ if (i >= size)
20
+ return 0;
21
+
22
+ gh_buf_grow(ob, HOUDINI_UNESCAPED_SIZE(size));
23
+ }
24
+
25
+ gh_buf_put(ob, src + org, i - org);
26
+ }
27
+
28
+ /* escaping */
29
+ if (i == size)
30
+ break;
31
+
32
+ if (++i == size) {
33
+ gh_buf_putc(ob, '\\');
34
+ break;
35
+ }
36
+
37
+ ch = src[i];
38
+
39
+ switch (ch) {
40
+ case 'n':
41
+ ch = '\n';
42
+ /* pass through */
43
+
44
+ case '\\':
45
+ case '\'':
46
+ case '\"':
47
+ case '/':
48
+ gh_buf_putc(ob, ch);
49
+ i++;
50
+ break;
51
+
52
+ default:
53
+ gh_buf_putc(ob, '\\');
54
+ break;
55
+ }
56
+ }
57
+
58
+ return 1;
59
+ }
60
+
@@ -1,3 +1,9 @@
1
+ #include <assert.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+
5
+ #include "houdini.h"
6
+
1
7
  static const char URL_SAFE[] = {
2
8
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3
9
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -14,7 +20,8 @@ static const char URL_SAFE[] = {
14
20
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
15
21
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
16
22
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
17
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, };
23
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
24
+ };
18
25
 
19
26
  static const char URI_SAFE[] = {
20
27
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -32,4 +39,63 @@ static const char URI_SAFE[] = {
32
39
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
33
40
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
34
41
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
35
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, };
42
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
43
+ };
44
+
45
+ static int
46
+ escape(gh_buf *ob, const uint8_t *src, size_t size, int is_url)
47
+ {
48
+ static const uint8_t hex_chars[] = "0123456789ABCDEF";
49
+ const char *safe_table = is_url ? URL_SAFE : URI_SAFE;
50
+
51
+ size_t i = 0, org;
52
+ uint8_t hex_str[3];
53
+
54
+ hex_str[0] = '%';
55
+
56
+ while (i < size) {
57
+ org = i;
58
+ while (i < size && safe_table[src[i]] != 0)
59
+ i++;
60
+
61
+ if (likely(i > org)) {
62
+ if (unlikely(org == 0)) {
63
+ if (i >= size)
64
+ return 0;
65
+
66
+ gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size));
67
+ }
68
+
69
+ gh_buf_put(ob, src + org, i - org);
70
+ }
71
+
72
+ /* escaping */
73
+ if (i >= size)
74
+ break;
75
+
76
+ if (src[i] == ' ' && is_url) {
77
+ gh_buf_putc(ob, '+');
78
+ } else {
79
+ hex_str[1] = hex_chars[(src[i] >> 4) & 0xF];
80
+ hex_str[2] = hex_chars[src[i] & 0xF];
81
+ gh_buf_put(ob, hex_str, 3);
82
+ }
83
+
84
+ i++;
85
+ }
86
+
87
+ return 1;
88
+ }
89
+
90
+ int
91
+ houdini_escape_uri(gh_buf *ob, const uint8_t *src, size_t size)
92
+ {
93
+ return escape(ob, src, size, 0);
94
+ }
95
+
96
+ int
97
+ houdini_escape_url(gh_buf *ob, const uint8_t *src, size_t size)
98
+ {
99
+ return escape(ob, src, size, 1);
100
+ }
101
+
@@ -0,0 +1,65 @@
1
+ #include <assert.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+
5
+ #include "houdini.h"
6
+
7
+ #define hex2c(c) ((c | 32) % 39 - 9)
8
+
9
+ static int
10
+ unescape(gh_buf *ob, const uint8_t *src, size_t size, int is_url)
11
+ {
12
+ size_t i = 0, org;
13
+
14
+ while (i < size) {
15
+ org = i;
16
+ while (i < size && src[i] != '%')
17
+ i++;
18
+
19
+ if (likely(i > org)) {
20
+ if (unlikely(org == 0)) {
21
+ if (i >= size && !is_url)
22
+ return 0;
23
+
24
+ gh_buf_grow(ob, HOUDINI_UNESCAPED_SIZE(size));
25
+ }
26
+
27
+ gh_buf_put(ob, src + org, i - org);
28
+ }
29
+
30
+ /* escaping */
31
+ if (i >= size)
32
+ break;
33
+
34
+ i++;
35
+
36
+ if (i + 1 < size && _isxdigit(src[i]) && _isxdigit(src[i + 1])) {
37
+ unsigned char new_char = (hex2c(src[i]) << 4) + hex2c(src[i + 1]);
38
+ gh_buf_putc(ob, new_char);
39
+ i += 2;
40
+ } else {
41
+ gh_buf_putc(ob, '%');
42
+ }
43
+ }
44
+
45
+ if (is_url) {
46
+ char *find = (char *)gh_buf_cstr(ob);
47
+ while ((find = strchr(find, '+')) != NULL)
48
+ *find = ' ';
49
+ }
50
+
51
+ return 1;
52
+ }
53
+
54
+ int
55
+ houdini_unescape_uri(gh_buf *ob, const uint8_t *src, size_t size)
56
+ {
57
+ return unescape(ob, src, size, 0);
58
+ }
59
+
60
+ int
61
+ houdini_unescape_url(gh_buf *ob, const uint8_t *src, size_t size)
62
+ {
63
+ return unescape(ob, src, size, 1);
64
+ }
65
+
@@ -0,0 +1,136 @@
1
+ #include <assert.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+
5
+ #include "houdini.h"
6
+
7
+ /**
8
+ * & --> &amp;
9
+ * < --> &lt;
10
+ * > --> &gt;
11
+ * " --> &quot;
12
+ * ' --> &apos;
13
+ */
14
+ static const char *LOOKUP_CODES[] = {
15
+ "", /* reserved: use literal single character */
16
+ "", /* unused */
17
+ "", /* reserved: 2 character UTF-8 */
18
+ "", /* reserved: 3 character UTF-8 */
19
+ "", /* reserved: 4 character UTF-8 */
20
+ "?", /* invalid UTF-8 character */
21
+ "&quot;",
22
+ "&amp;",
23
+ "&apos;",
24
+ "&lt;",
25
+ "&gt;"
26
+ };
27
+
28
+ static const char CODE_INVALID = 5;
29
+
30
+ static const char XML_LOOKUP_TABLE[] = {
31
+ /* ASCII: 0xxxxxxx */
32
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 5, 5, 0, 5, 5,
33
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
34
+ 0, 0, 6, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0,
35
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0,10, 0,
36
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
37
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
38
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
39
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
40
+
41
+ /* Invalid UTF-8 char start: 10xxxxxx */
42
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
43
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
44
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
45
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
46
+
47
+ /* Multibyte UTF-8 */
48
+
49
+ /* 2 bytes: 110xxxxx */
50
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
51
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
52
+
53
+ /* 3 bytes: 1110xxxx */
54
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
55
+
56
+ /* 4 bytes: 11110xxx */
57
+ 4, 4, 4, 4, 4, 4, 4, 4,
58
+
59
+ /* Invalid UTF-8: 11111xxx */
60
+ 5, 5, 5, 5, 5, 5, 5, 5,
61
+ };
62
+
63
+ int
64
+ houdini_escape_xml(gh_buf *ob, const uint8_t *src, size_t size)
65
+ {
66
+ size_t i = 0;
67
+ unsigned char code = 0;
68
+
69
+ gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size));
70
+
71
+ while (i < size) {
72
+ size_t start, end;
73
+
74
+ start = end = i;
75
+
76
+ while (i < size) {
77
+ unsigned int byte;
78
+
79
+ byte = src[i++];
80
+ code = XML_LOOKUP_TABLE[byte];
81
+
82
+ if (!code) {
83
+ /* single character used literally */
84
+ } else if (code >= CODE_INVALID) {
85
+ break; /* insert lookup code string */
86
+ } else if (code > size - end) {
87
+ code = CODE_INVALID; /* truncated UTF-8 character */
88
+ break;
89
+ } else {
90
+ unsigned int chr = byte & (0xff >> code);
91
+
92
+ while (--code) {
93
+ byte = src[i++];
94
+ if ((byte & 0xc0) != 0x80) {
95
+ code = CODE_INVALID;
96
+ break;
97
+ }
98
+ chr = (chr << 6) + (byte & 0x3f);
99
+ }
100
+
101
+ switch (i - end) {
102
+ case 2:
103
+ if (chr < 0x80)
104
+ code = CODE_INVALID;
105
+ break;
106
+ case 3:
107
+ if (chr < 0x800 ||
108
+ (chr > 0xd7ff && chr < 0xe000) ||
109
+ chr > 0xfffd)
110
+ code = CODE_INVALID;
111
+ break;
112
+ case 4:
113
+ if (chr < 0x10000 || chr > 0x10ffff)
114
+ code = CODE_INVALID;
115
+ break;
116
+ default:
117
+ break;
118
+ }
119
+ if (code == CODE_INVALID)
120
+ break;
121
+ }
122
+ end = i;
123
+ }
124
+
125
+ if (end > start)
126
+ gh_buf_put(ob, src + start, end - start);
127
+
128
+ /* escaping */
129
+ if (end >= size)
130
+ break;
131
+
132
+ gh_buf_puts(ob, LOOKUP_CODES[code]);
133
+ }
134
+
135
+ return 1;
136
+ }
@@ -1,3 +1,3 @@
1
1
  module EscapeUtils
2
- VERSION = "0.2.4"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -0,0 +1,8 @@
1
+ module Builder
2
+ class XmlBase < BlankSlate
3
+ private
4
+ def _escape(text)
5
+ EscapeUtils.escape_xml(text.to_s)
6
+ end
7
+ end
8
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,14 @@
1
+ # Basic test environment.
2
+
3
+ # blah fuck this
4
+ require 'rubygems' if !defined?(Gem)
5
+ require 'bundler/setup'
6
+
7
+ require 'escape_utils'
8
+
9
+ # bring in minitest
10
+ require 'minitest/autorun'
11
+
12
+ # put lib and test dirs directly on load path
13
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
14
+ $LOAD_PATH.unshift File.expand_path('..', __FILE__)
@@ -0,0 +1,61 @@
1
+ require File.expand_path("../../helper", __FILE__)
2
+
3
+ class HtmlEscapeTest < MiniTest::Unit::TestCase
4
+ def test_escape_basic_html_with_secure
5
+ assert_equal "&lt;some_tag&#47;&gt;", EscapeUtils.escape_html("<some_tag/>")
6
+
7
+ secure_before = EscapeUtils.html_secure
8
+ EscapeUtils.html_secure = true
9
+ assert_equal "&lt;some_tag&#47;&gt;", EscapeUtils.escape_html("<some_tag/>")
10
+ EscapeUtils.html_secure = secure_before
11
+ end
12
+
13
+ def test_escape_basic_html_without_secure
14
+ assert_equal "&lt;some_tag/&gt;", EscapeUtils.escape_html("<some_tag/>", false)
15
+
16
+ secure_before = EscapeUtils.html_secure
17
+ EscapeUtils.html_secure = false
18
+ assert_equal "&lt;some_tag/&gt;", EscapeUtils.escape_html("<some_tag/>")
19
+ EscapeUtils.html_secure = secure_before
20
+ end
21
+
22
+ def test_escape_double_quotes
23
+ assert_equal "&lt;some_tag some_attr=&quot;some value&quot;&#47;&gt;", EscapeUtils.escape_html("<some_tag some_attr=\"some value\"/>")
24
+ end
25
+
26
+ def test_escape_single_quotes
27
+ assert_equal "&lt;some_tag some_attr=&#39;some value&#39;&#47;&gt;", EscapeUtils.escape_html("<some_tag some_attr='some value'/>")
28
+ end
29
+
30
+ def test_escape_ampersand
31
+ assert_equal "&lt;b&gt;Bourbon &amp; Branch&lt;&#47;b&gt;", EscapeUtils.escape_html("<b>Bourbon & Branch</b>")
32
+ end
33
+
34
+ def test_returns_original_if_not_escaped
35
+ str = 'foobar'
36
+ assert_equal str.object_id, EscapeUtils.escape_html(str).object_id
37
+ end
38
+
39
+ if RUBY_VERSION =~ /^1.9/
40
+ def test_utf8_or_ascii_input_only
41
+ str = "<b>Bourbon & Branch</b>"
42
+
43
+ str.force_encoding 'ISO-8859-1'
44
+ assert_raises Encoding::CompatibilityError do
45
+ EscapeUtils.escape_html(str)
46
+ end
47
+
48
+ str.force_encoding 'UTF-8'
49
+ begin
50
+ EscapeUtils.escape_html(str)
51
+ rescue Encoding::CompatibilityError => e
52
+ assert_nil e, "#{e.class.name} raised, expected not to"
53
+ end
54
+ end
55
+
56
+ def test_return_value_is_tagged_as_utf8
57
+ str = "<b>Bourbon & Branch</b>".encode('utf-8')
58
+ assert_equal Encoding.find('UTF-8'), EscapeUtils.escape_html(str).encoding
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,48 @@
1
+ # encoding: utf-8
2
+ require File.expand_path("../../helper", __FILE__)
3
+
4
+ class HtmlUnescapeTest < MiniTest::Unit::TestCase
5
+ def test_basic_html
6
+ assert_equal "<some_tag/>", EscapeUtils.unescape_html("&lt;some_tag&#47;&gt;")
7
+ end
8
+
9
+ def test_double_quotes
10
+ assert_equal "<some_tag some_attr=\"some value\"/>", EscapeUtils.unescape_html("&lt;some_tag some_attr=&quot;some value&quot;&#47;&gt;")
11
+ end
12
+
13
+ def test_single_quotes
14
+ assert_equal "<some_tag some_attr='some value'/>", EscapeUtils.unescape_html("&lt;some_tag some_attr=&#39;some value&#39;&#47;&gt;")
15
+ end
16
+
17
+ def test_amperstand
18
+ assert_equal "<b>Bourbon & Branch</b>", EscapeUtils.unescape_html("&lt;b&gt;Bourbon &amp; Branch&lt;&#47;b&gt;")
19
+ end
20
+
21
+ def test_passes_through_incompletely_escaped_tags
22
+ assert_equal "&", EscapeUtils.unescape_html("&")
23
+ assert_equal "&lt", EscapeUtils.unescape_html("&lt")
24
+ end
25
+
26
+ if RUBY_VERSION =~ /^1.9/
27
+ def test_input_must_be_utf8_or_ascii
28
+ escaped = EscapeUtils.escape_html("<b>Bourbon & Branch</b>")
29
+
30
+ escaped.force_encoding 'ISO-8859-1'
31
+ assert_raises Encoding::CompatibilityError do
32
+ EscapeUtils.unescape_html(escaped)
33
+ end
34
+
35
+ escaped.force_encoding 'UTF-8'
36
+ begin
37
+ EscapeUtils.unescape_html(escaped)
38
+ rescue Encoding::CompatibilityError => e
39
+ assert_nil e, "#{e.class.name} raised, expected not to"
40
+ end
41
+ end
42
+
43
+ def test_return_value_is_tagged_as_utf8
44
+ escaped = EscapeUtils.escape_html("<b>Bourbon & Branch</b>")
45
+ assert_equal Encoding.find('UTF-8'), EscapeUtils.unescape_html(escaped).encoding
46
+ end
47
+ end
48
+ end