escape_utils 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. data/.gitignore +2 -1
  2. data/.travis.yml +13 -0
  3. data/CHANGELOG.md +7 -0
  4. data/MIT-LICENSE +1 -1
  5. data/Rakefile +5 -18
  6. data/benchmark/html_escape.rb +9 -2
  7. data/benchmark/xml_escape.rb +29 -0
  8. data/escape_utils.gemspec +2 -3
  9. data/ext/escape_utils/buffer.c +181 -160
  10. data/ext/escape_utils/buffer.h +90 -68
  11. data/ext/escape_utils/escape_utils.c +77 -39
  12. data/ext/escape_utils/extconf.rb +1 -1
  13. data/ext/escape_utils/houdini.h +37 -8
  14. data/ext/escape_utils/houdini_href_e.c +115 -0
  15. data/ext/escape_utils/houdini_html_e.c +90 -0
  16. data/ext/escape_utils/houdini_html_u.c +122 -0
  17. data/ext/escape_utils/{houdini_js.c → houdini_js_e.c} +17 -75
  18. data/ext/escape_utils/houdini_js_u.c +60 -0
  19. data/ext/escape_utils/{uri_escape.h → houdini_uri_e.c} +68 -2
  20. data/ext/escape_utils/houdini_uri_u.c +65 -0
  21. data/ext/escape_utils/houdini_xml_e.c +136 -0
  22. data/lib/escape_utils/version.rb +1 -1
  23. data/lib/escape_utils/xml/builder.rb +8 -0
  24. data/test/helper.rb +14 -0
  25. data/test/html/escape_test.rb +61 -0
  26. data/test/html/unescape_test.rb +48 -0
  27. data/test/html_safety_test.rb +46 -0
  28. data/test/javascript/escape_test.rb +42 -0
  29. data/test/javascript/unescape_test.rb +46 -0
  30. data/test/query/escape_test.rb +50 -0
  31. data/test/query/unescape_test.rb +52 -0
  32. data/test/uri/escape_test.rb +50 -0
  33. data/test/uri/unescape_test.rb +55 -0
  34. data/test/url/escape_test.rb +58 -0
  35. data/test/url/unescape_test.rb +60 -0
  36. data/test/xml/escape_test.rb +67 -0
  37. metadata +136 -152
  38. data/.rspec +0 -2
  39. data/ext/escape_utils/houdini_html.c +0 -214
  40. data/ext/escape_utils/houdini_uri.c +0 -130
  41. data/spec/html/escape_spec.rb +0 -42
  42. data/spec/html/unescape_spec.rb +0 -37
  43. data/spec/html_safety_spec.rb +0 -48
  44. data/spec/javascript/escape_spec.rb +0 -34
  45. data/spec/javascript/unescape_spec.rb +0 -37
  46. data/spec/query/escape_spec.rb +0 -44
  47. data/spec/query/unescape_spec.rb +0 -46
  48. data/spec/rcov.opts +0 -3
  49. data/spec/spec_helper.rb +0 -5
  50. data/spec/uri/escape_spec.rb +0 -43
  51. data/spec/uri/unescape_spec.rb +0 -57
  52. data/spec/url/escape_spec.rb +0 -52
  53. data/spec/url/unescape_spec.rb +0 -57
@@ -0,0 +1,115 @@
1
+ #include <assert.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+
5
+ #include "houdini.h"
6
+
7
+ /*
8
+ * The following characters will not be escaped:
9
+ *
10
+ * -_.+!*'(),%#@?=;:/,+&$ alphanum
11
+ *
12
+ * Note that this character set is the addition of:
13
+ *
14
+ * - The characters which are safe to be in an URL
15
+ * - The characters which are *not* safe to be in
16
+ * an URL because they are RESERVED characters.
17
+ *
18
+ * We asume (lazily) that any RESERVED char that
19
+ * appears inside an URL is actually meant to
20
+ * have its native function (i.e. as an URL
21
+ * component/separator) and hence needs no escaping.
22
+ *
23
+ * There are two exceptions: the chacters & (amp)
24
+ * and ' (single quote) do not appear in the table.
25
+ * They are meant to appear in the URL as components,
26
+ * yet they require special HTML-entity escaping
27
+ * to generate valid HTML markup.
28
+ *
29
+ * All other characters will be escaped to %XX.
30
+ *
31
+ */
32
+ static const char HREF_SAFE[] = {
33
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
34
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
35
+ 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
36
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
37
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
39
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
41
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
42
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
43
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
44
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
45
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
49
+ };
50
+
51
+ int
52
+ houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size)
53
+ {
54
+ static const uint8_t hex_chars[] = "0123456789ABCDEF";
55
+ size_t i = 0, org;
56
+ uint8_t hex_str[3];
57
+
58
+ hex_str[0] = '%';
59
+
60
+ while (i < size) {
61
+ org = i;
62
+ while (i < size && HREF_SAFE[src[i]] != 0)
63
+ i++;
64
+
65
+ if (likely(i > org)) {
66
+ if (unlikely(org == 0)) {
67
+ if (i >= size)
68
+ return 0;
69
+
70
+ gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size));
71
+ }
72
+
73
+ gh_buf_put(ob, src + org, i - org);
74
+ }
75
+
76
+ /* escaping */
77
+ if (i >= size)
78
+ break;
79
+
80
+ switch (src[i]) {
81
+ /* amp appears all the time in URLs, but needs
82
+ * HTML-entity escaping to be inside an href */
83
+ case '&':
84
+ gh_buf_PUTS(ob, "&amp;");
85
+ break;
86
+
87
+ /* the single quote is a valid URL character
88
+ * according to the standard; it needs HTML
89
+ * entity escaping too */
90
+ case '\'':
91
+ gh_buf_PUTS(ob, "&#x27;");
92
+ break;
93
+
94
+ /* the space can be escaped to %20 or a plus
95
+ * sign. we're going with the generic escape
96
+ * for now. the plus thing is more commonly seen
97
+ * when building GET strings */
98
+ #if 0
99
+ case ' ':
100
+ gh_buf_putc(ob, '+');
101
+ break;
102
+ #endif
103
+
104
+ /* every other character goes with a %XX escaping */
105
+ default:
106
+ hex_str[1] = hex_chars[(src[i] >> 4) & 0xF];
107
+ hex_str[2] = hex_chars[src[i] & 0xF];
108
+ gh_buf_put(ob, hex_str, 3);
109
+ }
110
+
111
+ i++;
112
+ }
113
+
114
+ return 1;
115
+ }
@@ -0,0 +1,90 @@
1
+ #include <assert.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+
5
+ #include "houdini.h"
6
+
7
+ /**
8
+ * According to the OWASP rules:
9
+ *
10
+ * & --> &amp;
11
+ * < --> &lt;
12
+ * > --> &gt;
13
+ * " --> &quot;
14
+ * ' --> &#x27; &apos; is not recommended
15
+ * / --> &#x2F; forward slash is included as it helps end an HTML entity
16
+ *
17
+ */
18
+ static const char HTML_ESCAPE_TABLE[] = {
19
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
20
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
21
+ 0, 0, 1, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 4,
22
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 6, 0,
23
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
24
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
25
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
29
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
30
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
33
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
34
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
35
+ };
36
+
37
+ static const char *HTML_ESCAPES[] = {
38
+ "",
39
+ "&quot;",
40
+ "&amp;",
41
+ "&#39;",
42
+ "&#47;",
43
+ "&lt;",
44
+ "&gt;"
45
+ };
46
+
47
+ int
48
+ houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure)
49
+ {
50
+ size_t i = 0, org, esc = 0;
51
+
52
+ while (i < size) {
53
+ org = i;
54
+ while (i < size && (esc = HTML_ESCAPE_TABLE[src[i]]) == 0)
55
+ i++;
56
+
57
+ if (i > org) {
58
+ if (unlikely(org == 0)) {
59
+ if (i >= size)
60
+ return 0;
61
+
62
+ gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size));
63
+ }
64
+
65
+ gh_buf_put(ob, src + org, i - org);
66
+ }
67
+
68
+ /* escaping */
69
+ if (unlikely(i >= size))
70
+ break;
71
+
72
+ /* The forward slash is only escaped in secure mode */
73
+ if (src[i] == '/' && !secure) {
74
+ gh_buf_putc(ob, '/');
75
+ } else {
76
+ gh_buf_puts(ob, HTML_ESCAPES[esc]);
77
+ }
78
+
79
+ i++;
80
+ }
81
+
82
+ return 1;
83
+ }
84
+
85
+ int
86
+ houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size)
87
+ {
88
+ return houdini_escape_html0(ob, src, size, 1);
89
+ }
90
+
@@ -0,0 +1,122 @@
1
+ #include <assert.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+
5
+ #include "houdini.h"
6
+ #include "html_unescape.h"
7
+
8
+ static inline void
9
+ gh_buf_put_utf8(gh_buf *ob, int c)
10
+ {
11
+ unsigned char unichar[4];
12
+
13
+ if (c < 0x80) {
14
+ gh_buf_putc(ob, c);
15
+ }
16
+ else if (c < 0x800) {
17
+ unichar[0] = 192 + (c / 64);
18
+ unichar[1] = 128 + (c % 64);
19
+ gh_buf_put(ob, unichar, 2);
20
+ }
21
+ else if (c - 0xd800u < 0x800) {
22
+ gh_buf_putc(ob, '?');
23
+ }
24
+ else if (c < 0x10000) {
25
+ unichar[0] = 224 + (c / 4096);
26
+ unichar[1] = 128 + (c / 64) % 64;
27
+ unichar[2] = 128 + (c % 64);
28
+ gh_buf_put(ob, unichar, 3);
29
+ }
30
+ else if (c < 0x110000) {
31
+ unichar[0] = 240 + (c / 262144);
32
+ unichar[1] = 128 + (c / 4096) % 64;
33
+ unichar[2] = 128 + (c / 64) % 64;
34
+ unichar[3] = 128 + (c % 64);
35
+ gh_buf_put(ob, unichar, 4);
36
+ }
37
+ else {
38
+ gh_buf_putc(ob, '?');
39
+ }
40
+ }
41
+
42
+ static size_t
43
+ unescape_ent(gh_buf *ob, const uint8_t *src, size_t size)
44
+ {
45
+ size_t i = 0;
46
+
47
+ if (size > 3 && src[0] == '#') {
48
+ int codepoint = 0;
49
+
50
+ if (_isdigit(src[1])) {
51
+ for (i = 1; i < size && _isdigit(src[i]); ++i)
52
+ codepoint = (codepoint * 10) + (src[i] - '0');
53
+ }
54
+
55
+ else if (src[1] == 'x' || src[1] == 'X') {
56
+ for (i = 2; i < size && _isxdigit(src[i]); ++i)
57
+ codepoint = (codepoint * 16) + ((src[i] | 32) % 39 - 9);
58
+ }
59
+
60
+ if (i < size && src[i] == ';') {
61
+ gh_buf_put_utf8(ob, codepoint);
62
+ return i + 1;
63
+ }
64
+ }
65
+
66
+ else {
67
+ if (size > MAX_WORD_LENGTH)
68
+ size = MAX_WORD_LENGTH;
69
+
70
+ for (i = MIN_WORD_LENGTH; i < size; ++i) {
71
+ if (src[i] == ' ')
72
+ break;
73
+
74
+ if (src[i] == ';') {
75
+ const struct html_ent *entity = find_entity((char *)src, i);
76
+
77
+ if (entity != NULL) {
78
+ gh_buf_put(ob, entity->utf8, entity->utf8_len);
79
+ return i + 1;
80
+ }
81
+
82
+ break;
83
+ }
84
+ }
85
+ }
86
+
87
+ gh_buf_putc(ob, '&');
88
+ return 0;
89
+ }
90
+
91
+ int
92
+ houdini_unescape_html(gh_buf *ob, const uint8_t *src, size_t size)
93
+ {
94
+ size_t i = 0, org;
95
+
96
+ while (i < size) {
97
+ org = i;
98
+ while (i < size && src[i] != '&')
99
+ i++;
100
+
101
+ if (likely(i > org)) {
102
+ if (unlikely(org == 0)) {
103
+ if (i >= size)
104
+ return 0;
105
+
106
+ gh_buf_grow(ob, HOUDINI_UNESCAPED_SIZE(size));
107
+ }
108
+
109
+ gh_buf_put(ob, src + org, i - org);
110
+ }
111
+
112
+ /* escaping */
113
+ if (i >= size)
114
+ break;
115
+
116
+ i++;
117
+ i += unescape_ent(ob, src + i, size - i);
118
+ }
119
+
120
+ return 1;
121
+ }
122
+
@@ -4,55 +4,6 @@
4
4
 
5
5
  #include "houdini.h"
6
6
 
7
- #define ESCAPE_GROW_FACTOR(x) (((x) * 12) / 10)
8
- #define UNESCAPE_GROW_FACTOR(x) (x)
9
-
10
- void
11
- houdini_unescape_js(struct buf *ob, const uint8_t *src, size_t size)
12
- {
13
- size_t i = 0, org, ch;
14
-
15
- bufgrow(ob, UNESCAPE_GROW_FACTOR(size));
16
-
17
- while (i < size) {
18
- org = i;
19
- while (i < size && src[i] != '\\')
20
- i++;
21
-
22
- if (i > org)
23
- bufput(ob, src + org, i - org);
24
-
25
- /* escaping */
26
- if (i == size)
27
- break;
28
-
29
- if (++i == size) {
30
- bufputc(ob, '\\');
31
- break;
32
- }
33
-
34
- ch = src[i];
35
-
36
- switch (ch) {
37
- case 'n':
38
- ch = '\n';
39
- /* pass through */
40
-
41
- case '\\':
42
- case '\'':
43
- case '\"':
44
- case '/':
45
- bufputc(ob, ch);
46
- i++;
47
- break;
48
-
49
- default:
50
- bufputc(ob, '\\');
51
- break;
52
- }
53
- }
54
- }
55
-
56
7
  static const char JS_ESCAPE[] = {
57
8
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
58
9
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -72,20 +23,26 @@ static const char JS_ESCAPE[] = {
72
23
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73
24
  };
74
25
 
75
- void
76
- houdini_escape_js(struct buf *ob, const uint8_t *src, size_t size)
26
+ int
27
+ houdini_escape_js(gh_buf *ob, const uint8_t *src, size_t size)
77
28
  {
78
29
  size_t i = 0, org, ch;
79
30
 
80
- bufgrow(ob, ESCAPE_GROW_FACTOR(size));
81
-
82
31
  while (i < size) {
83
32
  org = i;
84
33
  while (i < size && JS_ESCAPE[src[i]] == 0)
85
34
  i++;
86
35
 
87
- if (i > org)
88
- bufput(ob, src + org, i - org);
36
+ if (likely(i > org)) {
37
+ if (unlikely(org == 0)) {
38
+ if (i >= size)
39
+ return 0;
40
+
41
+ gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size));
42
+ }
43
+
44
+ gh_buf_put(ob, src + org, i - org);
45
+ }
89
46
 
90
47
  /* escaping */
91
48
  if (i >= size)
@@ -99,9 +56,9 @@ houdini_escape_js(struct buf *ob, const uint8_t *src, size_t size)
99
56
  * Escape only if preceded by a lt
100
57
  */
101
58
  if (i && src[i - 1] == '<')
102
- bufputc(ob, '\\');
59
+ gh_buf_putc(ob, '\\');
103
60
 
104
- bufputc(ob, ch);
61
+ gh_buf_putc(ob, ch);
105
62
  break;
106
63
 
107
64
  case '\r':
@@ -120,29 +77,14 @@ houdini_escape_js(struct buf *ob, const uint8_t *src, size_t size)
120
77
  /*
121
78
  * Normal escaping
122
79
  */
123
- bufputc(ob, '\\');
124
- bufputc(ob, ch);
80
+ gh_buf_putc(ob, '\\');
81
+ gh_buf_putc(ob, ch);
125
82
  break;
126
83
  }
127
84
 
128
85
  i++;
129
86
  }
130
- }
131
-
132
87
 
133
- //#define TEST
134
- #ifdef TEST
135
-
136
- int main()
137
- {
138
- const char TEST_STRING[] = "http% this \200 is a test";
139
- struct buf *buffer;
140
-
141
- buffer = bufnew(128);
142
- houdini_escape_uri(buffer, TEST_STRING, strlen(TEST_STRING));
143
- printf("Result: %.*s\n", (int)buffer->size, buffer->data);
144
- bufrelease(buffer);
145
- return 0;
88
+ return 1;
146
89
  }
147
- #endif
148
90