escape_utils 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data/.gitignore +2 -1
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +206 -0
  4. data/benchmark/html_escape.rb +1 -0
  5. data/benchmark/html_unescape.rb +1 -0
  6. data/benchmark/javascript_escape.rb +1 -0
  7. data/benchmark/javascript_unescape.rb +1 -0
  8. data/benchmark/url_escape.rb +1 -0
  9. data/benchmark/url_unescape.rb +1 -0
  10. data/escape_utils.gemspec +0 -3
  11. data/ext/escape_utils/buffer.c +228 -0
  12. data/ext/escape_utils/buffer.h +91 -0
  13. data/ext/escape_utils/escape_utils.c +111 -531
  14. data/ext/escape_utils/houdini.h +15 -0
  15. data/ext/escape_utils/houdini_html.c +214 -0
  16. data/ext/escape_utils/houdini_js.c +148 -0
  17. data/ext/escape_utils/houdini_uri.c +130 -0
  18. data/ext/escape_utils/html_unescape.h +754 -0
  19. data/ext/escape_utils/uri_escape.h +35 -0
  20. data/lib/escape_utils.rb +2 -2
  21. data/lib/escape_utils/html/cgi.rb +0 -2
  22. data/lib/escape_utils/html/erb.rb +0 -2
  23. data/lib/escape_utils/html/haml.rb +0 -2
  24. data/lib/escape_utils/html/rack.rb +0 -2
  25. data/lib/escape_utils/html_safety.rb +0 -2
  26. data/lib/escape_utils/javascript/action_view.rb +0 -2
  27. data/lib/escape_utils/url/cgi.rb +0 -2
  28. data/lib/escape_utils/url/erb.rb +0 -2
  29. data/lib/escape_utils/url/rack.rb +0 -2
  30. data/lib/escape_utils/url/uri.rb +0 -2
  31. data/lib/escape_utils/version.rb +1 -1
  32. data/spec/html/escape_spec.rb +0 -1
  33. data/spec/html/unescape_spec.rb +0 -1
  34. data/spec/html_safety_spec.rb +0 -1
  35. data/spec/javascript/escape_spec.rb +0 -1
  36. data/spec/javascript/unescape_spec.rb +0 -1
  37. data/spec/query/escape_spec.rb +0 -1
  38. data/spec/query/unescape_spec.rb +1 -0
  39. data/spec/spec_helper.rb +0 -1
  40. data/spec/uri/escape_spec.rb +0 -1
  41. data/spec/uri/unescape_spec.rb +1 -0
  42. data/spec/url/escape_spec.rb +0 -1
  43. data/spec/url/unescape_spec.rb +1 -0
  44. metadata +16 -8
  45. data/README.rdoc +0 -146
@@ -0,0 +1,15 @@
1
+ #ifndef __HOUDINI_H__
2
+ #define __HOUDINI_H__
3
+
4
+ #include "buffer.h"
5
+
6
+ extern void houdini_escape_html(struct buf *ob, const uint8_t *src, size_t size, int secure);
7
+ extern void houdini_unescape_html(struct buf *ob, const uint8_t *src, size_t size);
8
+ extern void houdini_escape_uri(struct buf *ob, const uint8_t *src, size_t size);
9
+ extern void houdini_escape_url(struct buf *ob, const uint8_t *src, size_t size);
10
+ extern void houdini_unescape_uri(struct buf *ob, const uint8_t *src, size_t size);
11
+ extern void houdini_unescape_url(struct buf *ob, const uint8_t *src, size_t size);
12
+ extern void houdini_escape_js(struct buf *ob, const uint8_t *src, size_t size);
13
+ extern void houdini_unescape_js(struct buf *ob, const uint8_t *src, size_t size);
14
+
15
+ #endif
@@ -0,0 +1,214 @@
1
+ #include <assert.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+
5
+ #include "houdini.h"
6
+ #include "html_unescape.h"
7
+
8
+ #define ESCAPE_GROW_FACTOR(x) (((x) * 12) / 10) /* this is very scientific, yes */
9
+ #define UNESCAPE_GROW_FACTOR(x) (x) /* unescaping shouldn't grow our buffer */
10
+
11
+ /* Helper _isdigit methods -- do not trust the current locale */
12
+ int _isxdigit(int c)
13
+ {
14
+ return strchr("0123456789ABCDEFabcdef", c) != NULL;
15
+ }
16
+
17
+ int _isdigit(int c)
18
+ {
19
+ return (c >= '0' && c <= '9');
20
+ }
21
+
22
+
23
+ /**
24
+ * According to the OWASP rules:
25
+ *
26
+ * & --> &amp;
27
+ * < --> &lt;
28
+ * > --> &gt;
29
+ * " --> &quot;
30
+ * ' --> &#x27; &apos; is not recommended
31
+ * / --> &#x2F; forward slash is included as it helps end an HTML entity
32
+ *
33
+ */
34
+ static const char HTML_ESCAPE_TABLE[] = {
35
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
36
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
37
+ 0, 0, 1, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 4,
38
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 6, 0,
39
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
40
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
41
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
42
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
43
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
44
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
45
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
49
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
50
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
51
+ };
52
+
53
+ static const char *HTML_ESCAPES[] = {
54
+ "",
55
+ "&quot;",
56
+ "&amp;",
57
+ "&#39;",
58
+ "&#47;",
59
+ "&lt;",
60
+ "&gt;"
61
+ };
62
+
63
+ void
64
+ houdini_escape_html(struct buf *ob, const uint8_t *src, size_t size, int secure)
65
+ {
66
+ size_t i = 0, org, esc;
67
+
68
+ bufgrow(ob, ESCAPE_GROW_FACTOR(size));
69
+
70
+ while (i < size) {
71
+ org = i;
72
+ while (i < size && (esc = HTML_ESCAPE_TABLE[src[i]]) == 0)
73
+ i++;
74
+
75
+ if (i > org)
76
+ bufput(ob, src + org, i - org);
77
+
78
+ /* escaping */
79
+ if (i >= size)
80
+ break;
81
+
82
+ /* The forward slash is only escaped in secure mode */
83
+ if (src[i] == '/' && !secure) {
84
+ bufputc(ob, '/');
85
+ } else {
86
+ bufputs(ob, HTML_ESCAPES[esc]);
87
+ }
88
+
89
+ i++;
90
+ }
91
+ }
92
+
93
+ static inline void
94
+ bufput_utf8(struct buf *ob, int c)
95
+ {
96
+ unsigned char unichar[4];
97
+
98
+ if (c < 0x80) {
99
+ bufputc(ob, c);
100
+ }
101
+ else if (c < 0x800) {
102
+ unichar[0] = 192 + (c / 64);
103
+ unichar[1] = 128 + (c % 64);
104
+ bufput(ob, unichar, 2);
105
+ }
106
+ else if (c - 0xd800u < 0x800) {
107
+ bufputc(ob, '?');
108
+ }
109
+ else if (c < 0x10000) {
110
+ unichar[0] = 224 + (c / 4096);
111
+ unichar[1] = 128 + (c / 64) % 64;
112
+ unichar[2] = 128 + (c % 64);
113
+ bufput(ob, unichar, 3);
114
+ }
115
+ else if (c < 0x110000) {
116
+ unichar[0] = 240 + (c / 262144);
117
+ unichar[1] = 128 + (c / 4096) % 64;
118
+ unichar[2] = 128 + (c / 64) % 64;
119
+ unichar[3] = 128 + (c % 64);
120
+ bufput(ob, unichar, 4);
121
+ }
122
+ else {
123
+ bufputc(ob, '?');
124
+ }
125
+ }
126
+
127
+ static size_t
128
+ unescape_ent(struct buf *ob, const uint8_t *src, size_t size)
129
+ {
130
+ size_t i = 0;
131
+
132
+ if (size > 3 && src[0] == '#') {
133
+ int codepoint = 0;
134
+
135
+ if (_isdigit(src[1])) {
136
+ for (i = 1; i < size && _isdigit(src[i]); ++i)
137
+ codepoint = (codepoint * 10) + (src[i] - '0');
138
+ }
139
+
140
+ else if (src[1] == 'x' || src[1] == 'X') {
141
+ for (i = 2; i < size && _isxdigit(src[i]); ++i)
142
+ codepoint = (codepoint * 16) + ((src[i] | 32) % 39 - 9);
143
+ }
144
+
145
+ if (i < size && src[i] == ';') {
146
+ bufput_utf8(ob, codepoint);
147
+ return i + 1;
148
+ }
149
+ }
150
+
151
+ else {
152
+ if (size > MAX_WORD_LENGTH)
153
+ size = MAX_WORD_LENGTH;
154
+
155
+ for (i = MIN_WORD_LENGTH; i < size; ++i) {
156
+ if (src[i] == ' ')
157
+ break;
158
+
159
+ if (src[i] == ';') {
160
+ const struct html_ent *entity = find_entity((char *)src, i);
161
+
162
+ if (entity != NULL) {
163
+ bufput(ob, entity->utf8, entity->utf8_len);
164
+ return i + 1;
165
+ }
166
+
167
+ break;
168
+ }
169
+ }
170
+ }
171
+
172
+ bufputc(ob, '&');
173
+ return 0;
174
+ }
175
+
176
+ void
177
+ houdini_unescape_html(struct buf *ob, const uint8_t *src, size_t size)
178
+ {
179
+ size_t i = 0, org;
180
+
181
+ bufgrow(ob, UNESCAPE_GROW_FACTOR(size));
182
+
183
+ while (i < size) {
184
+ org = i;
185
+ while (i < size && src[i] != '&')
186
+ i++;
187
+
188
+ if (i > org)
189
+ bufput(ob, src + org, i - org);
190
+
191
+ /* escaping */
192
+ if (i >= size)
193
+ break;
194
+
195
+ i++;
196
+ i += unescape_ent(ob, src + i, size - i);
197
+ }
198
+ }
199
+
200
+ #ifdef TEST
201
+
202
+ int main()
203
+ {
204
+ const char TEST_STRING[] = "This &#x2663; is & just &quot;an example&diams;&quot;";
205
+ struct buf *buffer;
206
+
207
+ buffer = bufnew(128);
208
+ houdini_unescape_html(buffer, TEST_STRING, strlen(TEST_STRING));
209
+ printf("Result: %.*s\n", (int)buffer->size, buffer->data);
210
+ bufrelease(buffer);
211
+ return 0;
212
+ }
213
+ #endif
214
+
@@ -0,0 +1,148 @@
1
+ #include <assert.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+
5
+ #include "houdini.h"
6
+
7
+ #define ESCAPE_GROW_FACTOR(x) (((x) * 12) / 10)
8
+ #define UNESCAPE_GROW_FACTOR(x) (x)
9
+
10
+ void
11
+ houdini_unescape_js(struct buf *ob, const uint8_t *src, size_t size)
12
+ {
13
+ size_t i = 0, org, ch;
14
+
15
+ bufgrow(ob, UNESCAPE_GROW_FACTOR(size));
16
+
17
+ while (i < size) {
18
+ org = i;
19
+ while (i < size && src[i] != '\\')
20
+ i++;
21
+
22
+ if (i > org)
23
+ bufput(ob, src + org, i - org);
24
+
25
+ /* escaping */
26
+ if (i == size)
27
+ break;
28
+
29
+ if (++i == size) {
30
+ bufputc(ob, '\\');
31
+ break;
32
+ }
33
+
34
+ ch = src[i];
35
+
36
+ switch (ch) {
37
+ case 'n':
38
+ ch = '\n';
39
+ /* pass through */
40
+
41
+ case '\\':
42
+ case '\'':
43
+ case '\"':
44
+ case '/':
45
+ bufputc(ob, ch);
46
+ i++;
47
+ break;
48
+
49
+ default:
50
+ bufputc(ob, '\\');
51
+ break;
52
+ }
53
+ }
54
+ }
55
+
56
+ static const char JS_ESCAPE[] = {
57
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
58
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59
+ 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
60
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
63
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
64
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
65
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
66
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
72
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73
+ };
74
+
75
+ void
76
+ houdini_escape_js(struct buf *ob, const uint8_t *src, size_t size)
77
+ {
78
+ size_t i = 0, org, ch;
79
+
80
+ bufgrow(ob, ESCAPE_GROW_FACTOR(size));
81
+
82
+ while (i < size) {
83
+ org = i;
84
+ while (i < size && JS_ESCAPE[src[i]] == 0)
85
+ i++;
86
+
87
+ if (i > org)
88
+ bufput(ob, src + org, i - org);
89
+
90
+ /* escaping */
91
+ if (i >= size)
92
+ break;
93
+
94
+ ch = src[i];
95
+
96
+ switch (ch) {
97
+ case '/':
98
+ /*
99
+ * Escape only if preceded by a lt
100
+ */
101
+ if (i && src[i - 1] == '<')
102
+ bufputc(ob, '\\');
103
+
104
+ bufputc(ob, ch);
105
+ break;
106
+
107
+ case '\r':
108
+ /*
109
+ * Escape as \n, and skip the next \n if it's there
110
+ */
111
+ if (i + 1 < size && src[i + 1] == '\n') i++;
112
+
113
+ case '\n':
114
+ /*
115
+ * Escape actually as '\','n', not as '\', '\n'
116
+ */
117
+ ch = 'n';
118
+
119
+ default:
120
+ /*
121
+ * Normal escaping
122
+ */
123
+ bufputc(ob, '\\');
124
+ bufputc(ob, ch);
125
+ break;
126
+ }
127
+
128
+ i++;
129
+ }
130
+ }
131
+
132
+
133
+ //#define TEST
134
+ #ifdef TEST
135
+
136
+ int main()
137
+ {
138
+ const char TEST_STRING[] = "http% this \200 is a test";
139
+ struct buf *buffer;
140
+
141
+ buffer = bufnew(128);
142
+ houdini_escape_uri(buffer, TEST_STRING, strlen(TEST_STRING));
143
+ printf("Result: %.*s\n", (int)buffer->size, buffer->data);
144
+ bufrelease(buffer);
145
+ return 0;
146
+ }
147
+ #endif
148
+
@@ -0,0 +1,130 @@
1
+ #include <assert.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+
5
+ #include "houdini.h"
6
+ #include "uri_escape.h"
7
+
8
+ #define ESCAPE_GROW_FACTOR(x) (((x) * 12) / 10)
9
+ #define UNESCAPE_GROW_FACTOR(x) (x)
10
+
11
+ extern int _isxdigit(int c);
12
+
13
+ static void
14
+ escape(struct buf *ob, const uint8_t *src, size_t size, int is_url)
15
+ {
16
+ static const char hex_chars[] = "0123456789ABCDEF";
17
+ const char *safe_table = is_url ? URL_SAFE : URI_SAFE;
18
+
19
+ size_t i = 0, org;
20
+ char hex_str[3];
21
+
22
+ bufgrow(ob, ESCAPE_GROW_FACTOR(size));
23
+ hex_str[0] = '%';
24
+
25
+ while (i < size) {
26
+ org = i;
27
+ while (i < size && safe_table[src[i]] != 0)
28
+ i++;
29
+
30
+ if (i > org)
31
+ bufput(ob, src + org, i - org);
32
+
33
+ /* escaping */
34
+ if (i >= size)
35
+ break;
36
+
37
+ if (src[i] == ' ' && is_url) {
38
+ bufputc(ob, '+');
39
+ } else {
40
+ hex_str[1] = hex_chars[(src[i] >> 4) & 0xF];
41
+ hex_str[2] = hex_chars[src[i] & 0xF];
42
+ bufput(ob, hex_str, 3);
43
+ }
44
+
45
+ i++;
46
+ }
47
+ }
48
+
49
+ #define hex2c(c) ((c | 32) % 39 - 9)
50
+
51
+ static void
52
+ unescape(struct buf *ob, const uint8_t *src, size_t size, int is_url)
53
+ {
54
+ size_t i = 0, org;
55
+
56
+ bufgrow(ob, UNESCAPE_GROW_FACTOR(size));
57
+
58
+ while (i < size) {
59
+ org = i;
60
+ while (i < size && src[i] != '%')
61
+ i++;
62
+
63
+ if (i > org)
64
+ bufput(ob, src + org, i - org);
65
+
66
+ /* escaping */
67
+ if (i >= size)
68
+ break;
69
+
70
+ i++;
71
+
72
+ if (i + 1 < size && _isxdigit(src[i]) && _isxdigit(src[i + 1])) {
73
+ unsigned char new_char = (hex2c(src[i]) << 4) + hex2c(src[i + 1]);
74
+ bufputc(ob, new_char);
75
+ i += 2;
76
+ } else {
77
+ bufputc(ob, '%');
78
+ }
79
+ }
80
+
81
+ if (is_url) {
82
+ char *find = (char *)bufcstr(ob);
83
+ while ((find = strchr(find, '+')) != NULL)
84
+ *find = ' ';
85
+ }
86
+ }
87
+
88
+
89
+ void
90
+ houdini_escape_uri(struct buf *ob, const uint8_t *src, size_t size)
91
+ {
92
+ return escape(ob, src, size, 0);
93
+ }
94
+
95
+ void
96
+ houdini_escape_url(struct buf *ob, const uint8_t *src, size_t size)
97
+ {
98
+ return escape(ob, src, size, 1);
99
+ }
100
+
101
+ void
102
+ houdini_unescape_uri(struct buf *ob, const uint8_t *src, size_t size)
103
+ {
104
+ return unescape(ob, src, size, 0);
105
+ }
106
+
107
+ void
108
+ houdini_unescape_url(struct buf *ob, const uint8_t *src, size_t size)
109
+ {
110
+ return unescape(ob, src, size, 1);
111
+ }
112
+
113
+
114
+
115
+ //#define TEST
116
+ #ifdef TEST
117
+
118
+ int main()
119
+ {
120
+ const char TEST_STRING[] = "http% this \200 is a test";
121
+ struct buf *buffer;
122
+
123
+ buffer = bufnew(128);
124
+ houdini_escape_uri(buffer, TEST_STRING, strlen(TEST_STRING));
125
+ printf("Result: %.*s\n", (int)buffer->size, buffer->data);
126
+ bufrelease(buffer);
127
+ return 0;
128
+ }
129
+ #endif
130
+