tcell_agent 0.2.19 → 0.2.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE_libinjection +32 -0
- data/Rakefile +14 -1
- data/ext/libinjection/extconf.rb +3 -0
- data/ext/libinjection/libinjection.h +65 -0
- data/ext/libinjection/libinjection_html5.c +847 -0
- data/ext/libinjection/libinjection_html5.h +54 -0
- data/ext/libinjection/libinjection_sqli.c +2317 -0
- data/ext/libinjection/libinjection_sqli.h +295 -0
- data/ext/libinjection/libinjection_sqli_data.h +9004 -0
- data/ext/libinjection/libinjection_wrap.c +3525 -0
- data/ext/libinjection/libinjection_xss.c +531 -0
- data/ext/libinjection/libinjection_xss.h +21 -0
- data/lib/tcell_agent/configuration.rb +0 -48
- data/lib/tcell_agent/logger.rb +1 -0
- data/lib/tcell_agent/policies/appsensor/database_sensor.rb +8 -20
- data/lib/tcell_agent/policies/appsensor/injection_sensor.rb +30 -46
- data/lib/tcell_agent/policies/appsensor/login_sensor.rb +1 -4
- data/lib/tcell_agent/policies/appsensor/misc_sensor.rb +8 -22
- data/lib/tcell_agent/policies/appsensor/payloads_policy.rb +143 -0
- data/lib/tcell_agent/policies/appsensor/response_codes_sensor.rb +3 -1
- data/lib/tcell_agent/policies/appsensor/sensor.rb +21 -2
- data/lib/tcell_agent/policies/appsensor/size_sensor.rb +3 -1
- data/lib/tcell_agent/policies/appsensor/sqli_sensor.rb +9 -0
- data/lib/tcell_agent/policies/appsensor/user_agent_sensor.rb +1 -5
- data/lib/tcell_agent/policies/appsensor/xss_sensor.rb +9 -1
- data/lib/tcell_agent/policies/appsensor_policy.rb +40 -19
- data/lib/tcell_agent/policies/http_redirect_policy.rb +12 -2
- data/lib/tcell_agent/rails/csrf_exception.rb +1 -1
- data/lib/tcell_agent/rails/dlp.rb +98 -76
- data/lib/tcell_agent/rails/middleware/global_middleware.rb +1 -2
- data/lib/tcell_agent/rails/middleware/headers_middleware.rb +2 -2
- data/lib/tcell_agent/rails/on_start.rb +53 -20
- data/lib/tcell_agent/sensor_events/appsensor_event.rb +12 -19
- data/lib/tcell_agent/sensor_events/appsensor_meta_event.rb +7 -2
- data/lib/tcell_agent/sensor_events/sensor.rb +10 -11
- data/lib/tcell_agent/sensor_events/server_agent.rb +17 -12
- data/lib/tcell_agent/sensor_events/util/sanitizer_utilities.rb +148 -139
- data/lib/tcell_agent/utils/params.rb +24 -21
- data/lib/tcell_agent/version.rb +1 -1
- data/spec/lib/tcell_agent/configuration_spec.rb +0 -179
- data/spec/lib/tcell_agent/policies/appsensor/database_sensor_spec.rb +6 -4
- data/spec/lib/tcell_agent/policies/appsensor/misc_sensor_spec.rb +31 -22
- data/spec/lib/tcell_agent/policies/appsensor/payloads_policy_apply_spec.rb +466 -0
- data/spec/lib/tcell_agent/policies/appsensor/payloads_policy_from_json_spec.rb +890 -0
- data/spec/lib/tcell_agent/policies/appsensor/payloads_policy_log_spec.rb +484 -0
- data/spec/lib/tcell_agent/policies/appsensor/request_size_sensor_spec.rb +4 -3
- data/spec/lib/tcell_agent/policies/appsensor/response_codes_sensor_spec.rb +4 -4
- data/spec/lib/tcell_agent/policies/appsensor/response_size_sensor_spec.rb +1 -1
- data/spec/lib/tcell_agent/policies/appsensor/sqli_sensor_spec.rb +85 -0
- data/spec/lib/tcell_agent/policies/appsensor/user_agent_sensor_spec.rb +36 -16
- data/spec/lib/tcell_agent/policies/appsensor/xss_sensor_spec.rb +188 -312
- data/spec/lib/tcell_agent/policies/appsensor_policy_spec.rb +61 -0
- data/spec/lib/tcell_agent/rails/middleware/appsensor_middleware_spec.rb +18 -11
- data/spec/lib/tcell_agent/rails/middleware/redirect_middleware_spec.rb +14 -15
- data/spec/lib/tcell_agent/sensor_events/appsensor_meta_event_spec.rb +1 -1
- data/spec/lib/tcell_agent/sensor_events/util/sanitizer_utilities_spec.rb +6 -5
- data/spec/lib/tcell_agent/utils/params_spec.rb +28 -108
- data/tcell_agent.gemspec +21 -1
- metadata +37 -4
@@ -0,0 +1,847 @@
|
|
1
|
+
#include "libinjection_html5.h"
|
2
|
+
|
3
|
+
#include <string.h>
|
4
|
+
#include <assert.h>
|
5
|
+
|
6
|
+
#ifdef DEBUG
|
7
|
+
#include <stdio.h>
|
8
|
+
#define TRACE() printf("%s:%d\n", __FUNCTION__, __LINE__)
|
9
|
+
#else
|
10
|
+
#define TRACE()
|
11
|
+
#endif
|
12
|
+
|
13
|
+
|
14
|
+
#define CHAR_EOF -1
|
15
|
+
#define CHAR_NULL 0
|
16
|
+
#define CHAR_BANG 33
|
17
|
+
#define CHAR_DOUBLE 34
|
18
|
+
#define CHAR_PERCENT 37
|
19
|
+
#define CHAR_SINGLE 39
|
20
|
+
#define CHAR_DASH 45
|
21
|
+
#define CHAR_SLASH 47
|
22
|
+
#define CHAR_LT 60
|
23
|
+
#define CHAR_EQUALS 61
|
24
|
+
#define CHAR_GT 62
|
25
|
+
#define CHAR_QUESTION 63
|
26
|
+
#define CHAR_RIGHTB 93
|
27
|
+
#define CHAR_TICK 96
|
28
|
+
|
29
|
+
/* prototypes */
|
30
|
+
|
31
|
+
static int h5_skip_white(h5_state_t* hs);
|
32
|
+
static int h5_is_white(char c);
|
33
|
+
static int h5_state_eof(h5_state_t* hs);
|
34
|
+
static int h5_state_data(h5_state_t* hs);
|
35
|
+
static int h5_state_tag_open(h5_state_t* hs);
|
36
|
+
static int h5_state_tag_name(h5_state_t* hs);
|
37
|
+
static int h5_state_tag_name_close(h5_state_t* hs);
|
38
|
+
static int h5_state_end_tag_open(h5_state_t* hs);
|
39
|
+
static int h5_state_self_closing_start_tag(h5_state_t* hs);
|
40
|
+
static int h5_state_attribute_name(h5_state_t* hs);
|
41
|
+
static int h5_state_after_attribute_name(h5_state_t* hs);
|
42
|
+
static int h5_state_before_attribute_name(h5_state_t* hs);
|
43
|
+
static int h5_state_before_attribute_value(h5_state_t* hs);
|
44
|
+
static int h5_state_attribute_value_double_quote(h5_state_t* hs);
|
45
|
+
static int h5_state_attribute_value_single_quote(h5_state_t* hs);
|
46
|
+
static int h5_state_attribute_value_back_quote(h5_state_t* hs);
|
47
|
+
static int h5_state_attribute_value_no_quote(h5_state_t* hs);
|
48
|
+
static int h5_state_after_attribute_value_quoted_state(h5_state_t* hs);
|
49
|
+
static int h5_state_comment(h5_state_t* hs);
|
50
|
+
static int h5_state_cdata(h5_state_t* hs);
|
51
|
+
|
52
|
+
|
53
|
+
/* 12.2.4.44 */
|
54
|
+
static int h5_state_bogus_comment(h5_state_t* hs);
|
55
|
+
static int h5_state_bogus_comment2(h5_state_t* hs);
|
56
|
+
|
57
|
+
/* 12.2.4.45 */
|
58
|
+
static int h5_state_markup_declaration_open(h5_state_t* hs);
|
59
|
+
|
60
|
+
/* 8.2.4.52 */
|
61
|
+
static int h5_state_doctype(h5_state_t* hs);
|
62
|
+
|
63
|
+
/**
|
64
|
+
* public function
|
65
|
+
*/
|
66
|
+
void libinjection_h5_init(h5_state_t* hs, const char* s, size_t len, enum html5_flags flags)
|
67
|
+
{
|
68
|
+
memset(hs, 0, sizeof(h5_state_t));
|
69
|
+
hs->s = s;
|
70
|
+
hs->len = len;
|
71
|
+
|
72
|
+
switch (flags) {
|
73
|
+
case DATA_STATE:
|
74
|
+
hs->state = h5_state_data;
|
75
|
+
break;
|
76
|
+
case VALUE_NO_QUOTE:
|
77
|
+
hs->state = h5_state_before_attribute_name;
|
78
|
+
break;
|
79
|
+
case VALUE_SINGLE_QUOTE:
|
80
|
+
hs->state = h5_state_attribute_value_single_quote;
|
81
|
+
break;
|
82
|
+
case VALUE_DOUBLE_QUOTE:
|
83
|
+
hs->state = h5_state_attribute_value_double_quote;
|
84
|
+
break;
|
85
|
+
case VALUE_BACK_QUOTE:
|
86
|
+
hs->state = h5_state_attribute_value_back_quote;
|
87
|
+
break;
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
91
|
+
/**
|
92
|
+
* public function
|
93
|
+
*/
|
94
|
+
int libinjection_h5_next(h5_state_t* hs)
|
95
|
+
{
|
96
|
+
assert(hs->state != NULL);
|
97
|
+
return (*hs->state)(hs);
|
98
|
+
}
|
99
|
+
|
100
|
+
/**
|
101
|
+
* Everything below here is private
|
102
|
+
*
|
103
|
+
*/
|
104
|
+
|
105
|
+
|
106
|
+
static int h5_is_white(char ch)
|
107
|
+
{
|
108
|
+
/*
|
109
|
+
* \t = horizontal tab = 0x09
|
110
|
+
* \n = newline = 0x0A
|
111
|
+
* \v = vertical tab = 0x0B
|
112
|
+
* \f = form feed = 0x0C
|
113
|
+
* \r = cr = 0x0D
|
114
|
+
*/
|
115
|
+
return strchr(" \t\n\v\f\r", ch) != NULL;
|
116
|
+
}
|
117
|
+
|
118
|
+
static int h5_skip_white(h5_state_t* hs)
|
119
|
+
{
|
120
|
+
char ch;
|
121
|
+
while (hs->pos < hs->len) {
|
122
|
+
ch = hs->s[hs->pos];
|
123
|
+
switch (ch) {
|
124
|
+
case 0x00: /* IE only */
|
125
|
+
case 0x20:
|
126
|
+
case 0x09:
|
127
|
+
case 0x0A:
|
128
|
+
case 0x0B: /* IE only */
|
129
|
+
case 0x0C:
|
130
|
+
case 0x0D: /* IE only */
|
131
|
+
hs->pos += 1;
|
132
|
+
break;
|
133
|
+
default:
|
134
|
+
return ch;
|
135
|
+
}
|
136
|
+
}
|
137
|
+
return CHAR_EOF;
|
138
|
+
}
|
139
|
+
|
140
|
+
static int h5_state_eof(h5_state_t* hs)
|
141
|
+
{
|
142
|
+
/* eliminate unused function argument warning */
|
143
|
+
(void)hs;
|
144
|
+
return 0;
|
145
|
+
}
|
146
|
+
|
147
|
+
static int h5_state_data(h5_state_t* hs)
|
148
|
+
{
|
149
|
+
const char* idx;
|
150
|
+
|
151
|
+
TRACE();
|
152
|
+
assert(hs->len >= hs->pos);
|
153
|
+
idx = (const char*) memchr(hs->s + hs->pos, CHAR_LT, hs->len - hs->pos);
|
154
|
+
if (idx == NULL) {
|
155
|
+
hs->token_start = hs->s + hs->pos;
|
156
|
+
hs->token_len = hs->len - hs->pos;
|
157
|
+
hs->token_type = DATA_TEXT;
|
158
|
+
hs->state = h5_state_eof;
|
159
|
+
if (hs->token_len == 0) {
|
160
|
+
return 0;
|
161
|
+
}
|
162
|
+
} else {
|
163
|
+
hs->token_start = hs->s + hs->pos;
|
164
|
+
hs->token_type = DATA_TEXT;
|
165
|
+
hs->token_len = (size_t)(idx - hs->s) - hs->pos;
|
166
|
+
hs->pos = (size_t)(idx - hs->s) + 1;
|
167
|
+
hs->state = h5_state_tag_open;
|
168
|
+
if (hs->token_len == 0) {
|
169
|
+
return h5_state_tag_open(hs);
|
170
|
+
}
|
171
|
+
}
|
172
|
+
return 1;
|
173
|
+
}
|
174
|
+
|
175
|
+
/**
|
176
|
+
* 12 2.4.8
|
177
|
+
*/
|
178
|
+
static int h5_state_tag_open(h5_state_t* hs)
|
179
|
+
{
|
180
|
+
char ch;
|
181
|
+
|
182
|
+
TRACE();
|
183
|
+
ch = hs->s[hs->pos];
|
184
|
+
if (ch == CHAR_BANG) {
|
185
|
+
hs->pos += 1;
|
186
|
+
return h5_state_markup_declaration_open(hs);
|
187
|
+
} else if (ch == CHAR_SLASH) {
|
188
|
+
hs->pos += 1;
|
189
|
+
hs->is_close = 1;
|
190
|
+
return h5_state_end_tag_open(hs);
|
191
|
+
} else if (ch == CHAR_QUESTION) {
|
192
|
+
hs->pos += 1;
|
193
|
+
return h5_state_bogus_comment(hs);
|
194
|
+
} else if (ch == CHAR_PERCENT) {
|
195
|
+
/* this is not in spec.. alternative comment format used
|
196
|
+
by IE <= 9 and Safari < 4.0.3 */
|
197
|
+
hs->pos += 1;
|
198
|
+
return h5_state_bogus_comment2(hs);
|
199
|
+
} else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
|
200
|
+
return h5_state_tag_name(hs);
|
201
|
+
} else if (ch == CHAR_NULL) {
|
202
|
+
/* IE-ism NULL characters are ignored */
|
203
|
+
return h5_state_tag_name(hs);
|
204
|
+
} else {
|
205
|
+
/* user input mistake in configuring state */
|
206
|
+
if (hs->pos == 0) {
|
207
|
+
return h5_state_data(hs);
|
208
|
+
}
|
209
|
+
hs->token_start = hs->s + hs->pos - 1;
|
210
|
+
hs->token_len = 1;
|
211
|
+
hs->token_type = DATA_TEXT;
|
212
|
+
hs->state = h5_state_data;
|
213
|
+
return 1;
|
214
|
+
}
|
215
|
+
}
|
216
|
+
/**
|
217
|
+
* 12.2.4.9
|
218
|
+
*/
|
219
|
+
static int h5_state_end_tag_open(h5_state_t* hs)
|
220
|
+
{
|
221
|
+
char ch;
|
222
|
+
|
223
|
+
TRACE();
|
224
|
+
|
225
|
+
if (hs->pos >= hs->len) {
|
226
|
+
return 0;
|
227
|
+
}
|
228
|
+
ch = hs->s[hs->pos];
|
229
|
+
if (ch == CHAR_GT) {
|
230
|
+
return h5_state_data(hs);
|
231
|
+
} else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
|
232
|
+
return h5_state_tag_name(hs);
|
233
|
+
}
|
234
|
+
|
235
|
+
hs->is_close = 0;
|
236
|
+
return h5_state_bogus_comment(hs);
|
237
|
+
}
|
238
|
+
/*
|
239
|
+
*
|
240
|
+
*/
|
241
|
+
static int h5_state_tag_name_close(h5_state_t* hs)
|
242
|
+
{
|
243
|
+
TRACE();
|
244
|
+
hs->is_close = 0;
|
245
|
+
hs->token_start = hs->s + hs->pos;
|
246
|
+
hs->token_len = 1;
|
247
|
+
hs->token_type = TAG_NAME_CLOSE;
|
248
|
+
hs->pos += 1;
|
249
|
+
if (hs->pos < hs->len) {
|
250
|
+
hs->state = h5_state_data;
|
251
|
+
} else {
|
252
|
+
hs->state = h5_state_eof;
|
253
|
+
}
|
254
|
+
|
255
|
+
return 1;
|
256
|
+
}
|
257
|
+
|
258
|
+
/**
|
259
|
+
* 12.2.4.10
|
260
|
+
*/
|
261
|
+
static int h5_state_tag_name(h5_state_t* hs)
|
262
|
+
{
|
263
|
+
char ch;
|
264
|
+
size_t pos;
|
265
|
+
|
266
|
+
TRACE();
|
267
|
+
pos = hs->pos;
|
268
|
+
while (pos < hs->len) {
|
269
|
+
ch = hs->s[pos];
|
270
|
+
if (ch == 0) {
|
271
|
+
/* special non-standard case */
|
272
|
+
/* allow nulls in tag name */
|
273
|
+
/* some old browsers apparently allow and ignore them */
|
274
|
+
pos += 1;
|
275
|
+
} else if (h5_is_white(ch)) {
|
276
|
+
hs->token_start = hs->s + hs->pos;
|
277
|
+
hs->token_len = pos - hs->pos;
|
278
|
+
hs->token_type = TAG_NAME_OPEN;
|
279
|
+
hs->pos = pos + 1;
|
280
|
+
hs->state = h5_state_before_attribute_name;
|
281
|
+
return 1;
|
282
|
+
} else if (ch == CHAR_SLASH) {
|
283
|
+
hs->token_start = hs->s + hs->pos;
|
284
|
+
hs->token_len = pos - hs->pos;
|
285
|
+
hs->token_type = TAG_NAME_OPEN;
|
286
|
+
hs->pos = pos + 1;
|
287
|
+
hs->state = h5_state_self_closing_start_tag;
|
288
|
+
return 1;
|
289
|
+
} else if (ch == CHAR_GT) {
|
290
|
+
hs->token_start = hs->s + hs->pos;
|
291
|
+
hs->token_len = pos - hs->pos;
|
292
|
+
if (hs->is_close) {
|
293
|
+
hs->pos = pos + 1;
|
294
|
+
hs->is_close = 0;
|
295
|
+
hs->token_type = TAG_CLOSE;
|
296
|
+
hs->state = h5_state_data;
|
297
|
+
} else {
|
298
|
+
hs->pos = pos;
|
299
|
+
hs->token_type = TAG_NAME_OPEN;
|
300
|
+
hs->state = h5_state_tag_name_close;
|
301
|
+
}
|
302
|
+
return 1;
|
303
|
+
} else {
|
304
|
+
pos += 1;
|
305
|
+
}
|
306
|
+
}
|
307
|
+
|
308
|
+
hs->token_start = hs->s + hs->pos;
|
309
|
+
hs->token_len = hs->len - hs->pos;
|
310
|
+
hs->token_type = TAG_NAME_OPEN;
|
311
|
+
hs->state = h5_state_eof;
|
312
|
+
return 1;
|
313
|
+
}
|
314
|
+
|
315
|
+
/**
|
316
|
+
* 12.2.4.34
|
317
|
+
*/
|
318
|
+
static int h5_state_before_attribute_name(h5_state_t* hs)
|
319
|
+
{
|
320
|
+
int ch;
|
321
|
+
|
322
|
+
TRACE();
|
323
|
+
ch = h5_skip_white(hs);
|
324
|
+
switch (ch) {
|
325
|
+
case CHAR_EOF: {
|
326
|
+
return 0;
|
327
|
+
}
|
328
|
+
case CHAR_SLASH: {
|
329
|
+
hs->pos += 1;
|
330
|
+
return h5_state_self_closing_start_tag(hs);
|
331
|
+
}
|
332
|
+
case CHAR_GT: {
|
333
|
+
hs->state = h5_state_data;
|
334
|
+
hs->token_start = hs->s + hs->pos;
|
335
|
+
hs->token_len = 1;
|
336
|
+
hs->token_type = TAG_NAME_CLOSE;
|
337
|
+
hs->pos += 1;
|
338
|
+
return 1;
|
339
|
+
}
|
340
|
+
default: {
|
341
|
+
return h5_state_attribute_name(hs);
|
342
|
+
}
|
343
|
+
}
|
344
|
+
}
|
345
|
+
|
346
|
+
static int h5_state_attribute_name(h5_state_t* hs)
|
347
|
+
{
|
348
|
+
char ch;
|
349
|
+
size_t pos;
|
350
|
+
|
351
|
+
TRACE();
|
352
|
+
pos = hs->pos + 1;
|
353
|
+
while (pos < hs->len) {
|
354
|
+
ch = hs->s[pos];
|
355
|
+
if (h5_is_white(ch)) {
|
356
|
+
hs->token_start = hs->s + hs->pos;
|
357
|
+
hs->token_len = pos - hs->pos;
|
358
|
+
hs->token_type = ATTR_NAME;
|
359
|
+
hs->state = h5_state_after_attribute_name;
|
360
|
+
hs->pos = pos + 1;
|
361
|
+
return 1;
|
362
|
+
} else if (ch == CHAR_SLASH) {
|
363
|
+
hs->token_start = hs->s + hs->pos;
|
364
|
+
hs->token_len = pos - hs->pos;
|
365
|
+
hs->token_type = ATTR_NAME;
|
366
|
+
hs->state = h5_state_self_closing_start_tag;
|
367
|
+
hs->pos = pos + 1;
|
368
|
+
return 1;
|
369
|
+
} else if (ch == CHAR_EQUALS) {
|
370
|
+
hs->token_start = hs->s + hs->pos;
|
371
|
+
hs->token_len = pos - hs->pos;
|
372
|
+
hs->token_type = ATTR_NAME;
|
373
|
+
hs->state = h5_state_before_attribute_value;
|
374
|
+
hs->pos = pos + 1;
|
375
|
+
return 1;
|
376
|
+
} else if (ch == CHAR_GT) {
|
377
|
+
hs->token_start = hs->s + hs->pos;
|
378
|
+
hs->token_len = pos - hs->pos;
|
379
|
+
hs->token_type = ATTR_NAME;
|
380
|
+
hs->state = h5_state_tag_name_close;
|
381
|
+
hs->pos = pos;
|
382
|
+
return 1;
|
383
|
+
} else {
|
384
|
+
pos += 1;
|
385
|
+
}
|
386
|
+
}
|
387
|
+
/* EOF */
|
388
|
+
hs->token_start = hs->s + hs->pos;
|
389
|
+
hs->token_len = hs->len - hs->pos;
|
390
|
+
hs->token_type = ATTR_NAME;
|
391
|
+
hs->state = h5_state_eof;
|
392
|
+
hs->pos = hs->len;
|
393
|
+
return 1;
|
394
|
+
}
|
395
|
+
|
396
|
+
/**
|
397
|
+
* 12.2.4.36
|
398
|
+
*/
|
399
|
+
static int h5_state_after_attribute_name(h5_state_t* hs)
|
400
|
+
{
|
401
|
+
int c;
|
402
|
+
|
403
|
+
TRACE();
|
404
|
+
c = h5_skip_white(hs);
|
405
|
+
switch (c) {
|
406
|
+
case CHAR_EOF: {
|
407
|
+
return 0;
|
408
|
+
}
|
409
|
+
case CHAR_SLASH: {
|
410
|
+
hs->pos += 1;
|
411
|
+
return h5_state_self_closing_start_tag(hs);
|
412
|
+
}
|
413
|
+
case CHAR_EQUALS: {
|
414
|
+
hs->pos += 1;
|
415
|
+
return h5_state_before_attribute_value(hs);
|
416
|
+
}
|
417
|
+
case CHAR_GT: {
|
418
|
+
return h5_state_tag_name_close(hs);
|
419
|
+
}
|
420
|
+
default: {
|
421
|
+
return h5_state_attribute_name(hs);
|
422
|
+
}
|
423
|
+
}
|
424
|
+
}
|
425
|
+
|
426
|
+
/**
|
427
|
+
* 12.2.4.37
|
428
|
+
*/
|
429
|
+
static int h5_state_before_attribute_value(h5_state_t* hs)
|
430
|
+
{
|
431
|
+
int c;
|
432
|
+
TRACE();
|
433
|
+
|
434
|
+
c = h5_skip_white(hs);
|
435
|
+
|
436
|
+
if (c == CHAR_EOF) {
|
437
|
+
hs->state = h5_state_eof;
|
438
|
+
return 0;
|
439
|
+
}
|
440
|
+
|
441
|
+
if (c == CHAR_DOUBLE) {
|
442
|
+
return h5_state_attribute_value_double_quote(hs);
|
443
|
+
} else if (c == CHAR_SINGLE) {
|
444
|
+
return h5_state_attribute_value_single_quote(hs);
|
445
|
+
} else if (c == CHAR_TICK) {
|
446
|
+
/* NON STANDARD IE */
|
447
|
+
return h5_state_attribute_value_back_quote(hs);
|
448
|
+
} else {
|
449
|
+
return h5_state_attribute_value_no_quote(hs);
|
450
|
+
}
|
451
|
+
}
|
452
|
+
|
453
|
+
|
454
|
+
static int h5_state_attribute_value_quote(h5_state_t* hs, char qchar)
|
455
|
+
{
|
456
|
+
const char* idx;
|
457
|
+
|
458
|
+
TRACE();
|
459
|
+
|
460
|
+
/* skip initial quote in normal case.
|
461
|
+
* don't do this "if (pos == 0)" since it means we have started
|
462
|
+
* in a non-data state. given an input of '><foo
|
463
|
+
* we want to make 0-length attribute name
|
464
|
+
*/
|
465
|
+
if (hs->pos > 0) {
|
466
|
+
hs->pos += 1;
|
467
|
+
}
|
468
|
+
|
469
|
+
|
470
|
+
idx = (const char*) memchr(hs->s + hs->pos, qchar, hs->len - hs->pos);
|
471
|
+
if (idx == NULL) {
|
472
|
+
hs->token_start = hs->s + hs->pos;
|
473
|
+
hs->token_len = hs->len - hs->pos;
|
474
|
+
hs->token_type = ATTR_VALUE;
|
475
|
+
hs->state = h5_state_eof;
|
476
|
+
} else {
|
477
|
+
hs->token_start = hs->s + hs->pos;
|
478
|
+
hs->token_len = (size_t)(idx - hs->s) - hs->pos;
|
479
|
+
hs->token_type = ATTR_VALUE;
|
480
|
+
hs->state = h5_state_after_attribute_value_quoted_state;
|
481
|
+
hs->pos += hs->token_len + 1;
|
482
|
+
}
|
483
|
+
return 1;
|
484
|
+
}
|
485
|
+
|
486
|
+
static
|
487
|
+
int h5_state_attribute_value_double_quote(h5_state_t* hs)
|
488
|
+
{
|
489
|
+
TRACE();
|
490
|
+
return h5_state_attribute_value_quote(hs, CHAR_DOUBLE);
|
491
|
+
}
|
492
|
+
|
493
|
+
static
|
494
|
+
int h5_state_attribute_value_single_quote(h5_state_t* hs)
|
495
|
+
{
|
496
|
+
TRACE();
|
497
|
+
return h5_state_attribute_value_quote(hs, CHAR_SINGLE);
|
498
|
+
}
|
499
|
+
|
500
|
+
static
|
501
|
+
int h5_state_attribute_value_back_quote(h5_state_t* hs)
|
502
|
+
{
|
503
|
+
TRACE();
|
504
|
+
return h5_state_attribute_value_quote(hs, CHAR_TICK);
|
505
|
+
}
|
506
|
+
|
507
|
+
static int h5_state_attribute_value_no_quote(h5_state_t* hs)
|
508
|
+
{
|
509
|
+
char ch;
|
510
|
+
size_t pos;
|
511
|
+
|
512
|
+
TRACE();
|
513
|
+
pos = hs->pos;
|
514
|
+
while (pos < hs->len) {
|
515
|
+
ch = hs->s[pos];
|
516
|
+
if (h5_is_white(ch)) {
|
517
|
+
hs->token_type = ATTR_VALUE;
|
518
|
+
hs->token_start = hs->s + hs->pos;
|
519
|
+
hs->token_len = pos - hs->pos;
|
520
|
+
hs->pos = pos + 1;
|
521
|
+
hs->state = h5_state_before_attribute_name;
|
522
|
+
return 1;
|
523
|
+
} else if (ch == CHAR_GT) {
|
524
|
+
hs->token_type = ATTR_VALUE;
|
525
|
+
hs->token_start = hs->s + hs->pos;
|
526
|
+
hs->token_len = pos - hs->pos;
|
527
|
+
hs->pos = pos;
|
528
|
+
hs->state = h5_state_tag_name_close;
|
529
|
+
return 1;
|
530
|
+
}
|
531
|
+
pos += 1;
|
532
|
+
}
|
533
|
+
TRACE();
|
534
|
+
/* EOF */
|
535
|
+
hs->state = h5_state_eof;
|
536
|
+
hs->token_start = hs->s + hs->pos;
|
537
|
+
hs->token_len = hs->len - hs->pos;
|
538
|
+
hs->token_type = ATTR_VALUE;
|
539
|
+
return 1;
|
540
|
+
}
|
541
|
+
|
542
|
+
/**
|
543
|
+
* 12.2.4.41
|
544
|
+
*/
|
545
|
+
static int h5_state_after_attribute_value_quoted_state(h5_state_t* hs)
|
546
|
+
{
|
547
|
+
char ch;
|
548
|
+
|
549
|
+
TRACE();
|
550
|
+
if (hs->pos >= hs->len) {
|
551
|
+
return 0;
|
552
|
+
}
|
553
|
+
ch = hs->s[hs->pos];
|
554
|
+
if (h5_is_white(ch)) {
|
555
|
+
hs->pos += 1;
|
556
|
+
return h5_state_before_attribute_name(hs);
|
557
|
+
} else if (ch == CHAR_SLASH) {
|
558
|
+
hs->pos += 1;
|
559
|
+
return h5_state_self_closing_start_tag(hs);
|
560
|
+
} else if (ch == CHAR_GT) {
|
561
|
+
hs->token_start = hs->s + hs->pos;
|
562
|
+
hs->token_len = 1;
|
563
|
+
hs->token_type = TAG_NAME_CLOSE;
|
564
|
+
hs->pos += 1;
|
565
|
+
hs->state = h5_state_data;
|
566
|
+
return 1;
|
567
|
+
} else {
|
568
|
+
return h5_state_before_attribute_name(hs);
|
569
|
+
}
|
570
|
+
}
|
571
|
+
|
572
|
+
/**
|
573
|
+
* 12.2.4.43
|
574
|
+
*/
|
575
|
+
static int h5_state_self_closing_start_tag(h5_state_t* hs)
|
576
|
+
{
|
577
|
+
char ch;
|
578
|
+
|
579
|
+
TRACE();
|
580
|
+
if (hs->pos >= hs->len) {
|
581
|
+
return 0;
|
582
|
+
}
|
583
|
+
ch = hs->s[hs->pos];
|
584
|
+
if (ch == CHAR_GT) {
|
585
|
+
assert(hs->pos > 0);
|
586
|
+
hs->token_start = hs->s + hs->pos -1;
|
587
|
+
hs->token_len = 2;
|
588
|
+
hs->token_type = TAG_NAME_SELFCLOSE;
|
589
|
+
hs->state = h5_state_data;
|
590
|
+
hs->pos += 1;
|
591
|
+
return 1;
|
592
|
+
} else {
|
593
|
+
return h5_state_before_attribute_name(hs);
|
594
|
+
}
|
595
|
+
}
|
596
|
+
|
597
|
+
/**
|
598
|
+
* 12.2.4.44
|
599
|
+
*/
|
600
|
+
static int h5_state_bogus_comment(h5_state_t* hs)
|
601
|
+
{
|
602
|
+
const char* idx;
|
603
|
+
|
604
|
+
TRACE();
|
605
|
+
idx = (const char*) memchr(hs->s + hs->pos, CHAR_GT, hs->len - hs->pos);
|
606
|
+
if (idx == NULL) {
|
607
|
+
hs->token_start = hs->s + hs->pos;
|
608
|
+
hs->token_len = hs->len - hs->pos;
|
609
|
+
hs->pos = hs->len;
|
610
|
+
hs->state = h5_state_eof;
|
611
|
+
} else {
|
612
|
+
hs->token_start = hs->s + hs->pos;
|
613
|
+
hs->token_len = (size_t)(idx - hs->s) - hs->pos;
|
614
|
+
hs->pos = (size_t)(idx - hs->s) + 1;
|
615
|
+
hs->state = h5_state_data;
|
616
|
+
}
|
617
|
+
|
618
|
+
hs->token_type = TAG_COMMENT;
|
619
|
+
return 1;
|
620
|
+
}
|
621
|
+
|
622
|
+
/**
|
623
|
+
* 12.2.4.44 ALT
|
624
|
+
*/
|
625
|
+
static int h5_state_bogus_comment2(h5_state_t* hs)
|
626
|
+
{
|
627
|
+
const char* idx;
|
628
|
+
size_t pos;
|
629
|
+
|
630
|
+
TRACE();
|
631
|
+
pos = hs->pos;
|
632
|
+
while (1) {
|
633
|
+
idx = (const char*) memchr(hs->s + pos, CHAR_PERCENT, hs->len - pos);
|
634
|
+
if (idx == NULL || (idx + 1 >= hs->s + hs->len)) {
|
635
|
+
hs->token_start = hs->s + hs->pos;
|
636
|
+
hs->token_len = hs->len - hs->pos;
|
637
|
+
hs->pos = hs->len;
|
638
|
+
hs->token_type = TAG_COMMENT;
|
639
|
+
hs->state = h5_state_eof;
|
640
|
+
return 1;
|
641
|
+
}
|
642
|
+
|
643
|
+
if (*(idx +1) != CHAR_GT) {
|
644
|
+
pos = (size_t)(idx - hs->s) + 1;
|
645
|
+
continue;
|
646
|
+
}
|
647
|
+
|
648
|
+
/* ends in %> */
|
649
|
+
hs->token_start = hs->s + hs->pos;
|
650
|
+
hs->token_len = (size_t)(idx - hs->s) - hs->pos;
|
651
|
+
hs->pos = (size_t)(idx - hs->s) + 2;
|
652
|
+
hs->state = h5_state_data;
|
653
|
+
hs->token_type = TAG_COMMENT;
|
654
|
+
return 1;
|
655
|
+
}
|
656
|
+
}
|
657
|
+
|
658
|
+
/**
|
659
|
+
* 8.2.4.45
|
660
|
+
*/
|
661
|
+
static int h5_state_markup_declaration_open(h5_state_t* hs)
|
662
|
+
{
|
663
|
+
size_t remaining;
|
664
|
+
|
665
|
+
TRACE();
|
666
|
+
remaining = hs->len - hs->pos;
|
667
|
+
if (remaining >= 7 &&
|
668
|
+
/* case insensitive */
|
669
|
+
(hs->s[hs->pos + 0] == 'D' || hs->s[hs->pos + 0] == 'd') &&
|
670
|
+
(hs->s[hs->pos + 1] == 'O' || hs->s[hs->pos + 1] == 'o') &&
|
671
|
+
(hs->s[hs->pos + 2] == 'C' || hs->s[hs->pos + 2] == 'c') &&
|
672
|
+
(hs->s[hs->pos + 3] == 'T' || hs->s[hs->pos + 3] == 't') &&
|
673
|
+
(hs->s[hs->pos + 4] == 'Y' || hs->s[hs->pos + 4] == 'y') &&
|
674
|
+
(hs->s[hs->pos + 5] == 'P' || hs->s[hs->pos + 5] == 'p') &&
|
675
|
+
(hs->s[hs->pos + 6] == 'E' || hs->s[hs->pos + 6] == 'e')
|
676
|
+
) {
|
677
|
+
return h5_state_doctype(hs);
|
678
|
+
} else if (remaining >= 7 &&
|
679
|
+
/* upper case required */
|
680
|
+
hs->s[hs->pos + 0] == '[' &&
|
681
|
+
hs->s[hs->pos + 1] == 'C' &&
|
682
|
+
hs->s[hs->pos + 2] == 'D' &&
|
683
|
+
hs->s[hs->pos + 3] == 'A' &&
|
684
|
+
hs->s[hs->pos + 4] == 'T' &&
|
685
|
+
hs->s[hs->pos + 5] == 'A' &&
|
686
|
+
hs->s[hs->pos + 6] == '['
|
687
|
+
) {
|
688
|
+
hs->pos += 7;
|
689
|
+
return h5_state_cdata(hs);
|
690
|
+
} else if (remaining >= 2 &&
|
691
|
+
hs->s[hs->pos + 0] == '-' &&
|
692
|
+
hs->s[hs->pos + 1] == '-') {
|
693
|
+
hs->pos += 2;
|
694
|
+
return h5_state_comment(hs);
|
695
|
+
}
|
696
|
+
|
697
|
+
return h5_state_bogus_comment(hs);
|
698
|
+
}
|
699
|
+
|
700
|
+
/**
|
701
|
+
* 12.2.4.48
|
702
|
+
* 12.2.4.49
|
703
|
+
* 12.2.4.50
|
704
|
+
* 12.2.4.51
|
705
|
+
* state machine spec is confusing since it can only look
|
706
|
+
* at one character at a time but simply it's comments end by:
|
707
|
+
* 1) EOF
|
708
|
+
* 2) ending in -->
|
709
|
+
* 3) ending in -!>
|
710
|
+
*/
|
711
|
+
static int h5_state_comment(h5_state_t* hs)
|
712
|
+
{
|
713
|
+
char ch;
|
714
|
+
const char* idx;
|
715
|
+
size_t pos;
|
716
|
+
size_t offset;
|
717
|
+
const char* end = hs->s + hs->len;
|
718
|
+
|
719
|
+
TRACE();
|
720
|
+
pos = hs->pos;
|
721
|
+
while (1) {
|
722
|
+
|
723
|
+
idx = (const char*) memchr(hs->s + pos, CHAR_DASH, hs->len - pos);
|
724
|
+
|
725
|
+
/* did not find anything or has less than 3 chars left */
|
726
|
+
if (idx == NULL || idx > hs->s + hs->len - 3) {
|
727
|
+
hs->state = h5_state_eof;
|
728
|
+
hs->token_start = hs->s + hs->pos;
|
729
|
+
hs->token_len = hs->len - hs->pos;
|
730
|
+
hs->token_type = TAG_COMMENT;
|
731
|
+
return 1;
|
732
|
+
}
|
733
|
+
offset = 1;
|
734
|
+
|
735
|
+
/* skip all nulls */
|
736
|
+
while (idx + offset < end && *(idx + offset) == 0) {
|
737
|
+
offset += 1;
|
738
|
+
}
|
739
|
+
if (idx + offset == end) {
|
740
|
+
hs->state = h5_state_eof;
|
741
|
+
hs->token_start = hs->s + hs->pos;
|
742
|
+
hs->token_len = hs->len - hs->pos;
|
743
|
+
hs->token_type = TAG_COMMENT;
|
744
|
+
return 1;
|
745
|
+
}
|
746
|
+
|
747
|
+
ch = *(idx + offset);
|
748
|
+
if (ch != CHAR_DASH && ch != CHAR_BANG) {
|
749
|
+
pos = (size_t)(idx - hs->s) + 1;
|
750
|
+
continue;
|
751
|
+
}
|
752
|
+
|
753
|
+
/* need to test */
|
754
|
+
#if 0
|
755
|
+
/* skip all nulls */
|
756
|
+
while (idx + offset < end && *(idx + offset) == 0) {
|
757
|
+
offset += 1;
|
758
|
+
}
|
759
|
+
if (idx + offset == end) {
|
760
|
+
hs->state = h5_state_eof;
|
761
|
+
hs->token_start = hs->s + hs->pos;
|
762
|
+
hs->token_len = hs->len - hs->pos;
|
763
|
+
hs->token_type = TAG_COMMENT;
|
764
|
+
return 1;
|
765
|
+
}
|
766
|
+
#endif
|
767
|
+
|
768
|
+
offset += 1;
|
769
|
+
if (idx + offset == end) {
|
770
|
+
hs->state = h5_state_eof;
|
771
|
+
hs->token_start = hs->s + hs->pos;
|
772
|
+
hs->token_len = hs->len - hs->pos;
|
773
|
+
hs->token_type = TAG_COMMENT;
|
774
|
+
return 1;
|
775
|
+
}
|
776
|
+
|
777
|
+
|
778
|
+
ch = *(idx + offset);
|
779
|
+
if (ch != CHAR_GT) {
|
780
|
+
pos = (size_t)(idx - hs->s) + 1;
|
781
|
+
continue;
|
782
|
+
}
|
783
|
+
offset += 1;
|
784
|
+
|
785
|
+
/* ends in --> or -!> */
|
786
|
+
hs->token_start = hs->s + hs->pos;
|
787
|
+
hs->token_len = (size_t)(idx - hs->s) - hs->pos;
|
788
|
+
hs->pos = (size_t)(idx + offset - hs->s);
|
789
|
+
hs->state = h5_state_data;
|
790
|
+
hs->token_type = TAG_COMMENT;
|
791
|
+
return 1;
|
792
|
+
}
|
793
|
+
}
|
794
|
+
|
795
|
+
static int h5_state_cdata(h5_state_t* hs)
|
796
|
+
{
|
797
|
+
const char* idx;
|
798
|
+
size_t pos;
|
799
|
+
|
800
|
+
TRACE();
|
801
|
+
pos = hs->pos;
|
802
|
+
while (1) {
|
803
|
+
idx = (const char*) memchr(hs->s + pos, CHAR_RIGHTB, hs->len - pos);
|
804
|
+
|
805
|
+
/* did not find anything or has less than 3 chars left */
|
806
|
+
if (idx == NULL || idx > hs->s + hs->len - 3) {
|
807
|
+
hs->state = h5_state_eof;
|
808
|
+
hs->token_start = hs->s + hs->pos;
|
809
|
+
hs->token_len = hs->len - hs->pos;
|
810
|
+
hs->token_type = DATA_TEXT;
|
811
|
+
return 1;
|
812
|
+
} else if ( *(idx+1) == CHAR_RIGHTB && *(idx+2) == CHAR_GT) {
|
813
|
+
hs->state = h5_state_data;
|
814
|
+
hs->token_start = hs->s + hs->pos;
|
815
|
+
hs->token_len = (size_t)(idx - hs->s) - hs->pos;
|
816
|
+
hs->pos = (size_t)(idx - hs->s) + 3;
|
817
|
+
hs->token_type = DATA_TEXT;
|
818
|
+
return 1;
|
819
|
+
} else {
|
820
|
+
pos = (size_t)(idx - hs->s) + 1;
|
821
|
+
}
|
822
|
+
}
|
823
|
+
}
|
824
|
+
|
825
|
+
/**
|
826
|
+
* 8.2.4.52
|
827
|
+
* http://www.w3.org/html/wg/drafts/html/master/syntax.html#doctype-state
|
828
|
+
*/
|
829
|
+
static int h5_state_doctype(h5_state_t* hs)
|
830
|
+
{
|
831
|
+
const char* idx;
|
832
|
+
|
833
|
+
TRACE();
|
834
|
+
hs->token_start = hs->s + hs->pos;
|
835
|
+
hs->token_type = DOCTYPE;
|
836
|
+
|
837
|
+
idx = (const char*) memchr(hs->s + hs->pos, CHAR_GT, hs->len - hs->pos);
|
838
|
+
if (idx == NULL) {
|
839
|
+
hs->state = h5_state_eof;
|
840
|
+
hs->token_len = hs->len - hs->pos;
|
841
|
+
} else {
|
842
|
+
hs->state = h5_state_data;
|
843
|
+
hs->token_len = (size_t)(idx - hs->s) - hs->pos;
|
844
|
+
hs->pos = (size_t)(idx - hs->s) + 1;
|
845
|
+
}
|
846
|
+
return 1;
|
847
|
+
}
|