tcell_agent 0.2.19 → 0.2.21

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE_libinjection +32 -0
  3. data/Rakefile +14 -1
  4. data/ext/libinjection/extconf.rb +3 -0
  5. data/ext/libinjection/libinjection.h +65 -0
  6. data/ext/libinjection/libinjection_html5.c +847 -0
  7. data/ext/libinjection/libinjection_html5.h +54 -0
  8. data/ext/libinjection/libinjection_sqli.c +2317 -0
  9. data/ext/libinjection/libinjection_sqli.h +295 -0
  10. data/ext/libinjection/libinjection_sqli_data.h +9004 -0
  11. data/ext/libinjection/libinjection_wrap.c +3525 -0
  12. data/ext/libinjection/libinjection_xss.c +531 -0
  13. data/ext/libinjection/libinjection_xss.h +21 -0
  14. data/lib/tcell_agent/configuration.rb +0 -48
  15. data/lib/tcell_agent/logger.rb +1 -0
  16. data/lib/tcell_agent/policies/appsensor/database_sensor.rb +8 -20
  17. data/lib/tcell_agent/policies/appsensor/injection_sensor.rb +30 -46
  18. data/lib/tcell_agent/policies/appsensor/login_sensor.rb +1 -4
  19. data/lib/tcell_agent/policies/appsensor/misc_sensor.rb +8 -22
  20. data/lib/tcell_agent/policies/appsensor/payloads_policy.rb +143 -0
  21. data/lib/tcell_agent/policies/appsensor/response_codes_sensor.rb +3 -1
  22. data/lib/tcell_agent/policies/appsensor/sensor.rb +21 -2
  23. data/lib/tcell_agent/policies/appsensor/size_sensor.rb +3 -1
  24. data/lib/tcell_agent/policies/appsensor/sqli_sensor.rb +9 -0
  25. data/lib/tcell_agent/policies/appsensor/user_agent_sensor.rb +1 -5
  26. data/lib/tcell_agent/policies/appsensor/xss_sensor.rb +9 -1
  27. data/lib/tcell_agent/policies/appsensor_policy.rb +40 -19
  28. data/lib/tcell_agent/policies/http_redirect_policy.rb +12 -2
  29. data/lib/tcell_agent/rails/csrf_exception.rb +1 -1
  30. data/lib/tcell_agent/rails/dlp.rb +98 -76
  31. data/lib/tcell_agent/rails/middleware/global_middleware.rb +1 -2
  32. data/lib/tcell_agent/rails/middleware/headers_middleware.rb +2 -2
  33. data/lib/tcell_agent/rails/on_start.rb +53 -20
  34. data/lib/tcell_agent/sensor_events/appsensor_event.rb +12 -19
  35. data/lib/tcell_agent/sensor_events/appsensor_meta_event.rb +7 -2
  36. data/lib/tcell_agent/sensor_events/sensor.rb +10 -11
  37. data/lib/tcell_agent/sensor_events/server_agent.rb +17 -12
  38. data/lib/tcell_agent/sensor_events/util/sanitizer_utilities.rb +148 -139
  39. data/lib/tcell_agent/utils/params.rb +24 -21
  40. data/lib/tcell_agent/version.rb +1 -1
  41. data/spec/lib/tcell_agent/configuration_spec.rb +0 -179
  42. data/spec/lib/tcell_agent/policies/appsensor/database_sensor_spec.rb +6 -4
  43. data/spec/lib/tcell_agent/policies/appsensor/misc_sensor_spec.rb +31 -22
  44. data/spec/lib/tcell_agent/policies/appsensor/payloads_policy_apply_spec.rb +466 -0
  45. data/spec/lib/tcell_agent/policies/appsensor/payloads_policy_from_json_spec.rb +890 -0
  46. data/spec/lib/tcell_agent/policies/appsensor/payloads_policy_log_spec.rb +484 -0
  47. data/spec/lib/tcell_agent/policies/appsensor/request_size_sensor_spec.rb +4 -3
  48. data/spec/lib/tcell_agent/policies/appsensor/response_codes_sensor_spec.rb +4 -4
  49. data/spec/lib/tcell_agent/policies/appsensor/response_size_sensor_spec.rb +1 -1
  50. data/spec/lib/tcell_agent/policies/appsensor/sqli_sensor_spec.rb +85 -0
  51. data/spec/lib/tcell_agent/policies/appsensor/user_agent_sensor_spec.rb +36 -16
  52. data/spec/lib/tcell_agent/policies/appsensor/xss_sensor_spec.rb +188 -312
  53. data/spec/lib/tcell_agent/policies/appsensor_policy_spec.rb +61 -0
  54. data/spec/lib/tcell_agent/rails/middleware/appsensor_middleware_spec.rb +18 -11
  55. data/spec/lib/tcell_agent/rails/middleware/redirect_middleware_spec.rb +14 -15
  56. data/spec/lib/tcell_agent/sensor_events/appsensor_meta_event_spec.rb +1 -1
  57. data/spec/lib/tcell_agent/sensor_events/util/sanitizer_utilities_spec.rb +6 -5
  58. data/spec/lib/tcell_agent/utils/params_spec.rb +28 -108
  59. data/tcell_agent.gemspec +21 -1
  60. metadata +37 -4
@@ -0,0 +1,847 @@
1
+ #include "libinjection_html5.h"
2
+
3
+ #include <string.h>
4
+ #include <assert.h>
5
+
6
+ #ifdef DEBUG
7
+ #include <stdio.h>
8
+ #define TRACE() printf("%s:%d\n", __FUNCTION__, __LINE__)
9
+ #else
10
+ #define TRACE()
11
+ #endif
12
+
13
+
14
+ #define CHAR_EOF -1
15
+ #define CHAR_NULL 0
16
+ #define CHAR_BANG 33
17
+ #define CHAR_DOUBLE 34
18
+ #define CHAR_PERCENT 37
19
+ #define CHAR_SINGLE 39
20
+ #define CHAR_DASH 45
21
+ #define CHAR_SLASH 47
22
+ #define CHAR_LT 60
23
+ #define CHAR_EQUALS 61
24
+ #define CHAR_GT 62
25
+ #define CHAR_QUESTION 63
26
+ #define CHAR_RIGHTB 93
27
+ #define CHAR_TICK 96
28
+
29
+ /* prototypes */
30
+
31
+ static int h5_skip_white(h5_state_t* hs);
32
+ static int h5_is_white(char c);
33
+ static int h5_state_eof(h5_state_t* hs);
34
+ static int h5_state_data(h5_state_t* hs);
35
+ static int h5_state_tag_open(h5_state_t* hs);
36
+ static int h5_state_tag_name(h5_state_t* hs);
37
+ static int h5_state_tag_name_close(h5_state_t* hs);
38
+ static int h5_state_end_tag_open(h5_state_t* hs);
39
+ static int h5_state_self_closing_start_tag(h5_state_t* hs);
40
+ static int h5_state_attribute_name(h5_state_t* hs);
41
+ static int h5_state_after_attribute_name(h5_state_t* hs);
42
+ static int h5_state_before_attribute_name(h5_state_t* hs);
43
+ static int h5_state_before_attribute_value(h5_state_t* hs);
44
+ static int h5_state_attribute_value_double_quote(h5_state_t* hs);
45
+ static int h5_state_attribute_value_single_quote(h5_state_t* hs);
46
+ static int h5_state_attribute_value_back_quote(h5_state_t* hs);
47
+ static int h5_state_attribute_value_no_quote(h5_state_t* hs);
48
+ static int h5_state_after_attribute_value_quoted_state(h5_state_t* hs);
49
+ static int h5_state_comment(h5_state_t* hs);
50
+ static int h5_state_cdata(h5_state_t* hs);
51
+
52
+
53
+ /* 12.2.4.44 */
54
+ static int h5_state_bogus_comment(h5_state_t* hs);
55
+ static int h5_state_bogus_comment2(h5_state_t* hs);
56
+
57
+ /* 12.2.4.45 */
58
+ static int h5_state_markup_declaration_open(h5_state_t* hs);
59
+
60
+ /* 8.2.4.52 */
61
+ static int h5_state_doctype(h5_state_t* hs);
62
+
63
+ /**
64
+ * public function
65
+ */
66
+ void libinjection_h5_init(h5_state_t* hs, const char* s, size_t len, enum html5_flags flags)
67
+ {
68
+ memset(hs, 0, sizeof(h5_state_t));
69
+ hs->s = s;
70
+ hs->len = len;
71
+
72
+ switch (flags) {
73
+ case DATA_STATE:
74
+ hs->state = h5_state_data;
75
+ break;
76
+ case VALUE_NO_QUOTE:
77
+ hs->state = h5_state_before_attribute_name;
78
+ break;
79
+ case VALUE_SINGLE_QUOTE:
80
+ hs->state = h5_state_attribute_value_single_quote;
81
+ break;
82
+ case VALUE_DOUBLE_QUOTE:
83
+ hs->state = h5_state_attribute_value_double_quote;
84
+ break;
85
+ case VALUE_BACK_QUOTE:
86
+ hs->state = h5_state_attribute_value_back_quote;
87
+ break;
88
+ }
89
+ }
90
+
91
+ /**
92
+ * public function
93
+ */
94
+ int libinjection_h5_next(h5_state_t* hs)
95
+ {
96
+ assert(hs->state != NULL);
97
+ return (*hs->state)(hs);
98
+ }
99
+
100
+ /**
101
+ * Everything below here is private
102
+ *
103
+ */
104
+
105
+
106
+ static int h5_is_white(char ch)
107
+ {
108
+ /*
109
+ * \t = horizontal tab = 0x09
110
+ * \n = newline = 0x0A
111
+ * \v = vertical tab = 0x0B
112
+ * \f = form feed = 0x0C
113
+ * \r = cr = 0x0D
114
+ */
115
+ return strchr(" \t\n\v\f\r", ch) != NULL;
116
+ }
117
+
118
+ static int h5_skip_white(h5_state_t* hs)
119
+ {
120
+ char ch;
121
+ while (hs->pos < hs->len) {
122
+ ch = hs->s[hs->pos];
123
+ switch (ch) {
124
+ case 0x00: /* IE only */
125
+ case 0x20:
126
+ case 0x09:
127
+ case 0x0A:
128
+ case 0x0B: /* IE only */
129
+ case 0x0C:
130
+ case 0x0D: /* IE only */
131
+ hs->pos += 1;
132
+ break;
133
+ default:
134
+ return ch;
135
+ }
136
+ }
137
+ return CHAR_EOF;
138
+ }
139
+
140
+ static int h5_state_eof(h5_state_t* hs)
141
+ {
142
+ /* eliminate unused function argument warning */
143
+ (void)hs;
144
+ return 0;
145
+ }
146
+
147
+ static int h5_state_data(h5_state_t* hs)
148
+ {
149
+ const char* idx;
150
+
151
+ TRACE();
152
+ assert(hs->len >= hs->pos);
153
+ idx = (const char*) memchr(hs->s + hs->pos, CHAR_LT, hs->len - hs->pos);
154
+ if (idx == NULL) {
155
+ hs->token_start = hs->s + hs->pos;
156
+ hs->token_len = hs->len - hs->pos;
157
+ hs->token_type = DATA_TEXT;
158
+ hs->state = h5_state_eof;
159
+ if (hs->token_len == 0) {
160
+ return 0;
161
+ }
162
+ } else {
163
+ hs->token_start = hs->s + hs->pos;
164
+ hs->token_type = DATA_TEXT;
165
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
166
+ hs->pos = (size_t)(idx - hs->s) + 1;
167
+ hs->state = h5_state_tag_open;
168
+ if (hs->token_len == 0) {
169
+ return h5_state_tag_open(hs);
170
+ }
171
+ }
172
+ return 1;
173
+ }
174
+
175
+ /**
176
+ * 12 2.4.8
177
+ */
178
+ static int h5_state_tag_open(h5_state_t* hs)
179
+ {
180
+ char ch;
181
+
182
+ TRACE();
183
+ ch = hs->s[hs->pos];
184
+ if (ch == CHAR_BANG) {
185
+ hs->pos += 1;
186
+ return h5_state_markup_declaration_open(hs);
187
+ } else if (ch == CHAR_SLASH) {
188
+ hs->pos += 1;
189
+ hs->is_close = 1;
190
+ return h5_state_end_tag_open(hs);
191
+ } else if (ch == CHAR_QUESTION) {
192
+ hs->pos += 1;
193
+ return h5_state_bogus_comment(hs);
194
+ } else if (ch == CHAR_PERCENT) {
195
+ /* this is not in spec.. alternative comment format used
196
+ by IE <= 9 and Safari < 4.0.3 */
197
+ hs->pos += 1;
198
+ return h5_state_bogus_comment2(hs);
199
+ } else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
200
+ return h5_state_tag_name(hs);
201
+ } else if (ch == CHAR_NULL) {
202
+ /* IE-ism NULL characters are ignored */
203
+ return h5_state_tag_name(hs);
204
+ } else {
205
+ /* user input mistake in configuring state */
206
+ if (hs->pos == 0) {
207
+ return h5_state_data(hs);
208
+ }
209
+ hs->token_start = hs->s + hs->pos - 1;
210
+ hs->token_len = 1;
211
+ hs->token_type = DATA_TEXT;
212
+ hs->state = h5_state_data;
213
+ return 1;
214
+ }
215
+ }
216
+ /**
217
+ * 12.2.4.9
218
+ */
219
+ static int h5_state_end_tag_open(h5_state_t* hs)
220
+ {
221
+ char ch;
222
+
223
+ TRACE();
224
+
225
+ if (hs->pos >= hs->len) {
226
+ return 0;
227
+ }
228
+ ch = hs->s[hs->pos];
229
+ if (ch == CHAR_GT) {
230
+ return h5_state_data(hs);
231
+ } else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
232
+ return h5_state_tag_name(hs);
233
+ }
234
+
235
+ hs->is_close = 0;
236
+ return h5_state_bogus_comment(hs);
237
+ }
238
+ /*
239
+ *
240
+ */
241
+ static int h5_state_tag_name_close(h5_state_t* hs)
242
+ {
243
+ TRACE();
244
+ hs->is_close = 0;
245
+ hs->token_start = hs->s + hs->pos;
246
+ hs->token_len = 1;
247
+ hs->token_type = TAG_NAME_CLOSE;
248
+ hs->pos += 1;
249
+ if (hs->pos < hs->len) {
250
+ hs->state = h5_state_data;
251
+ } else {
252
+ hs->state = h5_state_eof;
253
+ }
254
+
255
+ return 1;
256
+ }
257
+
258
+ /**
259
+ * 12.2.4.10
260
+ */
261
+ static int h5_state_tag_name(h5_state_t* hs)
262
+ {
263
+ char ch;
264
+ size_t pos;
265
+
266
+ TRACE();
267
+ pos = hs->pos;
268
+ while (pos < hs->len) {
269
+ ch = hs->s[pos];
270
+ if (ch == 0) {
271
+ /* special non-standard case */
272
+ /* allow nulls in tag name */
273
+ /* some old browsers apparently allow and ignore them */
274
+ pos += 1;
275
+ } else if (h5_is_white(ch)) {
276
+ hs->token_start = hs->s + hs->pos;
277
+ hs->token_len = pos - hs->pos;
278
+ hs->token_type = TAG_NAME_OPEN;
279
+ hs->pos = pos + 1;
280
+ hs->state = h5_state_before_attribute_name;
281
+ return 1;
282
+ } else if (ch == CHAR_SLASH) {
283
+ hs->token_start = hs->s + hs->pos;
284
+ hs->token_len = pos - hs->pos;
285
+ hs->token_type = TAG_NAME_OPEN;
286
+ hs->pos = pos + 1;
287
+ hs->state = h5_state_self_closing_start_tag;
288
+ return 1;
289
+ } else if (ch == CHAR_GT) {
290
+ hs->token_start = hs->s + hs->pos;
291
+ hs->token_len = pos - hs->pos;
292
+ if (hs->is_close) {
293
+ hs->pos = pos + 1;
294
+ hs->is_close = 0;
295
+ hs->token_type = TAG_CLOSE;
296
+ hs->state = h5_state_data;
297
+ } else {
298
+ hs->pos = pos;
299
+ hs->token_type = TAG_NAME_OPEN;
300
+ hs->state = h5_state_tag_name_close;
301
+ }
302
+ return 1;
303
+ } else {
304
+ pos += 1;
305
+ }
306
+ }
307
+
308
+ hs->token_start = hs->s + hs->pos;
309
+ hs->token_len = hs->len - hs->pos;
310
+ hs->token_type = TAG_NAME_OPEN;
311
+ hs->state = h5_state_eof;
312
+ return 1;
313
+ }
314
+
315
+ /**
316
+ * 12.2.4.34
317
+ */
318
+ static int h5_state_before_attribute_name(h5_state_t* hs)
319
+ {
320
+ int ch;
321
+
322
+ TRACE();
323
+ ch = h5_skip_white(hs);
324
+ switch (ch) {
325
+ case CHAR_EOF: {
326
+ return 0;
327
+ }
328
+ case CHAR_SLASH: {
329
+ hs->pos += 1;
330
+ return h5_state_self_closing_start_tag(hs);
331
+ }
332
+ case CHAR_GT: {
333
+ hs->state = h5_state_data;
334
+ hs->token_start = hs->s + hs->pos;
335
+ hs->token_len = 1;
336
+ hs->token_type = TAG_NAME_CLOSE;
337
+ hs->pos += 1;
338
+ return 1;
339
+ }
340
+ default: {
341
+ return h5_state_attribute_name(hs);
342
+ }
343
+ }
344
+ }
345
+
346
+ static int h5_state_attribute_name(h5_state_t* hs)
347
+ {
348
+ char ch;
349
+ size_t pos;
350
+
351
+ TRACE();
352
+ pos = hs->pos + 1;
353
+ while (pos < hs->len) {
354
+ ch = hs->s[pos];
355
+ if (h5_is_white(ch)) {
356
+ hs->token_start = hs->s + hs->pos;
357
+ hs->token_len = pos - hs->pos;
358
+ hs->token_type = ATTR_NAME;
359
+ hs->state = h5_state_after_attribute_name;
360
+ hs->pos = pos + 1;
361
+ return 1;
362
+ } else if (ch == CHAR_SLASH) {
363
+ hs->token_start = hs->s + hs->pos;
364
+ hs->token_len = pos - hs->pos;
365
+ hs->token_type = ATTR_NAME;
366
+ hs->state = h5_state_self_closing_start_tag;
367
+ hs->pos = pos + 1;
368
+ return 1;
369
+ } else if (ch == CHAR_EQUALS) {
370
+ hs->token_start = hs->s + hs->pos;
371
+ hs->token_len = pos - hs->pos;
372
+ hs->token_type = ATTR_NAME;
373
+ hs->state = h5_state_before_attribute_value;
374
+ hs->pos = pos + 1;
375
+ return 1;
376
+ } else if (ch == CHAR_GT) {
377
+ hs->token_start = hs->s + hs->pos;
378
+ hs->token_len = pos - hs->pos;
379
+ hs->token_type = ATTR_NAME;
380
+ hs->state = h5_state_tag_name_close;
381
+ hs->pos = pos;
382
+ return 1;
383
+ } else {
384
+ pos += 1;
385
+ }
386
+ }
387
+ /* EOF */
388
+ hs->token_start = hs->s + hs->pos;
389
+ hs->token_len = hs->len - hs->pos;
390
+ hs->token_type = ATTR_NAME;
391
+ hs->state = h5_state_eof;
392
+ hs->pos = hs->len;
393
+ return 1;
394
+ }
395
+
396
+ /**
397
+ * 12.2.4.36
398
+ */
399
+ static int h5_state_after_attribute_name(h5_state_t* hs)
400
+ {
401
+ int c;
402
+
403
+ TRACE();
404
+ c = h5_skip_white(hs);
405
+ switch (c) {
406
+ case CHAR_EOF: {
407
+ return 0;
408
+ }
409
+ case CHAR_SLASH: {
410
+ hs->pos += 1;
411
+ return h5_state_self_closing_start_tag(hs);
412
+ }
413
+ case CHAR_EQUALS: {
414
+ hs->pos += 1;
415
+ return h5_state_before_attribute_value(hs);
416
+ }
417
+ case CHAR_GT: {
418
+ return h5_state_tag_name_close(hs);
419
+ }
420
+ default: {
421
+ return h5_state_attribute_name(hs);
422
+ }
423
+ }
424
+ }
425
+
426
+ /**
427
+ * 12.2.4.37
428
+ */
429
+ static int h5_state_before_attribute_value(h5_state_t* hs)
430
+ {
431
+ int c;
432
+ TRACE();
433
+
434
+ c = h5_skip_white(hs);
435
+
436
+ if (c == CHAR_EOF) {
437
+ hs->state = h5_state_eof;
438
+ return 0;
439
+ }
440
+
441
+ if (c == CHAR_DOUBLE) {
442
+ return h5_state_attribute_value_double_quote(hs);
443
+ } else if (c == CHAR_SINGLE) {
444
+ return h5_state_attribute_value_single_quote(hs);
445
+ } else if (c == CHAR_TICK) {
446
+ /* NON STANDARD IE */
447
+ return h5_state_attribute_value_back_quote(hs);
448
+ } else {
449
+ return h5_state_attribute_value_no_quote(hs);
450
+ }
451
+ }
452
+
453
+
454
+ static int h5_state_attribute_value_quote(h5_state_t* hs, char qchar)
455
+ {
456
+ const char* idx;
457
+
458
+ TRACE();
459
+
460
+ /* skip initial quote in normal case.
461
+ * don't do this "if (pos == 0)" since it means we have started
462
+ * in a non-data state. given an input of '><foo
463
+ * we want to make 0-length attribute name
464
+ */
465
+ if (hs->pos > 0) {
466
+ hs->pos += 1;
467
+ }
468
+
469
+
470
+ idx = (const char*) memchr(hs->s + hs->pos, qchar, hs->len - hs->pos);
471
+ if (idx == NULL) {
472
+ hs->token_start = hs->s + hs->pos;
473
+ hs->token_len = hs->len - hs->pos;
474
+ hs->token_type = ATTR_VALUE;
475
+ hs->state = h5_state_eof;
476
+ } else {
477
+ hs->token_start = hs->s + hs->pos;
478
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
479
+ hs->token_type = ATTR_VALUE;
480
+ hs->state = h5_state_after_attribute_value_quoted_state;
481
+ hs->pos += hs->token_len + 1;
482
+ }
483
+ return 1;
484
+ }
485
+
486
+ static
487
+ int h5_state_attribute_value_double_quote(h5_state_t* hs)
488
+ {
489
+ TRACE();
490
+ return h5_state_attribute_value_quote(hs, CHAR_DOUBLE);
491
+ }
492
+
493
+ static
494
+ int h5_state_attribute_value_single_quote(h5_state_t* hs)
495
+ {
496
+ TRACE();
497
+ return h5_state_attribute_value_quote(hs, CHAR_SINGLE);
498
+ }
499
+
500
+ static
501
+ int h5_state_attribute_value_back_quote(h5_state_t* hs)
502
+ {
503
+ TRACE();
504
+ return h5_state_attribute_value_quote(hs, CHAR_TICK);
505
+ }
506
+
507
+ static int h5_state_attribute_value_no_quote(h5_state_t* hs)
508
+ {
509
+ char ch;
510
+ size_t pos;
511
+
512
+ TRACE();
513
+ pos = hs->pos;
514
+ while (pos < hs->len) {
515
+ ch = hs->s[pos];
516
+ if (h5_is_white(ch)) {
517
+ hs->token_type = ATTR_VALUE;
518
+ hs->token_start = hs->s + hs->pos;
519
+ hs->token_len = pos - hs->pos;
520
+ hs->pos = pos + 1;
521
+ hs->state = h5_state_before_attribute_name;
522
+ return 1;
523
+ } else if (ch == CHAR_GT) {
524
+ hs->token_type = ATTR_VALUE;
525
+ hs->token_start = hs->s + hs->pos;
526
+ hs->token_len = pos - hs->pos;
527
+ hs->pos = pos;
528
+ hs->state = h5_state_tag_name_close;
529
+ return 1;
530
+ }
531
+ pos += 1;
532
+ }
533
+ TRACE();
534
+ /* EOF */
535
+ hs->state = h5_state_eof;
536
+ hs->token_start = hs->s + hs->pos;
537
+ hs->token_len = hs->len - hs->pos;
538
+ hs->token_type = ATTR_VALUE;
539
+ return 1;
540
+ }
541
+
542
+ /**
543
+ * 12.2.4.41
544
+ */
545
+ static int h5_state_after_attribute_value_quoted_state(h5_state_t* hs)
546
+ {
547
+ char ch;
548
+
549
+ TRACE();
550
+ if (hs->pos >= hs->len) {
551
+ return 0;
552
+ }
553
+ ch = hs->s[hs->pos];
554
+ if (h5_is_white(ch)) {
555
+ hs->pos += 1;
556
+ return h5_state_before_attribute_name(hs);
557
+ } else if (ch == CHAR_SLASH) {
558
+ hs->pos += 1;
559
+ return h5_state_self_closing_start_tag(hs);
560
+ } else if (ch == CHAR_GT) {
561
+ hs->token_start = hs->s + hs->pos;
562
+ hs->token_len = 1;
563
+ hs->token_type = TAG_NAME_CLOSE;
564
+ hs->pos += 1;
565
+ hs->state = h5_state_data;
566
+ return 1;
567
+ } else {
568
+ return h5_state_before_attribute_name(hs);
569
+ }
570
+ }
571
+
572
+ /**
573
+ * 12.2.4.43
574
+ */
575
+ static int h5_state_self_closing_start_tag(h5_state_t* hs)
576
+ {
577
+ char ch;
578
+
579
+ TRACE();
580
+ if (hs->pos >= hs->len) {
581
+ return 0;
582
+ }
583
+ ch = hs->s[hs->pos];
584
+ if (ch == CHAR_GT) {
585
+ assert(hs->pos > 0);
586
+ hs->token_start = hs->s + hs->pos -1;
587
+ hs->token_len = 2;
588
+ hs->token_type = TAG_NAME_SELFCLOSE;
589
+ hs->state = h5_state_data;
590
+ hs->pos += 1;
591
+ return 1;
592
+ } else {
593
+ return h5_state_before_attribute_name(hs);
594
+ }
595
+ }
596
+
597
+ /**
598
+ * 12.2.4.44
599
+ */
600
+ static int h5_state_bogus_comment(h5_state_t* hs)
601
+ {
602
+ const char* idx;
603
+
604
+ TRACE();
605
+ idx = (const char*) memchr(hs->s + hs->pos, CHAR_GT, hs->len - hs->pos);
606
+ if (idx == NULL) {
607
+ hs->token_start = hs->s + hs->pos;
608
+ hs->token_len = hs->len - hs->pos;
609
+ hs->pos = hs->len;
610
+ hs->state = h5_state_eof;
611
+ } else {
612
+ hs->token_start = hs->s + hs->pos;
613
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
614
+ hs->pos = (size_t)(idx - hs->s) + 1;
615
+ hs->state = h5_state_data;
616
+ }
617
+
618
+ hs->token_type = TAG_COMMENT;
619
+ return 1;
620
+ }
621
+
622
+ /**
623
+ * 12.2.4.44 ALT
624
+ */
625
+ static int h5_state_bogus_comment2(h5_state_t* hs)
626
+ {
627
+ const char* idx;
628
+ size_t pos;
629
+
630
+ TRACE();
631
+ pos = hs->pos;
632
+ while (1) {
633
+ idx = (const char*) memchr(hs->s + pos, CHAR_PERCENT, hs->len - pos);
634
+ if (idx == NULL || (idx + 1 >= hs->s + hs->len)) {
635
+ hs->token_start = hs->s + hs->pos;
636
+ hs->token_len = hs->len - hs->pos;
637
+ hs->pos = hs->len;
638
+ hs->token_type = TAG_COMMENT;
639
+ hs->state = h5_state_eof;
640
+ return 1;
641
+ }
642
+
643
+ if (*(idx +1) != CHAR_GT) {
644
+ pos = (size_t)(idx - hs->s) + 1;
645
+ continue;
646
+ }
647
+
648
+ /* ends in %> */
649
+ hs->token_start = hs->s + hs->pos;
650
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
651
+ hs->pos = (size_t)(idx - hs->s) + 2;
652
+ hs->state = h5_state_data;
653
+ hs->token_type = TAG_COMMENT;
654
+ return 1;
655
+ }
656
+ }
657
+
658
+ /**
659
+ * 8.2.4.45
660
+ */
661
+ static int h5_state_markup_declaration_open(h5_state_t* hs)
662
+ {
663
+ size_t remaining;
664
+
665
+ TRACE();
666
+ remaining = hs->len - hs->pos;
667
+ if (remaining >= 7 &&
668
+ /* case insensitive */
669
+ (hs->s[hs->pos + 0] == 'D' || hs->s[hs->pos + 0] == 'd') &&
670
+ (hs->s[hs->pos + 1] == 'O' || hs->s[hs->pos + 1] == 'o') &&
671
+ (hs->s[hs->pos + 2] == 'C' || hs->s[hs->pos + 2] == 'c') &&
672
+ (hs->s[hs->pos + 3] == 'T' || hs->s[hs->pos + 3] == 't') &&
673
+ (hs->s[hs->pos + 4] == 'Y' || hs->s[hs->pos + 4] == 'y') &&
674
+ (hs->s[hs->pos + 5] == 'P' || hs->s[hs->pos + 5] == 'p') &&
675
+ (hs->s[hs->pos + 6] == 'E' || hs->s[hs->pos + 6] == 'e')
676
+ ) {
677
+ return h5_state_doctype(hs);
678
+ } else if (remaining >= 7 &&
679
+ /* upper case required */
680
+ hs->s[hs->pos + 0] == '[' &&
681
+ hs->s[hs->pos + 1] == 'C' &&
682
+ hs->s[hs->pos + 2] == 'D' &&
683
+ hs->s[hs->pos + 3] == 'A' &&
684
+ hs->s[hs->pos + 4] == 'T' &&
685
+ hs->s[hs->pos + 5] == 'A' &&
686
+ hs->s[hs->pos + 6] == '['
687
+ ) {
688
+ hs->pos += 7;
689
+ return h5_state_cdata(hs);
690
+ } else if (remaining >= 2 &&
691
+ hs->s[hs->pos + 0] == '-' &&
692
+ hs->s[hs->pos + 1] == '-') {
693
+ hs->pos += 2;
694
+ return h5_state_comment(hs);
695
+ }
696
+
697
+ return h5_state_bogus_comment(hs);
698
+ }
699
+
700
+ /**
701
+ * 12.2.4.48
702
+ * 12.2.4.49
703
+ * 12.2.4.50
704
+ * 12.2.4.51
705
+ * state machine spec is confusing since it can only look
706
+ * at one character at a time but simply it's comments end by:
707
+ * 1) EOF
708
+ * 2) ending in -->
709
+ * 3) ending in -!>
710
+ */
711
+ static int h5_state_comment(h5_state_t* hs)
712
+ {
713
+ char ch;
714
+ const char* idx;
715
+ size_t pos;
716
+ size_t offset;
717
+ const char* end = hs->s + hs->len;
718
+
719
+ TRACE();
720
+ pos = hs->pos;
721
+ while (1) {
722
+
723
+ idx = (const char*) memchr(hs->s + pos, CHAR_DASH, hs->len - pos);
724
+
725
+ /* did not find anything or has less than 3 chars left */
726
+ if (idx == NULL || idx > hs->s + hs->len - 3) {
727
+ hs->state = h5_state_eof;
728
+ hs->token_start = hs->s + hs->pos;
729
+ hs->token_len = hs->len - hs->pos;
730
+ hs->token_type = TAG_COMMENT;
731
+ return 1;
732
+ }
733
+ offset = 1;
734
+
735
+ /* skip all nulls */
736
+ while (idx + offset < end && *(idx + offset) == 0) {
737
+ offset += 1;
738
+ }
739
+ if (idx + offset == end) {
740
+ hs->state = h5_state_eof;
741
+ hs->token_start = hs->s + hs->pos;
742
+ hs->token_len = hs->len - hs->pos;
743
+ hs->token_type = TAG_COMMENT;
744
+ return 1;
745
+ }
746
+
747
+ ch = *(idx + offset);
748
+ if (ch != CHAR_DASH && ch != CHAR_BANG) {
749
+ pos = (size_t)(idx - hs->s) + 1;
750
+ continue;
751
+ }
752
+
753
+ /* need to test */
754
+ #if 0
755
+ /* skip all nulls */
756
+ while (idx + offset < end && *(idx + offset) == 0) {
757
+ offset += 1;
758
+ }
759
+ if (idx + offset == end) {
760
+ hs->state = h5_state_eof;
761
+ hs->token_start = hs->s + hs->pos;
762
+ hs->token_len = hs->len - hs->pos;
763
+ hs->token_type = TAG_COMMENT;
764
+ return 1;
765
+ }
766
+ #endif
767
+
768
+ offset += 1;
769
+ if (idx + offset == end) {
770
+ hs->state = h5_state_eof;
771
+ hs->token_start = hs->s + hs->pos;
772
+ hs->token_len = hs->len - hs->pos;
773
+ hs->token_type = TAG_COMMENT;
774
+ return 1;
775
+ }
776
+
777
+
778
+ ch = *(idx + offset);
779
+ if (ch != CHAR_GT) {
780
+ pos = (size_t)(idx - hs->s) + 1;
781
+ continue;
782
+ }
783
+ offset += 1;
784
+
785
+ /* ends in --> or -!> */
786
+ hs->token_start = hs->s + hs->pos;
787
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
788
+ hs->pos = (size_t)(idx + offset - hs->s);
789
+ hs->state = h5_state_data;
790
+ hs->token_type = TAG_COMMENT;
791
+ return 1;
792
+ }
793
+ }
794
+
795
+ static int h5_state_cdata(h5_state_t* hs)
796
+ {
797
+ const char* idx;
798
+ size_t pos;
799
+
800
+ TRACE();
801
+ pos = hs->pos;
802
+ while (1) {
803
+ idx = (const char*) memchr(hs->s + pos, CHAR_RIGHTB, hs->len - pos);
804
+
805
+ /* did not find anything or has less than 3 chars left */
806
+ if (idx == NULL || idx > hs->s + hs->len - 3) {
807
+ hs->state = h5_state_eof;
808
+ hs->token_start = hs->s + hs->pos;
809
+ hs->token_len = hs->len - hs->pos;
810
+ hs->token_type = DATA_TEXT;
811
+ return 1;
812
+ } else if ( *(idx+1) == CHAR_RIGHTB && *(idx+2) == CHAR_GT) {
813
+ hs->state = h5_state_data;
814
+ hs->token_start = hs->s + hs->pos;
815
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
816
+ hs->pos = (size_t)(idx - hs->s) + 3;
817
+ hs->token_type = DATA_TEXT;
818
+ return 1;
819
+ } else {
820
+ pos = (size_t)(idx - hs->s) + 1;
821
+ }
822
+ }
823
+ }
824
+
825
+ /**
826
+ * 8.2.4.52
827
+ * http://www.w3.org/html/wg/drafts/html/master/syntax.html#doctype-state
828
+ */
829
+ static int h5_state_doctype(h5_state_t* hs)
830
+ {
831
+ const char* idx;
832
+
833
+ TRACE();
834
+ hs->token_start = hs->s + hs->pos;
835
+ hs->token_type = DOCTYPE;
836
+
837
+ idx = (const char*) memchr(hs->s + hs->pos, CHAR_GT, hs->len - hs->pos);
838
+ if (idx == NULL) {
839
+ hs->state = h5_state_eof;
840
+ hs->token_len = hs->len - hs->pos;
841
+ } else {
842
+ hs->state = h5_state_data;
843
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
844
+ hs->pos = (size_t)(idx - hs->s) + 1;
845
+ }
846
+ return 1;
847
+ }