threatstack-agent-ruby 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +3 -0
  3. data/LICENSE +6 -0
  4. data/ext/libinjection/extconf.rb +4 -0
  5. data/ext/libinjection/libinjection.h +65 -0
  6. data/ext/libinjection/libinjection.i +13 -0
  7. data/ext/libinjection/libinjection_html5.c +850 -0
  8. data/ext/libinjection/libinjection_html5.h +54 -0
  9. data/ext/libinjection/libinjection_sqli.c +2325 -0
  10. data/ext/libinjection/libinjection_sqli.h +298 -0
  11. data/ext/libinjection/libinjection_sqli_data.h +9654 -0
  12. data/ext/libinjection/libinjection_wrap.c +2393 -0
  13. data/ext/libinjection/libinjection_xss.c +532 -0
  14. data/ext/libinjection/libinjection_xss.h +21 -0
  15. data/lib/constants.rb +110 -0
  16. data/lib/control.rb +61 -0
  17. data/lib/events/event_accumulator.rb +36 -0
  18. data/lib/events/models/attack_event.rb +58 -0
  19. data/lib/events/models/base_event.rb +41 -0
  20. data/lib/events/models/dependency_event.rb +93 -0
  21. data/lib/events/models/environment_event.rb +93 -0
  22. data/lib/events/models/instrumentation_event.rb +46 -0
  23. data/lib/exceptions/request_blocked_error.rb +11 -0
  24. data/lib/instrumentation/common.rb +172 -0
  25. data/lib/instrumentation/instrumenter.rb +144 -0
  26. data/lib/instrumentation/kernel.rb +45 -0
  27. data/lib/instrumentation/rails.rb +61 -0
  28. data/lib/jobs/delayed_job.rb +26 -0
  29. data/lib/jobs/event_submitter.rb +101 -0
  30. data/lib/jobs/job_queue.rb +38 -0
  31. data/lib/jobs/recurrent_job.rb +61 -0
  32. data/lib/threatstack-agent-ruby.rb +7 -0
  33. data/lib/utils/aws_utils.rb +46 -0
  34. data/lib/utils/formatter.rb +47 -0
  35. data/lib/utils/logger.rb +43 -0
  36. data/threatstack-agent-ruby.gemspec +35 -0
  37. metadata +221 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d56612a7470c1f394b66c6473479883be0ddc44810bebd77206acd42da1b31a6
4
+ data.tar.gz: 0d1bbad69a0ef6bb2ab53b662a8f004d06d353b2b02746ffb5616731eaef1c90
5
+ SHA512:
6
+ metadata.gz: 0c98e82db4c39fbfd7d930049b9ffe84c6efeda17e5f6b97ea971e240ffaec39c45683630af92ba95626a6571e0d9ecc02d01431c1c70284959b240576bb3e03
7
+ data.tar.gz: 767fb9d9a56bb8b4efdd9d7d9a28a373e86851550ba8c9e4401d4b2803d6bda1964ba0f47b566c25c12b54f8db5df49c48249bc12734f29f6dcb57a7e706879c
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,6 @@
1
+ Threat Stack AppSec for Ruby is free-to-use, proprietary software.
2
+ Please refer to our terms for more information: https://www.threatstack.com/terms
3
+
4
+ The Threat Stack Ruby AppSec agent also uses code from the following open source projects under the following licenses:
5
+
6
+ libinjection http://opensource.org/licenses/BSD-3-Clause
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mkmf'
4
+ create_makefile('libinjection')
@@ -0,0 +1,65 @@
1
+ /**
2
+ * Copyright 2012-2016 Nick Galbreath
3
+ * nickg@client9.com
4
+ * BSD License -- see COPYING.txt for details
5
+ *
6
+ * https://libinjection.client9.com/
7
+ *
8
+ */
9
+
10
+ #ifndef LIBINJECTION_H
11
+ #define LIBINJECTION_H
12
+
13
+ #ifdef __cplusplus
14
+ # define LIBINJECTION_BEGIN_DECLS extern "C" {
15
+ # define LIBINJECTION_END_DECLS }
16
+ #else
17
+ # define LIBINJECTION_BEGIN_DECLS
18
+ # define LIBINJECTION_END_DECLS
19
+ #endif
20
+
21
+ LIBINJECTION_BEGIN_DECLS
22
+
23
+ /*
24
+ * Pull in size_t
25
+ */
26
+ #include <string.h>
27
+
28
+ /*
29
+ * Version info.
30
+ *
31
+ * This is moved into a function to allow SWIG and other auto-generated
32
+ * binding to not be modified during minor release changes. We change
33
+ * change the version number in the c source file, and not regenerated
34
+ * the binding
35
+ *
36
+ * See python's normalized version
37
+ * http://www.python.org/dev/peps/pep-0386/#normalizedversion
38
+ */
39
+ const char* libinjection_version(void);
40
+
41
+ /**
42
+ * Simple API for SQLi detection - returns a SQLi fingerprint or NULL
43
+ * is benign input
44
+ *
45
+ * \param[in] s input string, may contain nulls, does not need to be null-terminated
46
+ * \param[in] slen input string length
47
+ * \param[out] fingerprint buffer of 8+ characters. c-string,
48
+ * \return 1 if SQLi, 0 if benign. fingerprint will be set or set to empty string.
49
+ */
50
+ int libinjection_sqli(const char* s, size_t slen, char fingerprint[]);
51
+
52
+ /** ALPHA version of xss detector.
53
+ *
54
+ * NOT DONE.
55
+ *
56
+ * \param[in] s input string, may contain nulls, does not need to be null-terminated
57
+ * \param[in] slen input string length
58
+ * \return 1 if XSS found, 0 if benign
59
+ *
60
+ */
61
+ int libinjection_xss(const char* s, size_t slen);
62
+
63
+ LIBINJECTION_END_DECLS
64
+
65
+ #endif /* LIBINJECTION_H */
@@ -0,0 +1,13 @@
1
+ /* libinjection.i SWIG interface file */
2
+
3
+ %module libinjection
4
+ %{
5
+ #include "libinjection.h"
6
+ #include "libinjection_sqli.h"
7
+ #include "libinjection_xss.h"
8
+ %}
9
+
10
+ %include "libinjection.h"
11
+
12
+ // int libinjection_sqli(char *, size_t, char *OUTPUT);
13
+ // int libinjection_xss(char *, size_t);
@@ -0,0 +1,850 @@
1
+ #include "libinjection_html5.h"
2
+
3
+ #include <string.h>
4
+ #include <assert.h>
5
+
6
+ #ifdef DEBUG
7
+ #include <stdio.h>
8
+ #define TRACE() printf("%s:%d\n", __FUNCTION__, __LINE__)
9
+ #else
10
+ #define TRACE()
11
+ #endif
12
+
13
+
14
+ #define CHAR_EOF -1
15
+ #define CHAR_NULL 0
16
+ #define CHAR_BANG 33
17
+ #define CHAR_DOUBLE 34
18
+ #define CHAR_PERCENT 37
19
+ #define CHAR_SINGLE 39
20
+ #define CHAR_DASH 45
21
+ #define CHAR_SLASH 47
22
+ #define CHAR_LT 60
23
+ #define CHAR_EQUALS 61
24
+ #define CHAR_GT 62
25
+ #define CHAR_QUESTION 63
26
+ #define CHAR_RIGHTB 93
27
+ #define CHAR_TICK 96
28
+
29
+ /* prototypes */
30
+
31
+ static int h5_skip_white(h5_state_t* hs);
32
+ static int h5_is_white(char c);
33
+ static int h5_state_eof(h5_state_t* hs);
34
+ static int h5_state_data(h5_state_t* hs);
35
+ static int h5_state_tag_open(h5_state_t* hs);
36
+ static int h5_state_tag_name(h5_state_t* hs);
37
+ static int h5_state_tag_name_close(h5_state_t* hs);
38
+ static int h5_state_end_tag_open(h5_state_t* hs);
39
+ static int h5_state_self_closing_start_tag(h5_state_t* hs);
40
+ static int h5_state_attribute_name(h5_state_t* hs);
41
+ static int h5_state_after_attribute_name(h5_state_t* hs);
42
+ static int h5_state_before_attribute_name(h5_state_t* hs);
43
+ static int h5_state_before_attribute_value(h5_state_t* hs);
44
+ static int h5_state_attribute_value_double_quote(h5_state_t* hs);
45
+ static int h5_state_attribute_value_single_quote(h5_state_t* hs);
46
+ static int h5_state_attribute_value_back_quote(h5_state_t* hs);
47
+ static int h5_state_attribute_value_no_quote(h5_state_t* hs);
48
+ static int h5_state_after_attribute_value_quoted_state(h5_state_t* hs);
49
+ static int h5_state_comment(h5_state_t* hs);
50
+ static int h5_state_cdata(h5_state_t* hs);
51
+
52
+
53
+ /* 12.2.4.44 */
54
+ static int h5_state_bogus_comment(h5_state_t* hs);
55
+ static int h5_state_bogus_comment2(h5_state_t* hs);
56
+
57
+ /* 12.2.4.45 */
58
+ static int h5_state_markup_declaration_open(h5_state_t* hs);
59
+
60
+ /* 8.2.4.52 */
61
+ static int h5_state_doctype(h5_state_t* hs);
62
+
63
+ /**
64
+ * public function
65
+ */
66
+ void libinjection_h5_init(h5_state_t* hs, const char* s, size_t len, enum html5_flags flags)
67
+ {
68
+ memset(hs, 0, sizeof(h5_state_t));
69
+ hs->s = s;
70
+ hs->len = len;
71
+
72
+ switch (flags) {
73
+ case DATA_STATE:
74
+ hs->state = h5_state_data;
75
+ break;
76
+ case VALUE_NO_QUOTE:
77
+ hs->state = h5_state_before_attribute_name;
78
+ break;
79
+ case VALUE_SINGLE_QUOTE:
80
+ hs->state = h5_state_attribute_value_single_quote;
81
+ break;
82
+ case VALUE_DOUBLE_QUOTE:
83
+ hs->state = h5_state_attribute_value_double_quote;
84
+ break;
85
+ case VALUE_BACK_QUOTE:
86
+ hs->state = h5_state_attribute_value_back_quote;
87
+ break;
88
+ }
89
+ }
90
+
91
+ /**
92
+ * public function
93
+ */
94
+ int libinjection_h5_next(h5_state_t* hs)
95
+ {
96
+ assert(hs->state != NULL);
97
+ return (*hs->state)(hs);
98
+ }
99
+
100
+ /**
101
+ * Everything below here is private
102
+ *
103
+ */
104
+
105
+
106
+ static int h5_is_white(char ch)
107
+ {
108
+ /*
109
+ * \t = horizontal tab = 0x09
110
+ * \n = newline = 0x0A
111
+ * \v = vertical tab = 0x0B
112
+ * \f = form feed = 0x0C
113
+ * \r = cr = 0x0D
114
+ */
115
+ return strchr(" \t\n\v\f\r", ch) != NULL;
116
+ }
117
+
118
+ static int h5_skip_white(h5_state_t* hs)
119
+ {
120
+ char ch;
121
+ while (hs->pos < hs->len) {
122
+ ch = hs->s[hs->pos];
123
+ switch (ch) {
124
+ case 0x00: /* IE only */
125
+ case 0x20:
126
+ case 0x09:
127
+ case 0x0A:
128
+ case 0x0B: /* IE only */
129
+ case 0x0C:
130
+ case 0x0D: /* IE only */
131
+ hs->pos += 1;
132
+ break;
133
+ default:
134
+ return ch;
135
+ }
136
+ }
137
+ return CHAR_EOF;
138
+ }
139
+
140
+ static int h5_state_eof(h5_state_t* hs)
141
+ {
142
+ /* eliminate unused function argument warning */
143
+ (void)hs;
144
+ return 0;
145
+ }
146
+
147
+ static int h5_state_data(h5_state_t* hs)
148
+ {
149
+ const char* idx;
150
+
151
+ TRACE();
152
+ assert(hs->len >= hs->pos);
153
+ idx = (const char*) memchr(hs->s + hs->pos, CHAR_LT, hs->len - hs->pos);
154
+ if (idx == NULL) {
155
+ hs->token_start = hs->s + hs->pos;
156
+ hs->token_len = hs->len - hs->pos;
157
+ hs->token_type = DATA_TEXT;
158
+ hs->state = h5_state_eof;
159
+ if (hs->token_len == 0) {
160
+ return 0;
161
+ }
162
+ } else {
163
+ hs->token_start = hs->s + hs->pos;
164
+ hs->token_type = DATA_TEXT;
165
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
166
+ hs->pos = (size_t)(idx - hs->s) + 1;
167
+ hs->state = h5_state_tag_open;
168
+ if (hs->token_len == 0) {
169
+ return h5_state_tag_open(hs);
170
+ }
171
+ }
172
+ return 1;
173
+ }
174
+
175
+ /**
176
+ * 12 2.4.8
177
+ */
178
+ static int h5_state_tag_open(h5_state_t* hs)
179
+ {
180
+ char ch;
181
+
182
+ TRACE();
183
+ if (hs->pos >= hs->len) {
184
+ return 0;
185
+ }
186
+ ch = hs->s[hs->pos];
187
+ if (ch == CHAR_BANG) {
188
+ hs->pos += 1;
189
+ return h5_state_markup_declaration_open(hs);
190
+ } else if (ch == CHAR_SLASH) {
191
+ hs->pos += 1;
192
+ hs->is_close = 1;
193
+ return h5_state_end_tag_open(hs);
194
+ } else if (ch == CHAR_QUESTION) {
195
+ hs->pos += 1;
196
+ return h5_state_bogus_comment(hs);
197
+ } else if (ch == CHAR_PERCENT) {
198
+ /* this is not in spec.. alternative comment format used
199
+ by IE <= 9 and Safari < 4.0.3 */
200
+ hs->pos += 1;
201
+ return h5_state_bogus_comment2(hs);
202
+ } else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
203
+ return h5_state_tag_name(hs);
204
+ } else if (ch == CHAR_NULL) {
205
+ /* IE-ism NULL characters are ignored */
206
+ return h5_state_tag_name(hs);
207
+ } else {
208
+ /* user input mistake in configuring state */
209
+ if (hs->pos == 0) {
210
+ return h5_state_data(hs);
211
+ }
212
+ hs->token_start = hs->s + hs->pos - 1;
213
+ hs->token_len = 1;
214
+ hs->token_type = DATA_TEXT;
215
+ hs->state = h5_state_data;
216
+ return 1;
217
+ }
218
+ }
219
+ /**
220
+ * 12.2.4.9
221
+ */
222
+ static int h5_state_end_tag_open(h5_state_t* hs)
223
+ {
224
+ char ch;
225
+
226
+ TRACE();
227
+
228
+ if (hs->pos >= hs->len) {
229
+ return 0;
230
+ }
231
+ ch = hs->s[hs->pos];
232
+ if (ch == CHAR_GT) {
233
+ return h5_state_data(hs);
234
+ } else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
235
+ return h5_state_tag_name(hs);
236
+ }
237
+
238
+ hs->is_close = 0;
239
+ return h5_state_bogus_comment(hs);
240
+ }
241
+ /*
242
+ *
243
+ */
244
+ static int h5_state_tag_name_close(h5_state_t* hs)
245
+ {
246
+ TRACE();
247
+ hs->is_close = 0;
248
+ hs->token_start = hs->s + hs->pos;
249
+ hs->token_len = 1;
250
+ hs->token_type = TAG_NAME_CLOSE;
251
+ hs->pos += 1;
252
+ if (hs->pos < hs->len) {
253
+ hs->state = h5_state_data;
254
+ } else {
255
+ hs->state = h5_state_eof;
256
+ }
257
+
258
+ return 1;
259
+ }
260
+
261
+ /**
262
+ * 12.2.4.10
263
+ */
264
+ static int h5_state_tag_name(h5_state_t* hs)
265
+ {
266
+ char ch;
267
+ size_t pos;
268
+
269
+ TRACE();
270
+ pos = hs->pos;
271
+ while (pos < hs->len) {
272
+ ch = hs->s[pos];
273
+ if (ch == 0) {
274
+ /* special non-standard case */
275
+ /* allow nulls in tag name */
276
+ /* some old browsers apparently allow and ignore them */
277
+ pos += 1;
278
+ } else if (h5_is_white(ch)) {
279
+ hs->token_start = hs->s + hs->pos;
280
+ hs->token_len = pos - hs->pos;
281
+ hs->token_type = TAG_NAME_OPEN;
282
+ hs->pos = pos + 1;
283
+ hs->state = h5_state_before_attribute_name;
284
+ return 1;
285
+ } else if (ch == CHAR_SLASH) {
286
+ hs->token_start = hs->s + hs->pos;
287
+ hs->token_len = pos - hs->pos;
288
+ hs->token_type = TAG_NAME_OPEN;
289
+ hs->pos = pos + 1;
290
+ hs->state = h5_state_self_closing_start_tag;
291
+ return 1;
292
+ } else if (ch == CHAR_GT) {
293
+ hs->token_start = hs->s + hs->pos;
294
+ hs->token_len = pos - hs->pos;
295
+ if (hs->is_close) {
296
+ hs->pos = pos + 1;
297
+ hs->is_close = 0;
298
+ hs->token_type = TAG_CLOSE;
299
+ hs->state = h5_state_data;
300
+ } else {
301
+ hs->pos = pos;
302
+ hs->token_type = TAG_NAME_OPEN;
303
+ hs->state = h5_state_tag_name_close;
304
+ }
305
+ return 1;
306
+ } else {
307
+ pos += 1;
308
+ }
309
+ }
310
+
311
+ hs->token_start = hs->s + hs->pos;
312
+ hs->token_len = hs->len - hs->pos;
313
+ hs->token_type = TAG_NAME_OPEN;
314
+ hs->state = h5_state_eof;
315
+ return 1;
316
+ }
317
+
318
+ /**
319
+ * 12.2.4.34
320
+ */
321
+ static int h5_state_before_attribute_name(h5_state_t* hs)
322
+ {
323
+ int ch;
324
+
325
+ TRACE();
326
+ ch = h5_skip_white(hs);
327
+ switch (ch) {
328
+ case CHAR_EOF: {
329
+ return 0;
330
+ }
331
+ case CHAR_SLASH: {
332
+ hs->pos += 1;
333
+ return h5_state_self_closing_start_tag(hs);
334
+ }
335
+ case CHAR_GT: {
336
+ hs->state = h5_state_data;
337
+ hs->token_start = hs->s + hs->pos;
338
+ hs->token_len = 1;
339
+ hs->token_type = TAG_NAME_CLOSE;
340
+ hs->pos += 1;
341
+ return 1;
342
+ }
343
+ default: {
344
+ return h5_state_attribute_name(hs);
345
+ }
346
+ }
347
+ }
348
+
349
+ static int h5_state_attribute_name(h5_state_t* hs)
350
+ {
351
+ char ch;
352
+ size_t pos;
353
+
354
+ TRACE();
355
+ pos = hs->pos + 1;
356
+ while (pos < hs->len) {
357
+ ch = hs->s[pos];
358
+ if (h5_is_white(ch)) {
359
+ hs->token_start = hs->s + hs->pos;
360
+ hs->token_len = pos - hs->pos;
361
+ hs->token_type = ATTR_NAME;
362
+ hs->state = h5_state_after_attribute_name;
363
+ hs->pos = pos + 1;
364
+ return 1;
365
+ } else if (ch == CHAR_SLASH) {
366
+ hs->token_start = hs->s + hs->pos;
367
+ hs->token_len = pos - hs->pos;
368
+ hs->token_type = ATTR_NAME;
369
+ hs->state = h5_state_self_closing_start_tag;
370
+ hs->pos = pos + 1;
371
+ return 1;
372
+ } else if (ch == CHAR_EQUALS) {
373
+ hs->token_start = hs->s + hs->pos;
374
+ hs->token_len = pos - hs->pos;
375
+ hs->token_type = ATTR_NAME;
376
+ hs->state = h5_state_before_attribute_value;
377
+ hs->pos = pos + 1;
378
+ return 1;
379
+ } else if (ch == CHAR_GT) {
380
+ hs->token_start = hs->s + hs->pos;
381
+ hs->token_len = pos - hs->pos;
382
+ hs->token_type = ATTR_NAME;
383
+ hs->state = h5_state_tag_name_close;
384
+ hs->pos = pos;
385
+ return 1;
386
+ } else {
387
+ pos += 1;
388
+ }
389
+ }
390
+ /* EOF */
391
+ hs->token_start = hs->s + hs->pos;
392
+ hs->token_len = hs->len - hs->pos;
393
+ hs->token_type = ATTR_NAME;
394
+ hs->state = h5_state_eof;
395
+ hs->pos = hs->len;
396
+ return 1;
397
+ }
398
+
399
+ /**
400
+ * 12.2.4.36
401
+ */
402
+ static int h5_state_after_attribute_name(h5_state_t* hs)
403
+ {
404
+ int c;
405
+
406
+ TRACE();
407
+ c = h5_skip_white(hs);
408
+ switch (c) {
409
+ case CHAR_EOF: {
410
+ return 0;
411
+ }
412
+ case CHAR_SLASH: {
413
+ hs->pos += 1;
414
+ return h5_state_self_closing_start_tag(hs);
415
+ }
416
+ case CHAR_EQUALS: {
417
+ hs->pos += 1;
418
+ return h5_state_before_attribute_value(hs);
419
+ }
420
+ case CHAR_GT: {
421
+ return h5_state_tag_name_close(hs);
422
+ }
423
+ default: {
424
+ return h5_state_attribute_name(hs);
425
+ }
426
+ }
427
+ }
428
+
429
+ /**
430
+ * 12.2.4.37
431
+ */
432
+ static int h5_state_before_attribute_value(h5_state_t* hs)
433
+ {
434
+ int c;
435
+ TRACE();
436
+
437
+ c = h5_skip_white(hs);
438
+
439
+ if (c == CHAR_EOF) {
440
+ hs->state = h5_state_eof;
441
+ return 0;
442
+ }
443
+
444
+ if (c == CHAR_DOUBLE) {
445
+ return h5_state_attribute_value_double_quote(hs);
446
+ } else if (c == CHAR_SINGLE) {
447
+ return h5_state_attribute_value_single_quote(hs);
448
+ } else if (c == CHAR_TICK) {
449
+ /* NON STANDARD IE */
450
+ return h5_state_attribute_value_back_quote(hs);
451
+ } else {
452
+ return h5_state_attribute_value_no_quote(hs);
453
+ }
454
+ }
455
+
456
+
457
+ static int h5_state_attribute_value_quote(h5_state_t* hs, char qchar)
458
+ {
459
+ const char* idx;
460
+
461
+ TRACE();
462
+
463
+ /* skip initial quote in normal case.
464
+ * don't do this "if (pos == 0)" since it means we have started
465
+ * in a non-data state. given an input of '><foo
466
+ * we want to make 0-length attribute name
467
+ */
468
+ if (hs->pos > 0) {
469
+ hs->pos += 1;
470
+ }
471
+
472
+
473
+ idx = (const char*) memchr(hs->s + hs->pos, qchar, hs->len - hs->pos);
474
+ if (idx == NULL) {
475
+ hs->token_start = hs->s + hs->pos;
476
+ hs->token_len = hs->len - hs->pos;
477
+ hs->token_type = ATTR_VALUE;
478
+ hs->state = h5_state_eof;
479
+ } else {
480
+ hs->token_start = hs->s + hs->pos;
481
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
482
+ hs->token_type = ATTR_VALUE;
483
+ hs->state = h5_state_after_attribute_value_quoted_state;
484
+ hs->pos += hs->token_len + 1;
485
+ }
486
+ return 1;
487
+ }
488
+
489
+ static
490
+ int h5_state_attribute_value_double_quote(h5_state_t* hs)
491
+ {
492
+ TRACE();
493
+ return h5_state_attribute_value_quote(hs, CHAR_DOUBLE);
494
+ }
495
+
496
+ static
497
+ int h5_state_attribute_value_single_quote(h5_state_t* hs)
498
+ {
499
+ TRACE();
500
+ return h5_state_attribute_value_quote(hs, CHAR_SINGLE);
501
+ }
502
+
503
+ static
504
+ int h5_state_attribute_value_back_quote(h5_state_t* hs)
505
+ {
506
+ TRACE();
507
+ return h5_state_attribute_value_quote(hs, CHAR_TICK);
508
+ }
509
+
510
+ static int h5_state_attribute_value_no_quote(h5_state_t* hs)
511
+ {
512
+ char ch;
513
+ size_t pos;
514
+
515
+ TRACE();
516
+ pos = hs->pos;
517
+ while (pos < hs->len) {
518
+ ch = hs->s[pos];
519
+ if (h5_is_white(ch)) {
520
+ hs->token_type = ATTR_VALUE;
521
+ hs->token_start = hs->s + hs->pos;
522
+ hs->token_len = pos - hs->pos;
523
+ hs->pos = pos + 1;
524
+ hs->state = h5_state_before_attribute_name;
525
+ return 1;
526
+ } else if (ch == CHAR_GT) {
527
+ hs->token_type = ATTR_VALUE;
528
+ hs->token_start = hs->s + hs->pos;
529
+ hs->token_len = pos - hs->pos;
530
+ hs->pos = pos;
531
+ hs->state = h5_state_tag_name_close;
532
+ return 1;
533
+ }
534
+ pos += 1;
535
+ }
536
+ TRACE();
537
+ /* EOF */
538
+ hs->state = h5_state_eof;
539
+ hs->token_start = hs->s + hs->pos;
540
+ hs->token_len = hs->len - hs->pos;
541
+ hs->token_type = ATTR_VALUE;
542
+ return 1;
543
+ }
544
+
545
+ /**
546
+ * 12.2.4.41
547
+ */
548
+ static int h5_state_after_attribute_value_quoted_state(h5_state_t* hs)
549
+ {
550
+ char ch;
551
+
552
+ TRACE();
553
+ if (hs->pos >= hs->len) {
554
+ return 0;
555
+ }
556
+ ch = hs->s[hs->pos];
557
+ if (h5_is_white(ch)) {
558
+ hs->pos += 1;
559
+ return h5_state_before_attribute_name(hs);
560
+ } else if (ch == CHAR_SLASH) {
561
+ hs->pos += 1;
562
+ return h5_state_self_closing_start_tag(hs);
563
+ } else if (ch == CHAR_GT) {
564
+ hs->token_start = hs->s + hs->pos;
565
+ hs->token_len = 1;
566
+ hs->token_type = TAG_NAME_CLOSE;
567
+ hs->pos += 1;
568
+ hs->state = h5_state_data;
569
+ return 1;
570
+ } else {
571
+ return h5_state_before_attribute_name(hs);
572
+ }
573
+ }
574
+
575
+ /**
576
+ * 12.2.4.43
577
+ */
578
+ static int h5_state_self_closing_start_tag(h5_state_t* hs)
579
+ {
580
+ char ch;
581
+
582
+ TRACE();
583
+ if (hs->pos >= hs->len) {
584
+ return 0;
585
+ }
586
+ ch = hs->s[hs->pos];
587
+ if (ch == CHAR_GT) {
588
+ assert(hs->pos > 0);
589
+ hs->token_start = hs->s + hs->pos -1;
590
+ hs->token_len = 2;
591
+ hs->token_type = TAG_NAME_SELFCLOSE;
592
+ hs->state = h5_state_data;
593
+ hs->pos += 1;
594
+ return 1;
595
+ } else {
596
+ return h5_state_before_attribute_name(hs);
597
+ }
598
+ }
599
+
600
+ /**
601
+ * 12.2.4.44
602
+ */
603
+ static int h5_state_bogus_comment(h5_state_t* hs)
604
+ {
605
+ const char* idx;
606
+
607
+ TRACE();
608
+ idx = (const char*) memchr(hs->s + hs->pos, CHAR_GT, hs->len - hs->pos);
609
+ if (idx == NULL) {
610
+ hs->token_start = hs->s + hs->pos;
611
+ hs->token_len = hs->len - hs->pos;
612
+ hs->pos = hs->len;
613
+ hs->state = h5_state_eof;
614
+ } else {
615
+ hs->token_start = hs->s + hs->pos;
616
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
617
+ hs->pos = (size_t)(idx - hs->s) + 1;
618
+ hs->state = h5_state_data;
619
+ }
620
+
621
+ hs->token_type = TAG_COMMENT;
622
+ return 1;
623
+ }
624
+
625
+ /**
626
+ * 12.2.4.44 ALT
627
+ */
628
+ static int h5_state_bogus_comment2(h5_state_t* hs)
629
+ {
630
+ const char* idx;
631
+ size_t pos;
632
+
633
+ TRACE();
634
+ pos = hs->pos;
635
+ while (1) {
636
+ idx = (const char*) memchr(hs->s + pos, CHAR_PERCENT, hs->len - pos);
637
+ if (idx == NULL || (idx + 1 >= hs->s + hs->len)) {
638
+ hs->token_start = hs->s + hs->pos;
639
+ hs->token_len = hs->len - hs->pos;
640
+ hs->pos = hs->len;
641
+ hs->token_type = TAG_COMMENT;
642
+ hs->state = h5_state_eof;
643
+ return 1;
644
+ }
645
+
646
+ if (*(idx +1) != CHAR_GT) {
647
+ pos = (size_t)(idx - hs->s) + 1;
648
+ continue;
649
+ }
650
+
651
+ /* ends in %> */
652
+ hs->token_start = hs->s + hs->pos;
653
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
654
+ hs->pos = (size_t)(idx - hs->s) + 2;
655
+ hs->state = h5_state_data;
656
+ hs->token_type = TAG_COMMENT;
657
+ return 1;
658
+ }
659
+ }
660
+
661
+ /**
662
+ * 8.2.4.45
663
+ */
664
+ static int h5_state_markup_declaration_open(h5_state_t* hs)
665
+ {
666
+ size_t remaining;
667
+
668
+ TRACE();
669
+ remaining = hs->len - hs->pos;
670
+ if (remaining >= 7 &&
671
+ /* case insensitive */
672
+ (hs->s[hs->pos + 0] == 'D' || hs->s[hs->pos + 0] == 'd') &&
673
+ (hs->s[hs->pos + 1] == 'O' || hs->s[hs->pos + 1] == 'o') &&
674
+ (hs->s[hs->pos + 2] == 'C' || hs->s[hs->pos + 2] == 'c') &&
675
+ (hs->s[hs->pos + 3] == 'T' || hs->s[hs->pos + 3] == 't') &&
676
+ (hs->s[hs->pos + 4] == 'Y' || hs->s[hs->pos + 4] == 'y') &&
677
+ (hs->s[hs->pos + 5] == 'P' || hs->s[hs->pos + 5] == 'p') &&
678
+ (hs->s[hs->pos + 6] == 'E' || hs->s[hs->pos + 6] == 'e')
679
+ ) {
680
+ return h5_state_doctype(hs);
681
+ } else if (remaining >= 7 &&
682
+ /* upper case required */
683
+ hs->s[hs->pos + 0] == '[' &&
684
+ hs->s[hs->pos + 1] == 'C' &&
685
+ hs->s[hs->pos + 2] == 'D' &&
686
+ hs->s[hs->pos + 3] == 'A' &&
687
+ hs->s[hs->pos + 4] == 'T' &&
688
+ hs->s[hs->pos + 5] == 'A' &&
689
+ hs->s[hs->pos + 6] == '['
690
+ ) {
691
+ hs->pos += 7;
692
+ return h5_state_cdata(hs);
693
+ } else if (remaining >= 2 &&
694
+ hs->s[hs->pos + 0] == '-' &&
695
+ hs->s[hs->pos + 1] == '-') {
696
+ hs->pos += 2;
697
+ return h5_state_comment(hs);
698
+ }
699
+
700
+ return h5_state_bogus_comment(hs);
701
+ }
702
+
703
+ /**
704
+ * 12.2.4.48
705
+ * 12.2.4.49
706
+ * 12.2.4.50
707
+ * 12.2.4.51
708
+ * state machine spec is confusing since it can only look
709
+ * at one character at a time but simply it's comments end by:
710
+ * 1) EOF
711
+ * 2) ending in -->
712
+ * 3) ending in -!>
713
+ */
714
+ static int h5_state_comment(h5_state_t* hs)
715
+ {
716
+ char ch;
717
+ const char* idx;
718
+ size_t pos;
719
+ size_t offset;
720
+ const char* end = hs->s + hs->len;
721
+
722
+ TRACE();
723
+ pos = hs->pos;
724
+ while (1) {
725
+
726
+ idx = (const char*) memchr(hs->s + pos, CHAR_DASH, hs->len - pos);
727
+
728
+ /* did not find anything or has less than 3 chars left */
729
+ if (idx == NULL || idx > hs->s + hs->len - 3) {
730
+ hs->state = h5_state_eof;
731
+ hs->token_start = hs->s + hs->pos;
732
+ hs->token_len = hs->len - hs->pos;
733
+ hs->token_type = TAG_COMMENT;
734
+ return 1;
735
+ }
736
+ offset = 1;
737
+
738
+ /* skip all nulls */
739
+ while (idx + offset < end && *(idx + offset) == 0) {
740
+ offset += 1;
741
+ }
742
+ if (idx + offset == end) {
743
+ hs->state = h5_state_eof;
744
+ hs->token_start = hs->s + hs->pos;
745
+ hs->token_len = hs->len - hs->pos;
746
+ hs->token_type = TAG_COMMENT;
747
+ return 1;
748
+ }
749
+
750
+ ch = *(idx + offset);
751
+ if (ch != CHAR_DASH && ch != CHAR_BANG) {
752
+ pos = (size_t)(idx - hs->s) + 1;
753
+ continue;
754
+ }
755
+
756
+ /* need to test */
757
+ #if 0
758
+ /* skip all nulls */
759
+ while (idx + offset < end && *(idx + offset) == 0) {
760
+ offset += 1;
761
+ }
762
+ if (idx + offset == end) {
763
+ hs->state = h5_state_eof;
764
+ hs->token_start = hs->s + hs->pos;
765
+ hs->token_len = hs->len - hs->pos;
766
+ hs->token_type = TAG_COMMENT;
767
+ return 1;
768
+ }
769
+ #endif
770
+
771
+ offset += 1;
772
+ if (idx + offset == end) {
773
+ hs->state = h5_state_eof;
774
+ hs->token_start = hs->s + hs->pos;
775
+ hs->token_len = hs->len - hs->pos;
776
+ hs->token_type = TAG_COMMENT;
777
+ return 1;
778
+ }
779
+
780
+
781
+ ch = *(idx + offset);
782
+ if (ch != CHAR_GT) {
783
+ pos = (size_t)(idx - hs->s) + 1;
784
+ continue;
785
+ }
786
+ offset += 1;
787
+
788
+ /* ends in --> or -!> */
789
+ hs->token_start = hs->s + hs->pos;
790
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
791
+ hs->pos = (size_t)(idx + offset - hs->s);
792
+ hs->state = h5_state_data;
793
+ hs->token_type = TAG_COMMENT;
794
+ return 1;
795
+ }
796
+ }
797
+
798
+ static int h5_state_cdata(h5_state_t* hs)
799
+ {
800
+ const char* idx;
801
+ size_t pos;
802
+
803
+ TRACE();
804
+ pos = hs->pos;
805
+ while (1) {
806
+ idx = (const char*) memchr(hs->s + pos, CHAR_RIGHTB, hs->len - pos);
807
+
808
+ /* did not find anything or has less than 3 chars left */
809
+ if (idx == NULL || idx > hs->s + hs->len - 3) {
810
+ hs->state = h5_state_eof;
811
+ hs->token_start = hs->s + hs->pos;
812
+ hs->token_len = hs->len - hs->pos;
813
+ hs->token_type = DATA_TEXT;
814
+ return 1;
815
+ } else if ( *(idx+1) == CHAR_RIGHTB && *(idx+2) == CHAR_GT) {
816
+ hs->state = h5_state_data;
817
+ hs->token_start = hs->s + hs->pos;
818
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
819
+ hs->pos = (size_t)(idx - hs->s) + 3;
820
+ hs->token_type = DATA_TEXT;
821
+ return 1;
822
+ } else {
823
+ pos = (size_t)(idx - hs->s) + 1;
824
+ }
825
+ }
826
+ }
827
+
828
+ /**
829
+ * 8.2.4.52
830
+ * http://www.w3.org/html/wg/drafts/html/master/syntax.html#doctype-state
831
+ */
832
+ static int h5_state_doctype(h5_state_t* hs)
833
+ {
834
+ const char* idx;
835
+
836
+ TRACE();
837
+ hs->token_start = hs->s + hs->pos;
838
+ hs->token_type = DOCTYPE;
839
+
840
+ idx = (const char*) memchr(hs->s + hs->pos, CHAR_GT, hs->len - hs->pos);
841
+ if (idx == NULL) {
842
+ hs->state = h5_state_eof;
843
+ hs->token_len = hs->len - hs->pos;
844
+ } else {
845
+ hs->state = h5_state_data;
846
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
847
+ hs->pos = (size_t)(idx - hs->s) + 1;
848
+ }
849
+ return 1;
850
+ }