threatstack-agent-ruby 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +3 -0
  3. data/LICENSE +6 -0
  4. data/ext/libinjection/extconf.rb +4 -0
  5. data/ext/libinjection/libinjection.h +65 -0
  6. data/ext/libinjection/libinjection.i +13 -0
  7. data/ext/libinjection/libinjection_html5.c +850 -0
  8. data/ext/libinjection/libinjection_html5.h +54 -0
  9. data/ext/libinjection/libinjection_sqli.c +2325 -0
  10. data/ext/libinjection/libinjection_sqli.h +298 -0
  11. data/ext/libinjection/libinjection_sqli_data.h +9654 -0
  12. data/ext/libinjection/libinjection_wrap.c +2393 -0
  13. data/ext/libinjection/libinjection_xss.c +532 -0
  14. data/ext/libinjection/libinjection_xss.h +21 -0
  15. data/lib/constants.rb +110 -0
  16. data/lib/control.rb +61 -0
  17. data/lib/events/event_accumulator.rb +36 -0
  18. data/lib/events/models/attack_event.rb +58 -0
  19. data/lib/events/models/base_event.rb +41 -0
  20. data/lib/events/models/dependency_event.rb +93 -0
  21. data/lib/events/models/environment_event.rb +93 -0
  22. data/lib/events/models/instrumentation_event.rb +46 -0
  23. data/lib/exceptions/request_blocked_error.rb +11 -0
  24. data/lib/instrumentation/common.rb +172 -0
  25. data/lib/instrumentation/instrumenter.rb +144 -0
  26. data/lib/instrumentation/kernel.rb +45 -0
  27. data/lib/instrumentation/rails.rb +61 -0
  28. data/lib/jobs/delayed_job.rb +26 -0
  29. data/lib/jobs/event_submitter.rb +101 -0
  30. data/lib/jobs/job_queue.rb +38 -0
  31. data/lib/jobs/recurrent_job.rb +61 -0
  32. data/lib/threatstack-agent-ruby.rb +7 -0
  33. data/lib/utils/aws_utils.rb +46 -0
  34. data/lib/utils/formatter.rb +47 -0
  35. data/lib/utils/logger.rb +43 -0
  36. data/threatstack-agent-ruby.gemspec +35 -0
  37. metadata +221 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d56612a7470c1f394b66c6473479883be0ddc44810bebd77206acd42da1b31a6
4
+ data.tar.gz: 0d1bbad69a0ef6bb2ab53b662a8f004d06d353b2b02746ffb5616731eaef1c90
5
+ SHA512:
6
+ metadata.gz: 0c98e82db4c39fbfd7d930049b9ffe84c6efeda17e5f6b97ea971e240ffaec39c45683630af92ba95626a6571e0d9ecc02d01431c1c70284959b240576bb3e03
7
+ data.tar.gz: 767fb9d9a56bb8b4efdd9d7d9a28a373e86851550ba8c9e4401d4b2803d6bda1964ba0f47b566c25c12b54f8db5df49c48249bc12734f29f6dcb57a7e706879c
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,6 @@
1
+ Threat Stack AppSec for Ruby is free-to-use, proprietary software.
2
+ Please refer to our terms for more information: https://www.threatstack.com/terms
3
+
4
+ The Threat Stack Ruby AppSec agent also uses code from the following open source projects under the following licenses:
5
+
6
+ libinjection http://opensource.org/licenses/BSD-3-Clause
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mkmf'
4
+ create_makefile('libinjection')
@@ -0,0 +1,65 @@
1
+ /**
2
+ * Copyright 2012-2016 Nick Galbreath
3
+ * nickg@client9.com
4
+ * BSD License -- see COPYING.txt for details
5
+ *
6
+ * https://libinjection.client9.com/
7
+ *
8
+ */
9
+
10
+ #ifndef LIBINJECTION_H
11
+ #define LIBINJECTION_H
12
+
13
+ #ifdef __cplusplus
14
+ # define LIBINJECTION_BEGIN_DECLS extern "C" {
15
+ # define LIBINJECTION_END_DECLS }
16
+ #else
17
+ # define LIBINJECTION_BEGIN_DECLS
18
+ # define LIBINJECTION_END_DECLS
19
+ #endif
20
+
21
+ LIBINJECTION_BEGIN_DECLS
22
+
23
+ /*
24
+ * Pull in size_t
25
+ */
26
+ #include <string.h>
27
+
28
+ /*
29
+ * Version info.
30
+ *
31
+ * This is moved into a function to allow SWIG and other auto-generated
32
+ * binding to not be modified during minor release changes. We change
33
+ * change the version number in the c source file, and not regenerated
34
+ * the binding
35
+ *
36
+ * See python's normalized version
37
+ * http://www.python.org/dev/peps/pep-0386/#normalizedversion
38
+ */
39
+ const char* libinjection_version(void);
40
+
41
+ /**
42
+ * Simple API for SQLi detection - returns a SQLi fingerprint or NULL
43
+ * is benign input
44
+ *
45
+ * \param[in] s input string, may contain nulls, does not need to be null-terminated
46
+ * \param[in] slen input string length
47
+ * \param[out] fingerprint buffer of 8+ characters. c-string,
48
+ * \return 1 if SQLi, 0 if benign. fingerprint will be set or set to empty string.
49
+ */
50
+ int libinjection_sqli(const char* s, size_t slen, char fingerprint[]);
51
+
52
+ /** ALPHA version of xss detector.
53
+ *
54
+ * NOT DONE.
55
+ *
56
+ * \param[in] s input string, may contain nulls, does not need to be null-terminated
57
+ * \param[in] slen input string length
58
+ * \return 1 if XSS found, 0 if benign
59
+ *
60
+ */
61
+ int libinjection_xss(const char* s, size_t slen);
62
+
63
+ LIBINJECTION_END_DECLS
64
+
65
+ #endif /* LIBINJECTION_H */
@@ -0,0 +1,13 @@
1
+ /* libinjection.i SWIG interface file */
2
+
3
+ %module libinjection
4
+ %{
5
+ #include "libinjection.h"
6
+ #include "libinjection_sqli.h"
7
+ #include "libinjection_xss.h"
8
+ %}
9
+
10
+ %include "libinjection.h"
11
+
12
+ // int libinjection_sqli(char *, size_t, char *OUTPUT);
13
+ // int libinjection_xss(char *, size_t);
@@ -0,0 +1,850 @@
1
+ #include "libinjection_html5.h"
2
+
3
+ #include <string.h>
4
+ #include <assert.h>
5
+
6
+ #ifdef DEBUG
7
+ #include <stdio.h>
8
+ #define TRACE() printf("%s:%d\n", __FUNCTION__, __LINE__)
9
+ #else
10
+ #define TRACE()
11
+ #endif
12
+
13
+
14
+ #define CHAR_EOF -1
15
+ #define CHAR_NULL 0
16
+ #define CHAR_BANG 33
17
+ #define CHAR_DOUBLE 34
18
+ #define CHAR_PERCENT 37
19
+ #define CHAR_SINGLE 39
20
+ #define CHAR_DASH 45
21
+ #define CHAR_SLASH 47
22
+ #define CHAR_LT 60
23
+ #define CHAR_EQUALS 61
24
+ #define CHAR_GT 62
25
+ #define CHAR_QUESTION 63
26
+ #define CHAR_RIGHTB 93
27
+ #define CHAR_TICK 96
28
+
29
+ /* prototypes */
30
+
31
+ static int h5_skip_white(h5_state_t* hs);
32
+ static int h5_is_white(char c);
33
+ static int h5_state_eof(h5_state_t* hs);
34
+ static int h5_state_data(h5_state_t* hs);
35
+ static int h5_state_tag_open(h5_state_t* hs);
36
+ static int h5_state_tag_name(h5_state_t* hs);
37
+ static int h5_state_tag_name_close(h5_state_t* hs);
38
+ static int h5_state_end_tag_open(h5_state_t* hs);
39
+ static int h5_state_self_closing_start_tag(h5_state_t* hs);
40
+ static int h5_state_attribute_name(h5_state_t* hs);
41
+ static int h5_state_after_attribute_name(h5_state_t* hs);
42
+ static int h5_state_before_attribute_name(h5_state_t* hs);
43
+ static int h5_state_before_attribute_value(h5_state_t* hs);
44
+ static int h5_state_attribute_value_double_quote(h5_state_t* hs);
45
+ static int h5_state_attribute_value_single_quote(h5_state_t* hs);
46
+ static int h5_state_attribute_value_back_quote(h5_state_t* hs);
47
+ static int h5_state_attribute_value_no_quote(h5_state_t* hs);
48
+ static int h5_state_after_attribute_value_quoted_state(h5_state_t* hs);
49
+ static int h5_state_comment(h5_state_t* hs);
50
+ static int h5_state_cdata(h5_state_t* hs);
51
+
52
+
53
+ /* 12.2.4.44 */
54
+ static int h5_state_bogus_comment(h5_state_t* hs);
55
+ static int h5_state_bogus_comment2(h5_state_t* hs);
56
+
57
+ /* 12.2.4.45 */
58
+ static int h5_state_markup_declaration_open(h5_state_t* hs);
59
+
60
+ /* 8.2.4.52 */
61
+ static int h5_state_doctype(h5_state_t* hs);
62
+
63
+ /**
64
+ * public function
65
+ */
66
+ void libinjection_h5_init(h5_state_t* hs, const char* s, size_t len, enum html5_flags flags)
67
+ {
68
+ memset(hs, 0, sizeof(h5_state_t));
69
+ hs->s = s;
70
+ hs->len = len;
71
+
72
+ switch (flags) {
73
+ case DATA_STATE:
74
+ hs->state = h5_state_data;
75
+ break;
76
+ case VALUE_NO_QUOTE:
77
+ hs->state = h5_state_before_attribute_name;
78
+ break;
79
+ case VALUE_SINGLE_QUOTE:
80
+ hs->state = h5_state_attribute_value_single_quote;
81
+ break;
82
+ case VALUE_DOUBLE_QUOTE:
83
+ hs->state = h5_state_attribute_value_double_quote;
84
+ break;
85
+ case VALUE_BACK_QUOTE:
86
+ hs->state = h5_state_attribute_value_back_quote;
87
+ break;
88
+ }
89
+ }
90
+
91
+ /**
92
+ * public function
93
+ */
94
+ int libinjection_h5_next(h5_state_t* hs)
95
+ {
96
+ assert(hs->state != NULL);
97
+ return (*hs->state)(hs);
98
+ }
99
+
100
+ /**
101
+ * Everything below here is private
102
+ *
103
+ */
104
+
105
+
106
+ static int h5_is_white(char ch)
107
+ {
108
+ /*
109
+ * \t = horizontal tab = 0x09
110
+ * \n = newline = 0x0A
111
+ * \v = vertical tab = 0x0B
112
+ * \f = form feed = 0x0C
113
+ * \r = cr = 0x0D
114
+ */
115
+ return strchr(" \t\n\v\f\r", ch) != NULL;
116
+ }
117
+
118
+ static int h5_skip_white(h5_state_t* hs)
119
+ {
120
+ char ch;
121
+ while (hs->pos < hs->len) {
122
+ ch = hs->s[hs->pos];
123
+ switch (ch) {
124
+ case 0x00: /* IE only */
125
+ case 0x20:
126
+ case 0x09:
127
+ case 0x0A:
128
+ case 0x0B: /* IE only */
129
+ case 0x0C:
130
+ case 0x0D: /* IE only */
131
+ hs->pos += 1;
132
+ break;
133
+ default:
134
+ return ch;
135
+ }
136
+ }
137
+ return CHAR_EOF;
138
+ }
139
+
140
+ static int h5_state_eof(h5_state_t* hs)
141
+ {
142
+ /* eliminate unused function argument warning */
143
+ (void)hs;
144
+ return 0;
145
+ }
146
+
147
+ static int h5_state_data(h5_state_t* hs)
148
+ {
149
+ const char* idx;
150
+
151
+ TRACE();
152
+ assert(hs->len >= hs->pos);
153
+ idx = (const char*) memchr(hs->s + hs->pos, CHAR_LT, hs->len - hs->pos);
154
+ if (idx == NULL) {
155
+ hs->token_start = hs->s + hs->pos;
156
+ hs->token_len = hs->len - hs->pos;
157
+ hs->token_type = DATA_TEXT;
158
+ hs->state = h5_state_eof;
159
+ if (hs->token_len == 0) {
160
+ return 0;
161
+ }
162
+ } else {
163
+ hs->token_start = hs->s + hs->pos;
164
+ hs->token_type = DATA_TEXT;
165
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
166
+ hs->pos = (size_t)(idx - hs->s) + 1;
167
+ hs->state = h5_state_tag_open;
168
+ if (hs->token_len == 0) {
169
+ return h5_state_tag_open(hs);
170
+ }
171
+ }
172
+ return 1;
173
+ }
174
+
175
+ /**
176
+ * 12 2.4.8
177
+ */
178
+ static int h5_state_tag_open(h5_state_t* hs)
179
+ {
180
+ char ch;
181
+
182
+ TRACE();
183
+ if (hs->pos >= hs->len) {
184
+ return 0;
185
+ }
186
+ ch = hs->s[hs->pos];
187
+ if (ch == CHAR_BANG) {
188
+ hs->pos += 1;
189
+ return h5_state_markup_declaration_open(hs);
190
+ } else if (ch == CHAR_SLASH) {
191
+ hs->pos += 1;
192
+ hs->is_close = 1;
193
+ return h5_state_end_tag_open(hs);
194
+ } else if (ch == CHAR_QUESTION) {
195
+ hs->pos += 1;
196
+ return h5_state_bogus_comment(hs);
197
+ } else if (ch == CHAR_PERCENT) {
198
+ /* this is not in spec.. alternative comment format used
199
+ by IE <= 9 and Safari < 4.0.3 */
200
+ hs->pos += 1;
201
+ return h5_state_bogus_comment2(hs);
202
+ } else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
203
+ return h5_state_tag_name(hs);
204
+ } else if (ch == CHAR_NULL) {
205
+ /* IE-ism NULL characters are ignored */
206
+ return h5_state_tag_name(hs);
207
+ } else {
208
+ /* user input mistake in configuring state */
209
+ if (hs->pos == 0) {
210
+ return h5_state_data(hs);
211
+ }
212
+ hs->token_start = hs->s + hs->pos - 1;
213
+ hs->token_len = 1;
214
+ hs->token_type = DATA_TEXT;
215
+ hs->state = h5_state_data;
216
+ return 1;
217
+ }
218
+ }
219
+ /**
220
+ * 12.2.4.9
221
+ */
222
+ static int h5_state_end_tag_open(h5_state_t* hs)
223
+ {
224
+ char ch;
225
+
226
+ TRACE();
227
+
228
+ if (hs->pos >= hs->len) {
229
+ return 0;
230
+ }
231
+ ch = hs->s[hs->pos];
232
+ if (ch == CHAR_GT) {
233
+ return h5_state_data(hs);
234
+ } else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
235
+ return h5_state_tag_name(hs);
236
+ }
237
+
238
+ hs->is_close = 0;
239
+ return h5_state_bogus_comment(hs);
240
+ }
241
+ /*
242
+ *
243
+ */
244
+ static int h5_state_tag_name_close(h5_state_t* hs)
245
+ {
246
+ TRACE();
247
+ hs->is_close = 0;
248
+ hs->token_start = hs->s + hs->pos;
249
+ hs->token_len = 1;
250
+ hs->token_type = TAG_NAME_CLOSE;
251
+ hs->pos += 1;
252
+ if (hs->pos < hs->len) {
253
+ hs->state = h5_state_data;
254
+ } else {
255
+ hs->state = h5_state_eof;
256
+ }
257
+
258
+ return 1;
259
+ }
260
+
261
+ /**
262
+ * 12.2.4.10
263
+ */
264
+ static int h5_state_tag_name(h5_state_t* hs)
265
+ {
266
+ char ch;
267
+ size_t pos;
268
+
269
+ TRACE();
270
+ pos = hs->pos;
271
+ while (pos < hs->len) {
272
+ ch = hs->s[pos];
273
+ if (ch == 0) {
274
+ /* special non-standard case */
275
+ /* allow nulls in tag name */
276
+ /* some old browsers apparently allow and ignore them */
277
+ pos += 1;
278
+ } else if (h5_is_white(ch)) {
279
+ hs->token_start = hs->s + hs->pos;
280
+ hs->token_len = pos - hs->pos;
281
+ hs->token_type = TAG_NAME_OPEN;
282
+ hs->pos = pos + 1;
283
+ hs->state = h5_state_before_attribute_name;
284
+ return 1;
285
+ } else if (ch == CHAR_SLASH) {
286
+ hs->token_start = hs->s + hs->pos;
287
+ hs->token_len = pos - hs->pos;
288
+ hs->token_type = TAG_NAME_OPEN;
289
+ hs->pos = pos + 1;
290
+ hs->state = h5_state_self_closing_start_tag;
291
+ return 1;
292
+ } else if (ch == CHAR_GT) {
293
+ hs->token_start = hs->s + hs->pos;
294
+ hs->token_len = pos - hs->pos;
295
+ if (hs->is_close) {
296
+ hs->pos = pos + 1;
297
+ hs->is_close = 0;
298
+ hs->token_type = TAG_CLOSE;
299
+ hs->state = h5_state_data;
300
+ } else {
301
+ hs->pos = pos;
302
+ hs->token_type = TAG_NAME_OPEN;
303
+ hs->state = h5_state_tag_name_close;
304
+ }
305
+ return 1;
306
+ } else {
307
+ pos += 1;
308
+ }
309
+ }
310
+
311
+ hs->token_start = hs->s + hs->pos;
312
+ hs->token_len = hs->len - hs->pos;
313
+ hs->token_type = TAG_NAME_OPEN;
314
+ hs->state = h5_state_eof;
315
+ return 1;
316
+ }
317
+
318
+ /**
319
+ * 12.2.4.34
320
+ */
321
+ static int h5_state_before_attribute_name(h5_state_t* hs)
322
+ {
323
+ int ch;
324
+
325
+ TRACE();
326
+ ch = h5_skip_white(hs);
327
+ switch (ch) {
328
+ case CHAR_EOF: {
329
+ return 0;
330
+ }
331
+ case CHAR_SLASH: {
332
+ hs->pos += 1;
333
+ return h5_state_self_closing_start_tag(hs);
334
+ }
335
+ case CHAR_GT: {
336
+ hs->state = h5_state_data;
337
+ hs->token_start = hs->s + hs->pos;
338
+ hs->token_len = 1;
339
+ hs->token_type = TAG_NAME_CLOSE;
340
+ hs->pos += 1;
341
+ return 1;
342
+ }
343
+ default: {
344
+ return h5_state_attribute_name(hs);
345
+ }
346
+ }
347
+ }
348
+
349
+ static int h5_state_attribute_name(h5_state_t* hs)
350
+ {
351
+ char ch;
352
+ size_t pos;
353
+
354
+ TRACE();
355
+ pos = hs->pos + 1;
356
+ while (pos < hs->len) {
357
+ ch = hs->s[pos];
358
+ if (h5_is_white(ch)) {
359
+ hs->token_start = hs->s + hs->pos;
360
+ hs->token_len = pos - hs->pos;
361
+ hs->token_type = ATTR_NAME;
362
+ hs->state = h5_state_after_attribute_name;
363
+ hs->pos = pos + 1;
364
+ return 1;
365
+ } else if (ch == CHAR_SLASH) {
366
+ hs->token_start = hs->s + hs->pos;
367
+ hs->token_len = pos - hs->pos;
368
+ hs->token_type = ATTR_NAME;
369
+ hs->state = h5_state_self_closing_start_tag;
370
+ hs->pos = pos + 1;
371
+ return 1;
372
+ } else if (ch == CHAR_EQUALS) {
373
+ hs->token_start = hs->s + hs->pos;
374
+ hs->token_len = pos - hs->pos;
375
+ hs->token_type = ATTR_NAME;
376
+ hs->state = h5_state_before_attribute_value;
377
+ hs->pos = pos + 1;
378
+ return 1;
379
+ } else if (ch == CHAR_GT) {
380
+ hs->token_start = hs->s + hs->pos;
381
+ hs->token_len = pos - hs->pos;
382
+ hs->token_type = ATTR_NAME;
383
+ hs->state = h5_state_tag_name_close;
384
+ hs->pos = pos;
385
+ return 1;
386
+ } else {
387
+ pos += 1;
388
+ }
389
+ }
390
+ /* EOF */
391
+ hs->token_start = hs->s + hs->pos;
392
+ hs->token_len = hs->len - hs->pos;
393
+ hs->token_type = ATTR_NAME;
394
+ hs->state = h5_state_eof;
395
+ hs->pos = hs->len;
396
+ return 1;
397
+ }
398
+
399
+ /**
400
+ * 12.2.4.36
401
+ */
402
+ static int h5_state_after_attribute_name(h5_state_t* hs)
403
+ {
404
+ int c;
405
+
406
+ TRACE();
407
+ c = h5_skip_white(hs);
408
+ switch (c) {
409
+ case CHAR_EOF: {
410
+ return 0;
411
+ }
412
+ case CHAR_SLASH: {
413
+ hs->pos += 1;
414
+ return h5_state_self_closing_start_tag(hs);
415
+ }
416
+ case CHAR_EQUALS: {
417
+ hs->pos += 1;
418
+ return h5_state_before_attribute_value(hs);
419
+ }
420
+ case CHAR_GT: {
421
+ return h5_state_tag_name_close(hs);
422
+ }
423
+ default: {
424
+ return h5_state_attribute_name(hs);
425
+ }
426
+ }
427
+ }
428
+
429
+ /**
430
+ * 12.2.4.37
431
+ */
432
+ static int h5_state_before_attribute_value(h5_state_t* hs)
433
+ {
434
+ int c;
435
+ TRACE();
436
+
437
+ c = h5_skip_white(hs);
438
+
439
+ if (c == CHAR_EOF) {
440
+ hs->state = h5_state_eof;
441
+ return 0;
442
+ }
443
+
444
+ if (c == CHAR_DOUBLE) {
445
+ return h5_state_attribute_value_double_quote(hs);
446
+ } else if (c == CHAR_SINGLE) {
447
+ return h5_state_attribute_value_single_quote(hs);
448
+ } else if (c == CHAR_TICK) {
449
+ /* NON STANDARD IE */
450
+ return h5_state_attribute_value_back_quote(hs);
451
+ } else {
452
+ return h5_state_attribute_value_no_quote(hs);
453
+ }
454
+ }
455
+
456
+
457
+ static int h5_state_attribute_value_quote(h5_state_t* hs, char qchar)
458
+ {
459
+ const char* idx;
460
+
461
+ TRACE();
462
+
463
+ /* skip initial quote in normal case.
464
+ * don't do this "if (pos == 0)" since it means we have started
465
+ * in a non-data state. given an input of '><foo
466
+ * we want to make 0-length attribute name
467
+ */
468
+ if (hs->pos > 0) {
469
+ hs->pos += 1;
470
+ }
471
+
472
+
473
+ idx = (const char*) memchr(hs->s + hs->pos, qchar, hs->len - hs->pos);
474
+ if (idx == NULL) {
475
+ hs->token_start = hs->s + hs->pos;
476
+ hs->token_len = hs->len - hs->pos;
477
+ hs->token_type = ATTR_VALUE;
478
+ hs->state = h5_state_eof;
479
+ } else {
480
+ hs->token_start = hs->s + hs->pos;
481
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
482
+ hs->token_type = ATTR_VALUE;
483
+ hs->state = h5_state_after_attribute_value_quoted_state;
484
+ hs->pos += hs->token_len + 1;
485
+ }
486
+ return 1;
487
+ }
488
+
489
+ static
490
+ int h5_state_attribute_value_double_quote(h5_state_t* hs)
491
+ {
492
+ TRACE();
493
+ return h5_state_attribute_value_quote(hs, CHAR_DOUBLE);
494
+ }
495
+
496
+ static
497
+ int h5_state_attribute_value_single_quote(h5_state_t* hs)
498
+ {
499
+ TRACE();
500
+ return h5_state_attribute_value_quote(hs, CHAR_SINGLE);
501
+ }
502
+
503
+ static
504
+ int h5_state_attribute_value_back_quote(h5_state_t* hs)
505
+ {
506
+ TRACE();
507
+ return h5_state_attribute_value_quote(hs, CHAR_TICK);
508
+ }
509
+
510
+ static int h5_state_attribute_value_no_quote(h5_state_t* hs)
511
+ {
512
+ char ch;
513
+ size_t pos;
514
+
515
+ TRACE();
516
+ pos = hs->pos;
517
+ while (pos < hs->len) {
518
+ ch = hs->s[pos];
519
+ if (h5_is_white(ch)) {
520
+ hs->token_type = ATTR_VALUE;
521
+ hs->token_start = hs->s + hs->pos;
522
+ hs->token_len = pos - hs->pos;
523
+ hs->pos = pos + 1;
524
+ hs->state = h5_state_before_attribute_name;
525
+ return 1;
526
+ } else if (ch == CHAR_GT) {
527
+ hs->token_type = ATTR_VALUE;
528
+ hs->token_start = hs->s + hs->pos;
529
+ hs->token_len = pos - hs->pos;
530
+ hs->pos = pos;
531
+ hs->state = h5_state_tag_name_close;
532
+ return 1;
533
+ }
534
+ pos += 1;
535
+ }
536
+ TRACE();
537
+ /* EOF */
538
+ hs->state = h5_state_eof;
539
+ hs->token_start = hs->s + hs->pos;
540
+ hs->token_len = hs->len - hs->pos;
541
+ hs->token_type = ATTR_VALUE;
542
+ return 1;
543
+ }
544
+
545
+ /**
546
+ * 12.2.4.41
547
+ */
548
+ static int h5_state_after_attribute_value_quoted_state(h5_state_t* hs)
549
+ {
550
+ char ch;
551
+
552
+ TRACE();
553
+ if (hs->pos >= hs->len) {
554
+ return 0;
555
+ }
556
+ ch = hs->s[hs->pos];
557
+ if (h5_is_white(ch)) {
558
+ hs->pos += 1;
559
+ return h5_state_before_attribute_name(hs);
560
+ } else if (ch == CHAR_SLASH) {
561
+ hs->pos += 1;
562
+ return h5_state_self_closing_start_tag(hs);
563
+ } else if (ch == CHAR_GT) {
564
+ hs->token_start = hs->s + hs->pos;
565
+ hs->token_len = 1;
566
+ hs->token_type = TAG_NAME_CLOSE;
567
+ hs->pos += 1;
568
+ hs->state = h5_state_data;
569
+ return 1;
570
+ } else {
571
+ return h5_state_before_attribute_name(hs);
572
+ }
573
+ }
574
+
575
+ /**
576
+ * 12.2.4.43
577
+ */
578
+ static int h5_state_self_closing_start_tag(h5_state_t* hs)
579
+ {
580
+ char ch;
581
+
582
+ TRACE();
583
+ if (hs->pos >= hs->len) {
584
+ return 0;
585
+ }
586
+ ch = hs->s[hs->pos];
587
+ if (ch == CHAR_GT) {
588
+ assert(hs->pos > 0);
589
+ hs->token_start = hs->s + hs->pos -1;
590
+ hs->token_len = 2;
591
+ hs->token_type = TAG_NAME_SELFCLOSE;
592
+ hs->state = h5_state_data;
593
+ hs->pos += 1;
594
+ return 1;
595
+ } else {
596
+ return h5_state_before_attribute_name(hs);
597
+ }
598
+ }
599
+
600
+ /**
601
+ * 12.2.4.44
602
+ */
603
+ static int h5_state_bogus_comment(h5_state_t* hs)
604
+ {
605
+ const char* idx;
606
+
607
+ TRACE();
608
+ idx = (const char*) memchr(hs->s + hs->pos, CHAR_GT, hs->len - hs->pos);
609
+ if (idx == NULL) {
610
+ hs->token_start = hs->s + hs->pos;
611
+ hs->token_len = hs->len - hs->pos;
612
+ hs->pos = hs->len;
613
+ hs->state = h5_state_eof;
614
+ } else {
615
+ hs->token_start = hs->s + hs->pos;
616
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
617
+ hs->pos = (size_t)(idx - hs->s) + 1;
618
+ hs->state = h5_state_data;
619
+ }
620
+
621
+ hs->token_type = TAG_COMMENT;
622
+ return 1;
623
+ }
624
+
625
+ /**
626
+ * 12.2.4.44 ALT
627
+ */
628
+ static int h5_state_bogus_comment2(h5_state_t* hs)
629
+ {
630
+ const char* idx;
631
+ size_t pos;
632
+
633
+ TRACE();
634
+ pos = hs->pos;
635
+ while (1) {
636
+ idx = (const char*) memchr(hs->s + pos, CHAR_PERCENT, hs->len - pos);
637
+ if (idx == NULL || (idx + 1 >= hs->s + hs->len)) {
638
+ hs->token_start = hs->s + hs->pos;
639
+ hs->token_len = hs->len - hs->pos;
640
+ hs->pos = hs->len;
641
+ hs->token_type = TAG_COMMENT;
642
+ hs->state = h5_state_eof;
643
+ return 1;
644
+ }
645
+
646
+ if (*(idx +1) != CHAR_GT) {
647
+ pos = (size_t)(idx - hs->s) + 1;
648
+ continue;
649
+ }
650
+
651
+ /* ends in %> */
652
+ hs->token_start = hs->s + hs->pos;
653
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
654
+ hs->pos = (size_t)(idx - hs->s) + 2;
655
+ hs->state = h5_state_data;
656
+ hs->token_type = TAG_COMMENT;
657
+ return 1;
658
+ }
659
+ }
660
+
661
+ /**
662
+ * 8.2.4.45
663
+ */
664
+ static int h5_state_markup_declaration_open(h5_state_t* hs)
665
+ {
666
+ size_t remaining;
667
+
668
+ TRACE();
669
+ remaining = hs->len - hs->pos;
670
+ if (remaining >= 7 &&
671
+ /* case insensitive */
672
+ (hs->s[hs->pos + 0] == 'D' || hs->s[hs->pos + 0] == 'd') &&
673
+ (hs->s[hs->pos + 1] == 'O' || hs->s[hs->pos + 1] == 'o') &&
674
+ (hs->s[hs->pos + 2] == 'C' || hs->s[hs->pos + 2] == 'c') &&
675
+ (hs->s[hs->pos + 3] == 'T' || hs->s[hs->pos + 3] == 't') &&
676
+ (hs->s[hs->pos + 4] == 'Y' || hs->s[hs->pos + 4] == 'y') &&
677
+ (hs->s[hs->pos + 5] == 'P' || hs->s[hs->pos + 5] == 'p') &&
678
+ (hs->s[hs->pos + 6] == 'E' || hs->s[hs->pos + 6] == 'e')
679
+ ) {
680
+ return h5_state_doctype(hs);
681
+ } else if (remaining >= 7 &&
682
+ /* upper case required */
683
+ hs->s[hs->pos + 0] == '[' &&
684
+ hs->s[hs->pos + 1] == 'C' &&
685
+ hs->s[hs->pos + 2] == 'D' &&
686
+ hs->s[hs->pos + 3] == 'A' &&
687
+ hs->s[hs->pos + 4] == 'T' &&
688
+ hs->s[hs->pos + 5] == 'A' &&
689
+ hs->s[hs->pos + 6] == '['
690
+ ) {
691
+ hs->pos += 7;
692
+ return h5_state_cdata(hs);
693
+ } else if (remaining >= 2 &&
694
+ hs->s[hs->pos + 0] == '-' &&
695
+ hs->s[hs->pos + 1] == '-') {
696
+ hs->pos += 2;
697
+ return h5_state_comment(hs);
698
+ }
699
+
700
+ return h5_state_bogus_comment(hs);
701
+ }
702
+
703
+ /**
704
+ * 12.2.4.48
705
+ * 12.2.4.49
706
+ * 12.2.4.50
707
+ * 12.2.4.51
708
+ * state machine spec is confusing since it can only look
709
+ * at one character at a time but simply it's comments end by:
710
+ * 1) EOF
711
+ * 2) ending in -->
712
+ * 3) ending in -!>
713
+ */
714
+ static int h5_state_comment(h5_state_t* hs)
715
+ {
716
+ char ch;
717
+ const char* idx;
718
+ size_t pos;
719
+ size_t offset;
720
+ const char* end = hs->s + hs->len;
721
+
722
+ TRACE();
723
+ pos = hs->pos;
724
+ while (1) {
725
+
726
+ idx = (const char*) memchr(hs->s + pos, CHAR_DASH, hs->len - pos);
727
+
728
+ /* did not find anything or has less than 3 chars left */
729
+ if (idx == NULL || idx > hs->s + hs->len - 3) {
730
+ hs->state = h5_state_eof;
731
+ hs->token_start = hs->s + hs->pos;
732
+ hs->token_len = hs->len - hs->pos;
733
+ hs->token_type = TAG_COMMENT;
734
+ return 1;
735
+ }
736
+ offset = 1;
737
+
738
+ /* skip all nulls */
739
+ while (idx + offset < end && *(idx + offset) == 0) {
740
+ offset += 1;
741
+ }
742
+ if (idx + offset == end) {
743
+ hs->state = h5_state_eof;
744
+ hs->token_start = hs->s + hs->pos;
745
+ hs->token_len = hs->len - hs->pos;
746
+ hs->token_type = TAG_COMMENT;
747
+ return 1;
748
+ }
749
+
750
+ ch = *(idx + offset);
751
+ if (ch != CHAR_DASH && ch != CHAR_BANG) {
752
+ pos = (size_t)(idx - hs->s) + 1;
753
+ continue;
754
+ }
755
+
756
+ /* need to test */
757
+ #if 0
758
+ /* skip all nulls */
759
+ while (idx + offset < end && *(idx + offset) == 0) {
760
+ offset += 1;
761
+ }
762
+ if (idx + offset == end) {
763
+ hs->state = h5_state_eof;
764
+ hs->token_start = hs->s + hs->pos;
765
+ hs->token_len = hs->len - hs->pos;
766
+ hs->token_type = TAG_COMMENT;
767
+ return 1;
768
+ }
769
+ #endif
770
+
771
+ offset += 1;
772
+ if (idx + offset == end) {
773
+ hs->state = h5_state_eof;
774
+ hs->token_start = hs->s + hs->pos;
775
+ hs->token_len = hs->len - hs->pos;
776
+ hs->token_type = TAG_COMMENT;
777
+ return 1;
778
+ }
779
+
780
+
781
+ ch = *(idx + offset);
782
+ if (ch != CHAR_GT) {
783
+ pos = (size_t)(idx - hs->s) + 1;
784
+ continue;
785
+ }
786
+ offset += 1;
787
+
788
+ /* ends in --> or -!> */
789
+ hs->token_start = hs->s + hs->pos;
790
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
791
+ hs->pos = (size_t)(idx + offset - hs->s);
792
+ hs->state = h5_state_data;
793
+ hs->token_type = TAG_COMMENT;
794
+ return 1;
795
+ }
796
+ }
797
+
798
+ static int h5_state_cdata(h5_state_t* hs)
799
+ {
800
+ const char* idx;
801
+ size_t pos;
802
+
803
+ TRACE();
804
+ pos = hs->pos;
805
+ while (1) {
806
+ idx = (const char*) memchr(hs->s + pos, CHAR_RIGHTB, hs->len - pos);
807
+
808
+ /* did not find anything or has less than 3 chars left */
809
+ if (idx == NULL || idx > hs->s + hs->len - 3) {
810
+ hs->state = h5_state_eof;
811
+ hs->token_start = hs->s + hs->pos;
812
+ hs->token_len = hs->len - hs->pos;
813
+ hs->token_type = DATA_TEXT;
814
+ return 1;
815
+ } else if ( *(idx+1) == CHAR_RIGHTB && *(idx+2) == CHAR_GT) {
816
+ hs->state = h5_state_data;
817
+ hs->token_start = hs->s + hs->pos;
818
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
819
+ hs->pos = (size_t)(idx - hs->s) + 3;
820
+ hs->token_type = DATA_TEXT;
821
+ return 1;
822
+ } else {
823
+ pos = (size_t)(idx - hs->s) + 1;
824
+ }
825
+ }
826
+ }
827
+
828
+ /**
829
+ * 8.2.4.52
830
+ * http://www.w3.org/html/wg/drafts/html/master/syntax.html#doctype-state
831
+ */
832
+ static int h5_state_doctype(h5_state_t* hs)
833
+ {
834
+ const char* idx;
835
+
836
+ TRACE();
837
+ hs->token_start = hs->s + hs->pos;
838
+ hs->token_type = DOCTYPE;
839
+
840
+ idx = (const char*) memchr(hs->s + hs->pos, CHAR_GT, hs->len - hs->pos);
841
+ if (idx == NULL) {
842
+ hs->state = h5_state_eof;
843
+ hs->token_len = hs->len - hs->pos;
844
+ } else {
845
+ hs->state = h5_state_data;
846
+ hs->token_len = (size_t)(idx - hs->s) - hs->pos;
847
+ hs->pos = (size_t)(idx - hs->s) + 1;
848
+ }
849
+ return 1;
850
+ }