threatstack-agent-ruby 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +3 -0
- data/LICENSE +6 -0
- data/ext/libinjection/extconf.rb +4 -0
- data/ext/libinjection/libinjection.h +65 -0
- data/ext/libinjection/libinjection.i +13 -0
- data/ext/libinjection/libinjection_html5.c +850 -0
- data/ext/libinjection/libinjection_html5.h +54 -0
- data/ext/libinjection/libinjection_sqli.c +2325 -0
- data/ext/libinjection/libinjection_sqli.h +298 -0
- data/ext/libinjection/libinjection_sqli_data.h +9654 -0
- data/ext/libinjection/libinjection_wrap.c +2393 -0
- data/ext/libinjection/libinjection_xss.c +532 -0
- data/ext/libinjection/libinjection_xss.h +21 -0
- data/lib/constants.rb +110 -0
- data/lib/control.rb +61 -0
- data/lib/events/event_accumulator.rb +36 -0
- data/lib/events/models/attack_event.rb +58 -0
- data/lib/events/models/base_event.rb +41 -0
- data/lib/events/models/dependency_event.rb +93 -0
- data/lib/events/models/environment_event.rb +93 -0
- data/lib/events/models/instrumentation_event.rb +46 -0
- data/lib/exceptions/request_blocked_error.rb +11 -0
- data/lib/instrumentation/common.rb +172 -0
- data/lib/instrumentation/instrumenter.rb +144 -0
- data/lib/instrumentation/kernel.rb +45 -0
- data/lib/instrumentation/rails.rb +61 -0
- data/lib/jobs/delayed_job.rb +26 -0
- data/lib/jobs/event_submitter.rb +101 -0
- data/lib/jobs/job_queue.rb +38 -0
- data/lib/jobs/recurrent_job.rb +61 -0
- data/lib/threatstack-agent-ruby.rb +7 -0
- data/lib/utils/aws_utils.rb +46 -0
- data/lib/utils/formatter.rb +47 -0
- data/lib/utils/logger.rb +43 -0
- data/threatstack-agent-ruby.gemspec +35 -0
- metadata +221 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: d56612a7470c1f394b66c6473479883be0ddc44810bebd77206acd42da1b31a6
|
4
|
+
data.tar.gz: 0d1bbad69a0ef6bb2ab53b662a8f004d06d353b2b02746ffb5616731eaef1c90
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0c98e82db4c39fbfd7d930049b9ffe84c6efeda17e5f6b97ea971e240ffaec39c45683630af92ba95626a6571e0d9ecc02d01431c1c70284959b240576bb3e03
|
7
|
+
data.tar.gz: 767fb9d9a56bb8b4efdd9d7d9a28a373e86851550ba8c9e4401d4b2803d6bda1964ba0f47b566c25c12b54f8db5df49c48249bc12734f29f6dcb57a7e706879c
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,6 @@
|
|
1
|
+
Threat Stack AppSec for Ruby is free-to-use, proprietary software.
|
2
|
+
Please refer to our terms for more information: https://www.threatstack.com/terms
|
3
|
+
|
4
|
+
The Threat Stack Ruby AppSec agent also uses code from the following open source projects under the following licenses:
|
5
|
+
|
6
|
+
libinjection http://opensource.org/licenses/BSD-3-Clause
|
@@ -0,0 +1,65 @@
|
|
1
|
+
/**
|
2
|
+
* Copyright 2012-2016 Nick Galbreath
|
3
|
+
* nickg@client9.com
|
4
|
+
* BSD License -- see COPYING.txt for details
|
5
|
+
*
|
6
|
+
* https://libinjection.client9.com/
|
7
|
+
*
|
8
|
+
*/
|
9
|
+
|
10
|
+
#ifndef LIBINJECTION_H
|
11
|
+
#define LIBINJECTION_H
|
12
|
+
|
13
|
+
#ifdef __cplusplus
|
14
|
+
# define LIBINJECTION_BEGIN_DECLS extern "C" {
|
15
|
+
# define LIBINJECTION_END_DECLS }
|
16
|
+
#else
|
17
|
+
# define LIBINJECTION_BEGIN_DECLS
|
18
|
+
# define LIBINJECTION_END_DECLS
|
19
|
+
#endif
|
20
|
+
|
21
|
+
LIBINJECTION_BEGIN_DECLS
|
22
|
+
|
23
|
+
/*
|
24
|
+
* Pull in size_t
|
25
|
+
*/
|
26
|
+
#include <string.h>
|
27
|
+
|
28
|
+
/*
|
29
|
+
* Version info.
|
30
|
+
*
|
31
|
+
* This is moved into a function to allow SWIG and other auto-generated
|
32
|
+
* binding to not be modified during minor release changes. We change
|
33
|
+
* change the version number in the c source file, and not regenerated
|
34
|
+
* the binding
|
35
|
+
*
|
36
|
+
* See python's normalized version
|
37
|
+
* http://www.python.org/dev/peps/pep-0386/#normalizedversion
|
38
|
+
*/
|
39
|
+
const char* libinjection_version(void);
|
40
|
+
|
41
|
+
/**
|
42
|
+
* Simple API for SQLi detection - returns a SQLi fingerprint or NULL
|
43
|
+
* is benign input
|
44
|
+
*
|
45
|
+
* \param[in] s input string, may contain nulls, does not need to be null-terminated
|
46
|
+
* \param[in] slen input string length
|
47
|
+
* \param[out] fingerprint buffer of 8+ characters. c-string,
|
48
|
+
* \return 1 if SQLi, 0 if benign. fingerprint will be set or set to empty string.
|
49
|
+
*/
|
50
|
+
int libinjection_sqli(const char* s, size_t slen, char fingerprint[]);
|
51
|
+
|
52
|
+
/** ALPHA version of xss detector.
|
53
|
+
*
|
54
|
+
* NOT DONE.
|
55
|
+
*
|
56
|
+
* \param[in] s input string, may contain nulls, does not need to be null-terminated
|
57
|
+
* \param[in] slen input string length
|
58
|
+
* \return 1 if XSS found, 0 if benign
|
59
|
+
*
|
60
|
+
*/
|
61
|
+
int libinjection_xss(const char* s, size_t slen);
|
62
|
+
|
63
|
+
LIBINJECTION_END_DECLS
|
64
|
+
|
65
|
+
#endif /* LIBINJECTION_H */
|
@@ -0,0 +1,13 @@
|
|
1
|
+
/* libinjection.i SWIG interface file */
|
2
|
+
|
3
|
+
%module libinjection
|
4
|
+
%{
|
5
|
+
#include "libinjection.h"
|
6
|
+
#include "libinjection_sqli.h"
|
7
|
+
#include "libinjection_xss.h"
|
8
|
+
%}
|
9
|
+
|
10
|
+
%include "libinjection.h"
|
11
|
+
|
12
|
+
// int libinjection_sqli(char *, size_t, char *OUTPUT);
|
13
|
+
// int libinjection_xss(char *, size_t);
|
@@ -0,0 +1,850 @@
|
|
1
|
+
#include "libinjection_html5.h"
|
2
|
+
|
3
|
+
#include <string.h>
|
4
|
+
#include <assert.h>
|
5
|
+
|
6
|
+
#ifdef DEBUG
|
7
|
+
#include <stdio.h>
|
8
|
+
#define TRACE() printf("%s:%d\n", __FUNCTION__, __LINE__)
|
9
|
+
#else
|
10
|
+
#define TRACE()
|
11
|
+
#endif
|
12
|
+
|
13
|
+
|
14
|
+
#define CHAR_EOF -1
|
15
|
+
#define CHAR_NULL 0
|
16
|
+
#define CHAR_BANG 33
|
17
|
+
#define CHAR_DOUBLE 34
|
18
|
+
#define CHAR_PERCENT 37
|
19
|
+
#define CHAR_SINGLE 39
|
20
|
+
#define CHAR_DASH 45
|
21
|
+
#define CHAR_SLASH 47
|
22
|
+
#define CHAR_LT 60
|
23
|
+
#define CHAR_EQUALS 61
|
24
|
+
#define CHAR_GT 62
|
25
|
+
#define CHAR_QUESTION 63
|
26
|
+
#define CHAR_RIGHTB 93
|
27
|
+
#define CHAR_TICK 96
|
28
|
+
|
29
|
+
/* prototypes */
|
30
|
+
|
31
|
+
static int h5_skip_white(h5_state_t* hs);
|
32
|
+
static int h5_is_white(char c);
|
33
|
+
static int h5_state_eof(h5_state_t* hs);
|
34
|
+
static int h5_state_data(h5_state_t* hs);
|
35
|
+
static int h5_state_tag_open(h5_state_t* hs);
|
36
|
+
static int h5_state_tag_name(h5_state_t* hs);
|
37
|
+
static int h5_state_tag_name_close(h5_state_t* hs);
|
38
|
+
static int h5_state_end_tag_open(h5_state_t* hs);
|
39
|
+
static int h5_state_self_closing_start_tag(h5_state_t* hs);
|
40
|
+
static int h5_state_attribute_name(h5_state_t* hs);
|
41
|
+
static int h5_state_after_attribute_name(h5_state_t* hs);
|
42
|
+
static int h5_state_before_attribute_name(h5_state_t* hs);
|
43
|
+
static int h5_state_before_attribute_value(h5_state_t* hs);
|
44
|
+
static int h5_state_attribute_value_double_quote(h5_state_t* hs);
|
45
|
+
static int h5_state_attribute_value_single_quote(h5_state_t* hs);
|
46
|
+
static int h5_state_attribute_value_back_quote(h5_state_t* hs);
|
47
|
+
static int h5_state_attribute_value_no_quote(h5_state_t* hs);
|
48
|
+
static int h5_state_after_attribute_value_quoted_state(h5_state_t* hs);
|
49
|
+
static int h5_state_comment(h5_state_t* hs);
|
50
|
+
static int h5_state_cdata(h5_state_t* hs);
|
51
|
+
|
52
|
+
|
53
|
+
/* 12.2.4.44 */
|
54
|
+
static int h5_state_bogus_comment(h5_state_t* hs);
|
55
|
+
static int h5_state_bogus_comment2(h5_state_t* hs);
|
56
|
+
|
57
|
+
/* 12.2.4.45 */
|
58
|
+
static int h5_state_markup_declaration_open(h5_state_t* hs);
|
59
|
+
|
60
|
+
/* 8.2.4.52 */
|
61
|
+
static int h5_state_doctype(h5_state_t* hs);
|
62
|
+
|
63
|
+
/**
|
64
|
+
* public function
|
65
|
+
*/
|
66
|
+
void libinjection_h5_init(h5_state_t* hs, const char* s, size_t len, enum html5_flags flags)
|
67
|
+
{
|
68
|
+
memset(hs, 0, sizeof(h5_state_t));
|
69
|
+
hs->s = s;
|
70
|
+
hs->len = len;
|
71
|
+
|
72
|
+
switch (flags) {
|
73
|
+
case DATA_STATE:
|
74
|
+
hs->state = h5_state_data;
|
75
|
+
break;
|
76
|
+
case VALUE_NO_QUOTE:
|
77
|
+
hs->state = h5_state_before_attribute_name;
|
78
|
+
break;
|
79
|
+
case VALUE_SINGLE_QUOTE:
|
80
|
+
hs->state = h5_state_attribute_value_single_quote;
|
81
|
+
break;
|
82
|
+
case VALUE_DOUBLE_QUOTE:
|
83
|
+
hs->state = h5_state_attribute_value_double_quote;
|
84
|
+
break;
|
85
|
+
case VALUE_BACK_QUOTE:
|
86
|
+
hs->state = h5_state_attribute_value_back_quote;
|
87
|
+
break;
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
91
|
+
/**
|
92
|
+
* public function
|
93
|
+
*/
|
94
|
+
int libinjection_h5_next(h5_state_t* hs)
|
95
|
+
{
|
96
|
+
assert(hs->state != NULL);
|
97
|
+
return (*hs->state)(hs);
|
98
|
+
}
|
99
|
+
|
100
|
+
/**
|
101
|
+
* Everything below here is private
|
102
|
+
*
|
103
|
+
*/
|
104
|
+
|
105
|
+
|
106
|
+
static int h5_is_white(char ch)
|
107
|
+
{
|
108
|
+
/*
|
109
|
+
* \t = horizontal tab = 0x09
|
110
|
+
* \n = newline = 0x0A
|
111
|
+
* \v = vertical tab = 0x0B
|
112
|
+
* \f = form feed = 0x0C
|
113
|
+
* \r = cr = 0x0D
|
114
|
+
*/
|
115
|
+
return strchr(" \t\n\v\f\r", ch) != NULL;
|
116
|
+
}
|
117
|
+
|
118
|
+
static int h5_skip_white(h5_state_t* hs)
|
119
|
+
{
|
120
|
+
char ch;
|
121
|
+
while (hs->pos < hs->len) {
|
122
|
+
ch = hs->s[hs->pos];
|
123
|
+
switch (ch) {
|
124
|
+
case 0x00: /* IE only */
|
125
|
+
case 0x20:
|
126
|
+
case 0x09:
|
127
|
+
case 0x0A:
|
128
|
+
case 0x0B: /* IE only */
|
129
|
+
case 0x0C:
|
130
|
+
case 0x0D: /* IE only */
|
131
|
+
hs->pos += 1;
|
132
|
+
break;
|
133
|
+
default:
|
134
|
+
return ch;
|
135
|
+
}
|
136
|
+
}
|
137
|
+
return CHAR_EOF;
|
138
|
+
}
|
139
|
+
|
140
|
+
static int h5_state_eof(h5_state_t* hs)
|
141
|
+
{
|
142
|
+
/* eliminate unused function argument warning */
|
143
|
+
(void)hs;
|
144
|
+
return 0;
|
145
|
+
}
|
146
|
+
|
147
|
+
static int h5_state_data(h5_state_t* hs)
|
148
|
+
{
|
149
|
+
const char* idx;
|
150
|
+
|
151
|
+
TRACE();
|
152
|
+
assert(hs->len >= hs->pos);
|
153
|
+
idx = (const char*) memchr(hs->s + hs->pos, CHAR_LT, hs->len - hs->pos);
|
154
|
+
if (idx == NULL) {
|
155
|
+
hs->token_start = hs->s + hs->pos;
|
156
|
+
hs->token_len = hs->len - hs->pos;
|
157
|
+
hs->token_type = DATA_TEXT;
|
158
|
+
hs->state = h5_state_eof;
|
159
|
+
if (hs->token_len == 0) {
|
160
|
+
return 0;
|
161
|
+
}
|
162
|
+
} else {
|
163
|
+
hs->token_start = hs->s + hs->pos;
|
164
|
+
hs->token_type = DATA_TEXT;
|
165
|
+
hs->token_len = (size_t)(idx - hs->s) - hs->pos;
|
166
|
+
hs->pos = (size_t)(idx - hs->s) + 1;
|
167
|
+
hs->state = h5_state_tag_open;
|
168
|
+
if (hs->token_len == 0) {
|
169
|
+
return h5_state_tag_open(hs);
|
170
|
+
}
|
171
|
+
}
|
172
|
+
return 1;
|
173
|
+
}
|
174
|
+
|
175
|
+
/**
|
176
|
+
* 12 2.4.8
|
177
|
+
*/
|
178
|
+
static int h5_state_tag_open(h5_state_t* hs)
|
179
|
+
{
|
180
|
+
char ch;
|
181
|
+
|
182
|
+
TRACE();
|
183
|
+
if (hs->pos >= hs->len) {
|
184
|
+
return 0;
|
185
|
+
}
|
186
|
+
ch = hs->s[hs->pos];
|
187
|
+
if (ch == CHAR_BANG) {
|
188
|
+
hs->pos += 1;
|
189
|
+
return h5_state_markup_declaration_open(hs);
|
190
|
+
} else if (ch == CHAR_SLASH) {
|
191
|
+
hs->pos += 1;
|
192
|
+
hs->is_close = 1;
|
193
|
+
return h5_state_end_tag_open(hs);
|
194
|
+
} else if (ch == CHAR_QUESTION) {
|
195
|
+
hs->pos += 1;
|
196
|
+
return h5_state_bogus_comment(hs);
|
197
|
+
} else if (ch == CHAR_PERCENT) {
|
198
|
+
/* this is not in spec.. alternative comment format used
|
199
|
+
by IE <= 9 and Safari < 4.0.3 */
|
200
|
+
hs->pos += 1;
|
201
|
+
return h5_state_bogus_comment2(hs);
|
202
|
+
} else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
|
203
|
+
return h5_state_tag_name(hs);
|
204
|
+
} else if (ch == CHAR_NULL) {
|
205
|
+
/* IE-ism NULL characters are ignored */
|
206
|
+
return h5_state_tag_name(hs);
|
207
|
+
} else {
|
208
|
+
/* user input mistake in configuring state */
|
209
|
+
if (hs->pos == 0) {
|
210
|
+
return h5_state_data(hs);
|
211
|
+
}
|
212
|
+
hs->token_start = hs->s + hs->pos - 1;
|
213
|
+
hs->token_len = 1;
|
214
|
+
hs->token_type = DATA_TEXT;
|
215
|
+
hs->state = h5_state_data;
|
216
|
+
return 1;
|
217
|
+
}
|
218
|
+
}
|
219
|
+
/**
|
220
|
+
* 12.2.4.9
|
221
|
+
*/
|
222
|
+
static int h5_state_end_tag_open(h5_state_t* hs)
|
223
|
+
{
|
224
|
+
char ch;
|
225
|
+
|
226
|
+
TRACE();
|
227
|
+
|
228
|
+
if (hs->pos >= hs->len) {
|
229
|
+
return 0;
|
230
|
+
}
|
231
|
+
ch = hs->s[hs->pos];
|
232
|
+
if (ch == CHAR_GT) {
|
233
|
+
return h5_state_data(hs);
|
234
|
+
} else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
|
235
|
+
return h5_state_tag_name(hs);
|
236
|
+
}
|
237
|
+
|
238
|
+
hs->is_close = 0;
|
239
|
+
return h5_state_bogus_comment(hs);
|
240
|
+
}
|
241
|
+
/*
|
242
|
+
*
|
243
|
+
*/
|
244
|
+
static int h5_state_tag_name_close(h5_state_t* hs)
|
245
|
+
{
|
246
|
+
TRACE();
|
247
|
+
hs->is_close = 0;
|
248
|
+
hs->token_start = hs->s + hs->pos;
|
249
|
+
hs->token_len = 1;
|
250
|
+
hs->token_type = TAG_NAME_CLOSE;
|
251
|
+
hs->pos += 1;
|
252
|
+
if (hs->pos < hs->len) {
|
253
|
+
hs->state = h5_state_data;
|
254
|
+
} else {
|
255
|
+
hs->state = h5_state_eof;
|
256
|
+
}
|
257
|
+
|
258
|
+
return 1;
|
259
|
+
}
|
260
|
+
|
261
|
+
/**
|
262
|
+
* 12.2.4.10
|
263
|
+
*/
|
264
|
+
static int h5_state_tag_name(h5_state_t* hs)
|
265
|
+
{
|
266
|
+
char ch;
|
267
|
+
size_t pos;
|
268
|
+
|
269
|
+
TRACE();
|
270
|
+
pos = hs->pos;
|
271
|
+
while (pos < hs->len) {
|
272
|
+
ch = hs->s[pos];
|
273
|
+
if (ch == 0) {
|
274
|
+
/* special non-standard case */
|
275
|
+
/* allow nulls in tag name */
|
276
|
+
/* some old browsers apparently allow and ignore them */
|
277
|
+
pos += 1;
|
278
|
+
} else if (h5_is_white(ch)) {
|
279
|
+
hs->token_start = hs->s + hs->pos;
|
280
|
+
hs->token_len = pos - hs->pos;
|
281
|
+
hs->token_type = TAG_NAME_OPEN;
|
282
|
+
hs->pos = pos + 1;
|
283
|
+
hs->state = h5_state_before_attribute_name;
|
284
|
+
return 1;
|
285
|
+
} else if (ch == CHAR_SLASH) {
|
286
|
+
hs->token_start = hs->s + hs->pos;
|
287
|
+
hs->token_len = pos - hs->pos;
|
288
|
+
hs->token_type = TAG_NAME_OPEN;
|
289
|
+
hs->pos = pos + 1;
|
290
|
+
hs->state = h5_state_self_closing_start_tag;
|
291
|
+
return 1;
|
292
|
+
} else if (ch == CHAR_GT) {
|
293
|
+
hs->token_start = hs->s + hs->pos;
|
294
|
+
hs->token_len = pos - hs->pos;
|
295
|
+
if (hs->is_close) {
|
296
|
+
hs->pos = pos + 1;
|
297
|
+
hs->is_close = 0;
|
298
|
+
hs->token_type = TAG_CLOSE;
|
299
|
+
hs->state = h5_state_data;
|
300
|
+
} else {
|
301
|
+
hs->pos = pos;
|
302
|
+
hs->token_type = TAG_NAME_OPEN;
|
303
|
+
hs->state = h5_state_tag_name_close;
|
304
|
+
}
|
305
|
+
return 1;
|
306
|
+
} else {
|
307
|
+
pos += 1;
|
308
|
+
}
|
309
|
+
}
|
310
|
+
|
311
|
+
hs->token_start = hs->s + hs->pos;
|
312
|
+
hs->token_len = hs->len - hs->pos;
|
313
|
+
hs->token_type = TAG_NAME_OPEN;
|
314
|
+
hs->state = h5_state_eof;
|
315
|
+
return 1;
|
316
|
+
}
|
317
|
+
|
318
|
+
/**
|
319
|
+
* 12.2.4.34
|
320
|
+
*/
|
321
|
+
static int h5_state_before_attribute_name(h5_state_t* hs)
|
322
|
+
{
|
323
|
+
int ch;
|
324
|
+
|
325
|
+
TRACE();
|
326
|
+
ch = h5_skip_white(hs);
|
327
|
+
switch (ch) {
|
328
|
+
case CHAR_EOF: {
|
329
|
+
return 0;
|
330
|
+
}
|
331
|
+
case CHAR_SLASH: {
|
332
|
+
hs->pos += 1;
|
333
|
+
return h5_state_self_closing_start_tag(hs);
|
334
|
+
}
|
335
|
+
case CHAR_GT: {
|
336
|
+
hs->state = h5_state_data;
|
337
|
+
hs->token_start = hs->s + hs->pos;
|
338
|
+
hs->token_len = 1;
|
339
|
+
hs->token_type = TAG_NAME_CLOSE;
|
340
|
+
hs->pos += 1;
|
341
|
+
return 1;
|
342
|
+
}
|
343
|
+
default: {
|
344
|
+
return h5_state_attribute_name(hs);
|
345
|
+
}
|
346
|
+
}
|
347
|
+
}
|
348
|
+
|
349
|
+
static int h5_state_attribute_name(h5_state_t* hs)
|
350
|
+
{
|
351
|
+
char ch;
|
352
|
+
size_t pos;
|
353
|
+
|
354
|
+
TRACE();
|
355
|
+
pos = hs->pos + 1;
|
356
|
+
while (pos < hs->len) {
|
357
|
+
ch = hs->s[pos];
|
358
|
+
if (h5_is_white(ch)) {
|
359
|
+
hs->token_start = hs->s + hs->pos;
|
360
|
+
hs->token_len = pos - hs->pos;
|
361
|
+
hs->token_type = ATTR_NAME;
|
362
|
+
hs->state = h5_state_after_attribute_name;
|
363
|
+
hs->pos = pos + 1;
|
364
|
+
return 1;
|
365
|
+
} else if (ch == CHAR_SLASH) {
|
366
|
+
hs->token_start = hs->s + hs->pos;
|
367
|
+
hs->token_len = pos - hs->pos;
|
368
|
+
hs->token_type = ATTR_NAME;
|
369
|
+
hs->state = h5_state_self_closing_start_tag;
|
370
|
+
hs->pos = pos + 1;
|
371
|
+
return 1;
|
372
|
+
} else if (ch == CHAR_EQUALS) {
|
373
|
+
hs->token_start = hs->s + hs->pos;
|
374
|
+
hs->token_len = pos - hs->pos;
|
375
|
+
hs->token_type = ATTR_NAME;
|
376
|
+
hs->state = h5_state_before_attribute_value;
|
377
|
+
hs->pos = pos + 1;
|
378
|
+
return 1;
|
379
|
+
} else if (ch == CHAR_GT) {
|
380
|
+
hs->token_start = hs->s + hs->pos;
|
381
|
+
hs->token_len = pos - hs->pos;
|
382
|
+
hs->token_type = ATTR_NAME;
|
383
|
+
hs->state = h5_state_tag_name_close;
|
384
|
+
hs->pos = pos;
|
385
|
+
return 1;
|
386
|
+
} else {
|
387
|
+
pos += 1;
|
388
|
+
}
|
389
|
+
}
|
390
|
+
/* EOF */
|
391
|
+
hs->token_start = hs->s + hs->pos;
|
392
|
+
hs->token_len = hs->len - hs->pos;
|
393
|
+
hs->token_type = ATTR_NAME;
|
394
|
+
hs->state = h5_state_eof;
|
395
|
+
hs->pos = hs->len;
|
396
|
+
return 1;
|
397
|
+
}
|
398
|
+
|
399
|
+
/**
|
400
|
+
* 12.2.4.36
|
401
|
+
*/
|
402
|
+
static int h5_state_after_attribute_name(h5_state_t* hs)
|
403
|
+
{
|
404
|
+
int c;
|
405
|
+
|
406
|
+
TRACE();
|
407
|
+
c = h5_skip_white(hs);
|
408
|
+
switch (c) {
|
409
|
+
case CHAR_EOF: {
|
410
|
+
return 0;
|
411
|
+
}
|
412
|
+
case CHAR_SLASH: {
|
413
|
+
hs->pos += 1;
|
414
|
+
return h5_state_self_closing_start_tag(hs);
|
415
|
+
}
|
416
|
+
case CHAR_EQUALS: {
|
417
|
+
hs->pos += 1;
|
418
|
+
return h5_state_before_attribute_value(hs);
|
419
|
+
}
|
420
|
+
case CHAR_GT: {
|
421
|
+
return h5_state_tag_name_close(hs);
|
422
|
+
}
|
423
|
+
default: {
|
424
|
+
return h5_state_attribute_name(hs);
|
425
|
+
}
|
426
|
+
}
|
427
|
+
}
|
428
|
+
|
429
|
+
/**
|
430
|
+
* 12.2.4.37
|
431
|
+
*/
|
432
|
+
static int h5_state_before_attribute_value(h5_state_t* hs)
|
433
|
+
{
|
434
|
+
int c;
|
435
|
+
TRACE();
|
436
|
+
|
437
|
+
c = h5_skip_white(hs);
|
438
|
+
|
439
|
+
if (c == CHAR_EOF) {
|
440
|
+
hs->state = h5_state_eof;
|
441
|
+
return 0;
|
442
|
+
}
|
443
|
+
|
444
|
+
if (c == CHAR_DOUBLE) {
|
445
|
+
return h5_state_attribute_value_double_quote(hs);
|
446
|
+
} else if (c == CHAR_SINGLE) {
|
447
|
+
return h5_state_attribute_value_single_quote(hs);
|
448
|
+
} else if (c == CHAR_TICK) {
|
449
|
+
/* NON STANDARD IE */
|
450
|
+
return h5_state_attribute_value_back_quote(hs);
|
451
|
+
} else {
|
452
|
+
return h5_state_attribute_value_no_quote(hs);
|
453
|
+
}
|
454
|
+
}
|
455
|
+
|
456
|
+
|
457
|
+
static int h5_state_attribute_value_quote(h5_state_t* hs, char qchar)
|
458
|
+
{
|
459
|
+
const char* idx;
|
460
|
+
|
461
|
+
TRACE();
|
462
|
+
|
463
|
+
/* skip initial quote in normal case.
|
464
|
+
* don't do this "if (pos == 0)" since it means we have started
|
465
|
+
* in a non-data state. given an input of '><foo
|
466
|
+
* we want to make 0-length attribute name
|
467
|
+
*/
|
468
|
+
if (hs->pos > 0) {
|
469
|
+
hs->pos += 1;
|
470
|
+
}
|
471
|
+
|
472
|
+
|
473
|
+
idx = (const char*) memchr(hs->s + hs->pos, qchar, hs->len - hs->pos);
|
474
|
+
if (idx == NULL) {
|
475
|
+
hs->token_start = hs->s + hs->pos;
|
476
|
+
hs->token_len = hs->len - hs->pos;
|
477
|
+
hs->token_type = ATTR_VALUE;
|
478
|
+
hs->state = h5_state_eof;
|
479
|
+
} else {
|
480
|
+
hs->token_start = hs->s + hs->pos;
|
481
|
+
hs->token_len = (size_t)(idx - hs->s) - hs->pos;
|
482
|
+
hs->token_type = ATTR_VALUE;
|
483
|
+
hs->state = h5_state_after_attribute_value_quoted_state;
|
484
|
+
hs->pos += hs->token_len + 1;
|
485
|
+
}
|
486
|
+
return 1;
|
487
|
+
}
|
488
|
+
|
489
|
+
static
|
490
|
+
int h5_state_attribute_value_double_quote(h5_state_t* hs)
|
491
|
+
{
|
492
|
+
TRACE();
|
493
|
+
return h5_state_attribute_value_quote(hs, CHAR_DOUBLE);
|
494
|
+
}
|
495
|
+
|
496
|
+
static
|
497
|
+
int h5_state_attribute_value_single_quote(h5_state_t* hs)
|
498
|
+
{
|
499
|
+
TRACE();
|
500
|
+
return h5_state_attribute_value_quote(hs, CHAR_SINGLE);
|
501
|
+
}
|
502
|
+
|
503
|
+
static
|
504
|
+
int h5_state_attribute_value_back_quote(h5_state_t* hs)
|
505
|
+
{
|
506
|
+
TRACE();
|
507
|
+
return h5_state_attribute_value_quote(hs, CHAR_TICK);
|
508
|
+
}
|
509
|
+
|
510
|
+
static int h5_state_attribute_value_no_quote(h5_state_t* hs)
|
511
|
+
{
|
512
|
+
char ch;
|
513
|
+
size_t pos;
|
514
|
+
|
515
|
+
TRACE();
|
516
|
+
pos = hs->pos;
|
517
|
+
while (pos < hs->len) {
|
518
|
+
ch = hs->s[pos];
|
519
|
+
if (h5_is_white(ch)) {
|
520
|
+
hs->token_type = ATTR_VALUE;
|
521
|
+
hs->token_start = hs->s + hs->pos;
|
522
|
+
hs->token_len = pos - hs->pos;
|
523
|
+
hs->pos = pos + 1;
|
524
|
+
hs->state = h5_state_before_attribute_name;
|
525
|
+
return 1;
|
526
|
+
} else if (ch == CHAR_GT) {
|
527
|
+
hs->token_type = ATTR_VALUE;
|
528
|
+
hs->token_start = hs->s + hs->pos;
|
529
|
+
hs->token_len = pos - hs->pos;
|
530
|
+
hs->pos = pos;
|
531
|
+
hs->state = h5_state_tag_name_close;
|
532
|
+
return 1;
|
533
|
+
}
|
534
|
+
pos += 1;
|
535
|
+
}
|
536
|
+
TRACE();
|
537
|
+
/* EOF */
|
538
|
+
hs->state = h5_state_eof;
|
539
|
+
hs->token_start = hs->s + hs->pos;
|
540
|
+
hs->token_len = hs->len - hs->pos;
|
541
|
+
hs->token_type = ATTR_VALUE;
|
542
|
+
return 1;
|
543
|
+
}
|
544
|
+
|
545
|
+
/**
|
546
|
+
* 12.2.4.41
|
547
|
+
*/
|
548
|
+
static int h5_state_after_attribute_value_quoted_state(h5_state_t* hs)
|
549
|
+
{
|
550
|
+
char ch;
|
551
|
+
|
552
|
+
TRACE();
|
553
|
+
if (hs->pos >= hs->len) {
|
554
|
+
return 0;
|
555
|
+
}
|
556
|
+
ch = hs->s[hs->pos];
|
557
|
+
if (h5_is_white(ch)) {
|
558
|
+
hs->pos += 1;
|
559
|
+
return h5_state_before_attribute_name(hs);
|
560
|
+
} else if (ch == CHAR_SLASH) {
|
561
|
+
hs->pos += 1;
|
562
|
+
return h5_state_self_closing_start_tag(hs);
|
563
|
+
} else if (ch == CHAR_GT) {
|
564
|
+
hs->token_start = hs->s + hs->pos;
|
565
|
+
hs->token_len = 1;
|
566
|
+
hs->token_type = TAG_NAME_CLOSE;
|
567
|
+
hs->pos += 1;
|
568
|
+
hs->state = h5_state_data;
|
569
|
+
return 1;
|
570
|
+
} else {
|
571
|
+
return h5_state_before_attribute_name(hs);
|
572
|
+
}
|
573
|
+
}
|
574
|
+
|
575
|
+
/**
|
576
|
+
* 12.2.4.43
|
577
|
+
*/
|
578
|
+
static int h5_state_self_closing_start_tag(h5_state_t* hs)
|
579
|
+
{
|
580
|
+
char ch;
|
581
|
+
|
582
|
+
TRACE();
|
583
|
+
if (hs->pos >= hs->len) {
|
584
|
+
return 0;
|
585
|
+
}
|
586
|
+
ch = hs->s[hs->pos];
|
587
|
+
if (ch == CHAR_GT) {
|
588
|
+
assert(hs->pos > 0);
|
589
|
+
hs->token_start = hs->s + hs->pos -1;
|
590
|
+
hs->token_len = 2;
|
591
|
+
hs->token_type = TAG_NAME_SELFCLOSE;
|
592
|
+
hs->state = h5_state_data;
|
593
|
+
hs->pos += 1;
|
594
|
+
return 1;
|
595
|
+
} else {
|
596
|
+
return h5_state_before_attribute_name(hs);
|
597
|
+
}
|
598
|
+
}
|
599
|
+
|
600
|
+
/**
|
601
|
+
* 12.2.4.44
|
602
|
+
*/
|
603
|
+
static int h5_state_bogus_comment(h5_state_t* hs)
|
604
|
+
{
|
605
|
+
const char* idx;
|
606
|
+
|
607
|
+
TRACE();
|
608
|
+
idx = (const char*) memchr(hs->s + hs->pos, CHAR_GT, hs->len - hs->pos);
|
609
|
+
if (idx == NULL) {
|
610
|
+
hs->token_start = hs->s + hs->pos;
|
611
|
+
hs->token_len = hs->len - hs->pos;
|
612
|
+
hs->pos = hs->len;
|
613
|
+
hs->state = h5_state_eof;
|
614
|
+
} else {
|
615
|
+
hs->token_start = hs->s + hs->pos;
|
616
|
+
hs->token_len = (size_t)(idx - hs->s) - hs->pos;
|
617
|
+
hs->pos = (size_t)(idx - hs->s) + 1;
|
618
|
+
hs->state = h5_state_data;
|
619
|
+
}
|
620
|
+
|
621
|
+
hs->token_type = TAG_COMMENT;
|
622
|
+
return 1;
|
623
|
+
}
|
624
|
+
|
625
|
+
/**
|
626
|
+
* 12.2.4.44 ALT
|
627
|
+
*/
|
628
|
+
static int h5_state_bogus_comment2(h5_state_t* hs)
|
629
|
+
{
|
630
|
+
const char* idx;
|
631
|
+
size_t pos;
|
632
|
+
|
633
|
+
TRACE();
|
634
|
+
pos = hs->pos;
|
635
|
+
while (1) {
|
636
|
+
idx = (const char*) memchr(hs->s + pos, CHAR_PERCENT, hs->len - pos);
|
637
|
+
if (idx == NULL || (idx + 1 >= hs->s + hs->len)) {
|
638
|
+
hs->token_start = hs->s + hs->pos;
|
639
|
+
hs->token_len = hs->len - hs->pos;
|
640
|
+
hs->pos = hs->len;
|
641
|
+
hs->token_type = TAG_COMMENT;
|
642
|
+
hs->state = h5_state_eof;
|
643
|
+
return 1;
|
644
|
+
}
|
645
|
+
|
646
|
+
if (*(idx +1) != CHAR_GT) {
|
647
|
+
pos = (size_t)(idx - hs->s) + 1;
|
648
|
+
continue;
|
649
|
+
}
|
650
|
+
|
651
|
+
/* ends in %> */
|
652
|
+
hs->token_start = hs->s + hs->pos;
|
653
|
+
hs->token_len = (size_t)(idx - hs->s) - hs->pos;
|
654
|
+
hs->pos = (size_t)(idx - hs->s) + 2;
|
655
|
+
hs->state = h5_state_data;
|
656
|
+
hs->token_type = TAG_COMMENT;
|
657
|
+
return 1;
|
658
|
+
}
|
659
|
+
}
|
660
|
+
|
661
|
+
/**
|
662
|
+
* 8.2.4.45
|
663
|
+
*/
|
664
|
+
static int h5_state_markup_declaration_open(h5_state_t* hs)
|
665
|
+
{
|
666
|
+
size_t remaining;
|
667
|
+
|
668
|
+
TRACE();
|
669
|
+
remaining = hs->len - hs->pos;
|
670
|
+
if (remaining >= 7 &&
|
671
|
+
/* case insensitive */
|
672
|
+
(hs->s[hs->pos + 0] == 'D' || hs->s[hs->pos + 0] == 'd') &&
|
673
|
+
(hs->s[hs->pos + 1] == 'O' || hs->s[hs->pos + 1] == 'o') &&
|
674
|
+
(hs->s[hs->pos + 2] == 'C' || hs->s[hs->pos + 2] == 'c') &&
|
675
|
+
(hs->s[hs->pos + 3] == 'T' || hs->s[hs->pos + 3] == 't') &&
|
676
|
+
(hs->s[hs->pos + 4] == 'Y' || hs->s[hs->pos + 4] == 'y') &&
|
677
|
+
(hs->s[hs->pos + 5] == 'P' || hs->s[hs->pos + 5] == 'p') &&
|
678
|
+
(hs->s[hs->pos + 6] == 'E' || hs->s[hs->pos + 6] == 'e')
|
679
|
+
) {
|
680
|
+
return h5_state_doctype(hs);
|
681
|
+
} else if (remaining >= 7 &&
|
682
|
+
/* upper case required */
|
683
|
+
hs->s[hs->pos + 0] == '[' &&
|
684
|
+
hs->s[hs->pos + 1] == 'C' &&
|
685
|
+
hs->s[hs->pos + 2] == 'D' &&
|
686
|
+
hs->s[hs->pos + 3] == 'A' &&
|
687
|
+
hs->s[hs->pos + 4] == 'T' &&
|
688
|
+
hs->s[hs->pos + 5] == 'A' &&
|
689
|
+
hs->s[hs->pos + 6] == '['
|
690
|
+
) {
|
691
|
+
hs->pos += 7;
|
692
|
+
return h5_state_cdata(hs);
|
693
|
+
} else if (remaining >= 2 &&
|
694
|
+
hs->s[hs->pos + 0] == '-' &&
|
695
|
+
hs->s[hs->pos + 1] == '-') {
|
696
|
+
hs->pos += 2;
|
697
|
+
return h5_state_comment(hs);
|
698
|
+
}
|
699
|
+
|
700
|
+
return h5_state_bogus_comment(hs);
|
701
|
+
}
|
702
|
+
|
703
|
+
/**
|
704
|
+
* 12.2.4.48
|
705
|
+
* 12.2.4.49
|
706
|
+
* 12.2.4.50
|
707
|
+
* 12.2.4.51
|
708
|
+
* state machine spec is confusing since it can only look
|
709
|
+
* at one character at a time but simply it's comments end by:
|
710
|
+
* 1) EOF
|
711
|
+
* 2) ending in -->
|
712
|
+
* 3) ending in -!>
|
713
|
+
*/
|
714
|
+
static int h5_state_comment(h5_state_t* hs)
|
715
|
+
{
|
716
|
+
char ch;
|
717
|
+
const char* idx;
|
718
|
+
size_t pos;
|
719
|
+
size_t offset;
|
720
|
+
const char* end = hs->s + hs->len;
|
721
|
+
|
722
|
+
TRACE();
|
723
|
+
pos = hs->pos;
|
724
|
+
while (1) {
|
725
|
+
|
726
|
+
idx = (const char*) memchr(hs->s + pos, CHAR_DASH, hs->len - pos);
|
727
|
+
|
728
|
+
/* did not find anything or has less than 3 chars left */
|
729
|
+
if (idx == NULL || idx > hs->s + hs->len - 3) {
|
730
|
+
hs->state = h5_state_eof;
|
731
|
+
hs->token_start = hs->s + hs->pos;
|
732
|
+
hs->token_len = hs->len - hs->pos;
|
733
|
+
hs->token_type = TAG_COMMENT;
|
734
|
+
return 1;
|
735
|
+
}
|
736
|
+
offset = 1;
|
737
|
+
|
738
|
+
/* skip all nulls */
|
739
|
+
while (idx + offset < end && *(idx + offset) == 0) {
|
740
|
+
offset += 1;
|
741
|
+
}
|
742
|
+
if (idx + offset == end) {
|
743
|
+
hs->state = h5_state_eof;
|
744
|
+
hs->token_start = hs->s + hs->pos;
|
745
|
+
hs->token_len = hs->len - hs->pos;
|
746
|
+
hs->token_type = TAG_COMMENT;
|
747
|
+
return 1;
|
748
|
+
}
|
749
|
+
|
750
|
+
ch = *(idx + offset);
|
751
|
+
if (ch != CHAR_DASH && ch != CHAR_BANG) {
|
752
|
+
pos = (size_t)(idx - hs->s) + 1;
|
753
|
+
continue;
|
754
|
+
}
|
755
|
+
|
756
|
+
/* need to test */
|
757
|
+
#if 0
|
758
|
+
/* skip all nulls */
|
759
|
+
while (idx + offset < end && *(idx + offset) == 0) {
|
760
|
+
offset += 1;
|
761
|
+
}
|
762
|
+
if (idx + offset == end) {
|
763
|
+
hs->state = h5_state_eof;
|
764
|
+
hs->token_start = hs->s + hs->pos;
|
765
|
+
hs->token_len = hs->len - hs->pos;
|
766
|
+
hs->token_type = TAG_COMMENT;
|
767
|
+
return 1;
|
768
|
+
}
|
769
|
+
#endif
|
770
|
+
|
771
|
+
offset += 1;
|
772
|
+
if (idx + offset == end) {
|
773
|
+
hs->state = h5_state_eof;
|
774
|
+
hs->token_start = hs->s + hs->pos;
|
775
|
+
hs->token_len = hs->len - hs->pos;
|
776
|
+
hs->token_type = TAG_COMMENT;
|
777
|
+
return 1;
|
778
|
+
}
|
779
|
+
|
780
|
+
|
781
|
+
ch = *(idx + offset);
|
782
|
+
if (ch != CHAR_GT) {
|
783
|
+
pos = (size_t)(idx - hs->s) + 1;
|
784
|
+
continue;
|
785
|
+
}
|
786
|
+
offset += 1;
|
787
|
+
|
788
|
+
/* ends in --> or -!> */
|
789
|
+
hs->token_start = hs->s + hs->pos;
|
790
|
+
hs->token_len = (size_t)(idx - hs->s) - hs->pos;
|
791
|
+
hs->pos = (size_t)(idx + offset - hs->s);
|
792
|
+
hs->state = h5_state_data;
|
793
|
+
hs->token_type = TAG_COMMENT;
|
794
|
+
return 1;
|
795
|
+
}
|
796
|
+
}
|
797
|
+
|
798
|
+
static int h5_state_cdata(h5_state_t* hs)
|
799
|
+
{
|
800
|
+
const char* idx;
|
801
|
+
size_t pos;
|
802
|
+
|
803
|
+
TRACE();
|
804
|
+
pos = hs->pos;
|
805
|
+
while (1) {
|
806
|
+
idx = (const char*) memchr(hs->s + pos, CHAR_RIGHTB, hs->len - pos);
|
807
|
+
|
808
|
+
/* did not find anything or has less than 3 chars left */
|
809
|
+
if (idx == NULL || idx > hs->s + hs->len - 3) {
|
810
|
+
hs->state = h5_state_eof;
|
811
|
+
hs->token_start = hs->s + hs->pos;
|
812
|
+
hs->token_len = hs->len - hs->pos;
|
813
|
+
hs->token_type = DATA_TEXT;
|
814
|
+
return 1;
|
815
|
+
} else if ( *(idx+1) == CHAR_RIGHTB && *(idx+2) == CHAR_GT) {
|
816
|
+
hs->state = h5_state_data;
|
817
|
+
hs->token_start = hs->s + hs->pos;
|
818
|
+
hs->token_len = (size_t)(idx - hs->s) - hs->pos;
|
819
|
+
hs->pos = (size_t)(idx - hs->s) + 3;
|
820
|
+
hs->token_type = DATA_TEXT;
|
821
|
+
return 1;
|
822
|
+
} else {
|
823
|
+
pos = (size_t)(idx - hs->s) + 1;
|
824
|
+
}
|
825
|
+
}
|
826
|
+
}
|
827
|
+
|
828
|
+
/**
|
829
|
+
* 8.2.4.52
|
830
|
+
* http://www.w3.org/html/wg/drafts/html/master/syntax.html#doctype-state
|
831
|
+
*/
|
832
|
+
static int h5_state_doctype(h5_state_t* hs)
|
833
|
+
{
|
834
|
+
const char* idx;
|
835
|
+
|
836
|
+
TRACE();
|
837
|
+
hs->token_start = hs->s + hs->pos;
|
838
|
+
hs->token_type = DOCTYPE;
|
839
|
+
|
840
|
+
idx = (const char*) memchr(hs->s + hs->pos, CHAR_GT, hs->len - hs->pos);
|
841
|
+
if (idx == NULL) {
|
842
|
+
hs->state = h5_state_eof;
|
843
|
+
hs->token_len = hs->len - hs->pos;
|
844
|
+
} else {
|
845
|
+
hs->state = h5_state_data;
|
846
|
+
hs->token_len = (size_t)(idx - hs->s) - hs->pos;
|
847
|
+
hs->pos = (size_t)(idx - hs->s) + 1;
|
848
|
+
}
|
849
|
+
return 1;
|
850
|
+
}
|