nokogumbo 0.3 → 0.4

Sign up to get free protection for your applications and to get access to all the features.
data/work/tokenizer.h ADDED
@@ -0,0 +1,123 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // This contains an implementation of a tokenizer for HTML5. It consumes a
18
+ // buffer of UTF-8 characters, and then emits a stream of tokens.
19
+
20
+ #ifndef GUMBO_TOKENIZER_H_
21
+ #define GUMBO_TOKENIZER_H_
22
+
23
+ #include <stdbool.h>
24
+ #include <stddef.h>
25
+
26
+ #include "gumbo.h"
27
+ #include "token_type.h"
28
+ #include "tokenizer_states.h"
29
+
30
+ #ifdef __cplusplus
31
+ extern "C" {
32
+ #endif
33
+
34
+ struct _GumboParser;
35
+
36
+ // Struct containing all information pertaining to doctype tokens.
37
+ typedef struct _GumboTokenDocType {
38
+ const char* name;
39
+ const char* public_identifier;
40
+ const char* system_identifier;
41
+ bool force_quirks;
42
+ // There's no way to tell a 0-length public or system ID apart from the
43
+ // absence of a public or system ID, but they're handled different by the
44
+ // spec, so we need bool flags for them.
45
+ bool has_public_identifier;
46
+ bool has_system_identifier;
47
+ } GumboTokenDocType;
48
+
49
+ // Struct containing all information pertaining to start tag tokens.
50
+ typedef struct _GumboTokenStartTag {
51
+ GumboTag tag;
52
+ GumboVector /* GumboAttribute */ attributes;
53
+ bool is_self_closing;
54
+ } GumboTokenStartTag;
55
+
56
+ // A data structure representing a single token in the input stream. This
57
+ // contains an enum for the type, the source position, a GumboStringPiece
58
+ // pointing to the original text, and then a union for any parsed data.
59
+ typedef struct _GumboToken {
60
+ GumboTokenType type;
61
+ GumboSourcePosition position;
62
+ GumboStringPiece original_text;
63
+ union {
64
+ GumboTokenDocType doc_type;
65
+ GumboTokenStartTag start_tag;
66
+ GumboTag end_tag;
67
+ const char* text; // For comments.
68
+ int character; // For character, whitespace, null, and EOF tokens.
69
+ } v;
70
+ } GumboToken;
71
+
72
+ // Initializes the tokenizer state within the GumboParser object, setting up a
73
+ // parse of the specified text.
74
+ void gumbo_tokenizer_state_init(
75
+ struct _GumboParser* parser, const char* text, size_t text_length);
76
+
77
+ // Destroys the tokenizer state within the GumboParser object, freeing any
78
+ // dynamically-allocated structures within it.
79
+ void gumbo_tokenizer_state_destroy(struct _GumboParser* parser);
80
+
81
+ // Sets the tokenizer state to the specified value. This is needed by some
82
+ // parser states, which alter the state of the tokenizer in response to tags
83
+ // seen.
84
+ void gumbo_tokenizer_set_state(
85
+ struct _GumboParser* parser, GumboTokenizerEnum state);
86
+
87
+ // Flags whether the current node is a foreign content element. This is
88
+ // necessary for the markup declaration open state, where the tokenizer must be
89
+ // aware of the state of the parser to properly tokenize bad comment tags.
90
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#markup-declaration-open-state
91
+ void gumbo_tokenizer_set_is_current_node_foreign(
92
+ struct _GumboParser* parser, bool is_foreign);
93
+
94
+ // Lexes a single token from the specified buffer, filling the output with the
95
+ // parsed GumboToken data structure. Returns true for a successful
96
+ // tokenization, false if a parse error occurs.
97
+ //
98
+ // Example:
99
+ // struct _GumboParser parser;
100
+ // GumboToken output;
101
+ // gumbo_tokenizer_state_init(&parser, text, strlen(text));
102
+ // while (gumbo_lex(&parser, &output)) {
103
+ // ...do stuff with output.
104
+ // gumbo_token_destroy(&parser, &token);
105
+ // }
106
+ // gumbo_tokenizer_state_destroy(&parser);
107
+ bool gumbo_lex(struct _GumboParser* parser, GumboToken* output);
108
+
109
+ // Frees the internally-allocated pointers within an GumboToken. Note that this
110
+ // doesn't free the token itself, since oftentimes it will be allocated on the
111
+ // stack. A simple call to free() (or struct _GumboParser->deallocator, if
112
+ // appropriate) can handle that.
113
+ //
114
+ // Note that if you are handing over ownership of the internal strings to some
115
+ // other data structure - for example, a parse tree - these do not need to be
116
+ // freed.
117
+ void gumbo_token_destroy(struct _GumboParser* parser, GumboToken* token);
118
+
119
+ #ifdef __cplusplus
120
+ }
121
+ #endif
122
+
123
+ #endif // GUMBO_TOKENIZER_H_
@@ -0,0 +1,103 @@
1
+ // Copyright 2011 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // This contains the list of states used in the tokenizer. Although at first
18
+ // glance it seems like these could be kept internal to the tokenizer, several
19
+ // of the actions in the parser require that it reach into the tokenizer and
20
+ // reset the tokenizer state. For that to work, it needs to have the
21
+ // definitions of individual states available.
22
+ //
23
+ // This may also be useful for providing more detailed error messages for parse
24
+ // errors, as we can match up states and inputs in a table without having to
25
+ // clutter the tokenizer code with lots of precise error messages.
26
+
27
+ #ifndef GUMBO_TOKENIZER_STATES_H_
28
+ #define GUMBO_TOKENIZER_STATES_H_
29
+
30
+ // The ordering of this enum is also used to build the dispatch table for the
31
+ // tokenizer state machine, so if it is changed, be sure to update that too.
32
+ typedef enum _GumboTokenizerEnum {
33
+ GUMBO_LEX_DATA,
34
+ GUMBO_LEX_CHAR_REF_IN_DATA,
35
+ GUMBO_LEX_RCDATA,
36
+ GUMBO_LEX_CHAR_REF_IN_RCDATA,
37
+ GUMBO_LEX_RAWTEXT,
38
+ GUMBO_LEX_SCRIPT,
39
+ GUMBO_LEX_PLAINTEXT,
40
+ GUMBO_LEX_TAG_OPEN,
41
+ GUMBO_LEX_END_TAG_OPEN,
42
+ GUMBO_LEX_TAG_NAME,
43
+ GUMBO_LEX_RCDATA_LT,
44
+ GUMBO_LEX_RCDATA_END_TAG_OPEN,
45
+ GUMBO_LEX_RCDATA_END_TAG_NAME,
46
+ GUMBO_LEX_RAWTEXT_LT,
47
+ GUMBO_LEX_RAWTEXT_END_TAG_OPEN,
48
+ GUMBO_LEX_RAWTEXT_END_TAG_NAME,
49
+ GUMBO_LEX_SCRIPT_LT,
50
+ GUMBO_LEX_SCRIPT_END_TAG_OPEN,
51
+ GUMBO_LEX_SCRIPT_END_TAG_NAME,
52
+ GUMBO_LEX_SCRIPT_ESCAPED_START,
53
+ GUMBO_LEX_SCRIPT_ESCAPED_START_DASH,
54
+ GUMBO_LEX_SCRIPT_ESCAPED,
55
+ GUMBO_LEX_SCRIPT_ESCAPED_DASH,
56
+ GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH,
57
+ GUMBO_LEX_SCRIPT_ESCAPED_LT,
58
+ GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN,
59
+ GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME,
60
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START,
61
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED,
62
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH,
63
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH,
64
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT,
65
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END,
66
+ GUMBO_LEX_BEFORE_ATTR_NAME,
67
+ GUMBO_LEX_ATTR_NAME,
68
+ GUMBO_LEX_AFTER_ATTR_NAME,
69
+ GUMBO_LEX_BEFORE_ATTR_VALUE,
70
+ GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED,
71
+ GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED,
72
+ GUMBO_LEX_ATTR_VALUE_UNQUOTED,
73
+ GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE,
74
+ GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED,
75
+ GUMBO_LEX_SELF_CLOSING_START_TAG,
76
+ GUMBO_LEX_BOGUS_COMMENT,
77
+ GUMBO_LEX_MARKUP_DECLARATION,
78
+ GUMBO_LEX_COMMENT_START,
79
+ GUMBO_LEX_COMMENT_START_DASH,
80
+ GUMBO_LEX_COMMENT,
81
+ GUMBO_LEX_COMMENT_END_DASH,
82
+ GUMBO_LEX_COMMENT_END,
83
+ GUMBO_LEX_COMMENT_END_BANG,
84
+ GUMBO_LEX_DOCTYPE,
85
+ GUMBO_LEX_BEFORE_DOCTYPE_NAME,
86
+ GUMBO_LEX_DOCTYPE_NAME,
87
+ GUMBO_LEX_AFTER_DOCTYPE_NAME,
88
+ GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD,
89
+ GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID,
90
+ GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED,
91
+ GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED,
92
+ GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID,
93
+ GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID,
94
+ GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD,
95
+ GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID,
96
+ GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED,
97
+ GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED,
98
+ GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID,
99
+ GUMBO_LEX_BOGUS_DOCTYPE,
100
+ GUMBO_LEX_CDATA
101
+ } GumboTokenizerEnum;
102
+
103
+ #endif // GUMBO_TOKENIZER_STATES_H_
data/work/utf8.c ADDED
@@ -0,0 +1,268 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "utf8.h"
18
+
19
+ #include <assert.h>
20
+ #include <stdint.h>
21
+ #include <string.h>
22
+ #include <strings.h> // For strncasecmp.
23
+
24
+ #include "error.h"
25
+ #include "gumbo.h"
26
+ #include "parser.h"
27
+ #include "util.h"
28
+ #include "vector.h"
29
+
30
+ const int kUtf8ReplacementChar = 0xFFFD;
31
+
32
+ // Reference material:
33
+ // Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description
34
+ // RFC 3629: http://tools.ietf.org/html/rfc3629
35
+ // HTML5 Unicode handling:
36
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling
37
+
38
+ // Adds a decoding error to the parser's error list, based on the current state
39
+ // of the Utf8Iterator.
40
+ static void add_error(Utf8Iterator* iter, GumboErrorType type) {
41
+ GumboParser* parser = iter->_parser;
42
+
43
+ GumboError* error = gumbo_add_error(parser);
44
+ if (!error) {
45
+ return;
46
+ }
47
+ error->type = type;
48
+ error->position = iter->_pos;
49
+ error->original_text = iter->_start;
50
+
51
+ // At the point the error is recorded, the code point hasn't been computed
52
+ // yet (and can't be, because it's invalid), so we need to build up the raw
53
+ // hex value from the bytes under the cursor.
54
+ uint64_t code_point = 0;
55
+ for (int i = 0; i < iter->_width; ++i) {
56
+ code_point = (code_point << 8) | (unsigned char) iter->_start[i];
57
+ }
58
+ error->v.codepoint = code_point;
59
+ }
60
+
61
+ // Reads the next UTF-8 character in the iter.
62
+ // This assumes that iter->_start points to the beginning of the character.
63
+ // When this method returns, iter->_width and iter->_current will be set
64
+ // appropriately, as well as any error flags.
65
+ static void read_char(Utf8Iterator* iter) {
66
+ unsigned char c;
67
+ unsigned char mask = '\0';
68
+ int is_bad_char = false;
69
+
70
+ c = (unsigned char) *iter->_start;
71
+ if (c < 0x80) {
72
+ // Valid one-byte sequence.
73
+ iter->_width = 1;
74
+ mask = 0xFF;
75
+ } else if (c < 0xC0) {
76
+ // Continuation character not following a multibyte sequence.
77
+ // The HTML5 spec here says to consume the byte and output a replacement
78
+ // character.
79
+ iter->_width = 1;
80
+ is_bad_char = true;
81
+ } else if (c < 0xE0) {
82
+ iter->_width = 2;
83
+ mask = 0x1F; // 00011111 in binary.
84
+ if (c < 0xC2) {
85
+ // Overlong encoding; error according to UTF8/HTML5 spec.
86
+ is_bad_char = true;
87
+ }
88
+ } else if (c < 0xF0) {
89
+ iter->_width = 3;
90
+ mask = 0xF; // 00001111 in binary.
91
+ } else if (c < 0xF5) {
92
+ iter->_width = 4;
93
+ mask = 0x7; // 00000111 in binary.
94
+ } else if (c < 0xF8) {
95
+ // The following cases are all errors, but we need to handle them separately
96
+ // so that we consume the proper number of bytes from the input stream
97
+ // before replacing them with the replacement char. The HTML5 spec
98
+ // specifies that we should consume the shorter of the length specified by
99
+ // the first bit or the run leading up to the first non-continuation
100
+ // character.
101
+ iter->_width = 5;
102
+ is_bad_char = true;
103
+ } else if (c < 0xFC) {
104
+ iter->_width = 6;
105
+ is_bad_char = true;
106
+ } else if (c < 0xFE) {
107
+ iter->_width = 7;
108
+ is_bad_char = true;
109
+ } else {
110
+ iter->_width = 1;
111
+ is_bad_char = true;
112
+ }
113
+
114
+ // Check to make sure we have enough bytes left in the iter to read all that
115
+ // we want. If not, we set the iter_truncated flag, mark this as a bad
116
+ // character, and adjust the current width so that it consumes the rest of the
117
+ // iter.
118
+ int code_point = c & mask;
119
+ if (iter->_start + iter->_width > iter->_end) {
120
+ iter->_width = iter->_end - iter->_start;
121
+ add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
122
+ is_bad_char = true;
123
+ }
124
+
125
+ // Now we decode continuation bytes, shift them appropriately, and build up
126
+ // the appropriate code point.
127
+ assert(iter->_width < 8);
128
+ for (int i = 1; i < iter->_width; ++i) {
129
+ c = (unsigned char) iter->_start[i];
130
+ if (c < 0x80 || c > 0xBF) {
131
+ // Per HTML5 spec, we don't include the invalid continuation char in the
132
+ // run that we consume here.
133
+ iter->_width = i;
134
+ is_bad_char = true;
135
+ break;
136
+ }
137
+ code_point = (code_point << 6) | (c & ~0x80);
138
+ }
139
+
140
+ // If we had a decode error, set the current code point to the replacement
141
+ // character and flip the flag indicating that a decode error occurred.
142
+ // Ditto if we have a code point that is explicitly on the list of characters
143
+ // prohibited by the HTML5 spec, such as control characters.
144
+ if (is_bad_char || utf8_is_invalid_code_point(code_point)) {
145
+ add_error(iter, GUMBO_ERR_UTF8_INVALID);
146
+ code_point = kUtf8ReplacementChar;
147
+ }
148
+
149
+ // This is the special handling for carriage returns that is mandated by the
150
+ // HTML5 spec. Since we're looking for particular 7-bit literal characters,
151
+ // we operate in terms of chars and only need a check for iter overrun,
152
+ // instead of having to read in a full next code point.
153
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
154
+ if (code_point == '\r') {
155
+ const char* next = iter->_start + iter->_width;
156
+ if (next < iter->_end && *next == '\n') {
157
+ // Advance the iter, as if the carriage return didn't exist.
158
+ ++iter->_start;
159
+ // Preserve the true offset, since other tools that look at it may be
160
+ // unaware of HTML5's rules for converting \r into \n.
161
+ ++iter->_pos.offset;
162
+ }
163
+ code_point = '\n';
164
+ }
165
+
166
+ // At this point, we know we have a valid character as the code point, so we
167
+ // set it, and we're done.
168
+ iter->_current = code_point;
169
+ }
170
+
171
+ static void update_position(Utf8Iterator* iter) {
172
+ iter->_pos.offset += iter->_width;
173
+ if (iter->_current == '\n') {
174
+ ++iter->_pos.line;
175
+ iter->_pos.column = 1;
176
+ } else if(iter->_current == '\t') {
177
+ int tab_stop = iter->_parser->_options->tab_stop;
178
+ iter->_pos.column = ((iter->_pos.column / tab_stop) + 1) * tab_stop;
179
+ } else {
180
+ ++iter->_pos.column;
181
+ }
182
+ }
183
+
184
+ // Returns true if this Unicode code point is in the list of characters
185
+ // forbidden by the HTML5 spec, such as undefined control chars.
186
+ bool utf8_is_invalid_code_point(int c) {
187
+ return (c >= 0x1 && c <= 0x8) || c == 0xB || (c >= 0xE && c <= 0x1F) ||
188
+ (c >= 0x7F && c <= 0x9F) || (c >= 0xFDD0 && c <= 0xFDEF) ||
189
+ ((c & 0xFFFF) == 0xFFFE) || ((c & 0xFFFF) == 0xFFFF);
190
+ }
191
+
192
+ void utf8iterator_init(
193
+ GumboParser* parser, const char* source, size_t source_length,
194
+ Utf8Iterator* iter) {
195
+ iter->_start = source;
196
+ iter->_end = source + source_length;
197
+ iter->_width = 0;
198
+ iter->_pos.line = 1;
199
+ iter->_pos.column = 1;
200
+ iter->_pos.offset = 0;
201
+ iter->_parser = parser;
202
+ if (source_length) {
203
+ read_char(iter);
204
+ } else {
205
+ iter->_current = -1;
206
+ }
207
+ }
208
+
209
+ void utf8iterator_next(Utf8Iterator* iter) {
210
+ iter->_start += iter->_width;
211
+ // We update positions based on the *last* character read, so that the first
212
+ // character following a newline is at column 1 in the next line.
213
+ update_position(iter);
214
+ if (iter->_start < iter->_end) {
215
+ read_char(iter);
216
+ } else { // EOF
217
+ iter->_current = -1;
218
+ }
219
+ }
220
+
221
+ int utf8iterator_current(const Utf8Iterator* iter) {
222
+ return iter->_current;
223
+ }
224
+
225
+ void utf8iterator_get_position(
226
+ const Utf8Iterator* iter, GumboSourcePosition* output) {
227
+ *output = iter->_pos;
228
+ }
229
+
230
+ const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
231
+ return iter->_start;
232
+ }
233
+
234
+ bool utf8iterator_maybe_consume_match(
235
+ Utf8Iterator* iter, const char* prefix, size_t length,
236
+ bool case_sensitive) {
237
+ bool matched = (iter->_start + length <= iter->_end) && (case_sensitive ?
238
+ !strncmp(iter->_start, prefix, length) :
239
+ !strncasecmp(iter->_start, prefix, length));
240
+ if (matched) {
241
+ for (int i = 0; i < length; ++i) {
242
+ utf8iterator_next(iter);
243
+ }
244
+ return true;
245
+ } else {
246
+ return false;
247
+ }
248
+ }
249
+
250
+ void utf8iterator_mark(Utf8Iterator* iter) {
251
+ iter->_mark = iter->_start;
252
+ iter->_mark_pos = iter->_pos;
253
+ }
254
+
255
+ // Returns the current input stream position to the mark.
256
+ void utf8iterator_reset(Utf8Iterator* iter) {
257
+ iter->_start = iter->_mark;
258
+ iter->_pos = iter->_mark_pos;
259
+ read_char(iter);
260
+ }
261
+
262
+ // Sets the position and original text fields of an error to the value at the
263
+ // mark.
264
+ void utf8iterator_fill_error_at_mark(
265
+ Utf8Iterator* iter, GumboError* error) {
266
+ error->position = iter->_mark_pos;
267
+ error->original_text = iter->_mark;
268
+ }