nokogumbo 0.4 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
data/work/tokenizer.h DELETED
@@ -1,123 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // This contains an implementation of a tokenizer for HTML5. It consumes a
18
- // buffer of UTF-8 characters, and then emits a stream of tokens.
19
-
20
- #ifndef GUMBO_TOKENIZER_H_
21
- #define GUMBO_TOKENIZER_H_
22
-
23
- #include <stdbool.h>
24
- #include <stddef.h>
25
-
26
- #include "gumbo.h"
27
- #include "token_type.h"
28
- #include "tokenizer_states.h"
29
-
30
- #ifdef __cplusplus
31
- extern "C" {
32
- #endif
33
-
34
- struct _GumboParser;
35
-
36
- // Struct containing all information pertaining to doctype tokens.
37
- typedef struct _GumboTokenDocType {
38
- const char* name;
39
- const char* public_identifier;
40
- const char* system_identifier;
41
- bool force_quirks;
42
- // There's no way to tell a 0-length public or system ID apart from the
43
- // absence of a public or system ID, but they're handled different by the
44
- // spec, so we need bool flags for them.
45
- bool has_public_identifier;
46
- bool has_system_identifier;
47
- } GumboTokenDocType;
48
-
49
- // Struct containing all information pertaining to start tag tokens.
50
- typedef struct _GumboTokenStartTag {
51
- GumboTag tag;
52
- GumboVector /* GumboAttribute */ attributes;
53
- bool is_self_closing;
54
- } GumboTokenStartTag;
55
-
56
- // A data structure representing a single token in the input stream. This
57
- // contains an enum for the type, the source position, a GumboStringPiece
58
- // pointing to the original text, and then a union for any parsed data.
59
- typedef struct _GumboToken {
60
- GumboTokenType type;
61
- GumboSourcePosition position;
62
- GumboStringPiece original_text;
63
- union {
64
- GumboTokenDocType doc_type;
65
- GumboTokenStartTag start_tag;
66
- GumboTag end_tag;
67
- const char* text; // For comments.
68
- int character; // For character, whitespace, null, and EOF tokens.
69
- } v;
70
- } GumboToken;
71
-
72
- // Initializes the tokenizer state within the GumboParser object, setting up a
73
- // parse of the specified text.
74
- void gumbo_tokenizer_state_init(
75
- struct _GumboParser* parser, const char* text, size_t text_length);
76
-
77
- // Destroys the tokenizer state within the GumboParser object, freeing any
78
- // dynamically-allocated structures within it.
79
- void gumbo_tokenizer_state_destroy(struct _GumboParser* parser);
80
-
81
- // Sets the tokenizer state to the specified value. This is needed by some
82
- // parser states, which alter the state of the tokenizer in response to tags
83
- // seen.
84
- void gumbo_tokenizer_set_state(
85
- struct _GumboParser* parser, GumboTokenizerEnum state);
86
-
87
- // Flags whether the current node is a foreign content element. This is
88
- // necessary for the markup declaration open state, where the tokenizer must be
89
- // aware of the state of the parser to properly tokenize bad comment tags.
90
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#markup-declaration-open-state
91
- void gumbo_tokenizer_set_is_current_node_foreign(
92
- struct _GumboParser* parser, bool is_foreign);
93
-
94
- // Lexes a single token from the specified buffer, filling the output with the
95
- // parsed GumboToken data structure. Returns true for a successful
96
- // tokenization, false if a parse error occurs.
97
- //
98
- // Example:
99
- // struct _GumboParser parser;
100
- // GumboToken output;
101
- // gumbo_tokenizer_state_init(&parser, text, strlen(text));
102
- // while (gumbo_lex(&parser, &output)) {
103
- // ...do stuff with output.
104
- // gumbo_token_destroy(&parser, &token);
105
- // }
106
- // gumbo_tokenizer_state_destroy(&parser);
107
- bool gumbo_lex(struct _GumboParser* parser, GumboToken* output);
108
-
109
- // Frees the internally-allocated pointers within an GumboToken. Note that this
110
- // doesn't free the token itself, since oftentimes it will be allocated on the
111
- // stack. A simple call to free() (or struct _GumboParser->deallocator, if
112
- // appropriate) can handle that.
113
- //
114
- // Note that if you are handing over ownership of the internal strings to some
115
- // other data structure - for example, a parse tree - these do not need to be
116
- // freed.
117
- void gumbo_token_destroy(struct _GumboParser* parser, GumboToken* token);
118
-
119
- #ifdef __cplusplus
120
- }
121
- #endif
122
-
123
- #endif // GUMBO_TOKENIZER_H_
@@ -1,103 +0,0 @@
1
- // Copyright 2011 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // This contains the list of states used in the tokenizer. Although at first
18
- // glance it seems like these could be kept internal to the tokenizer, several
19
- // of the actions in the parser require that it reach into the tokenizer and
20
- // reset the tokenizer state. For that to work, it needs to have the
21
- // definitions of individual states available.
22
- //
23
- // This may also be useful for providing more detailed error messages for parse
24
- // errors, as we can match up states and inputs in a table without having to
25
- // clutter the tokenizer code with lots of precise error messages.
26
-
27
- #ifndef GUMBO_TOKENIZER_STATES_H_
28
- #define GUMBO_TOKENIZER_STATES_H_
29
-
30
- // The ordering of this enum is also used to build the dispatch table for the
31
- // tokenizer state machine, so if it is changed, be sure to update that too.
32
- typedef enum _GumboTokenizerEnum {
33
- GUMBO_LEX_DATA,
34
- GUMBO_LEX_CHAR_REF_IN_DATA,
35
- GUMBO_LEX_RCDATA,
36
- GUMBO_LEX_CHAR_REF_IN_RCDATA,
37
- GUMBO_LEX_RAWTEXT,
38
- GUMBO_LEX_SCRIPT,
39
- GUMBO_LEX_PLAINTEXT,
40
- GUMBO_LEX_TAG_OPEN,
41
- GUMBO_LEX_END_TAG_OPEN,
42
- GUMBO_LEX_TAG_NAME,
43
- GUMBO_LEX_RCDATA_LT,
44
- GUMBO_LEX_RCDATA_END_TAG_OPEN,
45
- GUMBO_LEX_RCDATA_END_TAG_NAME,
46
- GUMBO_LEX_RAWTEXT_LT,
47
- GUMBO_LEX_RAWTEXT_END_TAG_OPEN,
48
- GUMBO_LEX_RAWTEXT_END_TAG_NAME,
49
- GUMBO_LEX_SCRIPT_LT,
50
- GUMBO_LEX_SCRIPT_END_TAG_OPEN,
51
- GUMBO_LEX_SCRIPT_END_TAG_NAME,
52
- GUMBO_LEX_SCRIPT_ESCAPED_START,
53
- GUMBO_LEX_SCRIPT_ESCAPED_START_DASH,
54
- GUMBO_LEX_SCRIPT_ESCAPED,
55
- GUMBO_LEX_SCRIPT_ESCAPED_DASH,
56
- GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH,
57
- GUMBO_LEX_SCRIPT_ESCAPED_LT,
58
- GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN,
59
- GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME,
60
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START,
61
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED,
62
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH,
63
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH,
64
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT,
65
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END,
66
- GUMBO_LEX_BEFORE_ATTR_NAME,
67
- GUMBO_LEX_ATTR_NAME,
68
- GUMBO_LEX_AFTER_ATTR_NAME,
69
- GUMBO_LEX_BEFORE_ATTR_VALUE,
70
- GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED,
71
- GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED,
72
- GUMBO_LEX_ATTR_VALUE_UNQUOTED,
73
- GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE,
74
- GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED,
75
- GUMBO_LEX_SELF_CLOSING_START_TAG,
76
- GUMBO_LEX_BOGUS_COMMENT,
77
- GUMBO_LEX_MARKUP_DECLARATION,
78
- GUMBO_LEX_COMMENT_START,
79
- GUMBO_LEX_COMMENT_START_DASH,
80
- GUMBO_LEX_COMMENT,
81
- GUMBO_LEX_COMMENT_END_DASH,
82
- GUMBO_LEX_COMMENT_END,
83
- GUMBO_LEX_COMMENT_END_BANG,
84
- GUMBO_LEX_DOCTYPE,
85
- GUMBO_LEX_BEFORE_DOCTYPE_NAME,
86
- GUMBO_LEX_DOCTYPE_NAME,
87
- GUMBO_LEX_AFTER_DOCTYPE_NAME,
88
- GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD,
89
- GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID,
90
- GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED,
91
- GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED,
92
- GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID,
93
- GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID,
94
- GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD,
95
- GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID,
96
- GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED,
97
- GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED,
98
- GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID,
99
- GUMBO_LEX_BOGUS_DOCTYPE,
100
- GUMBO_LEX_CDATA
101
- } GumboTokenizerEnum;
102
-
103
- #endif // GUMBO_TOKENIZER_STATES_H_
data/work/utf8.c DELETED
@@ -1,268 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #include "utf8.h"
18
-
19
- #include <assert.h>
20
- #include <stdint.h>
21
- #include <string.h>
22
- #include <strings.h> // For strncasecmp.
23
-
24
- #include "error.h"
25
- #include "gumbo.h"
26
- #include "parser.h"
27
- #include "util.h"
28
- #include "vector.h"
29
-
30
- const int kUtf8ReplacementChar = 0xFFFD;
31
-
32
- // Reference material:
33
- // Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description
34
- // RFC 3629: http://tools.ietf.org/html/rfc3629
35
- // HTML5 Unicode handling:
36
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling
37
-
38
- // Adds a decoding error to the parser's error list, based on the current state
39
- // of the Utf8Iterator.
40
- static void add_error(Utf8Iterator* iter, GumboErrorType type) {
41
- GumboParser* parser = iter->_parser;
42
-
43
- GumboError* error = gumbo_add_error(parser);
44
- if (!error) {
45
- return;
46
- }
47
- error->type = type;
48
- error->position = iter->_pos;
49
- error->original_text = iter->_start;
50
-
51
- // At the point the error is recorded, the code point hasn't been computed
52
- // yet (and can't be, because it's invalid), so we need to build up the raw
53
- // hex value from the bytes under the cursor.
54
- uint64_t code_point = 0;
55
- for (int i = 0; i < iter->_width; ++i) {
56
- code_point = (code_point << 8) | (unsigned char) iter->_start[i];
57
- }
58
- error->v.codepoint = code_point;
59
- }
60
-
61
- // Reads the next UTF-8 character in the iter.
62
- // This assumes that iter->_start points to the beginning of the character.
63
- // When this method returns, iter->_width and iter->_current will be set
64
- // appropriately, as well as any error flags.
65
- static void read_char(Utf8Iterator* iter) {
66
- unsigned char c;
67
- unsigned char mask = '\0';
68
- int is_bad_char = false;
69
-
70
- c = (unsigned char) *iter->_start;
71
- if (c < 0x80) {
72
- // Valid one-byte sequence.
73
- iter->_width = 1;
74
- mask = 0xFF;
75
- } else if (c < 0xC0) {
76
- // Continuation character not following a multibyte sequence.
77
- // The HTML5 spec here says to consume the byte and output a replacement
78
- // character.
79
- iter->_width = 1;
80
- is_bad_char = true;
81
- } else if (c < 0xE0) {
82
- iter->_width = 2;
83
- mask = 0x1F; // 00011111 in binary.
84
- if (c < 0xC2) {
85
- // Overlong encoding; error according to UTF8/HTML5 spec.
86
- is_bad_char = true;
87
- }
88
- } else if (c < 0xF0) {
89
- iter->_width = 3;
90
- mask = 0xF; // 00001111 in binary.
91
- } else if (c < 0xF5) {
92
- iter->_width = 4;
93
- mask = 0x7; // 00000111 in binary.
94
- } else if (c < 0xF8) {
95
- // The following cases are all errors, but we need to handle them separately
96
- // so that we consume the proper number of bytes from the input stream
97
- // before replacing them with the replacement char. The HTML5 spec
98
- // specifies that we should consume the shorter of the length specified by
99
- // the first bit or the run leading up to the first non-continuation
100
- // character.
101
- iter->_width = 5;
102
- is_bad_char = true;
103
- } else if (c < 0xFC) {
104
- iter->_width = 6;
105
- is_bad_char = true;
106
- } else if (c < 0xFE) {
107
- iter->_width = 7;
108
- is_bad_char = true;
109
- } else {
110
- iter->_width = 1;
111
- is_bad_char = true;
112
- }
113
-
114
- // Check to make sure we have enough bytes left in the iter to read all that
115
- // we want. If not, we set the iter_truncated flag, mark this as a bad
116
- // character, and adjust the current width so that it consumes the rest of the
117
- // iter.
118
- int code_point = c & mask;
119
- if (iter->_start + iter->_width > iter->_end) {
120
- iter->_width = iter->_end - iter->_start;
121
- add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
122
- is_bad_char = true;
123
- }
124
-
125
- // Now we decode continuation bytes, shift them appropriately, and build up
126
- // the appropriate code point.
127
- assert(iter->_width < 8);
128
- for (int i = 1; i < iter->_width; ++i) {
129
- c = (unsigned char) iter->_start[i];
130
- if (c < 0x80 || c > 0xBF) {
131
- // Per HTML5 spec, we don't include the invalid continuation char in the
132
- // run that we consume here.
133
- iter->_width = i;
134
- is_bad_char = true;
135
- break;
136
- }
137
- code_point = (code_point << 6) | (c & ~0x80);
138
- }
139
-
140
- // If we had a decode error, set the current code point to the replacement
141
- // character and flip the flag indicating that a decode error occurred.
142
- // Ditto if we have a code point that is explicitly on the list of characters
143
- // prohibited by the HTML5 spec, such as control characters.
144
- if (is_bad_char || utf8_is_invalid_code_point(code_point)) {
145
- add_error(iter, GUMBO_ERR_UTF8_INVALID);
146
- code_point = kUtf8ReplacementChar;
147
- }
148
-
149
- // This is the special handling for carriage returns that is mandated by the
150
- // HTML5 spec. Since we're looking for particular 7-bit literal characters,
151
- // we operate in terms of chars and only need a check for iter overrun,
152
- // instead of having to read in a full next code point.
153
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
154
- if (code_point == '\r') {
155
- const char* next = iter->_start + iter->_width;
156
- if (next < iter->_end && *next == '\n') {
157
- // Advance the iter, as if the carriage return didn't exist.
158
- ++iter->_start;
159
- // Preserve the true offset, since other tools that look at it may be
160
- // unaware of HTML5's rules for converting \r into \n.
161
- ++iter->_pos.offset;
162
- }
163
- code_point = '\n';
164
- }
165
-
166
- // At this point, we know we have a valid character as the code point, so we
167
- // set it, and we're done.
168
- iter->_current = code_point;
169
- }
170
-
171
- static void update_position(Utf8Iterator* iter) {
172
- iter->_pos.offset += iter->_width;
173
- if (iter->_current == '\n') {
174
- ++iter->_pos.line;
175
- iter->_pos.column = 1;
176
- } else if(iter->_current == '\t') {
177
- int tab_stop = iter->_parser->_options->tab_stop;
178
- iter->_pos.column = ((iter->_pos.column / tab_stop) + 1) * tab_stop;
179
- } else {
180
- ++iter->_pos.column;
181
- }
182
- }
183
-
184
- // Returns true if this Unicode code point is in the list of characters
185
- // forbidden by the HTML5 spec, such as undefined control chars.
186
- bool utf8_is_invalid_code_point(int c) {
187
- return (c >= 0x1 && c <= 0x8) || c == 0xB || (c >= 0xE && c <= 0x1F) ||
188
- (c >= 0x7F && c <= 0x9F) || (c >= 0xFDD0 && c <= 0xFDEF) ||
189
- ((c & 0xFFFF) == 0xFFFE) || ((c & 0xFFFF) == 0xFFFF);
190
- }
191
-
192
- void utf8iterator_init(
193
- GumboParser* parser, const char* source, size_t source_length,
194
- Utf8Iterator* iter) {
195
- iter->_start = source;
196
- iter->_end = source + source_length;
197
- iter->_width = 0;
198
- iter->_pos.line = 1;
199
- iter->_pos.column = 1;
200
- iter->_pos.offset = 0;
201
- iter->_parser = parser;
202
- if (source_length) {
203
- read_char(iter);
204
- } else {
205
- iter->_current = -1;
206
- }
207
- }
208
-
209
- void utf8iterator_next(Utf8Iterator* iter) {
210
- iter->_start += iter->_width;
211
- // We update positions based on the *last* character read, so that the first
212
- // character following a newline is at column 1 in the next line.
213
- update_position(iter);
214
- if (iter->_start < iter->_end) {
215
- read_char(iter);
216
- } else { // EOF
217
- iter->_current = -1;
218
- }
219
- }
220
-
221
- int utf8iterator_current(const Utf8Iterator* iter) {
222
- return iter->_current;
223
- }
224
-
225
- void utf8iterator_get_position(
226
- const Utf8Iterator* iter, GumboSourcePosition* output) {
227
- *output = iter->_pos;
228
- }
229
-
230
- const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
231
- return iter->_start;
232
- }
233
-
234
- bool utf8iterator_maybe_consume_match(
235
- Utf8Iterator* iter, const char* prefix, size_t length,
236
- bool case_sensitive) {
237
- bool matched = (iter->_start + length <= iter->_end) && (case_sensitive ?
238
- !strncmp(iter->_start, prefix, length) :
239
- !strncasecmp(iter->_start, prefix, length));
240
- if (matched) {
241
- for (int i = 0; i < length; ++i) {
242
- utf8iterator_next(iter);
243
- }
244
- return true;
245
- } else {
246
- return false;
247
- }
248
- }
249
-
250
- void utf8iterator_mark(Utf8Iterator* iter) {
251
- iter->_mark = iter->_start;
252
- iter->_mark_pos = iter->_pos;
253
- }
254
-
255
- // Returns the current input stream position to the mark.
256
- void utf8iterator_reset(Utf8Iterator* iter) {
257
- iter->_start = iter->_mark;
258
- iter->_pos = iter->_mark_pos;
259
- read_char(iter);
260
- }
261
-
262
- // Sets the position and original text fields of an error to the value at the
263
- // mark.
264
- void utf8iterator_fill_error_at_mark(
265
- Utf8Iterator* iter, GumboError* error) {
266
- error->position = iter->_mark_pos;
267
- error->original_text = iter->_mark;
268
- }