nokogumbo 1.4.8 → 1.4.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,123 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // This contains an implementation of a tokenizer for HTML5. It consumes a
18
+ // buffer of UTF-8 characters, and then emits a stream of tokens.
19
+
20
+ #ifndef GUMBO_TOKENIZER_H_
21
+ #define GUMBO_TOKENIZER_H_
22
+
23
+ #include <stdbool.h>
24
+ #include <stddef.h>
25
+
26
+ #include "gumbo.h"
27
+ #include "token_type.h"
28
+ #include "tokenizer_states.h"
29
+
30
+ #ifdef __cplusplus
31
+ extern "C" {
32
+ #endif
33
+
34
+ struct GumboInternalParser;
35
+
36
+ // Struct containing all information pertaining to doctype tokens.
37
+ typedef struct GumboInternalTokenDocType {
38
+ const char* name;
39
+ const char* public_identifier;
40
+ const char* system_identifier;
41
+ bool force_quirks;
42
+ // There's no way to tell a 0-length public or system ID apart from the
43
+ // absence of a public or system ID, but they're handled different by the
44
+ // spec, so we need bool flags for them.
45
+ bool has_public_identifier;
46
+ bool has_system_identifier;
47
+ } GumboTokenDocType;
48
+
49
+ // Struct containing all information pertaining to start tag tokens.
50
+ typedef struct GumboInternalTokenStartTag {
51
+ GumboTag tag;
52
+ GumboVector /* GumboAttribute */ attributes;
53
+ bool is_self_closing;
54
+ } GumboTokenStartTag;
55
+
56
+ // A data structure representing a single token in the input stream. This
57
+ // contains an enum for the type, the source position, a GumboStringPiece
58
+ // pointing to the original text, and then a union for any parsed data.
59
+ typedef struct GumboInternalToken {
60
+ GumboTokenType type;
61
+ GumboSourcePosition position;
62
+ GumboStringPiece original_text;
63
+ union {
64
+ GumboTokenDocType doc_type;
65
+ GumboTokenStartTag start_tag;
66
+ GumboTag end_tag;
67
+ const char* text; // For comments.
68
+ int character; // For character, whitespace, null, and EOF tokens.
69
+ } v;
70
+ } GumboToken;
71
+
72
+ // Initializes the tokenizer state within the GumboParser object, setting up a
73
+ // parse of the specified text.
74
+ void gumbo_tokenizer_state_init(
75
+ struct GumboInternalParser* parser, const char* text, size_t text_length);
76
+
77
+ // Destroys the tokenizer state within the GumboParser object, freeing any
78
+ // dynamically-allocated structures within it.
79
+ void gumbo_tokenizer_state_destroy(struct GumboInternalParser* parser);
80
+
81
+ // Sets the tokenizer state to the specified value. This is needed by some
82
+ // parser states, which alter the state of the tokenizer in response to tags
83
+ // seen.
84
+ void gumbo_tokenizer_set_state(
85
+ struct GumboInternalParser* parser, GumboTokenizerEnum state);
86
+
87
+ // Flags whether the current node is a foreign content element. This is
88
+ // necessary for the markup declaration open state, where the tokenizer must be
89
+ // aware of the state of the parser to properly tokenize bad comment tags.
90
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#markup-declaration-open-state
91
+ void gumbo_tokenizer_set_is_current_node_foreign(
92
+ struct GumboInternalParser* parser, bool is_foreign);
93
+
94
+ // Lexes a single token from the specified buffer, filling the output with the
95
+ // parsed GumboToken data structure. Returns true for a successful
96
+ // tokenization, false if a parse error occurs.
97
+ //
98
+ // Example:
99
+ // struct GumboInternalParser parser;
100
+ // GumboToken output;
101
+ // gumbo_tokenizer_state_init(&parser, text, strlen(text));
102
+ // while (gumbo_lex(&parser, &output)) {
103
+ // ...do stuff with output.
104
+ // gumbo_token_destroy(&parser, &token);
105
+ // }
106
+ // gumbo_tokenizer_state_destroy(&parser);
107
+ bool gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
108
+
109
+ // Frees the internally-allocated pointers within an GumboToken. Note that this
110
+ // doesn't free the token itself, since oftentimes it will be allocated on the
111
+ // stack. A simple call to free() (or GumboParser->deallocator, if
112
+ // appropriate) can handle that.
113
+ //
114
+ // Note that if you are handing over ownership of the internal strings to some
115
+ // other data structure - for example, a parse tree - these do not need to be
116
+ // freed.
117
+ void gumbo_token_destroy(struct GumboInternalParser* parser, GumboToken* token);
118
+
119
+ #ifdef __cplusplus
120
+ }
121
+ #endif
122
+
123
+ #endif // GUMBO_TOKENIZER_H_
@@ -0,0 +1,103 @@
1
+ // Copyright 2011 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // This contains the list of states used in the tokenizer. Although at first
18
+ // glance it seems like these could be kept internal to the tokenizer, several
19
+ // of the actions in the parser require that it reach into the tokenizer and
20
+ // reset the tokenizer state. For that to work, it needs to have the
21
+ // definitions of individual states available.
22
+ //
23
+ // This may also be useful for providing more detailed error messages for parse
24
+ // errors, as we can match up states and inputs in a table without having to
25
+ // clutter the tokenizer code with lots of precise error messages.
26
+
27
+ #ifndef GUMBO_TOKENIZER_STATES_H_
28
+ #define GUMBO_TOKENIZER_STATES_H_
29
+
30
+ // The ordering of this enum is also used to build the dispatch table for the
31
+ // tokenizer state machine, so if it is changed, be sure to update that too.
32
+ typedef enum {
33
+ GUMBO_LEX_DATA,
34
+ GUMBO_LEX_CHAR_REF_IN_DATA,
35
+ GUMBO_LEX_RCDATA,
36
+ GUMBO_LEX_CHAR_REF_IN_RCDATA,
37
+ GUMBO_LEX_RAWTEXT,
38
+ GUMBO_LEX_SCRIPT,
39
+ GUMBO_LEX_PLAINTEXT,
40
+ GUMBO_LEX_TAG_OPEN,
41
+ GUMBO_LEX_END_TAG_OPEN,
42
+ GUMBO_LEX_TAG_NAME,
43
+ GUMBO_LEX_RCDATA_LT,
44
+ GUMBO_LEX_RCDATA_END_TAG_OPEN,
45
+ GUMBO_LEX_RCDATA_END_TAG_NAME,
46
+ GUMBO_LEX_RAWTEXT_LT,
47
+ GUMBO_LEX_RAWTEXT_END_TAG_OPEN,
48
+ GUMBO_LEX_RAWTEXT_END_TAG_NAME,
49
+ GUMBO_LEX_SCRIPT_LT,
50
+ GUMBO_LEX_SCRIPT_END_TAG_OPEN,
51
+ GUMBO_LEX_SCRIPT_END_TAG_NAME,
52
+ GUMBO_LEX_SCRIPT_ESCAPED_START,
53
+ GUMBO_LEX_SCRIPT_ESCAPED_START_DASH,
54
+ GUMBO_LEX_SCRIPT_ESCAPED,
55
+ GUMBO_LEX_SCRIPT_ESCAPED_DASH,
56
+ GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH,
57
+ GUMBO_LEX_SCRIPT_ESCAPED_LT,
58
+ GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN,
59
+ GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME,
60
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START,
61
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED,
62
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH,
63
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH,
64
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT,
65
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END,
66
+ GUMBO_LEX_BEFORE_ATTR_NAME,
67
+ GUMBO_LEX_ATTR_NAME,
68
+ GUMBO_LEX_AFTER_ATTR_NAME,
69
+ GUMBO_LEX_BEFORE_ATTR_VALUE,
70
+ GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED,
71
+ GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED,
72
+ GUMBO_LEX_ATTR_VALUE_UNQUOTED,
73
+ GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE,
74
+ GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED,
75
+ GUMBO_LEX_SELF_CLOSING_START_TAG,
76
+ GUMBO_LEX_BOGUS_COMMENT,
77
+ GUMBO_LEX_MARKUP_DECLARATION,
78
+ GUMBO_LEX_COMMENT_START,
79
+ GUMBO_LEX_COMMENT_START_DASH,
80
+ GUMBO_LEX_COMMENT,
81
+ GUMBO_LEX_COMMENT_END_DASH,
82
+ GUMBO_LEX_COMMENT_END,
83
+ GUMBO_LEX_COMMENT_END_BANG,
84
+ GUMBO_LEX_DOCTYPE,
85
+ GUMBO_LEX_BEFORE_DOCTYPE_NAME,
86
+ GUMBO_LEX_DOCTYPE_NAME,
87
+ GUMBO_LEX_AFTER_DOCTYPE_NAME,
88
+ GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD,
89
+ GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID,
90
+ GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED,
91
+ GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED,
92
+ GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID,
93
+ GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID,
94
+ GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD,
95
+ GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID,
96
+ GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED,
97
+ GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED,
98
+ GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID,
99
+ GUMBO_LEX_BOGUS_DOCTYPE,
100
+ GUMBO_LEX_CDATA
101
+ } GumboTokenizerEnum;
102
+
103
+ #endif // GUMBO_TOKENIZER_STATES_H_
@@ -0,0 +1,270 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "utf8.h"
18
+
19
+ #include <assert.h>
20
+ #include <stdint.h>
21
+ #include <string.h>
22
+ #include <strings.h> // For strncasecmp.
23
+
24
+ #include "error.h"
25
+ #include "gumbo.h"
26
+ #include "parser.h"
27
+ #include "util.h"
28
+ #include "vector.h"
29
+
30
+ const int kUtf8ReplacementChar = 0xFFFD;
31
+
32
+ // Reference material:
33
+ // Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description
34
+ // RFC 3629: http://tools.ietf.org/html/rfc3629
35
+ // HTML5 Unicode handling:
36
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#preprocessing-the-input-stream
37
+ //
38
+ // This implementation is based on a DFA-based decoder by Bjoern Hoehrmann
39
+ // <bjoern@hoehrmann.de>. We wrap the inner table-based decoder routine in our
40
+ // own handling for newlines, tabs, invalid continuation bytes, and other
41
+ // conditions that the HTML5 spec fully specifies but normal UTF8 decoders do
42
+ // not handle.
43
+ // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. Full text of
44
+ // the license agreement and code follows.
45
+
46
+ // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
47
+
48
+ // Permission is hereby granted, free of charge, to any person obtaining a copy
49
+ // of this software and associated documentation files (the "Software"), to deal
50
+ // in the Software without restriction, including without limitation the rights
51
+ // to
52
+ // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
53
+ // of the Software, and to permit persons to whom the Software is furnished to
54
+ // do
55
+ // so, subject to the following conditions:
56
+
57
+ // The above copyright notice and this permission notice shall be included in
58
+ // all copies or substantial portions of the Software.
59
+
60
+ #define UTF8_ACCEPT 0
61
+ #define UTF8_REJECT 12
62
+
63
+ static const uint8_t utf8d[] = {
64
+ // The first part of the table maps bytes to character classes that
65
+ // to reduce the size of the transition table and create bitmasks.
66
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71
+ 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9,
72
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
73
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 2, 2, 2, 2, 2, 2,
74
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10,
75
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8,
76
+ 8, 8, 8, 8, 8, 8,
77
+
78
+ // The second part is a transition table that maps a combination
79
+ // of a state of the automaton and a character class to a state.
80
+ 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12,
81
+ 12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12,
82
+ 12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12,
83
+ 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12,
84
+ 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12,
85
+ 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
86
+ };
87
+
88
+ uint32_t static inline decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
89
+ uint32_t type = utf8d[byte];
90
+
91
+ *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6)
92
+ : (0xff >> type) & (byte);
93
+
94
+ *state = utf8d[256 + *state + type];
95
+ return *state;
96
+ }
97
+
98
+ // END COPIED CODE.
99
+
100
+ // Adds a decoding error to the parser's error list, based on the current state
101
+ // of the Utf8Iterator.
102
+ static void add_error(Utf8Iterator* iter, GumboErrorType type) {
103
+ GumboParser* parser = iter->_parser;
104
+
105
+ GumboError* error = gumbo_add_error(parser);
106
+ if (!error) {
107
+ return;
108
+ }
109
+ error->type = type;
110
+ error->position = iter->_pos;
111
+ error->original_text = iter->_start;
112
+
113
+ // At the point the error is recorded, the code point hasn't been computed
114
+ // yet (and can't be, because it's invalid), so we need to build up the raw
115
+ // hex value from the bytes under the cursor.
116
+ uint64_t code_point = 0;
117
+ for (int i = 0; i < iter->_width; ++i) {
118
+ code_point = (code_point << 8) | (unsigned char) iter->_start[i];
119
+ }
120
+ error->v.codepoint = code_point;
121
+ }
122
+
123
+ // Reads the next UTF-8 character in the iter.
124
+ // This assumes that iter->_start points to the beginning of the character.
125
+ // When this method returns, iter->_width and iter->_current will be set
126
+ // appropriately, as well as any error flags.
127
+ static void read_char(Utf8Iterator* iter) {
128
+ if (iter->_start >= iter->_end) {
129
+ // No input left to consume; emit an EOF and set width = 0.
130
+ iter->_current = -1;
131
+ iter->_width = 0;
132
+ return;
133
+ }
134
+
135
+ uint32_t code_point = 0;
136
+ uint32_t state = UTF8_ACCEPT;
137
+ for (const char* c = iter->_start; c < iter->_end; ++c) {
138
+ decode(&state, &code_point, (uint32_t)(unsigned char) (*c));
139
+ if (state == UTF8_ACCEPT) {
140
+ iter->_width = c - iter->_start + 1;
141
+ // This is the special handling for carriage returns that is mandated by
142
+ // the HTML5 spec. Since we're looking for particular 7-bit literal
143
+ // characters, we operate in terms of chars and only need a check for iter
144
+ // overrun, instead of having to read in a full next code point.
145
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
146
+ if (code_point == '\r') {
147
+ assert(iter->_width == 1);
148
+ const char* next = c + 1;
149
+ if (next < iter->_end && *next == '\n') {
150
+ // Advance the iter, as if the carriage return didn't exist.
151
+ ++iter->_start;
152
+ // Preserve the true offset, since other tools that look at it may be
153
+ // unaware of HTML5's rules for converting \r into \n.
154
+ ++iter->_pos.offset;
155
+ }
156
+ code_point = '\n';
157
+ }
158
+ if (utf8_is_invalid_code_point(code_point)) {
159
+ add_error(iter, GUMBO_ERR_UTF8_INVALID);
160
+ code_point = kUtf8ReplacementChar;
161
+ }
162
+ iter->_current = code_point;
163
+ return;
164
+ } else if (state == UTF8_REJECT) {
165
+ // We don't want to consume the invalid continuation byte of a multi-byte
166
+ // run, but we do want to skip past an invalid first byte.
167
+ iter->_width = c - iter->_start + (c == iter->_start);
168
+ iter->_current = kUtf8ReplacementChar;
169
+ add_error(iter, GUMBO_ERR_UTF8_INVALID);
170
+ return;
171
+ }
172
+ }
173
+ // If we got here without exiting early, then we've reached the end of the
174
+ // iterator. Add an error for truncated input, set the width to consume the
175
+ // rest of the iterator, and emit a replacement character. The next time we
176
+ // enter this method, it will detect that there's no input to consume and
177
+ // output an EOF.
178
+ iter->_current = kUtf8ReplacementChar;
179
+ iter->_width = iter->_end - iter->_start;
180
+ add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
181
+ }
182
+
183
+ static void update_position(Utf8Iterator* iter) {
184
+ iter->_pos.offset += iter->_width;
185
+ if (iter->_current == '\n') {
186
+ ++iter->_pos.line;
187
+ iter->_pos.column = 1;
188
+ } else if (iter->_current == '\t') {
189
+ int tab_stop = iter->_parser->_options->tab_stop;
190
+ iter->_pos.column = ((iter->_pos.column / tab_stop) + 1) * tab_stop;
191
+ } else if (iter->_current != -1) {
192
+ ++iter->_pos.column;
193
+ }
194
+ }
195
+
196
+ // Returns true if this Unicode code point is in the list of characters
197
+ // forbidden by the HTML5 spec, such as undefined control chars.
198
+ bool utf8_is_invalid_code_point(int c) {
199
+ return (c >= 0x1 && c <= 0x8) || c == 0xB || (c >= 0xE && c <= 0x1F) ||
200
+ (c >= 0x7F && c <= 0x9F) || (c >= 0xFDD0 && c <= 0xFDEF) ||
201
+ ((c & 0xFFFF) == 0xFFFE) || ((c & 0xFFFF) == 0xFFFF);
202
+ }
203
+
204
+ void utf8iterator_init(GumboParser* parser, const char* source,
205
+ size_t source_length, Utf8Iterator* iter) {
206
+ iter->_start = source;
207
+ iter->_end = source + source_length;
208
+ iter->_pos.line = 1;
209
+ iter->_pos.column = 1;
210
+ iter->_pos.offset = 0;
211
+ iter->_parser = parser;
212
+ read_char(iter);
213
+ }
214
+
215
+ void utf8iterator_next(Utf8Iterator* iter) {
216
+ // We update positions based on the *last* character read, so that the first
217
+ // character following a newline is at column 1 in the next line.
218
+ update_position(iter);
219
+ iter->_start += iter->_width;
220
+ read_char(iter);
221
+ }
222
+
223
+ int utf8iterator_current(const Utf8Iterator* iter) { return iter->_current; }
224
+
225
+ void utf8iterator_get_position(
226
+ const Utf8Iterator* iter, GumboSourcePosition* output) {
227
+ *output = iter->_pos;
228
+ }
229
+
230
+ const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
231
+ return iter->_start;
232
+ }
233
+
234
+ const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
235
+ return iter->_end;
236
+ }
237
+
238
+ bool utf8iterator_maybe_consume_match(Utf8Iterator* iter, const char* prefix,
239
+ size_t length, bool case_sensitive) {
240
+ bool matched = (iter->_start + length <= iter->_end) &&
241
+ (case_sensitive ? !strncmp(iter->_start, prefix, length)
242
+ : !strncasecmp(iter->_start, prefix, length));
243
+ if (matched) {
244
+ for (unsigned int i = 0; i < length; ++i) {
245
+ utf8iterator_next(iter);
246
+ }
247
+ return true;
248
+ } else {
249
+ return false;
250
+ }
251
+ }
252
+
253
+ void utf8iterator_mark(Utf8Iterator* iter) {
254
+ iter->_mark = iter->_start;
255
+ iter->_mark_pos = iter->_pos;
256
+ }
257
+
258
+ // Returns the current input stream position to the mark.
259
+ void utf8iterator_reset(Utf8Iterator* iter) {
260
+ iter->_start = iter->_mark;
261
+ iter->_pos = iter->_mark_pos;
262
+ read_char(iter);
263
+ }
264
+
265
+ // Sets the position and original text fields of an error to the value at the
266
+ // mark.
267
+ void utf8iterator_fill_error_at_mark(Utf8Iterator* iter, GumboError* error) {
268
+ error->position = iter->_mark_pos;
269
+ error->original_text = iter->_mark;
270
+ }