ruby-gumbo 1.0.2 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.mkd +28 -31
  4. data/Rakefile +60 -59
  5. data/ext/extconf.rb +17 -9
  6. data/ext/{gumbo.c → ruby_gumbo_ext.c} +29 -28
  7. data/lib/gumbo.rb +19 -0
  8. data/lib/gumbo/element.rb +52 -0
  9. data/lib/gumbo/{extra.rb → node.rb} +19 -22
  10. data/lib/gumbo/text.rb +29 -0
  11. data/vendor/gumbo-parser/src/attribute.c +44 -0
  12. data/vendor/gumbo-parser/src/attribute.h +37 -0
  13. data/vendor/gumbo-parser/src/char_ref.c +2561 -0
  14. data/vendor/gumbo-parser/src/char_ref.h +61 -0
  15. data/vendor/gumbo-parser/src/error.c +258 -0
  16. data/vendor/gumbo-parser/src/error.h +227 -0
  17. data/vendor/gumbo-parser/src/gumbo.h +807 -0
  18. data/vendor/gumbo-parser/src/insertion_mode.h +57 -0
  19. data/vendor/gumbo-parser/src/parser.c +3917 -0
  20. data/vendor/gumbo-parser/src/parser.h +57 -0
  21. data/vendor/gumbo-parser/src/string_buffer.c +106 -0
  22. data/vendor/gumbo-parser/src/string_buffer.h +81 -0
  23. data/vendor/gumbo-parser/src/string_piece.c +49 -0
  24. data/vendor/gumbo-parser/src/string_piece.h +39 -0
  25. data/vendor/gumbo-parser/src/tag.c +225 -0
  26. data/vendor/gumbo-parser/src/token_type.h +40 -0
  27. data/vendor/gumbo-parser/src/tokenizer.c +2980 -0
  28. data/vendor/gumbo-parser/src/tokenizer.h +123 -0
  29. data/vendor/gumbo-parser/src/tokenizer_states.h +103 -0
  30. data/vendor/gumbo-parser/src/utf8.c +275 -0
  31. data/vendor/gumbo-parser/src/utf8.h +127 -0
  32. data/vendor/gumbo-parser/src/util.c +58 -0
  33. data/vendor/gumbo-parser/src/util.h +62 -0
  34. data/vendor/gumbo-parser/src/vector.c +123 -0
  35. data/vendor/gumbo-parser/src/vector.h +69 -0
  36. metadata +40 -10
  37. data/ext/extconf.h +0 -3
@@ -0,0 +1,123 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // This contains an implementation of a tokenizer for HTML5. It consumes a
18
+ // buffer of UTF-8 characters, and then emits a stream of tokens.
19
+
20
+ #ifndef GUMBO_TOKENIZER_H_
21
+ #define GUMBO_TOKENIZER_H_
22
+
23
+ #include <stdbool.h>
24
+ #include <stddef.h>
25
+
26
+ #include "gumbo.h"
27
+ #include "token_type.h"
28
+ #include "tokenizer_states.h"
29
+
30
+ #ifdef __cplusplus
31
+ extern "C" {
32
+ #endif
33
+
34
+ struct GumboInternalParser;
35
+
36
+ // Struct containing all information pertaining to doctype tokens.
37
+ typedef struct GumboInternalTokenDocType {
38
+ const char* name;
39
+ const char* public_identifier;
40
+ const char* system_identifier;
41
+ bool force_quirks;
42
+ // There's no way to tell a 0-length public or system ID apart from the
43
+ // absence of a public or system ID, but they're handled different by the
44
+ // spec, so we need bool flags for them.
45
+ bool has_public_identifier;
46
+ bool has_system_identifier;
47
+ } GumboTokenDocType;
48
+
49
+ // Struct containing all information pertaining to start tag tokens.
50
+ typedef struct GumboInternalTokenStartTag {
51
+ GumboTag tag;
52
+ GumboVector /* GumboAttribute */ attributes;
53
+ bool is_self_closing;
54
+ } GumboTokenStartTag;
55
+
56
+ // A data structure representing a single token in the input stream. This
57
+ // contains an enum for the type, the source position, a GumboStringPiece
58
+ // pointing to the original text, and then a union for any parsed data.
59
+ typedef struct GumboInternalToken {
60
+ GumboTokenType type;
61
+ GumboSourcePosition position;
62
+ GumboStringPiece original_text;
63
+ union {
64
+ GumboTokenDocType doc_type;
65
+ GumboTokenStartTag start_tag;
66
+ GumboTag end_tag;
67
+ const char* text; // For comments.
68
+ int character; // For character, whitespace, null, and EOF tokens.
69
+ } v;
70
+ } GumboToken;
71
+
72
+ // Initializes the tokenizer state within the GumboParser object, setting up a
73
+ // parse of the specified text.
74
+ void gumbo_tokenizer_state_init(
75
+ struct GumboInternalParser* parser, const char* text, size_t text_length);
76
+
77
+ // Destroys the tokenizer state within the GumboParser object, freeing any
78
+ // dynamically-allocated structures within it.
79
+ void gumbo_tokenizer_state_destroy(struct GumboInternalParser* parser);
80
+
81
+ // Sets the tokenizer state to the specified value. This is needed by some
82
+ // parser states, which alter the state of the tokenizer in response to tags
83
+ // seen.
84
+ void gumbo_tokenizer_set_state(
85
+ struct GumboInternalParser* parser, GumboTokenizerEnum state);
86
+
87
+ // Flags whether the current node is a foreign content element. This is
88
+ // necessary for the markup declaration open state, where the tokenizer must be
89
+ // aware of the state of the parser to properly tokenize bad comment tags.
90
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#markup-declaration-open-state
91
+ void gumbo_tokenizer_set_is_current_node_foreign(
92
+ struct GumboInternalParser* parser, bool is_foreign);
93
+
94
+ // Lexes a single token from the specified buffer, filling the output with the
95
+ // parsed GumboToken data structure. Returns true for a successful
96
+ // tokenization, false if a parse error occurs.
97
+ //
98
+ // Example:
99
+ // struct GumboInternalParser parser;
100
+ // GumboToken output;
101
+ // gumbo_tokenizer_state_init(&parser, text, strlen(text));
102
+ // while (gumbo_lex(&parser, &output)) {
103
+ // ...do stuff with output.
104
+ // gumbo_token_destroy(&parser, &token);
105
+ // }
106
+ // gumbo_tokenizer_state_destroy(&parser);
107
+ bool gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
108
+
109
+ // Frees the internally-allocated pointers within an GumboToken. Note that this
110
+ // doesn't free the token itself, since oftentimes it will be allocated on the
111
+ // stack. A simple call to free() (or GumboParser->deallocator, if
112
+ // appropriate) can handle that.
113
+ //
114
+ // Note that if you are handing over ownership of the internal strings to some
115
+ // other data structure - for example, a parse tree - these do not need to be
116
+ // freed.
117
+ void gumbo_token_destroy(struct GumboInternalParser* parser, GumboToken* token);
118
+
119
+ #ifdef __cplusplus
120
+ }
121
+ #endif
122
+
123
+ #endif // GUMBO_TOKENIZER_H_
@@ -0,0 +1,103 @@
1
+ // Copyright 2011 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // This contains the list of states used in the tokenizer. Although at first
18
+ // glance it seems like these could be kept internal to the tokenizer, several
19
+ // of the actions in the parser require that it reach into the tokenizer and
20
+ // reset the tokenizer state. For that to work, it needs to have the
21
+ // definitions of individual states available.
22
+ //
23
+ // This may also be useful for providing more detailed error messages for parse
24
+ // errors, as we can match up states and inputs in a table without having to
25
+ // clutter the tokenizer code with lots of precise error messages.
26
+
27
+ #ifndef GUMBO_TOKENIZER_STATES_H_
28
+ #define GUMBO_TOKENIZER_STATES_H_
29
+
30
+ // The ordering of this enum is also used to build the dispatch table for the
31
+ // tokenizer state machine, so if it is changed, be sure to update that too.
32
+ typedef enum {
33
+ GUMBO_LEX_DATA,
34
+ GUMBO_LEX_CHAR_REF_IN_DATA,
35
+ GUMBO_LEX_RCDATA,
36
+ GUMBO_LEX_CHAR_REF_IN_RCDATA,
37
+ GUMBO_LEX_RAWTEXT,
38
+ GUMBO_LEX_SCRIPT,
39
+ GUMBO_LEX_PLAINTEXT,
40
+ GUMBO_LEX_TAG_OPEN,
41
+ GUMBO_LEX_END_TAG_OPEN,
42
+ GUMBO_LEX_TAG_NAME,
43
+ GUMBO_LEX_RCDATA_LT,
44
+ GUMBO_LEX_RCDATA_END_TAG_OPEN,
45
+ GUMBO_LEX_RCDATA_END_TAG_NAME,
46
+ GUMBO_LEX_RAWTEXT_LT,
47
+ GUMBO_LEX_RAWTEXT_END_TAG_OPEN,
48
+ GUMBO_LEX_RAWTEXT_END_TAG_NAME,
49
+ GUMBO_LEX_SCRIPT_LT,
50
+ GUMBO_LEX_SCRIPT_END_TAG_OPEN,
51
+ GUMBO_LEX_SCRIPT_END_TAG_NAME,
52
+ GUMBO_LEX_SCRIPT_ESCAPED_START,
53
+ GUMBO_LEX_SCRIPT_ESCAPED_START_DASH,
54
+ GUMBO_LEX_SCRIPT_ESCAPED,
55
+ GUMBO_LEX_SCRIPT_ESCAPED_DASH,
56
+ GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH,
57
+ GUMBO_LEX_SCRIPT_ESCAPED_LT,
58
+ GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN,
59
+ GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME,
60
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START,
61
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED,
62
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH,
63
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH,
64
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT,
65
+ GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END,
66
+ GUMBO_LEX_BEFORE_ATTR_NAME,
67
+ GUMBO_LEX_ATTR_NAME,
68
+ GUMBO_LEX_AFTER_ATTR_NAME,
69
+ GUMBO_LEX_BEFORE_ATTR_VALUE,
70
+ GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED,
71
+ GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED,
72
+ GUMBO_LEX_ATTR_VALUE_UNQUOTED,
73
+ GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE,
74
+ GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED,
75
+ GUMBO_LEX_SELF_CLOSING_START_TAG,
76
+ GUMBO_LEX_BOGUS_COMMENT,
77
+ GUMBO_LEX_MARKUP_DECLARATION,
78
+ GUMBO_LEX_COMMENT_START,
79
+ GUMBO_LEX_COMMENT_START_DASH,
80
+ GUMBO_LEX_COMMENT,
81
+ GUMBO_LEX_COMMENT_END_DASH,
82
+ GUMBO_LEX_COMMENT_END,
83
+ GUMBO_LEX_COMMENT_END_BANG,
84
+ GUMBO_LEX_DOCTYPE,
85
+ GUMBO_LEX_BEFORE_DOCTYPE_NAME,
86
+ GUMBO_LEX_DOCTYPE_NAME,
87
+ GUMBO_LEX_AFTER_DOCTYPE_NAME,
88
+ GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD,
89
+ GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID,
90
+ GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED,
91
+ GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED,
92
+ GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID,
93
+ GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID,
94
+ GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD,
95
+ GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID,
96
+ GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED,
97
+ GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED,
98
+ GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID,
99
+ GUMBO_LEX_BOGUS_DOCTYPE,
100
+ GUMBO_LEX_CDATA
101
+ } GumboTokenizerEnum;
102
+
103
+ #endif // GUMBO_TOKENIZER_STATES_H_
@@ -0,0 +1,275 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "utf8.h"
18
+
19
+ #include <assert.h>
20
+ #include <stdint.h>
21
+ #include <string.h>
22
+ #include <strings.h> // For strncasecmp.
23
+
24
+ #include "error.h"
25
+ #include "gumbo.h"
26
+ #include "parser.h"
27
+ #include "util.h"
28
+ #include "vector.h"
29
+
30
+ const int kUtf8ReplacementChar = 0xFFFD;
31
+
32
+ // Reference material:
33
+ // Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description
34
+ // RFC 3629: http://tools.ietf.org/html/rfc3629
35
+ // HTML5 Unicode handling:
36
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling
37
+
38
+ // Adds a decoding error to the parser's error list, based on the current state
39
+ // of the Utf8Iterator.
40
+ static void add_error(Utf8Iterator* iter, GumboErrorType type) {
41
+ GumboParser* parser = iter->_parser;
42
+
43
+ GumboError* error = gumbo_add_error(parser);
44
+ if (!error) {
45
+ return;
46
+ }
47
+ error->type = type;
48
+ error->position = iter->_pos;
49
+ error->original_text = iter->_start;
50
+
51
+ // At the point the error is recorded, the code point hasn't been computed
52
+ // yet (and can't be, because it's invalid), so we need to build up the raw
53
+ // hex value from the bytes under the cursor.
54
+ uint64_t code_point = 0;
55
+ for (int i = 0; i < iter->_width; ++i) {
56
+ code_point = (code_point << 8) | (unsigned char) iter->_start[i];
57
+ }
58
+ error->v.codepoint = code_point;
59
+ }
60
+
61
+ // Reads the next UTF-8 character in the iter.
62
+ // This assumes that iter->_start points to the beginning of the character.
63
+ // When this method returns, iter->_width and iter->_current will be set
64
+ // appropriately, as well as any error flags.
65
+ static void read_char(Utf8Iterator* iter) {
66
+ unsigned char c;
67
+ unsigned char mask = '\0';
68
+ int is_bad_char = false;
69
+
70
+ c = (unsigned char) *iter->_start;
71
+ if (c < 0x80) {
72
+ // Valid one-byte sequence.
73
+ iter->_width = 1;
74
+ mask = 0xFF;
75
+ } else if (c < 0xC0) {
76
+ // Continuation character not following a multibyte sequence.
77
+ // The HTML5 spec here says to consume the byte and output a replacement
78
+ // character.
79
+ iter->_width = 1;
80
+ is_bad_char = true;
81
+ } else if (c < 0xE0) {
82
+ iter->_width = 2;
83
+ mask = 0x1F; // 00011111 in binary.
84
+ if (c < 0xC2) {
85
+ // Overlong encoding; error according to UTF8/HTML5 spec.
86
+ is_bad_char = true;
87
+ }
88
+ } else if (c < 0xF0) {
89
+ iter->_width = 3;
90
+ mask = 0xF; // 00001111 in binary.
91
+ } else if (c < 0xF5) {
92
+ iter->_width = 4;
93
+ mask = 0x7; // 00000111 in binary.
94
+ } else if (c < 0xF8) {
95
+ // The following cases are all errors, but we need to handle them separately
96
+ // so that we consume the proper number of bytes from the input stream
97
+ // before replacing them with the replacement char. The HTML5 spec
98
+ // specifies that we should consume the shorter of the length specified by
99
+ // the first bit or the run leading up to the first non-continuation
100
+ // character.
101
+ iter->_width = 5;
102
+ is_bad_char = true;
103
+ } else if (c < 0xFC) {
104
+ iter->_width = 6;
105
+ is_bad_char = true;
106
+ } else if (c < 0xFE) {
107
+ iter->_width = 7;
108
+ is_bad_char = true;
109
+ } else {
110
+ iter->_width = 1;
111
+ is_bad_char = true;
112
+ }
113
+
114
+ // Check to make sure we have enough bytes left in the iter to read all that
115
+ // we want. If not, we set the iter_truncated flag, mark this as a bad
116
+ // character, and adjust the current width so that it consumes the rest of the
117
+ // iter.
118
+ uint64_t code_point = c & mask;
119
+ if (iter->_start + iter->_width > iter->_end) {
120
+ iter->_width = iter->_end - iter->_start;
121
+ add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
122
+ is_bad_char = true;
123
+ }
124
+
125
+ // Now we decode continuation bytes, shift them appropriately, and build up
126
+ // the appropriate code point.
127
+ assert(iter->_width < 8);
128
+ for (int i = 1; i < iter->_width; ++i) {
129
+ c = (unsigned char) iter->_start[i];
130
+ if (c < 0x80 || c > 0xBF) {
131
+ // Per HTML5 spec, we don't include the invalid continuation char in the
132
+ // run that we consume here.
133
+ iter->_width = i;
134
+ is_bad_char = true;
135
+ break;
136
+ }
137
+ code_point = (code_point << 6) | (c & ~0x80);
138
+ }
139
+ if (code_point > 0x10FFFF) is_bad_char = true;
140
+
141
+ // If we had a decode error, set the current code point to the replacement
142
+ // character and flip the flag indicating that a decode error occurred.
143
+ // Ditto if we have a code point that is explicitly on the list of characters
144
+ // prohibited by the HTML5 spec, such as control characters.
145
+ if (is_bad_char || utf8_is_invalid_code_point(code_point)) {
146
+ add_error(iter, GUMBO_ERR_UTF8_INVALID);
147
+ code_point = kUtf8ReplacementChar;
148
+ }
149
+
150
+ // This is the special handling for carriage returns that is mandated by the
151
+ // HTML5 spec. Since we're looking for particular 7-bit literal characters,
152
+ // we operate in terms of chars and only need a check for iter overrun,
153
+ // instead of having to read in a full next code point.
154
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
155
+ if (code_point == '\r') {
156
+ const char* next = iter->_start + iter->_width;
157
+ if (next < iter->_end && *next == '\n') {
158
+ // Advance the iter, as if the carriage return didn't exist.
159
+ ++iter->_start;
160
+ // Preserve the true offset, since other tools that look at it may be
161
+ // unaware of HTML5's rules for converting \r into \n.
162
+ ++iter->_pos.offset;
163
+ }
164
+ code_point = '\n';
165
+ }
166
+
167
+ // At this point, we know we have a valid character as the code point, so we
168
+ // set it, and we're done.
169
+ iter->_current = code_point;
170
+ }
171
+
172
+ static void update_position(Utf8Iterator* iter) {
173
+ iter->_pos.offset += iter->_width;
174
+ if (iter->_current == '\n') {
175
+ ++iter->_pos.line;
176
+ iter->_pos.column = 1;
177
+ } else if(iter->_current == '\t') {
178
+ int tab_stop = iter->_parser->_options->tab_stop;
179
+ iter->_pos.column = ((iter->_pos.column / tab_stop) + 1) * tab_stop;
180
+ } else {
181
+ ++iter->_pos.column;
182
+ }
183
+ }
184
+
185
+ // Returns true if this Unicode code point is in the list of characters
186
+ // forbidden by the HTML5 spec, such as undefined control chars.
187
+ bool utf8_is_invalid_code_point(int c) {
188
+ return (c >= 0x1 && c <= 0x8) || c == 0xB || (c >= 0xE && c <= 0x1F) ||
189
+ (c >= 0x7F && c <= 0x9F) || (c >= 0xFDD0 && c <= 0xFDEF) ||
190
+ ((c & 0xFFFF) == 0xFFFE) || ((c & 0xFFFF) == 0xFFFF);
191
+ }
192
+
193
+ void utf8iterator_init(
194
+ GumboParser* parser, const char* source, size_t source_length,
195
+ Utf8Iterator* iter) {
196
+ iter->_start = source;
197
+ iter->_end = source + source_length;
198
+ iter->_width = 0;
199
+ iter->_pos.line = 1;
200
+ iter->_pos.column = 1;
201
+ iter->_pos.offset = 0;
202
+ iter->_parser = parser;
203
+ if (source_length) {
204
+ read_char(iter);
205
+ } else {
206
+ iter->_current = -1;
207
+ }
208
+ }
209
+
210
+ void utf8iterator_next(Utf8Iterator* iter) {
211
+ if (iter->_current == -1) {
212
+ // If we're already at EOF, bail out before advancing anything to avoid
213
+ // reading past the end of the buffer. It's easier to catch this case here
214
+ // than litter the code with lots of individual checks for EOF.
215
+ return;
216
+ }
217
+ iter->_start += iter->_width;
218
+ // We update positions based on the *last* character read, so that the first
219
+ // character following a newline is at column 1 in the next line.
220
+ update_position(iter);
221
+ if (iter->_start < iter->_end) {
222
+ read_char(iter);
223
+ } else { // EOF
224
+ iter->_current = -1;
225
+ }
226
+ }
227
+
228
+ int utf8iterator_current(const Utf8Iterator* iter) {
229
+ return iter->_current;
230
+ }
231
+
232
+ void utf8iterator_get_position(
233
+ const Utf8Iterator* iter, GumboSourcePosition* output) {
234
+ *output = iter->_pos;
235
+ }
236
+
237
+ const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
238
+ return iter->_start;
239
+ }
240
+
241
+ bool utf8iterator_maybe_consume_match(
242
+ Utf8Iterator* iter, const char* prefix, size_t length,
243
+ bool case_sensitive) {
244
+ bool matched = (iter->_start + length <= iter->_end) && (case_sensitive ?
245
+ !strncmp(iter->_start, prefix, length) :
246
+ !strncasecmp(iter->_start, prefix, length));
247
+ if (matched) {
248
+ for (int i = 0; i < length; ++i) {
249
+ utf8iterator_next(iter);
250
+ }
251
+ return true;
252
+ } else {
253
+ return false;
254
+ }
255
+ }
256
+
257
+ void utf8iterator_mark(Utf8Iterator* iter) {
258
+ iter->_mark = iter->_start;
259
+ iter->_mark_pos = iter->_pos;
260
+ }
261
+
262
+ // Returns the current input stream position to the mark.
263
+ void utf8iterator_reset(Utf8Iterator* iter) {
264
+ iter->_start = iter->_mark;
265
+ iter->_pos = iter->_mark_pos;
266
+ read_char(iter);
267
+ }
268
+
269
+ // Sets the position and original text fields of an error to the value at the
270
+ // mark.
271
+ void utf8iterator_fill_error_at_mark(
272
+ Utf8Iterator* iter, GumboError* error) {
273
+ error->position = iter->_mark_pos;
274
+ error->original_text = iter->_mark;
275
+ }