nokogumbo 1.5.0 → 2.0.0.pre.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +56 -0
  3. data/README.md +146 -22
  4. data/ext/nokogumbo/extconf.rb +116 -0
  5. data/ext/{nokogumboc → nokogumbo}/nokogumbo.c +174 -71
  6. data/gumbo-parser/src/ascii.c +33 -0
  7. data/gumbo-parser/src/ascii.h +31 -0
  8. data/gumbo-parser/src/attribute.c +26 -28
  9. data/gumbo-parser/src/attribute.h +3 -23
  10. data/gumbo-parser/src/char_ref.c +135 -2351
  11. data/gumbo-parser/src/char_ref.h +13 -29
  12. data/gumbo-parser/src/error.c +215 -133
  13. data/gumbo-parser/src/error.h +34 -49
  14. data/gumbo-parser/src/foreign_attrs.c +104 -0
  15. data/gumbo-parser/src/gumbo.h +506 -304
  16. data/gumbo-parser/src/insertion_mode.h +4 -28
  17. data/gumbo-parser/src/macros.h +91 -0
  18. data/gumbo-parser/src/parser.c +1989 -1431
  19. data/gumbo-parser/src/parser.h +6 -22
  20. data/gumbo-parser/src/replacement.h +33 -0
  21. data/gumbo-parser/src/string_buffer.c +43 -50
  22. data/gumbo-parser/src/string_buffer.h +24 -40
  23. data/gumbo-parser/src/string_piece.c +39 -39
  24. data/gumbo-parser/src/svg_attrs.c +174 -0
  25. data/gumbo-parser/src/svg_tags.c +137 -0
  26. data/gumbo-parser/src/tag.c +186 -59
  27. data/gumbo-parser/src/tag_lookup.c +382 -0
  28. data/gumbo-parser/src/tag_lookup.h +13 -0
  29. data/gumbo-parser/src/token_type.h +1 -25
  30. data/gumbo-parser/src/tokenizer.c +899 -495
  31. data/gumbo-parser/src/tokenizer.h +37 -37
  32. data/gumbo-parser/src/tokenizer_states.h +6 -22
  33. data/gumbo-parser/src/utf8.c +103 -86
  34. data/gumbo-parser/src/utf8.h +37 -41
  35. data/gumbo-parser/src/util.c +48 -38
  36. data/gumbo-parser/src/util.h +10 -40
  37. data/gumbo-parser/src/vector.c +45 -57
  38. data/gumbo-parser/src/vector.h +17 -39
  39. data/lib/nokogumbo.rb +10 -174
  40. data/lib/nokogumbo/html5.rb +250 -0
  41. data/lib/nokogumbo/html5/document.rb +37 -0
  42. data/lib/nokogumbo/html5/document_fragment.rb +46 -0
  43. data/lib/nokogumbo/version.rb +3 -0
  44. data/lib/nokogumbo/xml/node.rb +57 -0
  45. metadata +32 -19
  46. data/ext/nokogumboc/extconf.rb +0 -60
  47. data/gumbo-parser/src/char_ref.rl +0 -2554
  48. data/gumbo-parser/src/string_piece.h +0 -38
  49. data/gumbo-parser/src/tag.in +0 -150
  50. data/gumbo-parser/src/tag_enum.h +0 -153
  51. data/gumbo-parser/src/tag_gperf.h +0 -105
  52. data/gumbo-parser/src/tag_sizes.h +0 -4
  53. data/gumbo-parser/src/tag_strings.h +0 -153
  54. data/gumbo-parser/visualc/include/strings.h +0 -4
  55. data/test-nokogumbo.rb +0 -190
@@ -0,0 +1,13 @@
1
+ #ifndef GUMBO_TAG_LOOKUP_H_
2
+ #define GUMBO_TAG_LOOKUP_H_
3
+
4
+ #include "gumbo.h"
5
+
6
+ typedef struct {
7
+ const char *key;
8
+ const GumboTag tag;
9
+ } TagHashSlot;
10
+
11
+ const TagHashSlot *gumbo_tag_lookup(const char *str, size_t len);
12
+
13
+ #endif // GUMBO_TAG_LOOKUP_H_
@@ -1,26 +1,6 @@
1
- // Copyright 2011 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
1
  #ifndef GUMBO_TOKEN_TYPE_H_
18
2
  #define GUMBO_TOKEN_TYPE_H_
19
3
 
20
- #ifdef __cplusplus
21
- extern "C" {
22
- #endif
23
-
24
4
  // An enum representing the type of token.
25
5
  typedef enum {
26
6
  GUMBO_TOKEN_DOCTYPE,
@@ -34,8 +14,4 @@ typedef enum {
34
14
  GUMBO_TOKEN_EOF
35
15
  } GumboTokenType;
36
16
 
37
- #ifdef __cplusplus
38
- } // extern C
39
- #endif
40
-
41
- #endif // GUMBO_TOKEN_TYPE_H_
17
+ #endif // GUMBO_TOKEN_TYPE_H_
@@ -1,69 +1,68 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // Coding conventions specific to this file:
18
- //
19
- // 1. Functions that fill in a token should be named emit_*, and should be
20
- // followed immediately by a return from the tokenizer (true if no error
21
- // occurred, false if an error occurred). Sometimes the emit functions
22
- // themselves return a boolean so that they can be combined with the return
23
- // statement; in this case, they should match this convention.
24
- // 2. Functions that shuffle data from temporaries to final API structures
25
- // should be named finish_*, and be called just before the tokenizer exits the
26
- // state that accumulates the temporary.
27
- // 3. All internal data structures should be kept in an initialized state from
28
- // tokenizer creation onwards, ready to accept input. When a buffer's flushed
29
- // and reset, it should be deallocated and immediately reinitialized.
30
- // 4. Make sure there are appropriate break statements following each state.
31
- // 5. Assertions on the state of the temporary and tag buffers are usually a
32
- // good idea, and should go at the entry point of each state when added.
33
- // 6. Statement order within states goes:
34
- // 1. Add parse errors, if appropriate.
35
- // 2. Call finish_* functions to build up tag state.
36
- // 2. Switch to new state. Set _reconsume flag if appropriate.
37
- // 3. Perform any other temporary buffer manipulation.
38
- // 4. Emit tokens
39
- // 5. Return/break.
40
- // This order ensures that we can verify that every emit is followed by a
41
- // return, ensures that the correct state is recorded with any parse errors, and
42
- // prevents parse error position from being messed up by possible mark/resets in
43
- // temporary buffer manipulation.
44
-
45
- #include "tokenizer.h"
1
+ /*
2
+ Copyright 2010 Google Inc.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ */
16
+
17
+ /*
18
+ Coding conventions specific to this file:
19
+
20
+ 1. Functions that fill in a token should be named emit_*, and should be
21
+ followed immediately by a return from the tokenizer (true if no error
22
+ occurred, false if an error occurred). Sometimes the emit functions
23
+ themselves return a boolean so that they can be combined with the return
24
+ statement; in this case, they should match this convention.
25
+ 2. Functions that shuffle data from temporaries to final API structures
26
+ should be named finish_*, and be called just before the tokenizer exits the
27
+ state that accumulates the temporary.
28
+ 3. All internal data structures should be kept in an initialized state from
29
+ tokenizer creation onwards, ready to accept input. When a buffer's flushed
30
+ and reset, it should be deallocated and immediately reinitialized.
31
+ 4. Make sure there are appropriate break statements following each state.
32
+ 5. Assertions on the state of the temporary and tag buffers are usually a
33
+ good idea, and should go at the entry point of each state when added.
34
+ 6. Statement order within states goes:
35
+ 1. Add parse errors, if appropriate.
36
+ 2. Call finish_* functions to build up tag state.
37
+ 2. Switch to new state. Set _reconsume flag if appropriate.
38
+ 3. Perform any other temporary buffer manipulation.
39
+ 4. Emit tokens
40
+ 5. Return/break.
41
+ This order ensures that we can verify that every emit is followed by
42
+ a return, ensures that the correct state is recorded with any parse
43
+ errors, and prevents parse error position from being messed up by
44
+ possible mark/resets in temporary buffer manipulation.
45
+ */
46
46
 
47
47
  #include <assert.h>
48
- #include <stdbool.h>
49
48
  #include <string.h>
50
-
49
+ #include "tokenizer.h"
50
+ #include "ascii.h"
51
51
  #include "attribute.h"
52
52
  #include "char_ref.h"
53
53
  #include "error.h"
54
54
  #include "gumbo.h"
55
55
  #include "parser.h"
56
56
  #include "string_buffer.h"
57
- #include "string_piece.h"
58
57
  #include "token_type.h"
59
58
  #include "tokenizer_states.h"
60
59
  #include "utf8.h"
61
60
  #include "util.h"
62
61
  #include "vector.h"
63
62
 
64
- // Compared against _script_data_buffer to determine if we're in double-escaped
65
- // script mode.
66
- const GumboStringPiece kScriptTag = {"script", 6};
63
+ // Compared against _script_data_buffer to determine if we're in
64
+ // double-escaped script mode.
65
+ static const GumboStringPiece kScriptTag = {.data = "script", .length = 6};
67
66
 
68
67
  // An enum for the return value of each individual state.
69
68
  typedef enum {
@@ -86,31 +85,35 @@ typedef struct GumboInternalTagState {
86
85
  // the buffer can be re-used for building up attributes.
87
86
  GumboTag _tag;
88
87
 
88
+ // The current tag name. It's set at the same time that _tag is set if _tag
89
+ // is set to GUMBO_TAG_UNKNOWN.
90
+ char *_name;
91
+
89
92
  // The starting location of the text in the buffer.
90
93
  GumboSourcePosition _start_pos;
91
94
 
92
- // The current list of attributes. This is copied (and ownership of its data
93
- // transferred) to the GumboStartTag token upon completion of the tag. New
95
+ // The current list of attributes. This is copied (and ownership of its data
96
+ // transferred) to the GumboStartTag token upon completion of the tag. New
94
97
  // attributes are added as soon as their attribute name state is complete, and
95
98
  // values are filled in by operating on _attributes.data[attributes.length-1].
96
99
  GumboVector /* GumboAttribute */ _attributes;
97
100
 
98
- // If true, the next attribute value to be finished should be dropped. This
101
+ // If true, the next attribute value to be finished should be dropped. This
99
102
  // happens if a duplicate attribute name is encountered - we want to consume
100
103
  // the attribute value, but shouldn't overwrite the existing value.
101
104
  bool _drop_next_attr_value;
102
105
 
103
106
  // The state that caused the tokenizer to switch into a character reference in
104
- // attribute value state. This is used to set the additional allowed
105
- // character, and is switched back to on completion. Initialized as the
107
+ // attribute value state. This is used to set the additional allowed
108
+ // character, and is switched back to on completion. Initialized as the
106
109
  // tokenizer enters the character reference state.
107
110
  GumboTokenizerEnum _attr_value_state;
108
111
 
109
- // The last start tag to have been emitted by the tokenizer. This is
112
+ // The last start tag to have been emitted by the tokenizer. This is
110
113
  // necessary to check for appropriate end tags.
111
114
  GumboTag _last_start_tag;
112
115
 
113
- // If true, then this is a start tag. If false, it's an end tag. This is
116
+ // If true, then this is a start tag. If false, it's an end tag. This is
114
117
  // necessary to generate the appropriate token type at tag-closing time.
115
118
  bool _is_start_tag;
116
119
 
@@ -121,43 +124,43 @@ typedef struct GumboInternalTagState {
121
124
  // This is the main tokenizer state struct, containing all state used by in
122
125
  // tokenizing the input stream.
123
126
  typedef struct GumboInternalTokenizerState {
124
- // The current lexer state. Starts in GUMBO_LEX_DATA.
127
+ // The current lexer state. Starts in GUMBO_LEX_DATA.
125
128
  GumboTokenizerEnum _state;
126
129
 
127
130
  // A flag indicating whether the current input character needs to reconsumed
128
131
  // in another state, or whether the next input character should be read for
129
- // the next iteration of the state loop. This is set when the spec reads
132
+ // the next iteration of the state loop. This is set when the spec reads
130
133
  // "Reconsume the current input character in..."
131
134
  bool _reconsume_current_input;
132
135
 
133
- // A flag indicating whether the current node is a foreign element. This is
136
+ // A flag indicating whether the current node is a foreign element. This is
134
137
  // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
135
138
  // markup declaration state.
136
139
  bool _is_current_node_foreign;
137
140
 
138
- // A flag indicating whether the tokenizer is in a CDATA section. If so, then
141
+ // A flag indicating whether the tokenizer is in a CDATA section. If so, then
139
142
  // text tokens emitted will be GUMBO_TOKEN_CDATA.
140
143
  bool _is_in_cdata;
141
144
 
142
145
  // Certain states (notably character references) may emit two character tokens
143
- // at once, but the contract for lex() fills in only one token at a time. The
146
+ // at once, but the contract for lex() fills in only one token at a time. The
144
147
  // extra character is buffered here, and then this is checked on entry to
145
- // lex(). If a character is stored here, it's immediately emitted and control
146
- // returns from the lexer. kGumboNoChar is used to represent 'no character
148
+ // lex(). If a character is stored here, it's immediately emitted and control
149
+ // returns from the lexer. kGumboNoChar is used to represent 'no character
147
150
  // stored.'
148
151
  //
149
152
  // Note that characters emitted through this mechanism will have their source
150
153
  // position marked as the character under the mark, i.e. multiple characters
151
- // may be emitted with the same position. This is desirable for character
152
- // references, but unsuitable for many other cases. Use the _temporary_buffer
154
+ // may be emitted with the same position. This is desirable for character
155
+ // references, but unsuitable for many other cases. Use the _temporary_buffer
153
156
  // mechanism if the buffered characters must have their original positions in
154
157
  // the document.
155
158
  int _buffered_emit_char;
156
159
 
157
160
  // A temporary buffer to accumulate characters, as described by the "temporary
158
- // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
161
+ // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
159
162
  // way: we record the specific character to go into the buffer, which may
160
- // sometimes be a lowercased version of the actual input character. However,
163
+ // sometimes be a lowercased version of the actual input character. However,
161
164
  // we *also* use utf8iterator_mark() to record the position at tag start.
162
165
  // When we start flushing the temporary buffer, we set _temporary_buffer_emit
163
166
  // to the start of it, and then increment it for each call to the tokenizer.
@@ -167,13 +170,13 @@ typedef struct GumboInternalTokenizerState {
167
170
  GumboStringBuffer _temporary_buffer;
168
171
 
169
172
  // The current cursor position we're emitting from within
170
- // _temporary_buffer.data. NULL whenever we're not flushing the buffer.
173
+ // _temporary_buffer.data. NULL whenever we're not flushing the buffer.
171
174
  const char* _temporary_buffer_emit;
172
175
 
173
176
  // The temporary buffer is also used by the spec to check whether we should
174
177
  // enter the script data double escaped state, but we can't use the same
175
178
  // buffer for both because we have to flush out "<s" as emits while still
176
- // maintaining the context that will eventually become "script". This is a
179
+ // maintaining the context that will eventually become "script". This is a
177
180
  // separate buffer that's used in place of the temporary buffer for states
178
181
  // that may enter the script data double escape start state.
179
182
  GumboStringBuffer _script_data_buffer;
@@ -189,7 +192,7 @@ typedef struct GumboInternalTokenizerState {
189
192
  // Current tag state.
190
193
  GumboTagState _tag_state;
191
194
 
192
- // Doctype state. We use the temporary buffer to accumulate characters (it's
195
+ // Doctype state. We use the temporary buffer to accumulate characters (it's
193
196
  // not used for anything else in the doctype states), and then freshly
194
197
  // allocate the strings in the doctype token, then copy it over on emit.
195
198
  GumboTokenDocType _doc_type_state;
@@ -199,8 +202,10 @@ typedef struct GumboInternalTokenizerState {
199
202
  } GumboTokenizerState;
200
203
 
201
204
  // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
202
- static void tokenizer_add_parse_error(
203
- GumboParser* parser, GumboErrorType type) {
205
+ static void tokenizer_add_parse_error (
206
+ GumboParser* parser,
207
+ GumboErrorType type
208
+ ) {
204
209
  GumboError* error = gumbo_add_error(parser);
205
210
  if (!error) {
206
211
  return;
@@ -309,14 +314,14 @@ static void tokenizer_add_parse_error(
309
314
  }
310
315
 
311
316
  static bool is_alpha(int c) {
312
- // We don't use ISO C isupper/islower functions here because they
313
- // depend upon the program's locale, while the behavior of the HTML5 spec is
314
- // independent of which locale the program is run in.
315
- return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
317
+ // We don't use the ISO C isalpha() function here because it depends
318
+ // on the current locale, whereas the behavior in the HTML5 spec is
319
+ // locale-independent.
320
+ return ((unsigned) c | 32) - 'a' < 26;
316
321
  }
317
322
 
318
323
  static int ensure_lowercase(int c) {
319
- return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
324
+ return gumbo_ascii_tolower(c);
320
325
  }
321
326
 
322
327
  static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
@@ -346,7 +351,7 @@ static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
346
351
  // text that will eventually be emitted, it needs to be called a couple of
347
352
  // states before the spec says "Set the temporary buffer to the empty string".
348
353
  // In general, this should be called whenever there's a transition to a
349
- // "less-than sign state". The initial < and possibly / then need to be
354
+ // "less-than sign state". The initial < and possibly / then need to be
350
355
  // appended to the temporary buffer, their presence needs to be accounted for in
351
356
  // states that compare the temporary buffer against a literal value, and
352
357
  // spec stanzas that say "emit a < and / character token along with a character
@@ -356,30 +361,40 @@ static void clear_temporary_buffer(GumboParser* parser) {
356
361
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
357
362
  assert(!tokenizer->_temporary_buffer_emit);
358
363
  utf8iterator_mark(&tokenizer->_input);
359
- gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer);
364
+ gumbo_string_buffer_clear(&tokenizer->_temporary_buffer);
360
365
  // The temporary buffer and script data buffer are the same object in the
361
366
  // spec, so the script data buffer should be cleared as well.
362
- gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
367
+ gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
363
368
  }
364
369
 
365
370
  // Appends a codepoint to the temporary buffer.
366
- static void append_char_to_temporary_buffer(
367
- GumboParser* parser, int codepoint) {
368
- gumbo_string_buffer_append_codepoint(
369
- parser, codepoint, &parser->_tokenizer_state->_temporary_buffer);
371
+ static void append_char_to_temporary_buffer (
372
+ GumboParser* parser,
373
+ int codepoint
374
+ ) {
375
+ gumbo_string_buffer_append_codepoint (
376
+ codepoint,
377
+ &parser->_tokenizer_state->_temporary_buffer
378
+ );
370
379
  }
371
380
 
372
- // Checks to see if the temporary buffer equals a certain string.
373
- // Make sure this remains side-effect free; it's used in assertions.
374
381
  #ifndef NDEBUG
375
- static bool temporary_buffer_equals(GumboParser* parser, const char* text) {
376
- GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer;
377
- // TODO(jdtang): See if the extra strlen is a performance problem, and replace
378
- // it with an explicit sizeof(literal) if necessary. I don't think it will
379
- // be, as this is only used in a couple of rare states.
380
- int text_len = strlen(text);
381
- return text_len == buffer->length &&
382
- memcmp(buffer->data, text, text_len) == 0;
382
+ static bool temporary_buffer_equals__ (
383
+ const GumboParser* parser,
384
+ const char* text,
385
+ size_t text_len
386
+ ) {
387
+ const GumboStringBuffer* buf = &parser->_tokenizer_state->_temporary_buffer;
388
+ return
389
+ text_len == buf->length
390
+ && memcmp(buf->data, text, text_len) == 0;
391
+ }
392
+
393
+ #define temporary_buffer_equals(parser, text) \
394
+ temporary_buffer_equals__(parser, "" text, sizeof(text) - 1)
395
+
396
+ static bool temporary_buffer_is_empty(const GumboParser* parser) {
397
+ return parser->_tokenizer_state->_temporary_buffer.length == 0;
383
398
  }
384
399
  #endif
385
400
 
@@ -387,9 +402,9 @@ static void doc_type_state_init(GumboParser* parser) {
387
402
  GumboTokenDocType* doc_type_state =
388
403
  &parser->_tokenizer_state->_doc_type_state;
389
404
  // We initialize these to NULL here so that we don't end up leaking memory if
390
- // we never see a doctype token. When we do see a doctype token, we reset
405
+ // we never see a doctype token. When we do see a doctype token, we reset
391
406
  // them to a freshly-allocated empty string so that we can present a uniform
392
- // interface to client code and not make them check for null. Ownership is
407
+ // interface to client code and not make them check for null. Ownership is
393
408
  // transferred to the doctype token when it's emitted.
394
409
  doc_type_state->name = NULL;
395
410
  doc_type_state->public_identifier = NULL;
@@ -408,7 +423,7 @@ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
408
423
  }
409
424
 
410
425
  // Sets the tag buffer original text and start point to the current iterator
411
- // position. This is necessary because attribute names & values may have
426
+ // position. This is necessary because attribute names & values may have
412
427
  // whitespace preceeding them, and so we can't assume that the actual token
413
428
  // starting point was the end of the last tag buffer usage.
414
429
  static void reset_tag_buffer_start_point(GumboParser* parser) {
@@ -423,15 +438,14 @@ static void reset_tag_buffer_start_point(GumboParser* parser) {
423
438
  // and clears the temporary buffer.
424
439
  static void finish_temporary_buffer(GumboParser* parser, const char** output) {
425
440
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
426
- *output =
427
- gumbo_string_buffer_to_string(parser, &tokenizer->_temporary_buffer);
441
+ *output = gumbo_string_buffer_to_string(&tokenizer->_temporary_buffer);
428
442
  clear_temporary_buffer(parser);
429
443
  }
430
444
 
431
445
  // Advances the iterator past the end of the token, and then fills in the
432
- // relevant position fields. It's assumed that after every emit, the tokenizer
446
+ // relevant position fields. It's assumed that after every emit, the tokenizer
433
447
  // will immediately return (letting the tree-construction stage read the filled
434
- // in Token). Thus, it's safe to advance the input stream here, since it will
448
+ // in Token). Thus, it's safe to advance the input stream here, since it will
435
449
  // bypass the advance at the bottom of the state machine loop.
436
450
  //
437
451
  // Since this advances the iterator and resets the current input, make sure to
@@ -450,7 +464,7 @@ static void finish_token(GumboParser* parser, GumboToken* token) {
450
464
  if (token->original_text.length > 0 &&
451
465
  token->original_text.data[token->original_text.length - 1] == '\r') {
452
466
  // The UTF8 iterator will ignore carriage returns in the input stream, which
453
- // means that the next token may start one past a \r character. The pointer
467
+ // means that the next token may start one past a \r character. The pointer
454
468
  // arithmetic above results in that \r being appended to the original text
455
469
  // of the preceding token, so we have to adjust its length here to chop the
456
470
  // \r off.
@@ -463,7 +477,7 @@ static void finish_token(GumboParser* parser, GumboToken* token) {
463
477
  static void finish_doctype_public_id(GumboParser* parser) {
464
478
  GumboTokenDocType* doc_type_state =
465
479
  &parser->_tokenizer_state->_doc_type_state;
466
- gumbo_parser_deallocate(parser, (void*) doc_type_state->public_identifier);
480
+ gumbo_free((void*) doc_type_state->public_identifier);
467
481
  finish_temporary_buffer(parser, &doc_type_state->public_identifier);
468
482
  doc_type_state->has_public_identifier = true;
469
483
  }
@@ -473,7 +487,7 @@ static void finish_doctype_public_id(GumboParser* parser) {
473
487
  static void finish_doctype_system_id(GumboParser* parser) {
474
488
  GumboTokenDocType* doc_type_state =
475
489
  &parser->_tokenizer_state->_doc_type_state;
476
- gumbo_parser_deallocate(parser, (void*) doc_type_state->system_identifier);
490
+ gumbo_free((void*) doc_type_state->system_identifier);
477
491
  finish_temporary_buffer(parser, &doc_type_state->system_identifier);
478
492
  doc_type_state->has_system_identifier = true;
479
493
  }
@@ -495,7 +509,7 @@ static StateResult emit_replacement_char(
495
509
  return RETURN_ERROR;
496
510
  }
497
511
 
498
- // Writes an EOF character token. Always returns RETURN_SUCCESS.
512
+ // Writes an EOF character token. Always returns RETURN_SUCCESS.
499
513
  static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
500
514
  emit_char(parser, -1, output);
501
515
  return RETURN_SUCCESS;
@@ -520,7 +534,9 @@ static void emit_doctype(GumboParser* parser, GumboToken* output) {
520
534
  // Debug-only function that explicitly sets the attribute vector data to NULL so
521
535
  // it can be asserted on tag creation, verifying that there are no memory leaks.
522
536
  static void mark_tag_state_as_empty(GumboTagState* tag_state) {
537
+ UNUSED_IF_NDEBUG(tag_state);
523
538
  #ifndef NDEBUG
539
+ tag_state->_name = NULL;
524
540
  tag_state->_attributes = kGumboEmptyVector;
525
541
  #endif
526
542
  }
@@ -532,6 +548,7 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
532
548
  if (tag_state->_is_start_tag) {
533
549
  output->type = GUMBO_TOKEN_START_TAG;
534
550
  output->v.start_tag.tag = tag_state->_tag;
551
+ output->v.start_tag.name = tag_state->_name;
535
552
  output->v.start_tag.attributes = tag_state->_attributes;
536
553
  output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
537
554
  tag_state->_last_start_tag = tag_state->_tag;
@@ -540,23 +557,28 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
540
557
  "Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
541
558
  } else {
542
559
  output->type = GUMBO_TOKEN_END_TAG;
543
- output->v.end_tag = tag_state->_tag;
560
+ output->v.end_tag.tag = tag_state->_tag;
561
+ output->v.end_tag.name = tag_state->_name;
562
+ output->v.end_tag.is_self_closing = tag_state->_is_self_closing;
544
563
  // In end tags, ownership of the attributes vector is not transferred to the
545
564
  // token, but it's still initialized as normal, so it must be manually
546
- // deallocated. There may also be attributes to destroy, in certain broken
565
+ // deallocated. There may also be attributes to destroy, in certain broken
547
566
  // cases like </div</th> (the "th" is an attribute there).
548
567
  for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
549
- gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
568
+ gumbo_destroy_attribute(tag_state->_attributes.data[i]);
550
569
  }
551
- gumbo_parser_deallocate(parser, tag_state->_attributes.data);
570
+ gumbo_free(tag_state->_attributes.data);
552
571
  mark_tag_state_as_empty(tag_state);
553
572
  gumbo_debug(
554
573
  "Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
555
574
  }
556
- gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
575
+ gumbo_string_buffer_destroy(&tag_state->_buffer);
557
576
  finish_token(parser, output);
558
- gumbo_debug("Original text = %.*s.\n", output->original_text.length,
559
- output->original_text.data);
577
+ gumbo_debug (
578
+ "Original text = %.*s.\n",
579
+ (int) output->original_text.length,
580
+ output->original_text.data
581
+ );
560
582
  assert(output->original_text.length >= 2);
561
583
  assert(output->original_text.data[0] == '<');
562
584
  assert(output->original_text.data[output->original_text.length - 1] == '>');
@@ -570,26 +592,36 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
570
592
  static void abandon_current_tag(GumboParser* parser) {
571
593
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
572
594
  for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
573
- gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
595
+ gumbo_destroy_attribute(tag_state->_attributes.data[i]);
574
596
  }
575
- gumbo_parser_deallocate(parser, tag_state->_attributes.data);
597
+ gumbo_free(tag_state->_attributes.data);
576
598
  mark_tag_state_as_empty(tag_state);
577
- gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
599
+ gumbo_string_buffer_destroy(&tag_state->_buffer);
578
600
  gumbo_debug("Abandoning current tag.\n");
579
601
  }
580
602
 
581
- // Wraps the consume_char_ref function to handle its output and make the
582
- // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
603
+ // Wraps the gumbo_consume_char_ref function to handle its output and make the
604
+ // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
583
605
  // error occurred, RETURN_SUCCESS otherwise.
584
- static StateResult emit_char_ref(GumboParser* parser,
585
- int additional_allowed_char, bool is_in_attribute, GumboToken* output) {
606
+ static StateResult emit_char_ref (
607
+ GumboParser* parser,
608
+ int additional_allowed_char,
609
+ bool UNUSED_ARG(is_in_attribute),
610
+ GumboToken* output
611
+ ) {
586
612
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
587
613
  OneOrTwoCodepoints char_ref;
588
- bool status = consume_char_ref(
589
- parser, &tokenizer->_input, additional_allowed_char, false, &char_ref);
614
+ bool status = gumbo_consume_char_ref (
615
+ parser,
616
+ &tokenizer->_input,
617
+ additional_allowed_char,
618
+ false,
619
+ &char_ref
620
+ );
590
621
  if (char_ref.first != kGumboNoChar) {
591
- // consume_char_ref ends with the iterator pointing at the next character,
592
- // so we need to be sure not advance it again before reading the next token.
622
+ // gumbo_consume_char_ref ends with the iterator pointing at the next
623
+ // character, so we need to be sure not advance it again before
624
+ // reading the next token.
593
625
  tokenizer->_reconsume_current_input = true;
594
626
  emit_char(parser, char_ref.first, output);
595
627
  tokenizer->_buffered_emit_char = char_ref.second;
@@ -599,9 +631,9 @@ static StateResult emit_char_ref(GumboParser* parser,
599
631
  return status ? RETURN_SUCCESS : RETURN_ERROR;
600
632
  }
601
633
 
602
- // Emits a comment token. Comments use the temporary buffer to accumulate their
634
+ // Emits a comment token. Comments use the temporary buffer to accumulate their
603
635
  // data, and then it's copied over and released to the 'text' field of the
604
- // GumboToken union. Always returns RETURN_SUCCESS.
636
+ // GumboToken union. Always returns RETURN_SUCCESS.
605
637
  static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
606
638
  output->type = GUMBO_TOKEN_COMMENT;
607
639
  finish_temporary_buffer(parser, &output->v.text);
@@ -626,11 +658,11 @@ static bool maybe_emit_from_temporary_buffer(
626
658
  }
627
659
 
628
660
  assert(*c == utf8iterator_current(&tokenizer->_input));
629
- // emit_char also advances the input stream. We need to do some juggling of
661
+ // emit_char also advances the input stream. We need to do some juggling of
630
662
  // the _reconsume_current_input flag to get the proper behavior when emitting
631
- // previous tokens. Basically, _reconsume_current_input should *never* be set
663
+ // previous tokens. Basically, _reconsume_current_input should *never* be set
632
664
  // when emitting anything from the temporary buffer, since those characters
633
- // have already been advanced past. However, it should be preserved so that
665
+ // have already been advanced past. However, it should be preserved so that
634
666
  // when the *next* character is encountered again, the tokenizer knows not to
635
667
  // advance past it.
636
668
  bool saved_reconsume_state = tokenizer->_reconsume_current_input;
@@ -644,7 +676,7 @@ static bool maybe_emit_from_temporary_buffer(
644
676
  // Sets up the tokenizer to begin flushing the temporary buffer.
645
677
  // This resets the input iterator stream to the start of the last tag, sets up
646
678
  // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
647
- // the first character in it. It returns true if a character was emitted, false
679
+ // the first character in it. It returns true if a character was emitted, false
648
680
  // otherwise.
649
681
  static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
650
682
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
@@ -654,32 +686,35 @@ static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
654
686
  return maybe_emit_from_temporary_buffer(parser, output);
655
687
  }
656
688
 
657
- // Appends a codepoint to the current tag buffer. If
689
+ // Appends a codepoint to the current tag buffer. If
658
690
  // reinitilize_position_on_first is set, this also initializes the tag buffer
659
691
  // start point; the only time you would *not* want to pass true for this
660
692
  // parameter is if you want the original_text to include character (like an
661
693
  // opening quote) that doesn't appear in the value.
662
- static void append_char_to_tag_buffer(
663
- GumboParser* parser, int codepoint, bool reinitilize_position_on_first) {
694
+ static void append_char_to_tag_buffer (
695
+ GumboParser* parser,
696
+ int codepoint,
697
+ bool reinitilize_position_on_first
698
+ ) {
664
699
  GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
665
700
  if (buffer->length == 0 && reinitilize_position_on_first) {
666
701
  reset_tag_buffer_start_point(parser);
667
702
  }
668
- gumbo_string_buffer_append_codepoint(parser, codepoint, buffer);
703
+ gumbo_string_buffer_append_codepoint(codepoint, buffer);
669
704
  }
670
705
 
671
- // (Re-)initialize the tag buffer. This also resets the original_text pointer
706
+ // (Re-)initialize the tag buffer. This also resets the original_text pointer
672
707
  // and _start_pos field to point to the current position.
673
708
  static void initialize_tag_buffer(GumboParser* parser) {
674
709
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
675
710
  GumboTagState* tag_state = &tokenizer->_tag_state;
676
711
 
677
- gumbo_string_buffer_init(parser, &tag_state->_buffer);
712
+ gumbo_string_buffer_init(&tag_state->_buffer);
678
713
  reset_tag_buffer_start_point(parser);
679
714
  }
680
715
 
681
716
  // Initializes the tag_state to start a new tag, keeping track of the opening
682
- // positions and original text. Takes a boolean indicating whether this is a
717
+ // positions and original text. Takes a boolean indicating whether this is a
683
718
  // start or end tag.
684
719
  static void start_new_tag(GumboParser* parser, bool is_start_tag) {
685
720
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
@@ -690,14 +725,15 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
690
725
  assert(is_alpha(c));
691
726
 
692
727
  initialize_tag_buffer(parser);
693
- gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
728
+ gumbo_string_buffer_append_codepoint(c, &tag_state->_buffer);
694
729
 
730
+ assert(tag_state->_name == NULL);
695
731
  assert(tag_state->_attributes.data == NULL);
696
732
  // Initial size chosen by statistical analysis of a corpus of 60k webpages.
697
- // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
733
+ // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
698
734
  // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
699
735
  // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
700
- gumbo_vector_init(parser, 1, &tag_state->_attributes);
736
+ gumbo_vector_init(1, &tag_state->_attributes);
701
737
  tag_state->_drop_next_attr_value = false;
702
738
  tag_state->_is_start_tag = is_start_tag;
703
739
  tag_state->_is_self_closing = false;
@@ -708,7 +744,7 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
708
744
  static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
709
745
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
710
746
  GumboTagState* tag_state = &tokenizer->_tag_state;
711
- *output = gumbo_string_buffer_to_string(parser, &tag_state->_buffer);
747
+ *output = gumbo_string_buffer_to_string(&tag_state->_buffer);
712
748
  }
713
749
 
714
750
  // Fills in:
@@ -717,9 +753,12 @@ static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
717
753
  // * The start_pos GumboSourcePosition with the start position of the tag
718
754
  // buffer.
719
755
  // * The end_pos GumboSourcePosition with the current source position.
720
- static void copy_over_original_tag_text(GumboParser* parser,
721
- GumboStringPiece* original_text, GumboSourcePosition* start_pos,
722
- GumboSourcePosition* end_pos) {
756
+ static void copy_over_original_tag_text (
757
+ GumboParser* parser,
758
+ GumboStringPiece* original_text,
759
+ GumboSourcePosition* start_pos,
760
+ GumboSourcePosition* end_pos
761
+ ) {
723
762
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
724
763
  GumboTagState* tag_state = &tokenizer->_tag_state;
725
764
 
@@ -729,7 +768,7 @@ static void copy_over_original_tag_text(GumboParser* parser,
729
768
  if (original_text->data[original_text->length - 1] == '\r') {
730
769
  // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
731
770
  // appended to the end of original text even when it's really the first part
732
- // of the next character. If we detect this situation, shrink the length of
771
+ // of the next character. If we detect this situation, shrink the length of
733
772
  // the original text by 1 to remove the carriage return.
734
773
  --original_text->length;
735
774
  }
@@ -739,8 +778,7 @@ static void copy_over_original_tag_text(GumboParser* parser,
739
778
 
740
779
  // Releases and then re-initializes the tag buffer.
741
780
  static void reinitialize_tag_buffer(GumboParser* parser) {
742
- gumbo_parser_deallocate(
743
- parser, parser->_tokenizer_state->_tag_state._buffer.data);
781
+ gumbo_free(parser->_tokenizer_state->_tag_state._buffer.data);
744
782
  initialize_tag_buffer(parser);
745
783
  }
746
784
 
@@ -750,14 +788,24 @@ static void finish_tag_name(GumboParser* parser) {
750
788
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
751
789
  GumboTagState* tag_state = &tokenizer->_tag_state;
752
790
 
753
- tag_state->_tag =
754
- gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
791
+ const char *data = tag_state->_buffer.data;
792
+ size_t length = tag_state->_buffer.length;
793
+ tag_state->_tag = gumbo_tagn_enum(data, length);
794
+ if (tag_state->_tag == GUMBO_TAG_UNKNOWN) {
795
+ char *name = gumbo_alloc(length + 1);
796
+ memcpy(name, data, length);
797
+ name[length] = 0;
798
+ tag_state->_name = name;
799
+ }
755
800
  reinitialize_tag_buffer(parser);
756
801
  }
757
802
 
758
803
  // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
759
- static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
760
- int original_index, int new_index) {
804
+ static void add_duplicate_attr_error (
805
+ GumboParser* parser,
806
+ int original_index,
807
+ int new_index
808
+ ) {
761
809
  GumboError* error = gumbo_add_error(parser);
762
810
  if (!error) {
763
811
  return;
@@ -773,11 +821,11 @@ static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
773
821
  }
774
822
 
775
823
  // Creates a new attribute in the current tag, copying the current tag buffer to
776
- // the attribute's name. The attribute's value starts out as the empty string
824
+ // the attribute's name. The attribute's value starts out as the empty string
777
825
  // (following the "Boolean attributes" section of the spec) and is only
778
- // overwritten on finish_attribute_value(). If the attribute has already been
826
+ // overwritten on finish_attribute_value(). If the attribute has already been
779
827
  // specified, the new attribute is dropped, a parse error is added, and the
780
- // function returns false. Otherwise, this returns true.
828
+ // function returns false. Otherwise, this returns true.
781
829
  static bool finish_attribute_name(GumboParser* parser) {
782
830
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
783
831
  GumboTagState* tag_state = &tokenizer->_tag_state;
@@ -789,30 +837,43 @@ static bool finish_attribute_name(GumboParser* parser) {
789
837
  GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
790
838
  for (unsigned int i = 0; i < attributes->length; ++i) {
791
839
  GumboAttribute* attr = attributes->data[i];
792
- if (strlen(attr->name) == tag_state->_buffer.length &&
793
- memcmp(attr->name, tag_state->_buffer.data,
794
- tag_state->_buffer.length) == 0) {
840
+ if (
841
+ strlen(attr->name) == tag_state->_buffer.length
842
+ && 0 == memcmp (
843
+ attr->name,
844
+ tag_state->_buffer.data,
845
+ tag_state->_buffer.length
846
+ )
847
+ ) {
795
848
  // Identical attribute; bail.
796
- add_duplicate_attr_error(parser, attr->name, i, attributes->length);
849
+ add_duplicate_attr_error(parser, i, attributes->length);
797
850
  tag_state->_drop_next_attr_value = true;
798
851
  return false;
799
852
  }
800
853
  }
801
854
 
802
- GumboAttribute* attr = gumbo_parser_allocate(parser, sizeof(GumboAttribute));
855
+ GumboAttribute* attr = gumbo_alloc(sizeof(GumboAttribute));
803
856
  attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
804
857
  copy_over_tag_buffer(parser, &attr->name);
805
- copy_over_original_tag_text(
806
- parser, &attr->original_name, &attr->name_start, &attr->name_end);
807
- attr->value = gumbo_copy_stringz(parser, "");
808
- copy_over_original_tag_text(
809
- parser, &attr->original_value, &attr->name_start, &attr->name_end);
810
- gumbo_vector_add(parser, attr, attributes);
858
+ copy_over_original_tag_text (
859
+ parser,
860
+ &attr->original_name,
861
+ &attr->name_start,
862
+ &attr->name_end
863
+ );
864
+ attr->value = gumbo_strdup("");
865
+ copy_over_original_tag_text (
866
+ parser,
867
+ &attr->original_value,
868
+ &attr->name_start,
869
+ &attr->name_end
870
+ );
871
+ gumbo_vector_add(attr, attributes);
811
872
  reinitialize_tag_buffer(parser);
812
873
  return true;
813
874
  }
814
875
 
815
- // Finishes an attribute value. This sets the value of the most recently added
876
+ // Finishes an attribute value. This sets the value of the most recently added
816
877
  // attribute to the current contents of the tag buffer.
817
878
  static void finish_attribute_value(GumboParser* parser) {
818
879
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
@@ -826,7 +887,7 @@ static void finish_attribute_value(GumboParser* parser) {
826
887
 
827
888
  GumboAttribute* attr =
828
889
  tag_state->_attributes.data[tag_state->_attributes.length - 1];
829
- gumbo_parser_deallocate(parser, (void*) attr->value);
890
+ gumbo_free((void*) attr->value);
830
891
  copy_over_tag_buffer(parser, &attr->value);
831
892
  copy_over_original_tag_text(
832
893
  parser, &attr->original_value, &attr->value_start, &attr->value_end);
@@ -842,24 +903,27 @@ static bool is_appropriate_end_tag(GumboParser* parser) {
842
903
  tag_state->_buffer.length);
843
904
  }
844
905
 
845
- void gumbo_tokenizer_state_init(
846
- GumboParser* parser, const char* text, size_t text_length) {
847
- GumboTokenizerState* tokenizer =
848
- gumbo_parser_allocate(parser, sizeof(GumboTokenizerState));
906
+ void gumbo_tokenizer_state_init (
907
+ GumboParser* parser,
908
+ const char* text,
909
+ size_t text_length
910
+ ) {
911
+ GumboTokenizerState* tokenizer = gumbo_alloc(sizeof(GumboTokenizerState));
849
912
  parser->_tokenizer_state = tokenizer;
850
913
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
851
914
  tokenizer->_reconsume_current_input = false;
852
915
  tokenizer->_is_current_node_foreign = false;
853
916
  tokenizer->_is_in_cdata = false;
854
917
  tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
918
+ tokenizer->_tag_state._name = NULL;
855
919
 
856
920
  tokenizer->_buffered_emit_char = kGumboNoChar;
857
- gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
921
+ gumbo_string_buffer_init(&tokenizer->_temporary_buffer);
858
922
  tokenizer->_temporary_buffer_emit = NULL;
859
923
 
860
924
  mark_tag_state_as_empty(&tokenizer->_tag_state);
861
925
 
862
- gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
926
+ gumbo_string_buffer_init(&tokenizer->_script_data_buffer);
863
927
  tokenizer->_token_start = text;
864
928
  utf8iterator_init(parser, text, text_length, &tokenizer->_input);
865
929
  utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
@@ -871,27 +935,37 @@ void gumbo_tokenizer_state_destroy(GumboParser* parser) {
871
935
  assert(tokenizer->_doc_type_state.name == NULL);
872
936
  assert(tokenizer->_doc_type_state.public_identifier == NULL);
873
937
  assert(tokenizer->_doc_type_state.system_identifier == NULL);
874
- gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
875
- gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
876
- gumbo_parser_deallocate(parser, tokenizer);
938
+ gumbo_string_buffer_destroy(&tokenizer->_temporary_buffer);
939
+ gumbo_string_buffer_destroy(&tokenizer->_script_data_buffer);
940
+ assert(tokenizer->_tag_state._name == NULL);
941
+ assert(tokenizer->_tag_state._attributes.data == NULL);
942
+ gumbo_free(tokenizer);
877
943
  }
878
944
 
879
945
  void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
880
946
  parser->_tokenizer_state->_state = state;
881
947
  }
882
948
 
883
- void gumbo_tokenizer_set_is_current_node_foreign(
884
- GumboParser* parser, bool is_foreign) {
949
+ void gumbo_tokenizer_set_is_current_node_foreign (
950
+ GumboParser* parser,
951
+ bool is_foreign
952
+ ) {
885
953
  if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
886
- gumbo_debug("Toggling is_current_node_foreign to %s.\n",
887
- is_foreign ? "true" : "false");
954
+ gumbo_debug (
955
+ "Toggling is_current_node_foreign to %s.\n",
956
+ is_foreign ? "true" : "false"
957
+ );
888
958
  }
889
959
  parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
890
960
  }
891
961
 
892
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state
893
- static StateResult handle_data_state(GumboParser* parser,
894
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
962
+ // https://html.spec.whatwg.org/multipage/parsing.html#data-state
963
+ static StateResult handle_data_state (
964
+ GumboParser* parser,
965
+ GumboTokenizerState* tokenizer,
966
+ int c,
967
+ GumboToken* output
968
+ ) {
895
969
  switch (c) {
896
970
  case '&':
897
971
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
@@ -914,16 +988,24 @@ static StateResult handle_data_state(GumboParser* parser,
914
988
  }
915
989
  }
916
990
 
917
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state
918
- static StateResult handle_char_ref_in_data_state(GumboParser* parser,
919
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
991
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-data-state
992
+ static StateResult handle_char_ref_in_data_state (
993
+ GumboParser* parser,
994
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
995
+ int UNUSED_ARG(c),
996
+ GumboToken* output
997
+ ) {
920
998
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
921
999
  return emit_char_ref(parser, ' ', false, output);
922
1000
  }
923
1001
 
924
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state
925
- static StateResult handle_rcdata_state(GumboParser* parser,
926
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1002
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
1003
+ static StateResult handle_rcdata_state (
1004
+ GumboParser* parser,
1005
+ GumboTokenizerState* tokenizer,
1006
+ int c,
1007
+ GumboToken* output
1008
+ ) {
927
1009
  switch (c) {
928
1010
  case '&':
929
1011
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
@@ -943,16 +1025,24 @@ static StateResult handle_rcdata_state(GumboParser* parser,
943
1025
  }
944
1026
  }
945
1027
 
946
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state
947
- static StateResult handle_char_ref_in_rcdata_state(GumboParser* parser,
948
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1028
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-rcdata-state
1029
+ static StateResult handle_char_ref_in_rcdata_state (
1030
+ GumboParser* parser,
1031
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1032
+ int UNUSED_ARG(c),
1033
+ GumboToken* output
1034
+ ) {
949
1035
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
950
1036
  return emit_char_ref(parser, ' ', false, output);
951
1037
  }
952
1038
 
953
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state
954
- static StateResult handle_rawtext_state(GumboParser* parser,
955
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1039
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
1040
+ static StateResult handle_rawtext_state (
1041
+ GumboParser* parser,
1042
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1043
+ int c,
1044
+ GumboToken* output
1045
+ ) {
956
1046
  switch (c) {
957
1047
  case '<':
958
1048
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
@@ -968,9 +1058,13 @@ static StateResult handle_rawtext_state(GumboParser* parser,
968
1058
  }
969
1059
  }
970
1060
 
971
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state
972
- static StateResult handle_script_state(GumboParser* parser,
973
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1061
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
1062
+ static StateResult handle_script_state (
1063
+ GumboParser* parser,
1064
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1065
+ int c,
1066
+ GumboToken* output
1067
+ ) {
974
1068
  switch (c) {
975
1069
  case '<':
976
1070
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
@@ -986,9 +1080,13 @@ static StateResult handle_script_state(GumboParser* parser,
986
1080
  }
987
1081
  }
988
1082
 
989
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state
990
- static StateResult handle_plaintext_state(GumboParser* parser,
991
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1083
+ // https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
1084
+ static StateResult handle_plaintext_state (
1085
+ GumboParser* parser,
1086
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1087
+ int c,
1088
+ GumboToken* output
1089
+ ) {
992
1090
  switch (c) {
993
1091
  case '\0':
994
1092
  return emit_replacement_char(parser, output);
@@ -999,9 +1097,13 @@ static StateResult handle_plaintext_state(GumboParser* parser,
999
1097
  }
1000
1098
  }
1001
1099
 
1002
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state
1003
- static StateResult handle_tag_open_state(GumboParser* parser,
1004
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1100
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
1101
+ static StateResult handle_tag_open_state (
1102
+ GumboParser* parser,
1103
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1104
+ int c,
1105
+ GumboToken* output
1106
+ ) {
1005
1107
  assert(temporary_buffer_equals(parser, "<"));
1006
1108
  switch (c) {
1007
1109
  case '!':
@@ -1032,9 +1134,13 @@ static StateResult handle_tag_open_state(GumboParser* parser,
1032
1134
  }
1033
1135
  }
1034
1136
 
1035
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state
1036
- static StateResult handle_end_tag_open_state(GumboParser* parser,
1037
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1137
+ // https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
1138
+ static StateResult handle_end_tag_open_state (
1139
+ GumboParser* parser,
1140
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1141
+ int c,
1142
+ GumboToken* output
1143
+ ) {
1038
1144
  assert(temporary_buffer_equals(parser, "</"));
1039
1145
  switch (c) {
1040
1146
  case '>':
@@ -1059,9 +1165,13 @@ static StateResult handle_end_tag_open_state(GumboParser* parser,
1059
1165
  }
1060
1166
  }
1061
1167
 
1062
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state
1063
- static StateResult handle_tag_name_state(GumboParser* parser,
1064
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1168
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
1169
+ static StateResult handle_tag_name_state (
1170
+ GumboParser* parser,
1171
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1172
+ int c,
1173
+ GumboToken* output
1174
+ ) {
1065
1175
  switch (c) {
1066
1176
  case '\t':
1067
1177
  case '\n':
@@ -1093,9 +1203,13 @@ static StateResult handle_tag_name_state(GumboParser* parser,
1093
1203
  }
1094
1204
  }
1095
1205
 
1096
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state
1097
- static StateResult handle_rcdata_lt_state(GumboParser* parser,
1098
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1206
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
1207
+ static StateResult handle_rcdata_lt_state (
1208
+ GumboParser* parser,
1209
+ GumboTokenizerState* tokenizer,
1210
+ int c,
1211
+ GumboToken* output
1212
+ ) {
1099
1213
  assert(temporary_buffer_equals(parser, "<"));
1100
1214
  if (c == '/') {
1101
1215
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
@@ -1108,9 +1222,13 @@ static StateResult handle_rcdata_lt_state(GumboParser* parser,
1108
1222
  }
1109
1223
  }
1110
1224
 
1111
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state
1112
- static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser,
1113
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1225
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
1226
+ static StateResult handle_rcdata_end_tag_open_state (
1227
+ GumboParser* parser,
1228
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1229
+ int c,
1230
+ GumboToken* output
1231
+ ) {
1114
1232
  assert(temporary_buffer_equals(parser, "</"));
1115
1233
  if (is_alpha(c)) {
1116
1234
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
@@ -1124,9 +1242,14 @@ static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser,
1124
1242
  return true;
1125
1243
  }
1126
1244
 
1127
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state
1128
- static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser,
1129
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1245
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
1246
+ static StateResult handle_rcdata_end_tag_name_state (
1247
+ GumboParser* parser,
1248
+ GumboTokenizerState* tokenizer,
1249
+ int c,
1250
+ GumboToken* output
1251
+ ) {
1252
+ UNUSED_IF_NDEBUG(tokenizer);
1130
1253
  assert(tokenizer->_temporary_buffer.length >= 2);
1131
1254
  if (is_alpha(c)) {
1132
1255
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
@@ -1156,9 +1279,13 @@ static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser,
1156
1279
  return emit_temporary_buffer(parser, output);
1157
1280
  }
1158
1281
 
1159
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state
1160
- static StateResult handle_rawtext_lt_state(GumboParser* parser,
1161
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1282
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
1283
+ static StateResult handle_rawtext_lt_state (
1284
+ GumboParser* parser,
1285
+ GumboTokenizerState* tokenizer,
1286
+ int c,
1287
+ GumboToken* output
1288
+ ) {
1162
1289
  assert(temporary_buffer_equals(parser, "<"));
1163
1290
  if (c == '/') {
1164
1291
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
@@ -1171,9 +1298,13 @@ static StateResult handle_rawtext_lt_state(GumboParser* parser,
1171
1298
  }
1172
1299
  }
1173
1300
 
1174
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state
1175
- static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser,
1176
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1301
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
1302
+ static StateResult handle_rawtext_end_tag_open_state (
1303
+ GumboParser* parser,
1304
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1305
+ int c,
1306
+ GumboToken* output
1307
+ ) {
1177
1308
  assert(temporary_buffer_equals(parser, "</"));
1178
1309
  if (is_alpha(c)) {
1179
1310
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
@@ -1186,9 +1317,13 @@ static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser,
1186
1317
  }
1187
1318
  }
1188
1319
 
1189
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state
1190
- static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser,
1191
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1320
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
1321
+ static StateResult handle_rawtext_end_tag_name_state (
1322
+ GumboParser* parser,
1323
+ GumboTokenizerState* tokenizer,
1324
+ int c,
1325
+ GumboToken* output
1326
+ ) {
1192
1327
  assert(tokenizer->_temporary_buffer.length >= 2);
1193
1328
  gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
1194
1329
  tokenizer->_tag_state._buffer.data);
@@ -1221,9 +1356,13 @@ static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser,
1221
1356
  return emit_temporary_buffer(parser, output);
1222
1357
  }
1223
1358
 
1224
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state
1225
- static StateResult handle_script_lt_state(GumboParser* parser,
1226
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1359
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
1360
+ static StateResult handle_script_lt_state (
1361
+ GumboParser* parser,
1362
+ GumboTokenizerState* tokenizer,
1363
+ int c,
1364
+ GumboToken* output
1365
+ ) {
1227
1366
  assert(temporary_buffer_equals(parser, "<"));
1228
1367
  if (c == '/') {
1229
1368
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
@@ -1240,9 +1379,13 @@ static StateResult handle_script_lt_state(GumboParser* parser,
1240
1379
  }
1241
1380
  }
1242
1381
 
1243
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state
1244
- static StateResult handle_script_end_tag_open_state(GumboParser* parser,
1245
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1382
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
1383
+ static StateResult handle_script_end_tag_open_state (
1384
+ GumboParser* parser,
1385
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1386
+ int c,
1387
+ GumboToken* output
1388
+ ) {
1246
1389
  assert(temporary_buffer_equals(parser, "</"));
1247
1390
  if (is_alpha(c)) {
1248
1391
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
@@ -1255,9 +1398,14 @@ static StateResult handle_script_end_tag_open_state(GumboParser* parser,
1255
1398
  }
1256
1399
  }
1257
1400
 
1258
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state
1259
- static StateResult handle_script_end_tag_name_state(GumboParser* parser,
1260
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1401
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
1402
+ static StateResult handle_script_end_tag_name_state (
1403
+ GumboParser* parser,
1404
+ GumboTokenizerState* tokenizer,
1405
+ int c,
1406
+ GumboToken* output
1407
+ ) {
1408
+ UNUSED_IF_NDEBUG(tokenizer);
1261
1409
  assert(tokenizer->_temporary_buffer.length >= 2);
1262
1410
  if (is_alpha(c)) {
1263
1411
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
@@ -1287,9 +1435,13 @@ static StateResult handle_script_end_tag_name_state(GumboParser* parser,
1287
1435
  return emit_temporary_buffer(parser, output);
1288
1436
  }
1289
1437
 
1290
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state
1291
- static StateResult handle_script_escaped_start_state(GumboParser* parser,
1292
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1438
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
1439
+ static StateResult handle_script_escaped_start_state (
1440
+ GumboParser* parser,
1441
+ GumboTokenizerState* tokenizer,
1442
+ int c,
1443
+ GumboToken* output
1444
+ ) {
1293
1445
  if (c == '-') {
1294
1446
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
1295
1447
  return emit_current_char(parser, output);
@@ -1300,9 +1452,13 @@ static StateResult handle_script_escaped_start_state(GumboParser* parser,
1300
1452
  }
1301
1453
  }
1302
1454
 
1303
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state
1304
- static StateResult handle_script_escaped_start_dash_state(GumboParser* parser,
1305
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1455
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
1456
+ static StateResult handle_script_escaped_start_dash_state (
1457
+ GumboParser* parser,
1458
+ GumboTokenizerState* tokenizer,
1459
+ int c,
1460
+ GumboToken* output
1461
+ ) {
1306
1462
  if (c == '-') {
1307
1463
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1308
1464
  return emit_current_char(parser, output);
@@ -1313,9 +1469,13 @@ static StateResult handle_script_escaped_start_dash_state(GumboParser* parser,
1313
1469
  }
1314
1470
  }
1315
1471
 
1316
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state
1317
- static StateResult handle_script_escaped_state(GumboParser* parser,
1318
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1472
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
1473
+ static StateResult handle_script_escaped_state (
1474
+ GumboParser* parser,
1475
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1476
+ int c,
1477
+ GumboToken* output
1478
+ ) {
1319
1479
  switch (c) {
1320
1480
  case '-':
1321
1481
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
@@ -1335,9 +1495,13 @@ static StateResult handle_script_escaped_state(GumboParser* parser,
1335
1495
  }
1336
1496
  }
1337
1497
 
1338
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state
1339
- static StateResult handle_script_escaped_dash_state(GumboParser* parser,
1340
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1498
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
1499
+ static StateResult handle_script_escaped_dash_state (
1500
+ GumboParser* parser,
1501
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1502
+ int c,
1503
+ GumboToken* output
1504
+ ) {
1341
1505
  switch (c) {
1342
1506
  case '-':
1343
1507
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
@@ -1360,9 +1524,13 @@ static StateResult handle_script_escaped_dash_state(GumboParser* parser,
1360
1524
  }
1361
1525
  }
1362
1526
 
1363
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state
1364
- static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser,
1365
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1527
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
1528
+ static StateResult handle_script_escaped_dash_dash_state (
1529
+ GumboParser* parser,
1530
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1531
+ int c,
1532
+ GumboToken* output
1533
+ ) {
1366
1534
  switch (c) {
1367
1535
  case '-':
1368
1536
  return emit_current_char(parser, output);
@@ -1387,9 +1555,13 @@ static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser,
1387
1555
  }
1388
1556
  }
1389
1557
 
1390
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state
1391
- static StateResult handle_script_escaped_lt_state(GumboParser* parser,
1392
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1558
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
1559
+ static StateResult handle_script_escaped_lt_state (
1560
+ GumboParser* parser,
1561
+ GumboTokenizerState* tokenizer,
1562
+ int c,
1563
+ GumboToken* output
1564
+ ) {
1393
1565
  assert(temporary_buffer_equals(parser, "<"));
1394
1566
  assert(!tokenizer->_script_data_buffer.length);
1395
1567
  if (c == '/') {
@@ -1399,8 +1571,10 @@ static StateResult handle_script_escaped_lt_state(GumboParser* parser,
1399
1571
  } else if (is_alpha(c)) {
1400
1572
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
1401
1573
  append_char_to_temporary_buffer(parser, c);
1402
- gumbo_string_buffer_append_codepoint(
1403
- parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1574
+ gumbo_string_buffer_append_codepoint (
1575
+ ensure_lowercase(c),
1576
+ &tokenizer->_script_data_buffer
1577
+ );
1404
1578
  return emit_temporary_buffer(parser, output);
1405
1579
  } else {
1406
1580
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
@@ -1408,9 +1582,13 @@ static StateResult handle_script_escaped_lt_state(GumboParser* parser,
1408
1582
  }
1409
1583
  }
1410
1584
 
1411
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state
1412
- static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser,
1413
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1585
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
1586
+ static StateResult handle_script_escaped_end_tag_open_state (
1587
+ GumboParser* parser,
1588
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1589
+ int c,
1590
+ GumboToken* output
1591
+ ) {
1414
1592
  assert(temporary_buffer_equals(parser, "</"));
1415
1593
  if (is_alpha(c)) {
1416
1594
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
@@ -1423,9 +1601,14 @@ static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser,
1423
1601
  }
1424
1602
  }
1425
1603
 
1426
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state
1427
- static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser,
1428
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1604
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
1605
+ static StateResult handle_script_escaped_end_tag_name_state (
1606
+ GumboParser* parser,
1607
+ GumboTokenizerState* tokenizer,
1608
+ int c,
1609
+ GumboToken* output
1610
+ ) {
1611
+ UNUSED_IF_NDEBUG(tokenizer);
1429
1612
  assert(tokenizer->_temporary_buffer.length >= 2);
1430
1613
  if (is_alpha(c)) {
1431
1614
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
@@ -1455,9 +1638,13 @@ static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser,
1455
1638
  return emit_temporary_buffer(parser, output);
1456
1639
  }
1457
1640
 
1458
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state
1459
- static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
1460
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1641
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
1642
+ static StateResult handle_script_double_escaped_start_state (
1643
+ GumboParser* parser,
1644
+ GumboTokenizerState* tokenizer,
1645
+ int c,
1646
+ GumboToken* output
1647
+ ) {
1461
1648
  switch (c) {
1462
1649
  case '\t':
1463
1650
  case '\n':
@@ -1465,16 +1652,22 @@ static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
1465
1652
  case ' ':
1466
1653
  case '/':
1467
1654
  case '>':
1468
- gumbo_tokenizer_set_state(
1469
- parser, gumbo_string_equals(&kScriptTag,
1470
- (GumboStringPiece*) &tokenizer->_script_data_buffer)
1471
- ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
1472
- : GUMBO_LEX_SCRIPT_ESCAPED);
1655
+ gumbo_tokenizer_set_state (
1656
+ parser,
1657
+ gumbo_string_equals (
1658
+ &kScriptTag,
1659
+ (GumboStringPiece*) &tokenizer->_script_data_buffer
1660
+ )
1661
+ ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
1662
+ : GUMBO_LEX_SCRIPT_ESCAPED
1663
+ );
1473
1664
  return emit_current_char(parser, output);
1474
1665
  default:
1475
1666
  if (is_alpha(c)) {
1476
- gumbo_string_buffer_append_codepoint(
1477
- parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1667
+ gumbo_string_buffer_append_codepoint (
1668
+ ensure_lowercase(c),
1669
+ &tokenizer->_script_data_buffer
1670
+ );
1478
1671
  return emit_current_char(parser, output);
1479
1672
  } else {
1480
1673
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
@@ -1484,9 +1677,13 @@ static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
1484
1677
  }
1485
1678
  }
1486
1679
 
1487
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state
1488
- static StateResult handle_script_double_escaped_state(GumboParser* parser,
1489
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1680
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
1681
+ static StateResult handle_script_double_escaped_state (
1682
+ GumboParser* parser,
1683
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1684
+ int c,
1685
+ GumboToken* output
1686
+ ) {
1490
1687
  switch (c) {
1491
1688
  case '-':
1492
1689
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
@@ -1505,9 +1702,13 @@ static StateResult handle_script_double_escaped_state(GumboParser* parser,
1505
1702
  }
1506
1703
  }
1507
1704
 
1508
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state
1509
- static StateResult handle_script_double_escaped_dash_state(GumboParser* parser,
1510
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1705
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
1706
+ static StateResult handle_script_double_escaped_dash_state (
1707
+ GumboParser* parser,
1708
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1709
+ int c,
1710
+ GumboToken* output
1711
+ ) {
1511
1712
  switch (c) {
1512
1713
  case '-':
1513
1714
  gumbo_tokenizer_set_state(
@@ -1529,10 +1730,13 @@ static StateResult handle_script_double_escaped_dash_state(GumboParser* parser,
1529
1730
  }
1530
1731
  }
1531
1732
 
1532
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state
1533
- static StateResult handle_script_double_escaped_dash_dash_state(
1534
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
1535
- GumboToken* output) {
1733
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
1734
+ static StateResult handle_script_double_escaped_dash_dash_state (
1735
+ GumboParser* parser,
1736
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1737
+ int c,
1738
+ GumboToken* output
1739
+ ) {
1536
1740
  switch (c) {
1537
1741
  case '-':
1538
1742
  return emit_current_char(parser, output);
@@ -1555,12 +1759,16 @@ static StateResult handle_script_double_escaped_dash_dash_state(
1555
1759
  }
1556
1760
  }
1557
1761
 
1558
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state
1559
- static StateResult handle_script_double_escaped_lt_state(GumboParser* parser,
1560
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1762
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
1763
+ static StateResult handle_script_double_escaped_lt_state (
1764
+ GumboParser* parser,
1765
+ GumboTokenizerState* tokenizer,
1766
+ int c,
1767
+ GumboToken* output
1768
+ ) {
1561
1769
  if (c == '/') {
1562
1770
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1563
- gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
1771
+ gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
1564
1772
  return emit_current_char(parser, output);
1565
1773
  } else {
1566
1774
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
@@ -1569,9 +1777,13 @@ static StateResult handle_script_double_escaped_lt_state(GumboParser* parser,
1569
1777
  }
1570
1778
  }
1571
1779
 
1572
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state
1573
- static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
1574
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1780
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
1781
+ static StateResult handle_script_double_escaped_end_state (
1782
+ GumboParser* parser,
1783
+ GumboTokenizerState* tokenizer,
1784
+ int c,
1785
+ GumboToken* output
1786
+ ) {
1575
1787
  switch (c) {
1576
1788
  case '\t':
1577
1789
  case '\n':
@@ -1587,8 +1799,10 @@ static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
1587
1799
  return emit_current_char(parser, output);
1588
1800
  default:
1589
1801
  if (is_alpha(c)) {
1590
- gumbo_string_buffer_append_codepoint(
1591
- parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1802
+ gumbo_string_buffer_append_codepoint (
1803
+ ensure_lowercase(c),
1804
+ &tokenizer->_script_data_buffer
1805
+ );
1592
1806
  return emit_current_char(parser, output);
1593
1807
  } else {
1594
1808
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
@@ -1598,9 +1812,13 @@ static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
1598
1812
  }
1599
1813
  }
1600
1814
 
1601
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state
1602
- static StateResult handle_before_attr_name_state(GumboParser* parser,
1603
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1815
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
1816
+ static StateResult handle_before_attr_name_state (
1817
+ GumboParser* parser,
1818
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1819
+ int c,
1820
+ GumboToken* output
1821
+ ) {
1604
1822
  switch (c) {
1605
1823
  case '\t':
1606
1824
  case '\n':
@@ -1636,9 +1854,13 @@ static StateResult handle_before_attr_name_state(GumboParser* parser,
1636
1854
  }
1637
1855
  }
1638
1856
 
1639
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state
1640
- static StateResult handle_attr_name_state(GumboParser* parser,
1641
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1857
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
1858
+ static StateResult handle_attr_name_state (
1859
+ GumboParser* parser,
1860
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1861
+ int c,
1862
+ GumboToken* output
1863
+ ) {
1642
1864
  switch (c) {
1643
1865
  case '\t':
1644
1866
  case '\n':
@@ -1679,9 +1901,13 @@ static StateResult handle_attr_name_state(GumboParser* parser,
1679
1901
  }
1680
1902
  }
1681
1903
 
1682
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state
1683
- static StateResult handle_after_attr_name_state(GumboParser* parser,
1684
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1904
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
1905
+ static StateResult handle_after_attr_name_state (
1906
+ GumboParser* parser,
1907
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1908
+ int c,
1909
+ GumboToken* output
1910
+ ) {
1685
1911
  switch (c) {
1686
1912
  case '\t':
1687
1913
  case '\n':
@@ -1719,9 +1945,13 @@ static StateResult handle_after_attr_name_state(GumboParser* parser,
1719
1945
  }
1720
1946
  }
1721
1947
 
1722
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state
1723
- static StateResult handle_before_attr_value_state(GumboParser* parser,
1724
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1948
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
1949
+ static StateResult handle_before_attr_value_state (
1950
+ GumboParser* parser,
1951
+ GumboTokenizerState* tokenizer,
1952
+ int c,
1953
+ GumboToken* output
1954
+ ) {
1725
1955
  switch (c) {
1726
1956
  case '\t':
1727
1957
  case '\n':
@@ -1768,9 +1998,13 @@ static StateResult handle_before_attr_value_state(GumboParser* parser,
1768
1998
  }
1769
1999
  }
1770
2000
 
1771
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state
1772
- static StateResult handle_attr_value_double_quoted_state(GumboParser* parser,
1773
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2001
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state
2002
+ static StateResult handle_attr_value_double_quoted_state (
2003
+ GumboParser* parser,
2004
+ GumboTokenizerState* tokenizer,
2005
+ int c,
2006
+ GumboToken* UNUSED_ARG(output)
2007
+ ) {
1774
2008
  switch (c) {
1775
2009
  case '"':
1776
2010
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
@@ -1796,9 +2030,13 @@ static StateResult handle_attr_value_double_quoted_state(GumboParser* parser,
1796
2030
  }
1797
2031
  }
1798
2032
 
1799
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state
1800
- static StateResult handle_attr_value_single_quoted_state(GumboParser* parser,
1801
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2033
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-single-quoted-state
2034
+ static StateResult handle_attr_value_single_quoted_state (
2035
+ GumboParser* parser,
2036
+ GumboTokenizerState* tokenizer,
2037
+ int c,
2038
+ GumboToken* UNUSED_ARG(output)
2039
+ ) {
1802
2040
  switch (c) {
1803
2041
  case '\'':
1804
2042
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
@@ -1824,9 +2062,13 @@ static StateResult handle_attr_value_single_quoted_state(GumboParser* parser,
1824
2062
  }
1825
2063
  }
1826
2064
 
1827
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state
1828
- static StateResult handle_attr_value_unquoted_state(GumboParser* parser,
1829
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2065
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-unquoted-state
2066
+ static StateResult handle_attr_value_unquoted_state (
2067
+ GumboParser* parser,
2068
+ GumboTokenizerState* tokenizer,
2069
+ int c,
2070
+ GumboToken* output
2071
+ ) {
1830
2072
  switch (c) {
1831
2073
  case '\t':
1832
2074
  case '\n':
@@ -1867,9 +2109,13 @@ static StateResult handle_attr_value_unquoted_state(GumboParser* parser,
1867
2109
  }
1868
2110
  }
1869
2111
 
1870
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state
1871
- static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
1872
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2112
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-attribute-value-state
2113
+ static StateResult handle_char_ref_in_attr_value_state (
2114
+ GumboParser* parser,
2115
+ GumboTokenizerState* tokenizer,
2116
+ int UNUSED_ARG(c),
2117
+ GumboToken* UNUSED_ARG(output)
2118
+ ) {
1873
2119
  OneOrTwoCodepoints char_ref;
1874
2120
  int allowed_char;
1875
2121
  bool is_unquoted = false;
@@ -1893,9 +2139,15 @@ static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
1893
2139
 
1894
2140
  // Ignore the status, since we don't have a convenient way of signalling that
1895
2141
  // a parser error has occurred when the error occurs in the middle of a
1896
- // multi-state token. We'd need a flag inside the TokenizerState to do this,
2142
+ // multi-state token. We'd need a flag inside the TokenizerState to do this,
1897
2143
  // but that's a low priority fix.
1898
- consume_char_ref(parser, &tokenizer->_input, allowed_char, true, &char_ref);
2144
+ gumbo_consume_char_ref (
2145
+ parser,
2146
+ &tokenizer->_input,
2147
+ allowed_char,
2148
+ true,
2149
+ &char_ref
2150
+ );
1899
2151
  if (char_ref.first != kGumboNoChar) {
1900
2152
  tokenizer->_reconsume_current_input = true;
1901
2153
  append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
@@ -1909,9 +2161,13 @@ static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
1909
2161
  return NEXT_CHAR;
1910
2162
  }
1911
2163
 
1912
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state
1913
- static StateResult handle_after_attr_value_quoted_state(GumboParser* parser,
1914
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2164
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state
2165
+ static StateResult handle_after_attr_value_quoted_state (
2166
+ GumboParser* parser,
2167
+ GumboTokenizerState* tokenizer,
2168
+ int c,
2169
+ GumboToken* output
2170
+ ) {
1915
2171
  finish_attribute_value(parser);
1916
2172
  switch (c) {
1917
2173
  case '\t':
@@ -1940,9 +2196,13 @@ static StateResult handle_after_attr_value_quoted_state(GumboParser* parser,
1940
2196
  }
1941
2197
  }
1942
2198
 
1943
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state
1944
- static StateResult handle_self_closing_start_tag_state(GumboParser* parser,
1945
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2199
+ // https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
2200
+ static StateResult handle_self_closing_start_tag_state (
2201
+ GumboParser* parser,
2202
+ GumboTokenizerState* tokenizer,
2203
+ int c,
2204
+ GumboToken* output
2205
+ ) {
1946
2206
  switch (c) {
1947
2207
  case '>':
1948
2208
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
@@ -1961,11 +2221,16 @@ static StateResult handle_self_closing_start_tag_state(GumboParser* parser,
1961
2221
  }
1962
2222
  }
1963
2223
 
1964
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state
1965
- static StateResult handle_bogus_comment_state(GumboParser* parser,
1966
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2224
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
2225
+ static StateResult handle_bogus_comment_state (
2226
+ GumboParser* parser,
2227
+ GumboTokenizerState* tokenizer,
2228
+ int c,
2229
+ GumboToken* output
2230
+ ) {
1967
2231
  while (c != '>' && c != -1) {
1968
2232
  if (c == '\0') {
2233
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1969
2234
  c = 0xFFFD;
1970
2235
  }
1971
2236
  append_char_to_temporary_buffer(parser, c);
@@ -1976,29 +2241,48 @@ static StateResult handle_bogus_comment_state(GumboParser* parser,
1976
2241
  return emit_comment(parser, output);
1977
2242
  }
1978
2243
 
1979
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state
1980
- static StateResult handle_markup_declaration_state(GumboParser* parser,
1981
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1982
- if (utf8iterator_maybe_consume_match(
1983
- &tokenizer->_input, "--", sizeof("--") - 1, true)) {
2244
+ // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
2245
+ static StateResult handle_markup_declaration_state (
2246
+ GumboParser* parser,
2247
+ GumboTokenizerState* tokenizer,
2248
+ int UNUSED_ARG(c),
2249
+ GumboToken* UNUSED_ARG(output)
2250
+ ) {
2251
+ if (
2252
+ utf8iterator_maybe_consume_match (
2253
+ &tokenizer->_input,
2254
+ "--",
2255
+ sizeof("--") - 1,
2256
+ true
2257
+ )
2258
+ ) {
1984
2259
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
1985
2260
  tokenizer->_reconsume_current_input = true;
1986
- } else if (utf8iterator_maybe_consume_match(
1987
- &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
2261
+ } else if (
2262
+ utf8iterator_maybe_consume_match (
2263
+ &tokenizer->_input,
2264
+ "DOCTYPE",
2265
+ sizeof("DOCTYPE") - 1,
2266
+ false
2267
+ )
2268
+ ) {
1988
2269
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
1989
2270
  tokenizer->_reconsume_current_input = true;
1990
2271
  // If we get here, we know we'll eventually emit a doctype token, so now is
1991
- // the time to initialize the doctype strings. (Not in doctype_state_init,
2272
+ // the time to initialize the doctype strings. (Not in doctype_state_init,
1992
2273
  // since then they'll leak if ownership never gets transferred to the
1993
2274
  // doctype token.
1994
- tokenizer->_doc_type_state.name = gumbo_copy_stringz(parser, "");
1995
- tokenizer->_doc_type_state.public_identifier =
1996
- gumbo_copy_stringz(parser, "");
1997
- tokenizer->_doc_type_state.system_identifier =
1998
- gumbo_copy_stringz(parser, "");
1999
- } else if (tokenizer->_is_current_node_foreign &&
2000
- utf8iterator_maybe_consume_match(
2001
- &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
2275
+ tokenizer->_doc_type_state.name = gumbo_strdup("");
2276
+ tokenizer->_doc_type_state.public_identifier = gumbo_strdup("");
2277
+ tokenizer->_doc_type_state.system_identifier = gumbo_strdup("");
2278
+ } else if (
2279
+ tokenizer->_is_current_node_foreign
2280
+ && utf8iterator_maybe_consume_match (
2281
+ &tokenizer->_input,
2282
+ "[CDATA[", sizeof("[CDATA[") - 1,
2283
+ true
2284
+ )
2285
+ ) {
2002
2286
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2003
2287
  tokenizer->_is_in_cdata = true;
2004
2288
  tokenizer->_reconsume_current_input = true;
@@ -2011,9 +2295,13 @@ static StateResult handle_markup_declaration_state(GumboParser* parser,
2011
2295
  return NEXT_CHAR;
2012
2296
  }
2013
2297
 
2014
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state
2015
- static StateResult handle_comment_start_state(GumboParser* parser,
2016
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2298
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
2299
+ static StateResult handle_comment_start_state (
2300
+ GumboParser* parser,
2301
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2302
+ int c,
2303
+ GumboToken* output
2304
+ ) {
2017
2305
  switch (c) {
2018
2306
  case '-':
2019
2307
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
@@ -2040,9 +2328,13 @@ static StateResult handle_comment_start_state(GumboParser* parser,
2040
2328
  }
2041
2329
  }
2042
2330
 
2043
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state
2044
- static StateResult handle_comment_start_dash_state(GumboParser* parser,
2045
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2331
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
2332
+ static StateResult handle_comment_start_dash_state (
2333
+ GumboParser* parser,
2334
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2335
+ int c,
2336
+ GumboToken* output
2337
+ ) {
2046
2338
  switch (c) {
2047
2339
  case '-':
2048
2340
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
@@ -2071,9 +2363,13 @@ static StateResult handle_comment_start_dash_state(GumboParser* parser,
2071
2363
  }
2072
2364
  }
2073
2365
 
2074
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state
2075
- static StateResult handle_comment_state(GumboParser* parser,
2076
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2366
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-state
2367
+ static StateResult handle_comment_state (
2368
+ GumboParser* parser,
2369
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2370
+ int c,
2371
+ GumboToken* output
2372
+ ) {
2077
2373
  switch (c) {
2078
2374
  case '-':
2079
2375
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
@@ -2093,9 +2389,13 @@ static StateResult handle_comment_state(GumboParser* parser,
2093
2389
  }
2094
2390
  }
2095
2391
 
2096
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state
2097
- static StateResult handle_comment_end_dash_state(GumboParser* parser,
2098
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2392
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
2393
+ static StateResult handle_comment_end_dash_state (
2394
+ GumboParser* parser,
2395
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2396
+ int c,
2397
+ GumboToken* output
2398
+ ) {
2099
2399
  switch (c) {
2100
2400
  case '-':
2101
2401
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
@@ -2119,9 +2419,13 @@ static StateResult handle_comment_end_dash_state(GumboParser* parser,
2119
2419
  }
2120
2420
  }
2121
2421
 
2122
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state
2123
- static StateResult handle_comment_end_state(GumboParser* parser,
2124
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2422
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
2423
+ static StateResult handle_comment_end_state (
2424
+ GumboParser* parser,
2425
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2426
+ int c,
2427
+ GumboToken* output
2428
+ ) {
2125
2429
  switch (c) {
2126
2430
  case '>':
2127
2431
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
@@ -2158,9 +2462,13 @@ static StateResult handle_comment_end_state(GumboParser* parser,
2158
2462
  }
2159
2463
  }
2160
2464
 
2161
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state
2162
- static StateResult handle_comment_end_bang_state(GumboParser* parser,
2163
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2465
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
2466
+ static StateResult handle_comment_end_bang_state (
2467
+ GumboParser* parser,
2468
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2469
+ int c,
2470
+ GumboToken* output
2471
+ ) {
2164
2472
  switch (c) {
2165
2473
  case '-':
2166
2474
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
@@ -2194,9 +2502,13 @@ static StateResult handle_comment_end_bang_state(GumboParser* parser,
2194
2502
  }
2195
2503
  }
2196
2504
 
2197
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state
2198
- static StateResult handle_doctype_state(GumboParser* parser,
2199
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2505
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
2506
+ static StateResult handle_doctype_state (
2507
+ GumboParser* parser,
2508
+ GumboTokenizerState* tokenizer,
2509
+ int c,
2510
+ GumboToken* output
2511
+ ) {
2200
2512
  assert(!tokenizer->_temporary_buffer.length);
2201
2513
  switch (c) {
2202
2514
  case '\t':
@@ -2220,9 +2532,13 @@ static StateResult handle_doctype_state(GumboParser* parser,
2220
2532
  }
2221
2533
  }
2222
2534
 
2223
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state
2224
- static StateResult handle_before_doctype_name_state(GumboParser* parser,
2225
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2535
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
2536
+ static StateResult handle_before_doctype_name_state (
2537
+ GumboParser* parser,
2538
+ GumboTokenizerState* tokenizer,
2539
+ int c,
2540
+ GumboToken* output
2541
+ ) {
2226
2542
  switch (c) {
2227
2543
  case '\t':
2228
2544
  case '\n':
@@ -2255,21 +2571,25 @@ static StateResult handle_before_doctype_name_state(GumboParser* parser,
2255
2571
  }
2256
2572
  }
2257
2573
 
2258
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state
2259
- static StateResult handle_doctype_name_state(GumboParser* parser,
2260
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2574
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
2575
+ static StateResult handle_doctype_name_state (
2576
+ GumboParser* parser,
2577
+ GumboTokenizerState* tokenizer,
2578
+ int c,
2579
+ GumboToken* output
2580
+ ) {
2261
2581
  switch (c) {
2262
2582
  case '\t':
2263
2583
  case '\n':
2264
2584
  case '\f':
2265
2585
  case ' ':
2266
2586
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2267
- gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2587
+ gumbo_free((void*) tokenizer->_doc_type_state.name);
2268
2588
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2269
2589
  return NEXT_CHAR;
2270
2590
  case '>':
2271
2591
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2272
- gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2592
+ gumbo_free((void*) tokenizer->_doc_type_state.name);
2273
2593
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2274
2594
  emit_doctype(parser, output);
2275
2595
  return RETURN_SUCCESS;
@@ -2281,7 +2601,7 @@ static StateResult handle_doctype_name_state(GumboParser* parser,
2281
2601
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2282
2602
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2283
2603
  tokenizer->_doc_type_state.force_quirks = true;
2284
- gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2604
+ gumbo_free((void*) tokenizer->_doc_type_state.name);
2285
2605
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2286
2606
  emit_doctype(parser, output);
2287
2607
  return RETURN_ERROR;
@@ -2293,9 +2613,13 @@ static StateResult handle_doctype_name_state(GumboParser* parser,
2293
2613
  }
2294
2614
  }
2295
2615
 
2296
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state
2297
- static StateResult handle_after_doctype_name_state(GumboParser* parser,
2298
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2616
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
2617
+ static StateResult handle_after_doctype_name_state (
2618
+ GumboParser* parser,
2619
+ GumboTokenizerState* tokenizer,
2620
+ int c,
2621
+ GumboToken* output
2622
+ ) {
2299
2623
  switch (c) {
2300
2624
  case '\t':
2301
2625
  case '\n':
@@ -2333,10 +2657,13 @@ static StateResult handle_after_doctype_name_state(GumboParser* parser,
2333
2657
  }
2334
2658
  }
2335
2659
 
2336
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state
2337
- static StateResult handle_after_doctype_public_keyword_state(
2338
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2339
- GumboToken* output) {
2660
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
2661
+ static StateResult handle_after_doctype_public_keyword_state (
2662
+ GumboParser* parser,
2663
+ GumboTokenizerState* tokenizer,
2664
+ int c,
2665
+ GumboToken* output
2666
+ ) {
2340
2667
  switch (c) {
2341
2668
  case '\t':
2342
2669
  case '\n':
@@ -2346,13 +2673,13 @@ static StateResult handle_after_doctype_public_keyword_state(
2346
2673
  return NEXT_CHAR;
2347
2674
  case '"':
2348
2675
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2349
- assert(temporary_buffer_equals(parser, ""));
2676
+ assert(temporary_buffer_is_empty(parser));
2350
2677
  gumbo_tokenizer_set_state(
2351
2678
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2352
2679
  return NEXT_CHAR;
2353
2680
  case '\'':
2354
2681
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2355
- assert(temporary_buffer_equals(parser, ""));
2682
+ assert(temporary_buffer_is_empty(parser));
2356
2683
  gumbo_tokenizer_set_state(
2357
2684
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2358
2685
  return NEXT_CHAR;
@@ -2377,9 +2704,13 @@ static StateResult handle_after_doctype_public_keyword_state(
2377
2704
  }
2378
2705
  }
2379
2706
 
2380
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state
2381
- static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
2382
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2707
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
2708
+ static StateResult handle_before_doctype_public_id_state (
2709
+ GumboParser* parser,
2710
+ GumboTokenizerState* tokenizer,
2711
+ int c,
2712
+ GumboToken* output
2713
+ ) {
2383
2714
  switch (c) {
2384
2715
  case '\t':
2385
2716
  case '\n':
@@ -2387,12 +2718,12 @@ static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
2387
2718
  case ' ':
2388
2719
  return NEXT_CHAR;
2389
2720
  case '"':
2390
- assert(temporary_buffer_equals(parser, ""));
2721
+ assert(temporary_buffer_is_empty(parser));
2391
2722
  gumbo_tokenizer_set_state(
2392
2723
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2393
2724
  return NEXT_CHAR;
2394
2725
  case '\'':
2395
- assert(temporary_buffer_equals(parser, ""));
2726
+ assert(temporary_buffer_is_empty(parser));
2396
2727
  gumbo_tokenizer_set_state(
2397
2728
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2398
2729
  return NEXT_CHAR;
@@ -2417,10 +2748,13 @@ static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
2417
2748
  }
2418
2749
  }
2419
2750
 
2420
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state
2421
- static StateResult handle_doctype_public_id_double_quoted_state(
2422
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2423
- GumboToken* output) {
2751
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
2752
+ static StateResult handle_doctype_public_id_double_quoted_state (
2753
+ GumboParser* parser,
2754
+ GumboTokenizerState* tokenizer,
2755
+ int c,
2756
+ GumboToken* output
2757
+ ) {
2424
2758
  switch (c) {
2425
2759
  case '"':
2426
2760
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
@@ -2450,10 +2784,13 @@ static StateResult handle_doctype_public_id_double_quoted_state(
2450
2784
  }
2451
2785
  }
2452
2786
 
2453
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state
2454
- static StateResult handle_doctype_public_id_single_quoted_state(
2455
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2456
- GumboToken* output) {
2787
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
2788
+ static StateResult handle_doctype_public_id_single_quoted_state (
2789
+ GumboParser* parser,
2790
+ GumboTokenizerState* tokenizer,
2791
+ int c,
2792
+ GumboToken* output
2793
+ ) {
2457
2794
  switch (c) {
2458
2795
  case '\'':
2459
2796
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
@@ -2483,9 +2820,13 @@ static StateResult handle_doctype_public_id_single_quoted_state(
2483
2820
  }
2484
2821
  }
2485
2822
 
2486
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state
2487
- static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
2488
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2823
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
2824
+ static StateResult handle_after_doctype_public_id_state (
2825
+ GumboParser* parser,
2826
+ GumboTokenizerState* tokenizer,
2827
+ int c,
2828
+ GumboToken* output
2829
+ ) {
2489
2830
  switch (c) {
2490
2831
  case '\t':
2491
2832
  case '\n':
@@ -2500,13 +2841,13 @@ static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
2500
2841
  return RETURN_SUCCESS;
2501
2842
  case '"':
2502
2843
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2503
- assert(temporary_buffer_equals(parser, ""));
2844
+ assert(temporary_buffer_is_empty(parser));
2504
2845
  gumbo_tokenizer_set_state(
2505
2846
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2506
2847
  return NEXT_CHAR;
2507
2848
  case '\'':
2508
2849
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2509
- assert(temporary_buffer_equals(parser, ""));
2850
+ assert(temporary_buffer_is_empty(parser));
2510
2851
  gumbo_tokenizer_set_state(
2511
2852
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2512
2853
  return NEXT_CHAR;
@@ -2525,10 +2866,13 @@ static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
2525
2866
  }
2526
2867
  }
2527
2868
 
2528
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state
2529
- static StateResult handle_between_doctype_public_system_id_state(
2530
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2531
- GumboToken* output) {
2869
+ // https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
2870
+ static StateResult handle_between_doctype_public_system_id_state (
2871
+ GumboParser* parser,
2872
+ GumboTokenizerState* tokenizer,
2873
+ int c,
2874
+ GumboToken* output
2875
+ ) {
2532
2876
  switch (c) {
2533
2877
  case '\t':
2534
2878
  case '\n':
@@ -2540,12 +2884,12 @@ static StateResult handle_between_doctype_public_system_id_state(
2540
2884
  emit_doctype(parser, output);
2541
2885
  return RETURN_SUCCESS;
2542
2886
  case '"':
2543
- assert(temporary_buffer_equals(parser, ""));
2887
+ assert(temporary_buffer_is_empty(parser));
2544
2888
  gumbo_tokenizer_set_state(
2545
2889
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2546
2890
  return NEXT_CHAR;
2547
2891
  case '\'':
2548
- assert(temporary_buffer_equals(parser, ""));
2892
+ assert(temporary_buffer_is_empty(parser));
2549
2893
  gumbo_tokenizer_set_state(
2550
2894
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2551
2895
  return NEXT_CHAR;
@@ -2564,10 +2908,13 @@ static StateResult handle_between_doctype_public_system_id_state(
2564
2908
  }
2565
2909
  }
2566
2910
 
2567
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state
2568
- static StateResult handle_after_doctype_system_keyword_state(
2569
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2570
- GumboToken* output) {
2911
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
2912
+ static StateResult handle_after_doctype_system_keyword_state (
2913
+ GumboParser* parser,
2914
+ GumboTokenizerState* tokenizer,
2915
+ int c,
2916
+ GumboToken* output
2917
+ ) {
2571
2918
  switch (c) {
2572
2919
  case '\t':
2573
2920
  case '\n':
@@ -2577,13 +2924,13 @@ static StateResult handle_after_doctype_system_keyword_state(
2577
2924
  return NEXT_CHAR;
2578
2925
  case '"':
2579
2926
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2580
- assert(temporary_buffer_equals(parser, ""));
2927
+ assert(temporary_buffer_is_empty(parser));
2581
2928
  gumbo_tokenizer_set_state(
2582
2929
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2583
2930
  return NEXT_CHAR;
2584
2931
  case '\'':
2585
2932
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2586
- assert(temporary_buffer_equals(parser, ""));
2933
+ assert(temporary_buffer_is_empty(parser));
2587
2934
  gumbo_tokenizer_set_state(
2588
2935
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2589
2936
  return NEXT_CHAR;
@@ -2607,9 +2954,13 @@ static StateResult handle_after_doctype_system_keyword_state(
2607
2954
  }
2608
2955
  }
2609
2956
 
2610
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state
2611
- static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
2612
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2957
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
2958
+ static StateResult handle_before_doctype_system_id_state (
2959
+ GumboParser* parser,
2960
+ GumboTokenizerState* tokenizer,
2961
+ int c,
2962
+ GumboToken* output
2963
+ ) {
2613
2964
  switch (c) {
2614
2965
  case '\t':
2615
2966
  case '\n':
@@ -2617,12 +2968,12 @@ static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
2617
2968
  case ' ':
2618
2969
  return NEXT_CHAR;
2619
2970
  case '"':
2620
- assert(temporary_buffer_equals(parser, ""));
2971
+ assert(temporary_buffer_is_empty(parser));
2621
2972
  gumbo_tokenizer_set_state(
2622
2973
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2623
2974
  return NEXT_CHAR;
2624
2975
  case '\'':
2625
- assert(temporary_buffer_equals(parser, ""));
2976
+ assert(temporary_buffer_is_empty(parser));
2626
2977
  gumbo_tokenizer_set_state(
2627
2978
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2628
2979
  return NEXT_CHAR;
@@ -2646,10 +2997,13 @@ static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
2646
2997
  }
2647
2998
  }
2648
2999
 
2649
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state
2650
- static StateResult handle_doctype_system_id_double_quoted_state(
2651
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2652
- GumboToken* output) {
3000
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
3001
+ static StateResult handle_doctype_system_id_double_quoted_state (
3002
+ GumboParser* parser,
3003
+ GumboTokenizerState* tokenizer,
3004
+ int c,
3005
+ GumboToken* output
3006
+ ) {
2653
3007
  switch (c) {
2654
3008
  case '"':
2655
3009
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
@@ -2679,10 +3033,13 @@ static StateResult handle_doctype_system_id_double_quoted_state(
2679
3033
  }
2680
3034
  }
2681
3035
 
2682
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state
2683
- static StateResult handle_doctype_system_id_single_quoted_state(
2684
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2685
- GumboToken* output) {
3036
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
3037
+ static StateResult handle_doctype_system_id_single_quoted_state (
3038
+ GumboParser* parser,
3039
+ GumboTokenizerState* tokenizer,
3040
+ int c,
3041
+ GumboToken* output
3042
+ ) {
2686
3043
  switch (c) {
2687
3044
  case '\'':
2688
3045
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
@@ -2712,9 +3069,13 @@ static StateResult handle_doctype_system_id_single_quoted_state(
2712
3069
  }
2713
3070
  }
2714
3071
 
2715
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state
2716
- static StateResult handle_after_doctype_system_id_state(GumboParser* parser,
2717
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
3072
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
3073
+ static StateResult handle_after_doctype_system_id_state (
3074
+ GumboParser* parser,
3075
+ GumboTokenizerState* tokenizer,
3076
+ int c,
3077
+ GumboToken* output
3078
+ ) {
2718
3079
  switch (c) {
2719
3080
  case '\t':
2720
3081
  case '\n':
@@ -2738,9 +3099,13 @@ static StateResult handle_after_doctype_system_id_state(GumboParser* parser,
2738
3099
  }
2739
3100
  }
2740
3101
 
2741
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state
2742
- static StateResult handle_bogus_doctype_state(GumboParser* parser,
2743
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
3102
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
3103
+ static StateResult handle_bogus_doctype_state (
3104
+ GumboParser* parser,
3105
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
3106
+ int c,
3107
+ GumboToken* output
3108
+ ) {
2744
3109
  if (c == '>' || c == -1) {
2745
3110
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2746
3111
  emit_doctype(parser, output);
@@ -2749,9 +3114,13 @@ static StateResult handle_bogus_doctype_state(GumboParser* parser,
2749
3114
  return NEXT_CHAR;
2750
3115
  }
2751
3116
 
2752
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state
2753
- static StateResult handle_cdata_state(GumboParser* parser,
2754
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
3117
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
3118
+ static StateResult handle_cdata_state (
3119
+ GumboParser* parser,
3120
+ GumboTokenizerState* tokenizer,
3121
+ int c,
3122
+ GumboToken* output
3123
+ ) {
2755
3124
  if (c == -1 || utf8iterator_maybe_consume_match(
2756
3125
  &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
2757
3126
  tokenizer->_reconsume_current_input = true;
@@ -2764,50 +3133,83 @@ static StateResult handle_cdata_state(GumboParser* parser,
2764
3133
  }
2765
3134
  }
2766
3135
 
2767
- typedef StateResult (*GumboLexerStateFunction)(
2768
- GumboParser*, GumboTokenizerState*, int, GumboToken*);
2769
-
2770
- static GumboLexerStateFunction dispatch_table[] = {handle_data_state,
2771
- handle_char_ref_in_data_state, handle_rcdata_state,
2772
- handle_char_ref_in_rcdata_state, handle_rawtext_state, handle_script_state,
2773
- handle_plaintext_state, handle_tag_open_state, handle_end_tag_open_state,
2774
- handle_tag_name_state, handle_rcdata_lt_state,
2775
- handle_rcdata_end_tag_open_state, handle_rcdata_end_tag_name_state,
2776
- handle_rawtext_lt_state, handle_rawtext_end_tag_open_state,
2777
- handle_rawtext_end_tag_name_state, handle_script_lt_state,
2778
- handle_script_end_tag_open_state, handle_script_end_tag_name_state,
2779
- handle_script_escaped_start_state, handle_script_escaped_start_dash_state,
2780
- handle_script_escaped_state, handle_script_escaped_dash_state,
2781
- handle_script_escaped_dash_dash_state, handle_script_escaped_lt_state,
2782
- handle_script_escaped_end_tag_open_state,
2783
- handle_script_escaped_end_tag_name_state,
2784
- handle_script_double_escaped_start_state,
2785
- handle_script_double_escaped_state, handle_script_double_escaped_dash_state,
2786
- handle_script_double_escaped_dash_dash_state,
2787
- handle_script_double_escaped_lt_state,
2788
- handle_script_double_escaped_end_state, handle_before_attr_name_state,
2789
- handle_attr_name_state, handle_after_attr_name_state,
2790
- handle_before_attr_value_state, handle_attr_value_double_quoted_state,
2791
- handle_attr_value_single_quoted_state, handle_attr_value_unquoted_state,
2792
- handle_char_ref_in_attr_value_state, handle_after_attr_value_quoted_state,
2793
- handle_self_closing_start_tag_state, handle_bogus_comment_state,
2794
- handle_markup_declaration_state, handle_comment_start_state,
2795
- handle_comment_start_dash_state, handle_comment_state,
2796
- handle_comment_end_dash_state, handle_comment_end_state,
2797
- handle_comment_end_bang_state, handle_doctype_state,
2798
- handle_before_doctype_name_state, handle_doctype_name_state,
2799
- handle_after_doctype_name_state, handle_after_doctype_public_keyword_state,
2800
- handle_before_doctype_public_id_state,
2801
- handle_doctype_public_id_double_quoted_state,
2802
- handle_doctype_public_id_single_quoted_state,
2803
- handle_after_doctype_public_id_state,
2804
- handle_between_doctype_public_system_id_state,
2805
- handle_after_doctype_system_keyword_state,
2806
- handle_before_doctype_system_id_state,
2807
- handle_doctype_system_id_double_quoted_state,
2808
- handle_doctype_system_id_single_quoted_state,
2809
- handle_after_doctype_system_id_state, handle_bogus_doctype_state,
2810
- handle_cdata_state};
3136
+ typedef StateResult (*GumboLexerStateFunction) (
3137
+ GumboParser* parser,
3138
+ GumboTokenizerState* tokenizer,
3139
+ int c,
3140
+ GumboToken* output
3141
+ );
3142
+
3143
+ static GumboLexerStateFunction dispatch_table[] = {
3144
+ handle_data_state,
3145
+ handle_char_ref_in_data_state,
3146
+ handle_rcdata_state,
3147
+ handle_char_ref_in_rcdata_state,
3148
+ handle_rawtext_state,
3149
+ handle_script_state,
3150
+ handle_plaintext_state,
3151
+ handle_tag_open_state,
3152
+ handle_end_tag_open_state,
3153
+ handle_tag_name_state,
3154
+ handle_rcdata_lt_state,
3155
+ handle_rcdata_end_tag_open_state,
3156
+ handle_rcdata_end_tag_name_state,
3157
+ handle_rawtext_lt_state,
3158
+ handle_rawtext_end_tag_open_state,
3159
+ handle_rawtext_end_tag_name_state,
3160
+ handle_script_lt_state,
3161
+ handle_script_end_tag_open_state,
3162
+ handle_script_end_tag_name_state,
3163
+ handle_script_escaped_start_state,
3164
+ handle_script_escaped_start_dash_state,
3165
+ handle_script_escaped_state,
3166
+ handle_script_escaped_dash_state,
3167
+ handle_script_escaped_dash_dash_state,
3168
+ handle_script_escaped_lt_state,
3169
+ handle_script_escaped_end_tag_open_state,
3170
+ handle_script_escaped_end_tag_name_state,
3171
+ handle_script_double_escaped_start_state,
3172
+ handle_script_double_escaped_state,
3173
+ handle_script_double_escaped_dash_state,
3174
+ handle_script_double_escaped_dash_dash_state,
3175
+ handle_script_double_escaped_lt_state,
3176
+ handle_script_double_escaped_end_state,
3177
+ handle_before_attr_name_state,
3178
+ handle_attr_name_state,
3179
+ handle_after_attr_name_state,
3180
+ handle_before_attr_value_state,
3181
+ handle_attr_value_double_quoted_state,
3182
+ handle_attr_value_single_quoted_state,
3183
+ handle_attr_value_unquoted_state,
3184
+ handle_char_ref_in_attr_value_state,
3185
+ handle_after_attr_value_quoted_state,
3186
+ handle_self_closing_start_tag_state,
3187
+ handle_bogus_comment_state,
3188
+ handle_markup_declaration_state,
3189
+ handle_comment_start_state,
3190
+ handle_comment_start_dash_state,
3191
+ handle_comment_state,
3192
+ handle_comment_end_dash_state,
3193
+ handle_comment_end_state,
3194
+ handle_comment_end_bang_state,
3195
+ handle_doctype_state,
3196
+ handle_before_doctype_name_state,
3197
+ handle_doctype_name_state,
3198
+ handle_after_doctype_name_state,
3199
+ handle_after_doctype_public_keyword_state,
3200
+ handle_before_doctype_public_id_state,
3201
+ handle_doctype_public_id_double_quoted_state,
3202
+ handle_doctype_public_id_single_quoted_state,
3203
+ handle_after_doctype_public_id_state,
3204
+ handle_between_doctype_public_system_id_state,
3205
+ handle_after_doctype_system_keyword_state,
3206
+ handle_before_doctype_system_id_state,
3207
+ handle_doctype_system_id_double_quoted_state,
3208
+ handle_doctype_system_id_single_quoted_state,
3209
+ handle_after_doctype_system_id_state,
3210
+ handle_bogus_doctype_state,
3211
+ handle_cdata_state
3212
+ };
2811
3213
 
2812
3214
  bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2813
3215
  // Because of the spec requirements that...
@@ -2819,9 +3221,9 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2819
3221
  // state.
2820
3222
  //
2821
3223
  // ...all state must be held in the GumboTokenizer struct instead of in local
2822
- // variables in this function. That allows us to return from this method with
3224
+ // variables in this function. That allows us to return from this method with
2823
3225
  // a token, and then immediately jump back to the same state with the same
2824
- // input if we need to return a different token. The various emit_* functions
3226
+ // input if we need to return a different token. The various emit_* functions
2825
3227
  // are responsible for changing state (eg. flushing the chardata buffer,
2826
3228
  // reading the next input character) to avoid an infinite loop.
2827
3229
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
@@ -2845,10 +3247,9 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2845
3247
  assert(!tokenizer->_temporary_buffer_emit);
2846
3248
  assert(tokenizer->_buffered_emit_char == kGumboNoChar);
2847
3249
  int c = utf8iterator_current(&tokenizer->_input);
2848
- gumbo_debug(
2849
- "Lexing character '%c' (%d) in state %d.\n", c, c, tokenizer->_state);
2850
- StateResult result =
2851
- dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
3250
+ GumboTokenizerEnum state = tokenizer->_state;
3251
+ gumbo_debug("Lexing character '%c' (%d) in state %u.\n", c, c, state);
3252
+ StateResult result = dispatch_table[state](parser, tokenizer, c, output);
2852
3253
  // We need to clear reconsume_current_input before returning to prevent
2853
3254
  // certain infinite loop states.
2854
3255
  bool should_advance = !tokenizer->_reconsume_current_input;
@@ -2866,30 +3267,33 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2866
3267
  }
2867
3268
  }
2868
3269
 
2869
- void gumbo_token_destroy(GumboParser* parser, GumboToken* token) {
3270
+ void gumbo_token_destroy(GumboToken* token) {
2870
3271
  if (!token) return;
2871
3272
 
2872
3273
  switch (token->type) {
2873
3274
  case GUMBO_TOKEN_DOCTYPE:
2874
- gumbo_parser_deallocate(parser, (void*) token->v.doc_type.name);
2875
- gumbo_parser_deallocate(
2876
- parser, (void*) token->v.doc_type.public_identifier);
2877
- gumbo_parser_deallocate(
2878
- parser, (void*) token->v.doc_type.system_identifier);
3275
+ gumbo_free((void*) token->v.doc_type.name);
3276
+ gumbo_free((void*) token->v.doc_type.public_identifier);
3277
+ gumbo_free((void*) token->v.doc_type.system_identifier);
2879
3278
  return;
2880
3279
  case GUMBO_TOKEN_START_TAG:
2881
3280
  for (unsigned int i = 0; i < token->v.start_tag.attributes.length; ++i) {
2882
3281
  GumboAttribute* attr = token->v.start_tag.attributes.data[i];
2883
3282
  if (attr) {
2884
3283
  // May have been nulled out if this token was merged with another.
2885
- gumbo_destroy_attribute(parser, attr);
3284
+ gumbo_destroy_attribute(attr);
2886
3285
  }
2887
3286
  }
2888
- gumbo_parser_deallocate(
2889
- parser, (void*) token->v.start_tag.attributes.data);
3287
+ gumbo_free((void*) token->v.start_tag.attributes.data);
3288
+ if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN)
3289
+ gumbo_free(token->v.start_tag.name);
2890
3290
  return;
3291
+ case GUMBO_TOKEN_END_TAG:
3292
+ if (token->v.end_tag.tag == GUMBO_TAG_UNKNOWN)
3293
+ gumbo_free(token->v.end_tag.name);
3294
+ break;
2891
3295
  case GUMBO_TOKEN_COMMENT:
2892
- gumbo_parser_deallocate(parser, (void*) token->v.text);
3296
+ gumbo_free((void*) token->v.text);
2893
3297
  return;
2894
3298
  default:
2895
3299
  return;