nokogumbo 1.5.0 → 2.0.0.pre.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +56 -0
  3. data/README.md +146 -22
  4. data/ext/nokogumbo/extconf.rb +116 -0
  5. data/ext/{nokogumboc → nokogumbo}/nokogumbo.c +174 -71
  6. data/gumbo-parser/src/ascii.c +33 -0
  7. data/gumbo-parser/src/ascii.h +31 -0
  8. data/gumbo-parser/src/attribute.c +26 -28
  9. data/gumbo-parser/src/attribute.h +3 -23
  10. data/gumbo-parser/src/char_ref.c +135 -2351
  11. data/gumbo-parser/src/char_ref.h +13 -29
  12. data/gumbo-parser/src/error.c +215 -133
  13. data/gumbo-parser/src/error.h +34 -49
  14. data/gumbo-parser/src/foreign_attrs.c +104 -0
  15. data/gumbo-parser/src/gumbo.h +506 -304
  16. data/gumbo-parser/src/insertion_mode.h +4 -28
  17. data/gumbo-parser/src/macros.h +91 -0
  18. data/gumbo-parser/src/parser.c +1989 -1431
  19. data/gumbo-parser/src/parser.h +6 -22
  20. data/gumbo-parser/src/replacement.h +33 -0
  21. data/gumbo-parser/src/string_buffer.c +43 -50
  22. data/gumbo-parser/src/string_buffer.h +24 -40
  23. data/gumbo-parser/src/string_piece.c +39 -39
  24. data/gumbo-parser/src/svg_attrs.c +174 -0
  25. data/gumbo-parser/src/svg_tags.c +137 -0
  26. data/gumbo-parser/src/tag.c +186 -59
  27. data/gumbo-parser/src/tag_lookup.c +382 -0
  28. data/gumbo-parser/src/tag_lookup.h +13 -0
  29. data/gumbo-parser/src/token_type.h +1 -25
  30. data/gumbo-parser/src/tokenizer.c +899 -495
  31. data/gumbo-parser/src/tokenizer.h +37 -37
  32. data/gumbo-parser/src/tokenizer_states.h +6 -22
  33. data/gumbo-parser/src/utf8.c +103 -86
  34. data/gumbo-parser/src/utf8.h +37 -41
  35. data/gumbo-parser/src/util.c +48 -38
  36. data/gumbo-parser/src/util.h +10 -40
  37. data/gumbo-parser/src/vector.c +45 -57
  38. data/gumbo-parser/src/vector.h +17 -39
  39. data/lib/nokogumbo.rb +10 -174
  40. data/lib/nokogumbo/html5.rb +250 -0
  41. data/lib/nokogumbo/html5/document.rb +37 -0
  42. data/lib/nokogumbo/html5/document_fragment.rb +46 -0
  43. data/lib/nokogumbo/version.rb +3 -0
  44. data/lib/nokogumbo/xml/node.rb +57 -0
  45. metadata +32 -19
  46. data/ext/nokogumboc/extconf.rb +0 -60
  47. data/gumbo-parser/src/char_ref.rl +0 -2554
  48. data/gumbo-parser/src/string_piece.h +0 -38
  49. data/gumbo-parser/src/tag.in +0 -150
  50. data/gumbo-parser/src/tag_enum.h +0 -153
  51. data/gumbo-parser/src/tag_gperf.h +0 -105
  52. data/gumbo-parser/src/tag_sizes.h +0 -4
  53. data/gumbo-parser/src/tag_strings.h +0 -153
  54. data/gumbo-parser/visualc/include/strings.h +0 -4
  55. data/test-nokogumbo.rb +0 -190
@@ -0,0 +1,13 @@
1
+ #ifndef GUMBO_TAG_LOOKUP_H_
2
+ #define GUMBO_TAG_LOOKUP_H_
3
+
4
+ #include "gumbo.h"
5
+
6
+ typedef struct {
7
+ const char *key;
8
+ const GumboTag tag;
9
+ } TagHashSlot;
10
+
11
+ const TagHashSlot *gumbo_tag_lookup(const char *str, size_t len);
12
+
13
+ #endif // GUMBO_TAG_LOOKUP_H_
@@ -1,26 +1,6 @@
1
- // Copyright 2011 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
1
  #ifndef GUMBO_TOKEN_TYPE_H_
18
2
  #define GUMBO_TOKEN_TYPE_H_
19
3
 
20
- #ifdef __cplusplus
21
- extern "C" {
22
- #endif
23
-
24
4
  // An enum representing the type of token.
25
5
  typedef enum {
26
6
  GUMBO_TOKEN_DOCTYPE,
@@ -34,8 +14,4 @@ typedef enum {
34
14
  GUMBO_TOKEN_EOF
35
15
  } GumboTokenType;
36
16
 
37
- #ifdef __cplusplus
38
- } // extern C
39
- #endif
40
-
41
- #endif // GUMBO_TOKEN_TYPE_H_
17
+ #endif // GUMBO_TOKEN_TYPE_H_
@@ -1,69 +1,68 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // Coding conventions specific to this file:
18
- //
19
- // 1. Functions that fill in a token should be named emit_*, and should be
20
- // followed immediately by a return from the tokenizer (true if no error
21
- // occurred, false if an error occurred). Sometimes the emit functions
22
- // themselves return a boolean so that they can be combined with the return
23
- // statement; in this case, they should match this convention.
24
- // 2. Functions that shuffle data from temporaries to final API structures
25
- // should be named finish_*, and be called just before the tokenizer exits the
26
- // state that accumulates the temporary.
27
- // 3. All internal data structures should be kept in an initialized state from
28
- // tokenizer creation onwards, ready to accept input. When a buffer's flushed
29
- // and reset, it should be deallocated and immediately reinitialized.
30
- // 4. Make sure there are appropriate break statements following each state.
31
- // 5. Assertions on the state of the temporary and tag buffers are usually a
32
- // good idea, and should go at the entry point of each state when added.
33
- // 6. Statement order within states goes:
34
- // 1. Add parse errors, if appropriate.
35
- // 2. Call finish_* functions to build up tag state.
36
- // 2. Switch to new state. Set _reconsume flag if appropriate.
37
- // 3. Perform any other temporary buffer manipulation.
38
- // 4. Emit tokens
39
- // 5. Return/break.
40
- // This order ensures that we can verify that every emit is followed by a
41
- // return, ensures that the correct state is recorded with any parse errors, and
42
- // prevents parse error position from being messed up by possible mark/resets in
43
- // temporary buffer manipulation.
44
-
45
- #include "tokenizer.h"
1
+ /*
2
+ Copyright 2010 Google Inc.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ */
16
+
17
+ /*
18
+ Coding conventions specific to this file:
19
+
20
+ 1. Functions that fill in a token should be named emit_*, and should be
21
+ followed immediately by a return from the tokenizer (true if no error
22
+ occurred, false if an error occurred). Sometimes the emit functions
23
+ themselves return a boolean so that they can be combined with the return
24
+ statement; in this case, they should match this convention.
25
+ 2. Functions that shuffle data from temporaries to final API structures
26
+ should be named finish_*, and be called just before the tokenizer exits the
27
+ state that accumulates the temporary.
28
+ 3. All internal data structures should be kept in an initialized state from
29
+ tokenizer creation onwards, ready to accept input. When a buffer's flushed
30
+ and reset, it should be deallocated and immediately reinitialized.
31
+ 4. Make sure there are appropriate break statements following each state.
32
+ 5. Assertions on the state of the temporary and tag buffers are usually a
33
+ good idea, and should go at the entry point of each state when added.
34
+ 6. Statement order within states goes:
35
+ 1. Add parse errors, if appropriate.
36
+ 2. Call finish_* functions to build up tag state.
37
+ 2. Switch to new state. Set _reconsume flag if appropriate.
38
+ 3. Perform any other temporary buffer manipulation.
39
+ 4. Emit tokens
40
+ 5. Return/break.
41
+ This order ensures that we can verify that every emit is followed by
42
+ a return, ensures that the correct state is recorded with any parse
43
+ errors, and prevents parse error position from being messed up by
44
+ possible mark/resets in temporary buffer manipulation.
45
+ */
46
46
 
47
47
  #include <assert.h>
48
- #include <stdbool.h>
49
48
  #include <string.h>
50
-
49
+ #include "tokenizer.h"
50
+ #include "ascii.h"
51
51
  #include "attribute.h"
52
52
  #include "char_ref.h"
53
53
  #include "error.h"
54
54
  #include "gumbo.h"
55
55
  #include "parser.h"
56
56
  #include "string_buffer.h"
57
- #include "string_piece.h"
58
57
  #include "token_type.h"
59
58
  #include "tokenizer_states.h"
60
59
  #include "utf8.h"
61
60
  #include "util.h"
62
61
  #include "vector.h"
63
62
 
64
- // Compared against _script_data_buffer to determine if we're in double-escaped
65
- // script mode.
66
- const GumboStringPiece kScriptTag = {"script", 6};
63
+ // Compared against _script_data_buffer to determine if we're in
64
+ // double-escaped script mode.
65
+ static const GumboStringPiece kScriptTag = {.data = "script", .length = 6};
67
66
 
68
67
  // An enum for the return value of each individual state.
69
68
  typedef enum {
@@ -86,31 +85,35 @@ typedef struct GumboInternalTagState {
86
85
  // the buffer can be re-used for building up attributes.
87
86
  GumboTag _tag;
88
87
 
88
+ // The current tag name. It's set at the same time that _tag is set if _tag
89
+ // is set to GUMBO_TAG_UNKNOWN.
90
+ char *_name;
91
+
89
92
  // The starting location of the text in the buffer.
90
93
  GumboSourcePosition _start_pos;
91
94
 
92
- // The current list of attributes. This is copied (and ownership of its data
93
- // transferred) to the GumboStartTag token upon completion of the tag. New
95
+ // The current list of attributes. This is copied (and ownership of its data
96
+ // transferred) to the GumboStartTag token upon completion of the tag. New
94
97
  // attributes are added as soon as their attribute name state is complete, and
95
98
  // values are filled in by operating on _attributes.data[attributes.length-1].
96
99
  GumboVector /* GumboAttribute */ _attributes;
97
100
 
98
- // If true, the next attribute value to be finished should be dropped. This
101
+ // If true, the next attribute value to be finished should be dropped. This
99
102
  // happens if a duplicate attribute name is encountered - we want to consume
100
103
  // the attribute value, but shouldn't overwrite the existing value.
101
104
  bool _drop_next_attr_value;
102
105
 
103
106
  // The state that caused the tokenizer to switch into a character reference in
104
- // attribute value state. This is used to set the additional allowed
105
- // character, and is switched back to on completion. Initialized as the
107
+ // attribute value state. This is used to set the additional allowed
108
+ // character, and is switched back to on completion. Initialized as the
106
109
  // tokenizer enters the character reference state.
107
110
  GumboTokenizerEnum _attr_value_state;
108
111
 
109
- // The last start tag to have been emitted by the tokenizer. This is
112
+ // The last start tag to have been emitted by the tokenizer. This is
110
113
  // necessary to check for appropriate end tags.
111
114
  GumboTag _last_start_tag;
112
115
 
113
- // If true, then this is a start tag. If false, it's an end tag. This is
116
+ // If true, then this is a start tag. If false, it's an end tag. This is
114
117
  // necessary to generate the appropriate token type at tag-closing time.
115
118
  bool _is_start_tag;
116
119
 
@@ -121,43 +124,43 @@ typedef struct GumboInternalTagState {
121
124
  // This is the main tokenizer state struct, containing all state used by in
122
125
  // tokenizing the input stream.
123
126
  typedef struct GumboInternalTokenizerState {
124
- // The current lexer state. Starts in GUMBO_LEX_DATA.
127
+ // The current lexer state. Starts in GUMBO_LEX_DATA.
125
128
  GumboTokenizerEnum _state;
126
129
 
127
130
  // A flag indicating whether the current input character needs to reconsumed
128
131
  // in another state, or whether the next input character should be read for
129
- // the next iteration of the state loop. This is set when the spec reads
132
+ // the next iteration of the state loop. This is set when the spec reads
130
133
  // "Reconsume the current input character in..."
131
134
  bool _reconsume_current_input;
132
135
 
133
- // A flag indicating whether the current node is a foreign element. This is
136
+ // A flag indicating whether the current node is a foreign element. This is
134
137
  // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
135
138
  // markup declaration state.
136
139
  bool _is_current_node_foreign;
137
140
 
138
- // A flag indicating whether the tokenizer is in a CDATA section. If so, then
141
+ // A flag indicating whether the tokenizer is in a CDATA section. If so, then
139
142
  // text tokens emitted will be GUMBO_TOKEN_CDATA.
140
143
  bool _is_in_cdata;
141
144
 
142
145
  // Certain states (notably character references) may emit two character tokens
143
- // at once, but the contract for lex() fills in only one token at a time. The
146
+ // at once, but the contract for lex() fills in only one token at a time. The
144
147
  // extra character is buffered here, and then this is checked on entry to
145
- // lex(). If a character is stored here, it's immediately emitted and control
146
- // returns from the lexer. kGumboNoChar is used to represent 'no character
148
+ // lex(). If a character is stored here, it's immediately emitted and control
149
+ // returns from the lexer. kGumboNoChar is used to represent 'no character
147
150
  // stored.'
148
151
  //
149
152
  // Note that characters emitted through this mechanism will have their source
150
153
  // position marked as the character under the mark, i.e. multiple characters
151
- // may be emitted with the same position. This is desirable for character
152
- // references, but unsuitable for many other cases. Use the _temporary_buffer
154
+ // may be emitted with the same position. This is desirable for character
155
+ // references, but unsuitable for many other cases. Use the _temporary_buffer
153
156
  // mechanism if the buffered characters must have their original positions in
154
157
  // the document.
155
158
  int _buffered_emit_char;
156
159
 
157
160
  // A temporary buffer to accumulate characters, as described by the "temporary
158
- // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
161
+ // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
159
162
  // way: we record the specific character to go into the buffer, which may
160
- // sometimes be a lowercased version of the actual input character. However,
163
+ // sometimes be a lowercased version of the actual input character. However,
161
164
  // we *also* use utf8iterator_mark() to record the position at tag start.
162
165
  // When we start flushing the temporary buffer, we set _temporary_buffer_emit
163
166
  // to the start of it, and then increment it for each call to the tokenizer.
@@ -167,13 +170,13 @@ typedef struct GumboInternalTokenizerState {
167
170
  GumboStringBuffer _temporary_buffer;
168
171
 
169
172
  // The current cursor position we're emitting from within
170
- // _temporary_buffer.data. NULL whenever we're not flushing the buffer.
173
+ // _temporary_buffer.data. NULL whenever we're not flushing the buffer.
171
174
  const char* _temporary_buffer_emit;
172
175
 
173
176
  // The temporary buffer is also used by the spec to check whether we should
174
177
  // enter the script data double escaped state, but we can't use the same
175
178
  // buffer for both because we have to flush out "<s" as emits while still
176
- // maintaining the context that will eventually become "script". This is a
179
+ // maintaining the context that will eventually become "script". This is a
177
180
  // separate buffer that's used in place of the temporary buffer for states
178
181
  // that may enter the script data double escape start state.
179
182
  GumboStringBuffer _script_data_buffer;
@@ -189,7 +192,7 @@ typedef struct GumboInternalTokenizerState {
189
192
  // Current tag state.
190
193
  GumboTagState _tag_state;
191
194
 
192
- // Doctype state. We use the temporary buffer to accumulate characters (it's
195
+ // Doctype state. We use the temporary buffer to accumulate characters (it's
193
196
  // not used for anything else in the doctype states), and then freshly
194
197
  // allocate the strings in the doctype token, then copy it over on emit.
195
198
  GumboTokenDocType _doc_type_state;
@@ -199,8 +202,10 @@ typedef struct GumboInternalTokenizerState {
199
202
  } GumboTokenizerState;
200
203
 
201
204
  // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
202
- static void tokenizer_add_parse_error(
203
- GumboParser* parser, GumboErrorType type) {
205
+ static void tokenizer_add_parse_error (
206
+ GumboParser* parser,
207
+ GumboErrorType type
208
+ ) {
204
209
  GumboError* error = gumbo_add_error(parser);
205
210
  if (!error) {
206
211
  return;
@@ -309,14 +314,14 @@ static void tokenizer_add_parse_error(
309
314
  }
310
315
 
311
316
  static bool is_alpha(int c) {
312
- // We don't use ISO C isupper/islower functions here because they
313
- // depend upon the program's locale, while the behavior of the HTML5 spec is
314
- // independent of which locale the program is run in.
315
- return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
317
+ // We don't use the ISO C isalpha() function here because it depends
318
+ // on the current locale, whereas the behavior in the HTML5 spec is
319
+ // locale-independent.
320
+ return ((unsigned) c | 32) - 'a' < 26;
316
321
  }
317
322
 
318
323
  static int ensure_lowercase(int c) {
319
- return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
324
+ return gumbo_ascii_tolower(c);
320
325
  }
321
326
 
322
327
  static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
@@ -346,7 +351,7 @@ static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
346
351
  // text that will eventually be emitted, it needs to be called a couple of
347
352
  // states before the spec says "Set the temporary buffer to the empty string".
348
353
  // In general, this should be called whenever there's a transition to a
349
- // "less-than sign state". The initial < and possibly / then need to be
354
+ // "less-than sign state". The initial < and possibly / then need to be
350
355
  // appended to the temporary buffer, their presence needs to be accounted for in
351
356
  // states that compare the temporary buffer against a literal value, and
352
357
  // spec stanzas that say "emit a < and / character token along with a character
@@ -356,30 +361,40 @@ static void clear_temporary_buffer(GumboParser* parser) {
356
361
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
357
362
  assert(!tokenizer->_temporary_buffer_emit);
358
363
  utf8iterator_mark(&tokenizer->_input);
359
- gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer);
364
+ gumbo_string_buffer_clear(&tokenizer->_temporary_buffer);
360
365
  // The temporary buffer and script data buffer are the same object in the
361
366
  // spec, so the script data buffer should be cleared as well.
362
- gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
367
+ gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
363
368
  }
364
369
 
365
370
  // Appends a codepoint to the temporary buffer.
366
- static void append_char_to_temporary_buffer(
367
- GumboParser* parser, int codepoint) {
368
- gumbo_string_buffer_append_codepoint(
369
- parser, codepoint, &parser->_tokenizer_state->_temporary_buffer);
371
+ static void append_char_to_temporary_buffer (
372
+ GumboParser* parser,
373
+ int codepoint
374
+ ) {
375
+ gumbo_string_buffer_append_codepoint (
376
+ codepoint,
377
+ &parser->_tokenizer_state->_temporary_buffer
378
+ );
370
379
  }
371
380
 
372
- // Checks to see if the temporary buffer equals a certain string.
373
- // Make sure this remains side-effect free; it's used in assertions.
374
381
  #ifndef NDEBUG
375
- static bool temporary_buffer_equals(GumboParser* parser, const char* text) {
376
- GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer;
377
- // TODO(jdtang): See if the extra strlen is a performance problem, and replace
378
- // it with an explicit sizeof(literal) if necessary. I don't think it will
379
- // be, as this is only used in a couple of rare states.
380
- int text_len = strlen(text);
381
- return text_len == buffer->length &&
382
- memcmp(buffer->data, text, text_len) == 0;
382
+ static bool temporary_buffer_equals__ (
383
+ const GumboParser* parser,
384
+ const char* text,
385
+ size_t text_len
386
+ ) {
387
+ const GumboStringBuffer* buf = &parser->_tokenizer_state->_temporary_buffer;
388
+ return
389
+ text_len == buf->length
390
+ && memcmp(buf->data, text, text_len) == 0;
391
+ }
392
+
393
+ #define temporary_buffer_equals(parser, text) \
394
+ temporary_buffer_equals__(parser, "" text, sizeof(text) - 1)
395
+
396
+ static bool temporary_buffer_is_empty(const GumboParser* parser) {
397
+ return parser->_tokenizer_state->_temporary_buffer.length == 0;
383
398
  }
384
399
  #endif
385
400
 
@@ -387,9 +402,9 @@ static void doc_type_state_init(GumboParser* parser) {
387
402
  GumboTokenDocType* doc_type_state =
388
403
  &parser->_tokenizer_state->_doc_type_state;
389
404
  // We initialize these to NULL here so that we don't end up leaking memory if
390
- // we never see a doctype token. When we do see a doctype token, we reset
405
+ // we never see a doctype token. When we do see a doctype token, we reset
391
406
  // them to a freshly-allocated empty string so that we can present a uniform
392
- // interface to client code and not make them check for null. Ownership is
407
+ // interface to client code and not make them check for null. Ownership is
393
408
  // transferred to the doctype token when it's emitted.
394
409
  doc_type_state->name = NULL;
395
410
  doc_type_state->public_identifier = NULL;
@@ -408,7 +423,7 @@ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
408
423
  }
409
424
 
410
425
  // Sets the tag buffer original text and start point to the current iterator
411
- // position. This is necessary because attribute names & values may have
426
+ // position. This is necessary because attribute names & values may have
412
427
  // whitespace preceeding them, and so we can't assume that the actual token
413
428
  // starting point was the end of the last tag buffer usage.
414
429
  static void reset_tag_buffer_start_point(GumboParser* parser) {
@@ -423,15 +438,14 @@ static void reset_tag_buffer_start_point(GumboParser* parser) {
423
438
  // and clears the temporary buffer.
424
439
  static void finish_temporary_buffer(GumboParser* parser, const char** output) {
425
440
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
426
- *output =
427
- gumbo_string_buffer_to_string(parser, &tokenizer->_temporary_buffer);
441
+ *output = gumbo_string_buffer_to_string(&tokenizer->_temporary_buffer);
428
442
  clear_temporary_buffer(parser);
429
443
  }
430
444
 
431
445
  // Advances the iterator past the end of the token, and then fills in the
432
- // relevant position fields. It's assumed that after every emit, the tokenizer
446
+ // relevant position fields. It's assumed that after every emit, the tokenizer
433
447
  // will immediately return (letting the tree-construction stage read the filled
434
- // in Token). Thus, it's safe to advance the input stream here, since it will
448
+ // in Token). Thus, it's safe to advance the input stream here, since it will
435
449
  // bypass the advance at the bottom of the state machine loop.
436
450
  //
437
451
  // Since this advances the iterator and resets the current input, make sure to
@@ -450,7 +464,7 @@ static void finish_token(GumboParser* parser, GumboToken* token) {
450
464
  if (token->original_text.length > 0 &&
451
465
  token->original_text.data[token->original_text.length - 1] == '\r') {
452
466
  // The UTF8 iterator will ignore carriage returns in the input stream, which
453
- // means that the next token may start one past a \r character. The pointer
467
+ // means that the next token may start one past a \r character. The pointer
454
468
  // arithmetic above results in that \r being appended to the original text
455
469
  // of the preceding token, so we have to adjust its length here to chop the
456
470
  // \r off.
@@ -463,7 +477,7 @@ static void finish_token(GumboParser* parser, GumboToken* token) {
463
477
  static void finish_doctype_public_id(GumboParser* parser) {
464
478
  GumboTokenDocType* doc_type_state =
465
479
  &parser->_tokenizer_state->_doc_type_state;
466
- gumbo_parser_deallocate(parser, (void*) doc_type_state->public_identifier);
480
+ gumbo_free((void*) doc_type_state->public_identifier);
467
481
  finish_temporary_buffer(parser, &doc_type_state->public_identifier);
468
482
  doc_type_state->has_public_identifier = true;
469
483
  }
@@ -473,7 +487,7 @@ static void finish_doctype_public_id(GumboParser* parser) {
473
487
  static void finish_doctype_system_id(GumboParser* parser) {
474
488
  GumboTokenDocType* doc_type_state =
475
489
  &parser->_tokenizer_state->_doc_type_state;
476
- gumbo_parser_deallocate(parser, (void*) doc_type_state->system_identifier);
490
+ gumbo_free((void*) doc_type_state->system_identifier);
477
491
  finish_temporary_buffer(parser, &doc_type_state->system_identifier);
478
492
  doc_type_state->has_system_identifier = true;
479
493
  }
@@ -495,7 +509,7 @@ static StateResult emit_replacement_char(
495
509
  return RETURN_ERROR;
496
510
  }
497
511
 
498
- // Writes an EOF character token. Always returns RETURN_SUCCESS.
512
+ // Writes an EOF character token. Always returns RETURN_SUCCESS.
499
513
  static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
500
514
  emit_char(parser, -1, output);
501
515
  return RETURN_SUCCESS;
@@ -520,7 +534,9 @@ static void emit_doctype(GumboParser* parser, GumboToken* output) {
520
534
  // Debug-only function that explicitly sets the attribute vector data to NULL so
521
535
  // it can be asserted on tag creation, verifying that there are no memory leaks.
522
536
  static void mark_tag_state_as_empty(GumboTagState* tag_state) {
537
+ UNUSED_IF_NDEBUG(tag_state);
523
538
  #ifndef NDEBUG
539
+ tag_state->_name = NULL;
524
540
  tag_state->_attributes = kGumboEmptyVector;
525
541
  #endif
526
542
  }
@@ -532,6 +548,7 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
532
548
  if (tag_state->_is_start_tag) {
533
549
  output->type = GUMBO_TOKEN_START_TAG;
534
550
  output->v.start_tag.tag = tag_state->_tag;
551
+ output->v.start_tag.name = tag_state->_name;
535
552
  output->v.start_tag.attributes = tag_state->_attributes;
536
553
  output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
537
554
  tag_state->_last_start_tag = tag_state->_tag;
@@ -540,23 +557,28 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
540
557
  "Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
541
558
  } else {
542
559
  output->type = GUMBO_TOKEN_END_TAG;
543
- output->v.end_tag = tag_state->_tag;
560
+ output->v.end_tag.tag = tag_state->_tag;
561
+ output->v.end_tag.name = tag_state->_name;
562
+ output->v.end_tag.is_self_closing = tag_state->_is_self_closing;
544
563
  // In end tags, ownership of the attributes vector is not transferred to the
545
564
  // token, but it's still initialized as normal, so it must be manually
546
- // deallocated. There may also be attributes to destroy, in certain broken
565
+ // deallocated. There may also be attributes to destroy, in certain broken
547
566
  // cases like </div</th> (the "th" is an attribute there).
548
567
  for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
549
- gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
568
+ gumbo_destroy_attribute(tag_state->_attributes.data[i]);
550
569
  }
551
- gumbo_parser_deallocate(parser, tag_state->_attributes.data);
570
+ gumbo_free(tag_state->_attributes.data);
552
571
  mark_tag_state_as_empty(tag_state);
553
572
  gumbo_debug(
554
573
  "Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
555
574
  }
556
- gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
575
+ gumbo_string_buffer_destroy(&tag_state->_buffer);
557
576
  finish_token(parser, output);
558
- gumbo_debug("Original text = %.*s.\n", output->original_text.length,
559
- output->original_text.data);
577
+ gumbo_debug (
578
+ "Original text = %.*s.\n",
579
+ (int) output->original_text.length,
580
+ output->original_text.data
581
+ );
560
582
  assert(output->original_text.length >= 2);
561
583
  assert(output->original_text.data[0] == '<');
562
584
  assert(output->original_text.data[output->original_text.length - 1] == '>');
@@ -570,26 +592,36 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
570
592
  static void abandon_current_tag(GumboParser* parser) {
571
593
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
572
594
  for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
573
- gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
595
+ gumbo_destroy_attribute(tag_state->_attributes.data[i]);
574
596
  }
575
- gumbo_parser_deallocate(parser, tag_state->_attributes.data);
597
+ gumbo_free(tag_state->_attributes.data);
576
598
  mark_tag_state_as_empty(tag_state);
577
- gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
599
+ gumbo_string_buffer_destroy(&tag_state->_buffer);
578
600
  gumbo_debug("Abandoning current tag.\n");
579
601
  }
580
602
 
581
- // Wraps the consume_char_ref function to handle its output and make the
582
- // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
603
+ // Wraps the gumbo_consume_char_ref function to handle its output and make the
604
+ // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
583
605
  // error occurred, RETURN_SUCCESS otherwise.
584
- static StateResult emit_char_ref(GumboParser* parser,
585
- int additional_allowed_char, bool is_in_attribute, GumboToken* output) {
606
+ static StateResult emit_char_ref (
607
+ GumboParser* parser,
608
+ int additional_allowed_char,
609
+ bool UNUSED_ARG(is_in_attribute),
610
+ GumboToken* output
611
+ ) {
586
612
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
587
613
  OneOrTwoCodepoints char_ref;
588
- bool status = consume_char_ref(
589
- parser, &tokenizer->_input, additional_allowed_char, false, &char_ref);
614
+ bool status = gumbo_consume_char_ref (
615
+ parser,
616
+ &tokenizer->_input,
617
+ additional_allowed_char,
618
+ false,
619
+ &char_ref
620
+ );
590
621
  if (char_ref.first != kGumboNoChar) {
591
- // consume_char_ref ends with the iterator pointing at the next character,
592
- // so we need to be sure not advance it again before reading the next token.
622
+ // gumbo_consume_char_ref ends with the iterator pointing at the next
623
+ // character, so we need to be sure not advance it again before
624
+ // reading the next token.
593
625
  tokenizer->_reconsume_current_input = true;
594
626
  emit_char(parser, char_ref.first, output);
595
627
  tokenizer->_buffered_emit_char = char_ref.second;
@@ -599,9 +631,9 @@ static StateResult emit_char_ref(GumboParser* parser,
599
631
  return status ? RETURN_SUCCESS : RETURN_ERROR;
600
632
  }
601
633
 
602
- // Emits a comment token. Comments use the temporary buffer to accumulate their
634
+ // Emits a comment token. Comments use the temporary buffer to accumulate their
603
635
  // data, and then it's copied over and released to the 'text' field of the
604
- // GumboToken union. Always returns RETURN_SUCCESS.
636
+ // GumboToken union. Always returns RETURN_SUCCESS.
605
637
  static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
606
638
  output->type = GUMBO_TOKEN_COMMENT;
607
639
  finish_temporary_buffer(parser, &output->v.text);
@@ -626,11 +658,11 @@ static bool maybe_emit_from_temporary_buffer(
626
658
  }
627
659
 
628
660
  assert(*c == utf8iterator_current(&tokenizer->_input));
629
- // emit_char also advances the input stream. We need to do some juggling of
661
+ // emit_char also advances the input stream. We need to do some juggling of
630
662
  // the _reconsume_current_input flag to get the proper behavior when emitting
631
- // previous tokens. Basically, _reconsume_current_input should *never* be set
663
+ // previous tokens. Basically, _reconsume_current_input should *never* be set
632
664
  // when emitting anything from the temporary buffer, since those characters
633
- // have already been advanced past. However, it should be preserved so that
665
+ // have already been advanced past. However, it should be preserved so that
634
666
  // when the *next* character is encountered again, the tokenizer knows not to
635
667
  // advance past it.
636
668
  bool saved_reconsume_state = tokenizer->_reconsume_current_input;
@@ -644,7 +676,7 @@ static bool maybe_emit_from_temporary_buffer(
644
676
  // Sets up the tokenizer to begin flushing the temporary buffer.
645
677
  // This resets the input iterator stream to the start of the last tag, sets up
646
678
  // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
647
- // the first character in it. It returns true if a character was emitted, false
679
+ // the first character in it. It returns true if a character was emitted, false
648
680
  // otherwise.
649
681
  static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
650
682
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
@@ -654,32 +686,35 @@ static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
654
686
  return maybe_emit_from_temporary_buffer(parser, output);
655
687
  }
656
688
 
657
- // Appends a codepoint to the current tag buffer. If
689
+ // Appends a codepoint to the current tag buffer. If
658
690
  // reinitilize_position_on_first is set, this also initializes the tag buffer
659
691
  // start point; the only time you would *not* want to pass true for this
660
692
  // parameter is if you want the original_text to include character (like an
661
693
  // opening quote) that doesn't appear in the value.
662
- static void append_char_to_tag_buffer(
663
- GumboParser* parser, int codepoint, bool reinitilize_position_on_first) {
694
+ static void append_char_to_tag_buffer (
695
+ GumboParser* parser,
696
+ int codepoint,
697
+ bool reinitilize_position_on_first
698
+ ) {
664
699
  GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
665
700
  if (buffer->length == 0 && reinitilize_position_on_first) {
666
701
  reset_tag_buffer_start_point(parser);
667
702
  }
668
- gumbo_string_buffer_append_codepoint(parser, codepoint, buffer);
703
+ gumbo_string_buffer_append_codepoint(codepoint, buffer);
669
704
  }
670
705
 
671
- // (Re-)initialize the tag buffer. This also resets the original_text pointer
706
+ // (Re-)initialize the tag buffer. This also resets the original_text pointer
672
707
  // and _start_pos field to point to the current position.
673
708
  static void initialize_tag_buffer(GumboParser* parser) {
674
709
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
675
710
  GumboTagState* tag_state = &tokenizer->_tag_state;
676
711
 
677
- gumbo_string_buffer_init(parser, &tag_state->_buffer);
712
+ gumbo_string_buffer_init(&tag_state->_buffer);
678
713
  reset_tag_buffer_start_point(parser);
679
714
  }
680
715
 
681
716
  // Initializes the tag_state to start a new tag, keeping track of the opening
682
- // positions and original text. Takes a boolean indicating whether this is a
717
+ // positions and original text. Takes a boolean indicating whether this is a
683
718
  // start or end tag.
684
719
  static void start_new_tag(GumboParser* parser, bool is_start_tag) {
685
720
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
@@ -690,14 +725,15 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
690
725
  assert(is_alpha(c));
691
726
 
692
727
  initialize_tag_buffer(parser);
693
- gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
728
+ gumbo_string_buffer_append_codepoint(c, &tag_state->_buffer);
694
729
 
730
+ assert(tag_state->_name == NULL);
695
731
  assert(tag_state->_attributes.data == NULL);
696
732
  // Initial size chosen by statistical analysis of a corpus of 60k webpages.
697
- // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
733
+ // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
698
734
  // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
699
735
  // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
700
- gumbo_vector_init(parser, 1, &tag_state->_attributes);
736
+ gumbo_vector_init(1, &tag_state->_attributes);
701
737
  tag_state->_drop_next_attr_value = false;
702
738
  tag_state->_is_start_tag = is_start_tag;
703
739
  tag_state->_is_self_closing = false;
@@ -708,7 +744,7 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
708
744
  static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
709
745
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
710
746
  GumboTagState* tag_state = &tokenizer->_tag_state;
711
- *output = gumbo_string_buffer_to_string(parser, &tag_state->_buffer);
747
+ *output = gumbo_string_buffer_to_string(&tag_state->_buffer);
712
748
  }
713
749
 
714
750
  // Fills in:
@@ -717,9 +753,12 @@ static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
717
753
  // * The start_pos GumboSourcePosition with the start position of the tag
718
754
  // buffer.
719
755
  // * The end_pos GumboSourcePosition with the current source position.
720
- static void copy_over_original_tag_text(GumboParser* parser,
721
- GumboStringPiece* original_text, GumboSourcePosition* start_pos,
722
- GumboSourcePosition* end_pos) {
756
+ static void copy_over_original_tag_text (
757
+ GumboParser* parser,
758
+ GumboStringPiece* original_text,
759
+ GumboSourcePosition* start_pos,
760
+ GumboSourcePosition* end_pos
761
+ ) {
723
762
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
724
763
  GumboTagState* tag_state = &tokenizer->_tag_state;
725
764
 
@@ -729,7 +768,7 @@ static void copy_over_original_tag_text(GumboParser* parser,
729
768
  if (original_text->data[original_text->length - 1] == '\r') {
730
769
  // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
731
770
  // appended to the end of original text even when it's really the first part
732
- // of the next character. If we detect this situation, shrink the length of
771
+ // of the next character. If we detect this situation, shrink the length of
733
772
  // the original text by 1 to remove the carriage return.
734
773
  --original_text->length;
735
774
  }
@@ -739,8 +778,7 @@ static void copy_over_original_tag_text(GumboParser* parser,
739
778
 
740
779
  // Releases and then re-initializes the tag buffer.
741
780
  static void reinitialize_tag_buffer(GumboParser* parser) {
742
- gumbo_parser_deallocate(
743
- parser, parser->_tokenizer_state->_tag_state._buffer.data);
781
+ gumbo_free(parser->_tokenizer_state->_tag_state._buffer.data);
744
782
  initialize_tag_buffer(parser);
745
783
  }
746
784
 
@@ -750,14 +788,24 @@ static void finish_tag_name(GumboParser* parser) {
750
788
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
751
789
  GumboTagState* tag_state = &tokenizer->_tag_state;
752
790
 
753
- tag_state->_tag =
754
- gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
791
+ const char *data = tag_state->_buffer.data;
792
+ size_t length = tag_state->_buffer.length;
793
+ tag_state->_tag = gumbo_tagn_enum(data, length);
794
+ if (tag_state->_tag == GUMBO_TAG_UNKNOWN) {
795
+ char *name = gumbo_alloc(length + 1);
796
+ memcpy(name, data, length);
797
+ name[length] = 0;
798
+ tag_state->_name = name;
799
+ }
755
800
  reinitialize_tag_buffer(parser);
756
801
  }
757
802
 
758
803
  // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
759
- static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
760
- int original_index, int new_index) {
804
+ static void add_duplicate_attr_error (
805
+ GumboParser* parser,
806
+ int original_index,
807
+ int new_index
808
+ ) {
761
809
  GumboError* error = gumbo_add_error(parser);
762
810
  if (!error) {
763
811
  return;
@@ -773,11 +821,11 @@ static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
773
821
  }
774
822
 
775
823
  // Creates a new attribute in the current tag, copying the current tag buffer to
776
- // the attribute's name. The attribute's value starts out as the empty string
824
+ // the attribute's name. The attribute's value starts out as the empty string
777
825
  // (following the "Boolean attributes" section of the spec) and is only
778
- // overwritten on finish_attribute_value(). If the attribute has already been
826
+ // overwritten on finish_attribute_value(). If the attribute has already been
779
827
  // specified, the new attribute is dropped, a parse error is added, and the
780
- // function returns false. Otherwise, this returns true.
828
+ // function returns false. Otherwise, this returns true.
781
829
  static bool finish_attribute_name(GumboParser* parser) {
782
830
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
783
831
  GumboTagState* tag_state = &tokenizer->_tag_state;
@@ -789,30 +837,43 @@ static bool finish_attribute_name(GumboParser* parser) {
789
837
  GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
790
838
  for (unsigned int i = 0; i < attributes->length; ++i) {
791
839
  GumboAttribute* attr = attributes->data[i];
792
- if (strlen(attr->name) == tag_state->_buffer.length &&
793
- memcmp(attr->name, tag_state->_buffer.data,
794
- tag_state->_buffer.length) == 0) {
840
+ if (
841
+ strlen(attr->name) == tag_state->_buffer.length
842
+ && 0 == memcmp (
843
+ attr->name,
844
+ tag_state->_buffer.data,
845
+ tag_state->_buffer.length
846
+ )
847
+ ) {
795
848
  // Identical attribute; bail.
796
- add_duplicate_attr_error(parser, attr->name, i, attributes->length);
849
+ add_duplicate_attr_error(parser, i, attributes->length);
797
850
  tag_state->_drop_next_attr_value = true;
798
851
  return false;
799
852
  }
800
853
  }
801
854
 
802
- GumboAttribute* attr = gumbo_parser_allocate(parser, sizeof(GumboAttribute));
855
+ GumboAttribute* attr = gumbo_alloc(sizeof(GumboAttribute));
803
856
  attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
804
857
  copy_over_tag_buffer(parser, &attr->name);
805
- copy_over_original_tag_text(
806
- parser, &attr->original_name, &attr->name_start, &attr->name_end);
807
- attr->value = gumbo_copy_stringz(parser, "");
808
- copy_over_original_tag_text(
809
- parser, &attr->original_value, &attr->name_start, &attr->name_end);
810
- gumbo_vector_add(parser, attr, attributes);
858
+ copy_over_original_tag_text (
859
+ parser,
860
+ &attr->original_name,
861
+ &attr->name_start,
862
+ &attr->name_end
863
+ );
864
+ attr->value = gumbo_strdup("");
865
+ copy_over_original_tag_text (
866
+ parser,
867
+ &attr->original_value,
868
+ &attr->name_start,
869
+ &attr->name_end
870
+ );
871
+ gumbo_vector_add(attr, attributes);
811
872
  reinitialize_tag_buffer(parser);
812
873
  return true;
813
874
  }
814
875
 
815
- // Finishes an attribute value. This sets the value of the most recently added
876
+ // Finishes an attribute value. This sets the value of the most recently added
816
877
  // attribute to the current contents of the tag buffer.
817
878
  static void finish_attribute_value(GumboParser* parser) {
818
879
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
@@ -826,7 +887,7 @@ static void finish_attribute_value(GumboParser* parser) {
826
887
 
827
888
  GumboAttribute* attr =
828
889
  tag_state->_attributes.data[tag_state->_attributes.length - 1];
829
- gumbo_parser_deallocate(parser, (void*) attr->value);
890
+ gumbo_free((void*) attr->value);
830
891
  copy_over_tag_buffer(parser, &attr->value);
831
892
  copy_over_original_tag_text(
832
893
  parser, &attr->original_value, &attr->value_start, &attr->value_end);
@@ -842,24 +903,27 @@ static bool is_appropriate_end_tag(GumboParser* parser) {
842
903
  tag_state->_buffer.length);
843
904
  }
844
905
 
845
- void gumbo_tokenizer_state_init(
846
- GumboParser* parser, const char* text, size_t text_length) {
847
- GumboTokenizerState* tokenizer =
848
- gumbo_parser_allocate(parser, sizeof(GumboTokenizerState));
906
+ void gumbo_tokenizer_state_init (
907
+ GumboParser* parser,
908
+ const char* text,
909
+ size_t text_length
910
+ ) {
911
+ GumboTokenizerState* tokenizer = gumbo_alloc(sizeof(GumboTokenizerState));
849
912
  parser->_tokenizer_state = tokenizer;
850
913
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
851
914
  tokenizer->_reconsume_current_input = false;
852
915
  tokenizer->_is_current_node_foreign = false;
853
916
  tokenizer->_is_in_cdata = false;
854
917
  tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
918
+ tokenizer->_tag_state._name = NULL;
855
919
 
856
920
  tokenizer->_buffered_emit_char = kGumboNoChar;
857
- gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
921
+ gumbo_string_buffer_init(&tokenizer->_temporary_buffer);
858
922
  tokenizer->_temporary_buffer_emit = NULL;
859
923
 
860
924
  mark_tag_state_as_empty(&tokenizer->_tag_state);
861
925
 
862
- gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
926
+ gumbo_string_buffer_init(&tokenizer->_script_data_buffer);
863
927
  tokenizer->_token_start = text;
864
928
  utf8iterator_init(parser, text, text_length, &tokenizer->_input);
865
929
  utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
@@ -871,27 +935,37 @@ void gumbo_tokenizer_state_destroy(GumboParser* parser) {
871
935
  assert(tokenizer->_doc_type_state.name == NULL);
872
936
  assert(tokenizer->_doc_type_state.public_identifier == NULL);
873
937
  assert(tokenizer->_doc_type_state.system_identifier == NULL);
874
- gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
875
- gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
876
- gumbo_parser_deallocate(parser, tokenizer);
938
+ gumbo_string_buffer_destroy(&tokenizer->_temporary_buffer);
939
+ gumbo_string_buffer_destroy(&tokenizer->_script_data_buffer);
940
+ assert(tokenizer->_tag_state._name == NULL);
941
+ assert(tokenizer->_tag_state._attributes.data == NULL);
942
+ gumbo_free(tokenizer);
877
943
  }
878
944
 
879
945
  void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
880
946
  parser->_tokenizer_state->_state = state;
881
947
  }
882
948
 
883
- void gumbo_tokenizer_set_is_current_node_foreign(
884
- GumboParser* parser, bool is_foreign) {
949
+ void gumbo_tokenizer_set_is_current_node_foreign (
950
+ GumboParser* parser,
951
+ bool is_foreign
952
+ ) {
885
953
  if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
886
- gumbo_debug("Toggling is_current_node_foreign to %s.\n",
887
- is_foreign ? "true" : "false");
954
+ gumbo_debug (
955
+ "Toggling is_current_node_foreign to %s.\n",
956
+ is_foreign ? "true" : "false"
957
+ );
888
958
  }
889
959
  parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
890
960
  }
891
961
 
892
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state
893
- static StateResult handle_data_state(GumboParser* parser,
894
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
962
+ // https://html.spec.whatwg.org/multipage/parsing.html#data-state
963
+ static StateResult handle_data_state (
964
+ GumboParser* parser,
965
+ GumboTokenizerState* tokenizer,
966
+ int c,
967
+ GumboToken* output
968
+ ) {
895
969
  switch (c) {
896
970
  case '&':
897
971
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
@@ -914,16 +988,24 @@ static StateResult handle_data_state(GumboParser* parser,
914
988
  }
915
989
  }
916
990
 
917
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state
918
- static StateResult handle_char_ref_in_data_state(GumboParser* parser,
919
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
991
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-data-state
992
+ static StateResult handle_char_ref_in_data_state (
993
+ GumboParser* parser,
994
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
995
+ int UNUSED_ARG(c),
996
+ GumboToken* output
997
+ ) {
920
998
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
921
999
  return emit_char_ref(parser, ' ', false, output);
922
1000
  }
923
1001
 
924
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state
925
- static StateResult handle_rcdata_state(GumboParser* parser,
926
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1002
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
1003
+ static StateResult handle_rcdata_state (
1004
+ GumboParser* parser,
1005
+ GumboTokenizerState* tokenizer,
1006
+ int c,
1007
+ GumboToken* output
1008
+ ) {
927
1009
  switch (c) {
928
1010
  case '&':
929
1011
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
@@ -943,16 +1025,24 @@ static StateResult handle_rcdata_state(GumboParser* parser,
943
1025
  }
944
1026
  }
945
1027
 
946
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state
947
- static StateResult handle_char_ref_in_rcdata_state(GumboParser* parser,
948
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1028
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-rcdata-state
1029
+ static StateResult handle_char_ref_in_rcdata_state (
1030
+ GumboParser* parser,
1031
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1032
+ int UNUSED_ARG(c),
1033
+ GumboToken* output
1034
+ ) {
949
1035
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
950
1036
  return emit_char_ref(parser, ' ', false, output);
951
1037
  }
952
1038
 
953
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state
954
- static StateResult handle_rawtext_state(GumboParser* parser,
955
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1039
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
1040
+ static StateResult handle_rawtext_state (
1041
+ GumboParser* parser,
1042
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1043
+ int c,
1044
+ GumboToken* output
1045
+ ) {
956
1046
  switch (c) {
957
1047
  case '<':
958
1048
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
@@ -968,9 +1058,13 @@ static StateResult handle_rawtext_state(GumboParser* parser,
968
1058
  }
969
1059
  }
970
1060
 
971
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state
972
- static StateResult handle_script_state(GumboParser* parser,
973
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1061
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
1062
+ static StateResult handle_script_state (
1063
+ GumboParser* parser,
1064
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1065
+ int c,
1066
+ GumboToken* output
1067
+ ) {
974
1068
  switch (c) {
975
1069
  case '<':
976
1070
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
@@ -986,9 +1080,13 @@ static StateResult handle_script_state(GumboParser* parser,
986
1080
  }
987
1081
  }
988
1082
 
989
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state
990
- static StateResult handle_plaintext_state(GumboParser* parser,
991
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1083
+ // https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
1084
+ static StateResult handle_plaintext_state (
1085
+ GumboParser* parser,
1086
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1087
+ int c,
1088
+ GumboToken* output
1089
+ ) {
992
1090
  switch (c) {
993
1091
  case '\0':
994
1092
  return emit_replacement_char(parser, output);
@@ -999,9 +1097,13 @@ static StateResult handle_plaintext_state(GumboParser* parser,
999
1097
  }
1000
1098
  }
1001
1099
 
1002
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state
1003
- static StateResult handle_tag_open_state(GumboParser* parser,
1004
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1100
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
1101
+ static StateResult handle_tag_open_state (
1102
+ GumboParser* parser,
1103
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1104
+ int c,
1105
+ GumboToken* output
1106
+ ) {
1005
1107
  assert(temporary_buffer_equals(parser, "<"));
1006
1108
  switch (c) {
1007
1109
  case '!':
@@ -1032,9 +1134,13 @@ static StateResult handle_tag_open_state(GumboParser* parser,
1032
1134
  }
1033
1135
  }
1034
1136
 
1035
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state
1036
- static StateResult handle_end_tag_open_state(GumboParser* parser,
1037
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1137
+ // https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
1138
+ static StateResult handle_end_tag_open_state (
1139
+ GumboParser* parser,
1140
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1141
+ int c,
1142
+ GumboToken* output
1143
+ ) {
1038
1144
  assert(temporary_buffer_equals(parser, "</"));
1039
1145
  switch (c) {
1040
1146
  case '>':
@@ -1059,9 +1165,13 @@ static StateResult handle_end_tag_open_state(GumboParser* parser,
1059
1165
  }
1060
1166
  }
1061
1167
 
1062
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state
1063
- static StateResult handle_tag_name_state(GumboParser* parser,
1064
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1168
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
1169
+ static StateResult handle_tag_name_state (
1170
+ GumboParser* parser,
1171
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1172
+ int c,
1173
+ GumboToken* output
1174
+ ) {
1065
1175
  switch (c) {
1066
1176
  case '\t':
1067
1177
  case '\n':
@@ -1093,9 +1203,13 @@ static StateResult handle_tag_name_state(GumboParser* parser,
1093
1203
  }
1094
1204
  }
1095
1205
 
1096
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state
1097
- static StateResult handle_rcdata_lt_state(GumboParser* parser,
1098
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1206
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
1207
+ static StateResult handle_rcdata_lt_state (
1208
+ GumboParser* parser,
1209
+ GumboTokenizerState* tokenizer,
1210
+ int c,
1211
+ GumboToken* output
1212
+ ) {
1099
1213
  assert(temporary_buffer_equals(parser, "<"));
1100
1214
  if (c == '/') {
1101
1215
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
@@ -1108,9 +1222,13 @@ static StateResult handle_rcdata_lt_state(GumboParser* parser,
1108
1222
  }
1109
1223
  }
1110
1224
 
1111
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state
1112
- static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser,
1113
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1225
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
1226
+ static StateResult handle_rcdata_end_tag_open_state (
1227
+ GumboParser* parser,
1228
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1229
+ int c,
1230
+ GumboToken* output
1231
+ ) {
1114
1232
  assert(temporary_buffer_equals(parser, "</"));
1115
1233
  if (is_alpha(c)) {
1116
1234
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
@@ -1124,9 +1242,14 @@ static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser,
1124
1242
  return true;
1125
1243
  }
1126
1244
 
1127
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state
1128
- static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser,
1129
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1245
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
1246
+ static StateResult handle_rcdata_end_tag_name_state (
1247
+ GumboParser* parser,
1248
+ GumboTokenizerState* tokenizer,
1249
+ int c,
1250
+ GumboToken* output
1251
+ ) {
1252
+ UNUSED_IF_NDEBUG(tokenizer);
1130
1253
  assert(tokenizer->_temporary_buffer.length >= 2);
1131
1254
  if (is_alpha(c)) {
1132
1255
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
@@ -1156,9 +1279,13 @@ static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser,
1156
1279
  return emit_temporary_buffer(parser, output);
1157
1280
  }
1158
1281
 
1159
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state
1160
- static StateResult handle_rawtext_lt_state(GumboParser* parser,
1161
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1282
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
1283
+ static StateResult handle_rawtext_lt_state (
1284
+ GumboParser* parser,
1285
+ GumboTokenizerState* tokenizer,
1286
+ int c,
1287
+ GumboToken* output
1288
+ ) {
1162
1289
  assert(temporary_buffer_equals(parser, "<"));
1163
1290
  if (c == '/') {
1164
1291
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
@@ -1171,9 +1298,13 @@ static StateResult handle_rawtext_lt_state(GumboParser* parser,
1171
1298
  }
1172
1299
  }
1173
1300
 
1174
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state
1175
- static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser,
1176
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1301
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
1302
+ static StateResult handle_rawtext_end_tag_open_state (
1303
+ GumboParser* parser,
1304
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1305
+ int c,
1306
+ GumboToken* output
1307
+ ) {
1177
1308
  assert(temporary_buffer_equals(parser, "</"));
1178
1309
  if (is_alpha(c)) {
1179
1310
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
@@ -1186,9 +1317,13 @@ static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser,
1186
1317
  }
1187
1318
  }
1188
1319
 
1189
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state
1190
- static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser,
1191
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1320
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
1321
+ static StateResult handle_rawtext_end_tag_name_state (
1322
+ GumboParser* parser,
1323
+ GumboTokenizerState* tokenizer,
1324
+ int c,
1325
+ GumboToken* output
1326
+ ) {
1192
1327
  assert(tokenizer->_temporary_buffer.length >= 2);
1193
1328
  gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
1194
1329
  tokenizer->_tag_state._buffer.data);
@@ -1221,9 +1356,13 @@ static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser,
1221
1356
  return emit_temporary_buffer(parser, output);
1222
1357
  }
1223
1358
 
1224
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state
1225
- static StateResult handle_script_lt_state(GumboParser* parser,
1226
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1359
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
1360
+ static StateResult handle_script_lt_state (
1361
+ GumboParser* parser,
1362
+ GumboTokenizerState* tokenizer,
1363
+ int c,
1364
+ GumboToken* output
1365
+ ) {
1227
1366
  assert(temporary_buffer_equals(parser, "<"));
1228
1367
  if (c == '/') {
1229
1368
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
@@ -1240,9 +1379,13 @@ static StateResult handle_script_lt_state(GumboParser* parser,
1240
1379
  }
1241
1380
  }
1242
1381
 
1243
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state
1244
- static StateResult handle_script_end_tag_open_state(GumboParser* parser,
1245
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1382
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
1383
+ static StateResult handle_script_end_tag_open_state (
1384
+ GumboParser* parser,
1385
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1386
+ int c,
1387
+ GumboToken* output
1388
+ ) {
1246
1389
  assert(temporary_buffer_equals(parser, "</"));
1247
1390
  if (is_alpha(c)) {
1248
1391
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
@@ -1255,9 +1398,14 @@ static StateResult handle_script_end_tag_open_state(GumboParser* parser,
1255
1398
  }
1256
1399
  }
1257
1400
 
1258
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state
1259
- static StateResult handle_script_end_tag_name_state(GumboParser* parser,
1260
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1401
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
1402
+ static StateResult handle_script_end_tag_name_state (
1403
+ GumboParser* parser,
1404
+ GumboTokenizerState* tokenizer,
1405
+ int c,
1406
+ GumboToken* output
1407
+ ) {
1408
+ UNUSED_IF_NDEBUG(tokenizer);
1261
1409
  assert(tokenizer->_temporary_buffer.length >= 2);
1262
1410
  if (is_alpha(c)) {
1263
1411
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
@@ -1287,9 +1435,13 @@ static StateResult handle_script_end_tag_name_state(GumboParser* parser,
1287
1435
  return emit_temporary_buffer(parser, output);
1288
1436
  }
1289
1437
 
1290
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state
1291
- static StateResult handle_script_escaped_start_state(GumboParser* parser,
1292
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1438
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
1439
+ static StateResult handle_script_escaped_start_state (
1440
+ GumboParser* parser,
1441
+ GumboTokenizerState* tokenizer,
1442
+ int c,
1443
+ GumboToken* output
1444
+ ) {
1293
1445
  if (c == '-') {
1294
1446
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
1295
1447
  return emit_current_char(parser, output);
@@ -1300,9 +1452,13 @@ static StateResult handle_script_escaped_start_state(GumboParser* parser,
1300
1452
  }
1301
1453
  }
1302
1454
 
1303
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state
1304
- static StateResult handle_script_escaped_start_dash_state(GumboParser* parser,
1305
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1455
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
1456
+ static StateResult handle_script_escaped_start_dash_state (
1457
+ GumboParser* parser,
1458
+ GumboTokenizerState* tokenizer,
1459
+ int c,
1460
+ GumboToken* output
1461
+ ) {
1306
1462
  if (c == '-') {
1307
1463
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1308
1464
  return emit_current_char(parser, output);
@@ -1313,9 +1469,13 @@ static StateResult handle_script_escaped_start_dash_state(GumboParser* parser,
1313
1469
  }
1314
1470
  }
1315
1471
 
1316
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state
1317
- static StateResult handle_script_escaped_state(GumboParser* parser,
1318
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1472
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
1473
+ static StateResult handle_script_escaped_state (
1474
+ GumboParser* parser,
1475
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1476
+ int c,
1477
+ GumboToken* output
1478
+ ) {
1319
1479
  switch (c) {
1320
1480
  case '-':
1321
1481
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
@@ -1335,9 +1495,13 @@ static StateResult handle_script_escaped_state(GumboParser* parser,
1335
1495
  }
1336
1496
  }
1337
1497
 
1338
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state
1339
- static StateResult handle_script_escaped_dash_state(GumboParser* parser,
1340
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1498
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
1499
+ static StateResult handle_script_escaped_dash_state (
1500
+ GumboParser* parser,
1501
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1502
+ int c,
1503
+ GumboToken* output
1504
+ ) {
1341
1505
  switch (c) {
1342
1506
  case '-':
1343
1507
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
@@ -1360,9 +1524,13 @@ static StateResult handle_script_escaped_dash_state(GumboParser* parser,
1360
1524
  }
1361
1525
  }
1362
1526
 
1363
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state
1364
- static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser,
1365
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1527
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
1528
+ static StateResult handle_script_escaped_dash_dash_state (
1529
+ GumboParser* parser,
1530
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1531
+ int c,
1532
+ GumboToken* output
1533
+ ) {
1366
1534
  switch (c) {
1367
1535
  case '-':
1368
1536
  return emit_current_char(parser, output);
@@ -1387,9 +1555,13 @@ static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser,
1387
1555
  }
1388
1556
  }
1389
1557
 
1390
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state
1391
- static StateResult handle_script_escaped_lt_state(GumboParser* parser,
1392
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1558
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
1559
+ static StateResult handle_script_escaped_lt_state (
1560
+ GumboParser* parser,
1561
+ GumboTokenizerState* tokenizer,
1562
+ int c,
1563
+ GumboToken* output
1564
+ ) {
1393
1565
  assert(temporary_buffer_equals(parser, "<"));
1394
1566
  assert(!tokenizer->_script_data_buffer.length);
1395
1567
  if (c == '/') {
@@ -1399,8 +1571,10 @@ static StateResult handle_script_escaped_lt_state(GumboParser* parser,
1399
1571
  } else if (is_alpha(c)) {
1400
1572
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
1401
1573
  append_char_to_temporary_buffer(parser, c);
1402
- gumbo_string_buffer_append_codepoint(
1403
- parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1574
+ gumbo_string_buffer_append_codepoint (
1575
+ ensure_lowercase(c),
1576
+ &tokenizer->_script_data_buffer
1577
+ );
1404
1578
  return emit_temporary_buffer(parser, output);
1405
1579
  } else {
1406
1580
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
@@ -1408,9 +1582,13 @@ static StateResult handle_script_escaped_lt_state(GumboParser* parser,
1408
1582
  }
1409
1583
  }
1410
1584
 
1411
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state
1412
- static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser,
1413
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1585
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
1586
+ static StateResult handle_script_escaped_end_tag_open_state (
1587
+ GumboParser* parser,
1588
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1589
+ int c,
1590
+ GumboToken* output
1591
+ ) {
1414
1592
  assert(temporary_buffer_equals(parser, "</"));
1415
1593
  if (is_alpha(c)) {
1416
1594
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
@@ -1423,9 +1601,14 @@ static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser,
1423
1601
  }
1424
1602
  }
1425
1603
 
1426
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state
1427
- static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser,
1428
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1604
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
1605
+ static StateResult handle_script_escaped_end_tag_name_state (
1606
+ GumboParser* parser,
1607
+ GumboTokenizerState* tokenizer,
1608
+ int c,
1609
+ GumboToken* output
1610
+ ) {
1611
+ UNUSED_IF_NDEBUG(tokenizer);
1429
1612
  assert(tokenizer->_temporary_buffer.length >= 2);
1430
1613
  if (is_alpha(c)) {
1431
1614
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
@@ -1455,9 +1638,13 @@ static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser,
1455
1638
  return emit_temporary_buffer(parser, output);
1456
1639
  }
1457
1640
 
1458
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state
1459
- static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
1460
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1641
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
1642
+ static StateResult handle_script_double_escaped_start_state (
1643
+ GumboParser* parser,
1644
+ GumboTokenizerState* tokenizer,
1645
+ int c,
1646
+ GumboToken* output
1647
+ ) {
1461
1648
  switch (c) {
1462
1649
  case '\t':
1463
1650
  case '\n':
@@ -1465,16 +1652,22 @@ static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
1465
1652
  case ' ':
1466
1653
  case '/':
1467
1654
  case '>':
1468
- gumbo_tokenizer_set_state(
1469
- parser, gumbo_string_equals(&kScriptTag,
1470
- (GumboStringPiece*) &tokenizer->_script_data_buffer)
1471
- ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
1472
- : GUMBO_LEX_SCRIPT_ESCAPED);
1655
+ gumbo_tokenizer_set_state (
1656
+ parser,
1657
+ gumbo_string_equals (
1658
+ &kScriptTag,
1659
+ (GumboStringPiece*) &tokenizer->_script_data_buffer
1660
+ )
1661
+ ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
1662
+ : GUMBO_LEX_SCRIPT_ESCAPED
1663
+ );
1473
1664
  return emit_current_char(parser, output);
1474
1665
  default:
1475
1666
  if (is_alpha(c)) {
1476
- gumbo_string_buffer_append_codepoint(
1477
- parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1667
+ gumbo_string_buffer_append_codepoint (
1668
+ ensure_lowercase(c),
1669
+ &tokenizer->_script_data_buffer
1670
+ );
1478
1671
  return emit_current_char(parser, output);
1479
1672
  } else {
1480
1673
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
@@ -1484,9 +1677,13 @@ static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
1484
1677
  }
1485
1678
  }
1486
1679
 
1487
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state
1488
- static StateResult handle_script_double_escaped_state(GumboParser* parser,
1489
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1680
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
1681
+ static StateResult handle_script_double_escaped_state (
1682
+ GumboParser* parser,
1683
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1684
+ int c,
1685
+ GumboToken* output
1686
+ ) {
1490
1687
  switch (c) {
1491
1688
  case '-':
1492
1689
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
@@ -1505,9 +1702,13 @@ static StateResult handle_script_double_escaped_state(GumboParser* parser,
1505
1702
  }
1506
1703
  }
1507
1704
 
1508
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state
1509
- static StateResult handle_script_double_escaped_dash_state(GumboParser* parser,
1510
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1705
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
1706
+ static StateResult handle_script_double_escaped_dash_state (
1707
+ GumboParser* parser,
1708
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1709
+ int c,
1710
+ GumboToken* output
1711
+ ) {
1511
1712
  switch (c) {
1512
1713
  case '-':
1513
1714
  gumbo_tokenizer_set_state(
@@ -1529,10 +1730,13 @@ static StateResult handle_script_double_escaped_dash_state(GumboParser* parser,
1529
1730
  }
1530
1731
  }
1531
1732
 
1532
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state
1533
- static StateResult handle_script_double_escaped_dash_dash_state(
1534
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
1535
- GumboToken* output) {
1733
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
1734
+ static StateResult handle_script_double_escaped_dash_dash_state (
1735
+ GumboParser* parser,
1736
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1737
+ int c,
1738
+ GumboToken* output
1739
+ ) {
1536
1740
  switch (c) {
1537
1741
  case '-':
1538
1742
  return emit_current_char(parser, output);
@@ -1555,12 +1759,16 @@ static StateResult handle_script_double_escaped_dash_dash_state(
1555
1759
  }
1556
1760
  }
1557
1761
 
1558
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state
1559
- static StateResult handle_script_double_escaped_lt_state(GumboParser* parser,
1560
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1762
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
1763
+ static StateResult handle_script_double_escaped_lt_state (
1764
+ GumboParser* parser,
1765
+ GumboTokenizerState* tokenizer,
1766
+ int c,
1767
+ GumboToken* output
1768
+ ) {
1561
1769
  if (c == '/') {
1562
1770
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1563
- gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
1771
+ gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
1564
1772
  return emit_current_char(parser, output);
1565
1773
  } else {
1566
1774
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
@@ -1569,9 +1777,13 @@ static StateResult handle_script_double_escaped_lt_state(GumboParser* parser,
1569
1777
  }
1570
1778
  }
1571
1779
 
1572
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state
1573
- static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
1574
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1780
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
1781
+ static StateResult handle_script_double_escaped_end_state (
1782
+ GumboParser* parser,
1783
+ GumboTokenizerState* tokenizer,
1784
+ int c,
1785
+ GumboToken* output
1786
+ ) {
1575
1787
  switch (c) {
1576
1788
  case '\t':
1577
1789
  case '\n':
@@ -1587,8 +1799,10 @@ static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
1587
1799
  return emit_current_char(parser, output);
1588
1800
  default:
1589
1801
  if (is_alpha(c)) {
1590
- gumbo_string_buffer_append_codepoint(
1591
- parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1802
+ gumbo_string_buffer_append_codepoint (
1803
+ ensure_lowercase(c),
1804
+ &tokenizer->_script_data_buffer
1805
+ );
1592
1806
  return emit_current_char(parser, output);
1593
1807
  } else {
1594
1808
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
@@ -1598,9 +1812,13 @@ static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
1598
1812
  }
1599
1813
  }
1600
1814
 
1601
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state
1602
- static StateResult handle_before_attr_name_state(GumboParser* parser,
1603
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1815
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
1816
+ static StateResult handle_before_attr_name_state (
1817
+ GumboParser* parser,
1818
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1819
+ int c,
1820
+ GumboToken* output
1821
+ ) {
1604
1822
  switch (c) {
1605
1823
  case '\t':
1606
1824
  case '\n':
@@ -1636,9 +1854,13 @@ static StateResult handle_before_attr_name_state(GumboParser* parser,
1636
1854
  }
1637
1855
  }
1638
1856
 
1639
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state
1640
- static StateResult handle_attr_name_state(GumboParser* parser,
1641
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1857
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
1858
+ static StateResult handle_attr_name_state (
1859
+ GumboParser* parser,
1860
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1861
+ int c,
1862
+ GumboToken* output
1863
+ ) {
1642
1864
  switch (c) {
1643
1865
  case '\t':
1644
1866
  case '\n':
@@ -1679,9 +1901,13 @@ static StateResult handle_attr_name_state(GumboParser* parser,
1679
1901
  }
1680
1902
  }
1681
1903
 
1682
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state
1683
- static StateResult handle_after_attr_name_state(GumboParser* parser,
1684
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1904
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
1905
+ static StateResult handle_after_attr_name_state (
1906
+ GumboParser* parser,
1907
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1908
+ int c,
1909
+ GumboToken* output
1910
+ ) {
1685
1911
  switch (c) {
1686
1912
  case '\t':
1687
1913
  case '\n':
@@ -1719,9 +1945,13 @@ static StateResult handle_after_attr_name_state(GumboParser* parser,
1719
1945
  }
1720
1946
  }
1721
1947
 
1722
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state
1723
- static StateResult handle_before_attr_value_state(GumboParser* parser,
1724
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1948
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
1949
+ static StateResult handle_before_attr_value_state (
1950
+ GumboParser* parser,
1951
+ GumboTokenizerState* tokenizer,
1952
+ int c,
1953
+ GumboToken* output
1954
+ ) {
1725
1955
  switch (c) {
1726
1956
  case '\t':
1727
1957
  case '\n':
@@ -1768,9 +1998,13 @@ static StateResult handle_before_attr_value_state(GumboParser* parser,
1768
1998
  }
1769
1999
  }
1770
2000
 
1771
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state
1772
- static StateResult handle_attr_value_double_quoted_state(GumboParser* parser,
1773
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2001
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state
2002
+ static StateResult handle_attr_value_double_quoted_state (
2003
+ GumboParser* parser,
2004
+ GumboTokenizerState* tokenizer,
2005
+ int c,
2006
+ GumboToken* UNUSED_ARG(output)
2007
+ ) {
1774
2008
  switch (c) {
1775
2009
  case '"':
1776
2010
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
@@ -1796,9 +2030,13 @@ static StateResult handle_attr_value_double_quoted_state(GumboParser* parser,
1796
2030
  }
1797
2031
  }
1798
2032
 
1799
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state
1800
- static StateResult handle_attr_value_single_quoted_state(GumboParser* parser,
1801
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2033
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-single-quoted-state
2034
+ static StateResult handle_attr_value_single_quoted_state (
2035
+ GumboParser* parser,
2036
+ GumboTokenizerState* tokenizer,
2037
+ int c,
2038
+ GumboToken* UNUSED_ARG(output)
2039
+ ) {
1802
2040
  switch (c) {
1803
2041
  case '\'':
1804
2042
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
@@ -1824,9 +2062,13 @@ static StateResult handle_attr_value_single_quoted_state(GumboParser* parser,
1824
2062
  }
1825
2063
  }
1826
2064
 
1827
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state
1828
- static StateResult handle_attr_value_unquoted_state(GumboParser* parser,
1829
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2065
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-unquoted-state
2066
+ static StateResult handle_attr_value_unquoted_state (
2067
+ GumboParser* parser,
2068
+ GumboTokenizerState* tokenizer,
2069
+ int c,
2070
+ GumboToken* output
2071
+ ) {
1830
2072
  switch (c) {
1831
2073
  case '\t':
1832
2074
  case '\n':
@@ -1867,9 +2109,13 @@ static StateResult handle_attr_value_unquoted_state(GumboParser* parser,
1867
2109
  }
1868
2110
  }
1869
2111
 
1870
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state
1871
- static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
1872
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2112
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-attribute-value-state
2113
+ static StateResult handle_char_ref_in_attr_value_state (
2114
+ GumboParser* parser,
2115
+ GumboTokenizerState* tokenizer,
2116
+ int UNUSED_ARG(c),
2117
+ GumboToken* UNUSED_ARG(output)
2118
+ ) {
1873
2119
  OneOrTwoCodepoints char_ref;
1874
2120
  int allowed_char;
1875
2121
  bool is_unquoted = false;
@@ -1893,9 +2139,15 @@ static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
1893
2139
 
1894
2140
  // Ignore the status, since we don't have a convenient way of signalling that
1895
2141
  // a parser error has occurred when the error occurs in the middle of a
1896
- // multi-state token. We'd need a flag inside the TokenizerState to do this,
2142
+ // multi-state token. We'd need a flag inside the TokenizerState to do this,
1897
2143
  // but that's a low priority fix.
1898
- consume_char_ref(parser, &tokenizer->_input, allowed_char, true, &char_ref);
2144
+ gumbo_consume_char_ref (
2145
+ parser,
2146
+ &tokenizer->_input,
2147
+ allowed_char,
2148
+ true,
2149
+ &char_ref
2150
+ );
1899
2151
  if (char_ref.first != kGumboNoChar) {
1900
2152
  tokenizer->_reconsume_current_input = true;
1901
2153
  append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
@@ -1909,9 +2161,13 @@ static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
1909
2161
  return NEXT_CHAR;
1910
2162
  }
1911
2163
 
1912
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state
1913
- static StateResult handle_after_attr_value_quoted_state(GumboParser* parser,
1914
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2164
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state
2165
+ static StateResult handle_after_attr_value_quoted_state (
2166
+ GumboParser* parser,
2167
+ GumboTokenizerState* tokenizer,
2168
+ int c,
2169
+ GumboToken* output
2170
+ ) {
1915
2171
  finish_attribute_value(parser);
1916
2172
  switch (c) {
1917
2173
  case '\t':
@@ -1940,9 +2196,13 @@ static StateResult handle_after_attr_value_quoted_state(GumboParser* parser,
1940
2196
  }
1941
2197
  }
1942
2198
 
1943
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state
1944
- static StateResult handle_self_closing_start_tag_state(GumboParser* parser,
1945
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2199
+ // https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
2200
+ static StateResult handle_self_closing_start_tag_state (
2201
+ GumboParser* parser,
2202
+ GumboTokenizerState* tokenizer,
2203
+ int c,
2204
+ GumboToken* output
2205
+ ) {
1946
2206
  switch (c) {
1947
2207
  case '>':
1948
2208
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
@@ -1961,11 +2221,16 @@ static StateResult handle_self_closing_start_tag_state(GumboParser* parser,
1961
2221
  }
1962
2222
  }
1963
2223
 
1964
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state
1965
- static StateResult handle_bogus_comment_state(GumboParser* parser,
1966
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2224
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
2225
+ static StateResult handle_bogus_comment_state (
2226
+ GumboParser* parser,
2227
+ GumboTokenizerState* tokenizer,
2228
+ int c,
2229
+ GumboToken* output
2230
+ ) {
1967
2231
  while (c != '>' && c != -1) {
1968
2232
  if (c == '\0') {
2233
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1969
2234
  c = 0xFFFD;
1970
2235
  }
1971
2236
  append_char_to_temporary_buffer(parser, c);
@@ -1976,29 +2241,48 @@ static StateResult handle_bogus_comment_state(GumboParser* parser,
1976
2241
  return emit_comment(parser, output);
1977
2242
  }
1978
2243
 
1979
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state
1980
- static StateResult handle_markup_declaration_state(GumboParser* parser,
1981
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1982
- if (utf8iterator_maybe_consume_match(
1983
- &tokenizer->_input, "--", sizeof("--") - 1, true)) {
2244
+ // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
2245
+ static StateResult handle_markup_declaration_state (
2246
+ GumboParser* parser,
2247
+ GumboTokenizerState* tokenizer,
2248
+ int UNUSED_ARG(c),
2249
+ GumboToken* UNUSED_ARG(output)
2250
+ ) {
2251
+ if (
2252
+ utf8iterator_maybe_consume_match (
2253
+ &tokenizer->_input,
2254
+ "--",
2255
+ sizeof("--") - 1,
2256
+ true
2257
+ )
2258
+ ) {
1984
2259
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
1985
2260
  tokenizer->_reconsume_current_input = true;
1986
- } else if (utf8iterator_maybe_consume_match(
1987
- &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
2261
+ } else if (
2262
+ utf8iterator_maybe_consume_match (
2263
+ &tokenizer->_input,
2264
+ "DOCTYPE",
2265
+ sizeof("DOCTYPE") - 1,
2266
+ false
2267
+ )
2268
+ ) {
1988
2269
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
1989
2270
  tokenizer->_reconsume_current_input = true;
1990
2271
  // If we get here, we know we'll eventually emit a doctype token, so now is
1991
- // the time to initialize the doctype strings. (Not in doctype_state_init,
2272
+ // the time to initialize the doctype strings. (Not in doctype_state_init,
1992
2273
  // since then they'll leak if ownership never gets transferred to the
1993
2274
  // doctype token.
1994
- tokenizer->_doc_type_state.name = gumbo_copy_stringz(parser, "");
1995
- tokenizer->_doc_type_state.public_identifier =
1996
- gumbo_copy_stringz(parser, "");
1997
- tokenizer->_doc_type_state.system_identifier =
1998
- gumbo_copy_stringz(parser, "");
1999
- } else if (tokenizer->_is_current_node_foreign &&
2000
- utf8iterator_maybe_consume_match(
2001
- &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
2275
+ tokenizer->_doc_type_state.name = gumbo_strdup("");
2276
+ tokenizer->_doc_type_state.public_identifier = gumbo_strdup("");
2277
+ tokenizer->_doc_type_state.system_identifier = gumbo_strdup("");
2278
+ } else if (
2279
+ tokenizer->_is_current_node_foreign
2280
+ && utf8iterator_maybe_consume_match (
2281
+ &tokenizer->_input,
2282
+ "[CDATA[", sizeof("[CDATA[") - 1,
2283
+ true
2284
+ )
2285
+ ) {
2002
2286
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2003
2287
  tokenizer->_is_in_cdata = true;
2004
2288
  tokenizer->_reconsume_current_input = true;
@@ -2011,9 +2295,13 @@ static StateResult handle_markup_declaration_state(GumboParser* parser,
2011
2295
  return NEXT_CHAR;
2012
2296
  }
2013
2297
 
2014
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state
2015
- static StateResult handle_comment_start_state(GumboParser* parser,
2016
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2298
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
2299
+ static StateResult handle_comment_start_state (
2300
+ GumboParser* parser,
2301
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2302
+ int c,
2303
+ GumboToken* output
2304
+ ) {
2017
2305
  switch (c) {
2018
2306
  case '-':
2019
2307
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
@@ -2040,9 +2328,13 @@ static StateResult handle_comment_start_state(GumboParser* parser,
2040
2328
  }
2041
2329
  }
2042
2330
 
2043
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state
2044
- static StateResult handle_comment_start_dash_state(GumboParser* parser,
2045
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2331
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
2332
+ static StateResult handle_comment_start_dash_state (
2333
+ GumboParser* parser,
2334
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2335
+ int c,
2336
+ GumboToken* output
2337
+ ) {
2046
2338
  switch (c) {
2047
2339
  case '-':
2048
2340
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
@@ -2071,9 +2363,13 @@ static StateResult handle_comment_start_dash_state(GumboParser* parser,
2071
2363
  }
2072
2364
  }
2073
2365
 
2074
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state
2075
- static StateResult handle_comment_state(GumboParser* parser,
2076
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2366
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-state
2367
+ static StateResult handle_comment_state (
2368
+ GumboParser* parser,
2369
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2370
+ int c,
2371
+ GumboToken* output
2372
+ ) {
2077
2373
  switch (c) {
2078
2374
  case '-':
2079
2375
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
@@ -2093,9 +2389,13 @@ static StateResult handle_comment_state(GumboParser* parser,
2093
2389
  }
2094
2390
  }
2095
2391
 
2096
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state
2097
- static StateResult handle_comment_end_dash_state(GumboParser* parser,
2098
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2392
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
2393
+ static StateResult handle_comment_end_dash_state (
2394
+ GumboParser* parser,
2395
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2396
+ int c,
2397
+ GumboToken* output
2398
+ ) {
2099
2399
  switch (c) {
2100
2400
  case '-':
2101
2401
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
@@ -2119,9 +2419,13 @@ static StateResult handle_comment_end_dash_state(GumboParser* parser,
2119
2419
  }
2120
2420
  }
2121
2421
 
2122
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state
2123
- static StateResult handle_comment_end_state(GumboParser* parser,
2124
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2422
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
2423
+ static StateResult handle_comment_end_state (
2424
+ GumboParser* parser,
2425
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2426
+ int c,
2427
+ GumboToken* output
2428
+ ) {
2125
2429
  switch (c) {
2126
2430
  case '>':
2127
2431
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
@@ -2158,9 +2462,13 @@ static StateResult handle_comment_end_state(GumboParser* parser,
2158
2462
  }
2159
2463
  }
2160
2464
 
2161
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state
2162
- static StateResult handle_comment_end_bang_state(GumboParser* parser,
2163
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2465
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
2466
+ static StateResult handle_comment_end_bang_state (
2467
+ GumboParser* parser,
2468
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2469
+ int c,
2470
+ GumboToken* output
2471
+ ) {
2164
2472
  switch (c) {
2165
2473
  case '-':
2166
2474
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
@@ -2194,9 +2502,13 @@ static StateResult handle_comment_end_bang_state(GumboParser* parser,
2194
2502
  }
2195
2503
  }
2196
2504
 
2197
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state
2198
- static StateResult handle_doctype_state(GumboParser* parser,
2199
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2505
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
2506
+ static StateResult handle_doctype_state (
2507
+ GumboParser* parser,
2508
+ GumboTokenizerState* tokenizer,
2509
+ int c,
2510
+ GumboToken* output
2511
+ ) {
2200
2512
  assert(!tokenizer->_temporary_buffer.length);
2201
2513
  switch (c) {
2202
2514
  case '\t':
@@ -2220,9 +2532,13 @@ static StateResult handle_doctype_state(GumboParser* parser,
2220
2532
  }
2221
2533
  }
2222
2534
 
2223
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state
2224
- static StateResult handle_before_doctype_name_state(GumboParser* parser,
2225
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2535
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
2536
+ static StateResult handle_before_doctype_name_state (
2537
+ GumboParser* parser,
2538
+ GumboTokenizerState* tokenizer,
2539
+ int c,
2540
+ GumboToken* output
2541
+ ) {
2226
2542
  switch (c) {
2227
2543
  case '\t':
2228
2544
  case '\n':
@@ -2255,21 +2571,25 @@ static StateResult handle_before_doctype_name_state(GumboParser* parser,
2255
2571
  }
2256
2572
  }
2257
2573
 
2258
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state
2259
- static StateResult handle_doctype_name_state(GumboParser* parser,
2260
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2574
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
2575
+ static StateResult handle_doctype_name_state (
2576
+ GumboParser* parser,
2577
+ GumboTokenizerState* tokenizer,
2578
+ int c,
2579
+ GumboToken* output
2580
+ ) {
2261
2581
  switch (c) {
2262
2582
  case '\t':
2263
2583
  case '\n':
2264
2584
  case '\f':
2265
2585
  case ' ':
2266
2586
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2267
- gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2587
+ gumbo_free((void*) tokenizer->_doc_type_state.name);
2268
2588
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2269
2589
  return NEXT_CHAR;
2270
2590
  case '>':
2271
2591
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2272
- gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2592
+ gumbo_free((void*) tokenizer->_doc_type_state.name);
2273
2593
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2274
2594
  emit_doctype(parser, output);
2275
2595
  return RETURN_SUCCESS;
@@ -2281,7 +2601,7 @@ static StateResult handle_doctype_name_state(GumboParser* parser,
2281
2601
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2282
2602
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2283
2603
  tokenizer->_doc_type_state.force_quirks = true;
2284
- gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2604
+ gumbo_free((void*) tokenizer->_doc_type_state.name);
2285
2605
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2286
2606
  emit_doctype(parser, output);
2287
2607
  return RETURN_ERROR;
@@ -2293,9 +2613,13 @@ static StateResult handle_doctype_name_state(GumboParser* parser,
2293
2613
  }
2294
2614
  }
2295
2615
 
2296
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state
2297
- static StateResult handle_after_doctype_name_state(GumboParser* parser,
2298
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2616
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
2617
+ static StateResult handle_after_doctype_name_state (
2618
+ GumboParser* parser,
2619
+ GumboTokenizerState* tokenizer,
2620
+ int c,
2621
+ GumboToken* output
2622
+ ) {
2299
2623
  switch (c) {
2300
2624
  case '\t':
2301
2625
  case '\n':
@@ -2333,10 +2657,13 @@ static StateResult handle_after_doctype_name_state(GumboParser* parser,
2333
2657
  }
2334
2658
  }
2335
2659
 
2336
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state
2337
- static StateResult handle_after_doctype_public_keyword_state(
2338
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2339
- GumboToken* output) {
2660
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
2661
+ static StateResult handle_after_doctype_public_keyword_state (
2662
+ GumboParser* parser,
2663
+ GumboTokenizerState* tokenizer,
2664
+ int c,
2665
+ GumboToken* output
2666
+ ) {
2340
2667
  switch (c) {
2341
2668
  case '\t':
2342
2669
  case '\n':
@@ -2346,13 +2673,13 @@ static StateResult handle_after_doctype_public_keyword_state(
2346
2673
  return NEXT_CHAR;
2347
2674
  case '"':
2348
2675
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2349
- assert(temporary_buffer_equals(parser, ""));
2676
+ assert(temporary_buffer_is_empty(parser));
2350
2677
  gumbo_tokenizer_set_state(
2351
2678
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2352
2679
  return NEXT_CHAR;
2353
2680
  case '\'':
2354
2681
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2355
- assert(temporary_buffer_equals(parser, ""));
2682
+ assert(temporary_buffer_is_empty(parser));
2356
2683
  gumbo_tokenizer_set_state(
2357
2684
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2358
2685
  return NEXT_CHAR;
@@ -2377,9 +2704,13 @@ static StateResult handle_after_doctype_public_keyword_state(
2377
2704
  }
2378
2705
  }
2379
2706
 
2380
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state
2381
- static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
2382
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2707
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
2708
+ static StateResult handle_before_doctype_public_id_state (
2709
+ GumboParser* parser,
2710
+ GumboTokenizerState* tokenizer,
2711
+ int c,
2712
+ GumboToken* output
2713
+ ) {
2383
2714
  switch (c) {
2384
2715
  case '\t':
2385
2716
  case '\n':
@@ -2387,12 +2718,12 @@ static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
2387
2718
  case ' ':
2388
2719
  return NEXT_CHAR;
2389
2720
  case '"':
2390
- assert(temporary_buffer_equals(parser, ""));
2721
+ assert(temporary_buffer_is_empty(parser));
2391
2722
  gumbo_tokenizer_set_state(
2392
2723
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2393
2724
  return NEXT_CHAR;
2394
2725
  case '\'':
2395
- assert(temporary_buffer_equals(parser, ""));
2726
+ assert(temporary_buffer_is_empty(parser));
2396
2727
  gumbo_tokenizer_set_state(
2397
2728
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2398
2729
  return NEXT_CHAR;
@@ -2417,10 +2748,13 @@ static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
2417
2748
  }
2418
2749
  }
2419
2750
 
2420
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state
2421
- static StateResult handle_doctype_public_id_double_quoted_state(
2422
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2423
- GumboToken* output) {
2751
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
2752
+ static StateResult handle_doctype_public_id_double_quoted_state (
2753
+ GumboParser* parser,
2754
+ GumboTokenizerState* tokenizer,
2755
+ int c,
2756
+ GumboToken* output
2757
+ ) {
2424
2758
  switch (c) {
2425
2759
  case '"':
2426
2760
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
@@ -2450,10 +2784,13 @@ static StateResult handle_doctype_public_id_double_quoted_state(
2450
2784
  }
2451
2785
  }
2452
2786
 
2453
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state
2454
- static StateResult handle_doctype_public_id_single_quoted_state(
2455
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2456
- GumboToken* output) {
2787
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
2788
+ static StateResult handle_doctype_public_id_single_quoted_state (
2789
+ GumboParser* parser,
2790
+ GumboTokenizerState* tokenizer,
2791
+ int c,
2792
+ GumboToken* output
2793
+ ) {
2457
2794
  switch (c) {
2458
2795
  case '\'':
2459
2796
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
@@ -2483,9 +2820,13 @@ static StateResult handle_doctype_public_id_single_quoted_state(
2483
2820
  }
2484
2821
  }
2485
2822
 
2486
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state
2487
- static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
2488
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2823
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
2824
+ static StateResult handle_after_doctype_public_id_state (
2825
+ GumboParser* parser,
2826
+ GumboTokenizerState* tokenizer,
2827
+ int c,
2828
+ GumboToken* output
2829
+ ) {
2489
2830
  switch (c) {
2490
2831
  case '\t':
2491
2832
  case '\n':
@@ -2500,13 +2841,13 @@ static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
2500
2841
  return RETURN_SUCCESS;
2501
2842
  case '"':
2502
2843
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2503
- assert(temporary_buffer_equals(parser, ""));
2844
+ assert(temporary_buffer_is_empty(parser));
2504
2845
  gumbo_tokenizer_set_state(
2505
2846
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2506
2847
  return NEXT_CHAR;
2507
2848
  case '\'':
2508
2849
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2509
- assert(temporary_buffer_equals(parser, ""));
2850
+ assert(temporary_buffer_is_empty(parser));
2510
2851
  gumbo_tokenizer_set_state(
2511
2852
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2512
2853
  return NEXT_CHAR;
@@ -2525,10 +2866,13 @@ static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
2525
2866
  }
2526
2867
  }
2527
2868
 
2528
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state
2529
- static StateResult handle_between_doctype_public_system_id_state(
2530
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2531
- GumboToken* output) {
2869
+ // https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
2870
+ static StateResult handle_between_doctype_public_system_id_state (
2871
+ GumboParser* parser,
2872
+ GumboTokenizerState* tokenizer,
2873
+ int c,
2874
+ GumboToken* output
2875
+ ) {
2532
2876
  switch (c) {
2533
2877
  case '\t':
2534
2878
  case '\n':
@@ -2540,12 +2884,12 @@ static StateResult handle_between_doctype_public_system_id_state(
2540
2884
  emit_doctype(parser, output);
2541
2885
  return RETURN_SUCCESS;
2542
2886
  case '"':
2543
- assert(temporary_buffer_equals(parser, ""));
2887
+ assert(temporary_buffer_is_empty(parser));
2544
2888
  gumbo_tokenizer_set_state(
2545
2889
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2546
2890
  return NEXT_CHAR;
2547
2891
  case '\'':
2548
- assert(temporary_buffer_equals(parser, ""));
2892
+ assert(temporary_buffer_is_empty(parser));
2549
2893
  gumbo_tokenizer_set_state(
2550
2894
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2551
2895
  return NEXT_CHAR;
@@ -2564,10 +2908,13 @@ static StateResult handle_between_doctype_public_system_id_state(
2564
2908
  }
2565
2909
  }
2566
2910
 
2567
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state
2568
- static StateResult handle_after_doctype_system_keyword_state(
2569
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2570
- GumboToken* output) {
2911
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
2912
+ static StateResult handle_after_doctype_system_keyword_state (
2913
+ GumboParser* parser,
2914
+ GumboTokenizerState* tokenizer,
2915
+ int c,
2916
+ GumboToken* output
2917
+ ) {
2571
2918
  switch (c) {
2572
2919
  case '\t':
2573
2920
  case '\n':
@@ -2577,13 +2924,13 @@ static StateResult handle_after_doctype_system_keyword_state(
2577
2924
  return NEXT_CHAR;
2578
2925
  case '"':
2579
2926
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2580
- assert(temporary_buffer_equals(parser, ""));
2927
+ assert(temporary_buffer_is_empty(parser));
2581
2928
  gumbo_tokenizer_set_state(
2582
2929
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2583
2930
  return NEXT_CHAR;
2584
2931
  case '\'':
2585
2932
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2586
- assert(temporary_buffer_equals(parser, ""));
2933
+ assert(temporary_buffer_is_empty(parser));
2587
2934
  gumbo_tokenizer_set_state(
2588
2935
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2589
2936
  return NEXT_CHAR;
@@ -2607,9 +2954,13 @@ static StateResult handle_after_doctype_system_keyword_state(
2607
2954
  }
2608
2955
  }
2609
2956
 
2610
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state
2611
- static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
2612
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2957
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
2958
+ static StateResult handle_before_doctype_system_id_state (
2959
+ GumboParser* parser,
2960
+ GumboTokenizerState* tokenizer,
2961
+ int c,
2962
+ GumboToken* output
2963
+ ) {
2613
2964
  switch (c) {
2614
2965
  case '\t':
2615
2966
  case '\n':
@@ -2617,12 +2968,12 @@ static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
2617
2968
  case ' ':
2618
2969
  return NEXT_CHAR;
2619
2970
  case '"':
2620
- assert(temporary_buffer_equals(parser, ""));
2971
+ assert(temporary_buffer_is_empty(parser));
2621
2972
  gumbo_tokenizer_set_state(
2622
2973
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2623
2974
  return NEXT_CHAR;
2624
2975
  case '\'':
2625
- assert(temporary_buffer_equals(parser, ""));
2976
+ assert(temporary_buffer_is_empty(parser));
2626
2977
  gumbo_tokenizer_set_state(
2627
2978
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2628
2979
  return NEXT_CHAR;
@@ -2646,10 +2997,13 @@ static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
2646
2997
  }
2647
2998
  }
2648
2999
 
2649
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state
2650
- static StateResult handle_doctype_system_id_double_quoted_state(
2651
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2652
- GumboToken* output) {
3000
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
3001
+ static StateResult handle_doctype_system_id_double_quoted_state (
3002
+ GumboParser* parser,
3003
+ GumboTokenizerState* tokenizer,
3004
+ int c,
3005
+ GumboToken* output
3006
+ ) {
2653
3007
  switch (c) {
2654
3008
  case '"':
2655
3009
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
@@ -2679,10 +3033,13 @@ static StateResult handle_doctype_system_id_double_quoted_state(
2679
3033
  }
2680
3034
  }
2681
3035
 
2682
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state
2683
- static StateResult handle_doctype_system_id_single_quoted_state(
2684
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2685
- GumboToken* output) {
3036
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
3037
+ static StateResult handle_doctype_system_id_single_quoted_state (
3038
+ GumboParser* parser,
3039
+ GumboTokenizerState* tokenizer,
3040
+ int c,
3041
+ GumboToken* output
3042
+ ) {
2686
3043
  switch (c) {
2687
3044
  case '\'':
2688
3045
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
@@ -2712,9 +3069,13 @@ static StateResult handle_doctype_system_id_single_quoted_state(
2712
3069
  }
2713
3070
  }
2714
3071
 
2715
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state
2716
- static StateResult handle_after_doctype_system_id_state(GumboParser* parser,
2717
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
3072
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
3073
+ static StateResult handle_after_doctype_system_id_state (
3074
+ GumboParser* parser,
3075
+ GumboTokenizerState* tokenizer,
3076
+ int c,
3077
+ GumboToken* output
3078
+ ) {
2718
3079
  switch (c) {
2719
3080
  case '\t':
2720
3081
  case '\n':
@@ -2738,9 +3099,13 @@ static StateResult handle_after_doctype_system_id_state(GumboParser* parser,
2738
3099
  }
2739
3100
  }
2740
3101
 
2741
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state
2742
- static StateResult handle_bogus_doctype_state(GumboParser* parser,
2743
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
3102
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
3103
+ static StateResult handle_bogus_doctype_state (
3104
+ GumboParser* parser,
3105
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
3106
+ int c,
3107
+ GumboToken* output
3108
+ ) {
2744
3109
  if (c == '>' || c == -1) {
2745
3110
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2746
3111
  emit_doctype(parser, output);
@@ -2749,9 +3114,13 @@ static StateResult handle_bogus_doctype_state(GumboParser* parser,
2749
3114
  return NEXT_CHAR;
2750
3115
  }
2751
3116
 
2752
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state
2753
- static StateResult handle_cdata_state(GumboParser* parser,
2754
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
3117
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
3118
+ static StateResult handle_cdata_state (
3119
+ GumboParser* parser,
3120
+ GumboTokenizerState* tokenizer,
3121
+ int c,
3122
+ GumboToken* output
3123
+ ) {
2755
3124
  if (c == -1 || utf8iterator_maybe_consume_match(
2756
3125
  &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
2757
3126
  tokenizer->_reconsume_current_input = true;
@@ -2764,50 +3133,83 @@ static StateResult handle_cdata_state(GumboParser* parser,
2764
3133
  }
2765
3134
  }
2766
3135
 
2767
- typedef StateResult (*GumboLexerStateFunction)(
2768
- GumboParser*, GumboTokenizerState*, int, GumboToken*);
2769
-
2770
- static GumboLexerStateFunction dispatch_table[] = {handle_data_state,
2771
- handle_char_ref_in_data_state, handle_rcdata_state,
2772
- handle_char_ref_in_rcdata_state, handle_rawtext_state, handle_script_state,
2773
- handle_plaintext_state, handle_tag_open_state, handle_end_tag_open_state,
2774
- handle_tag_name_state, handle_rcdata_lt_state,
2775
- handle_rcdata_end_tag_open_state, handle_rcdata_end_tag_name_state,
2776
- handle_rawtext_lt_state, handle_rawtext_end_tag_open_state,
2777
- handle_rawtext_end_tag_name_state, handle_script_lt_state,
2778
- handle_script_end_tag_open_state, handle_script_end_tag_name_state,
2779
- handle_script_escaped_start_state, handle_script_escaped_start_dash_state,
2780
- handle_script_escaped_state, handle_script_escaped_dash_state,
2781
- handle_script_escaped_dash_dash_state, handle_script_escaped_lt_state,
2782
- handle_script_escaped_end_tag_open_state,
2783
- handle_script_escaped_end_tag_name_state,
2784
- handle_script_double_escaped_start_state,
2785
- handle_script_double_escaped_state, handle_script_double_escaped_dash_state,
2786
- handle_script_double_escaped_dash_dash_state,
2787
- handle_script_double_escaped_lt_state,
2788
- handle_script_double_escaped_end_state, handle_before_attr_name_state,
2789
- handle_attr_name_state, handle_after_attr_name_state,
2790
- handle_before_attr_value_state, handle_attr_value_double_quoted_state,
2791
- handle_attr_value_single_quoted_state, handle_attr_value_unquoted_state,
2792
- handle_char_ref_in_attr_value_state, handle_after_attr_value_quoted_state,
2793
- handle_self_closing_start_tag_state, handle_bogus_comment_state,
2794
- handle_markup_declaration_state, handle_comment_start_state,
2795
- handle_comment_start_dash_state, handle_comment_state,
2796
- handle_comment_end_dash_state, handle_comment_end_state,
2797
- handle_comment_end_bang_state, handle_doctype_state,
2798
- handle_before_doctype_name_state, handle_doctype_name_state,
2799
- handle_after_doctype_name_state, handle_after_doctype_public_keyword_state,
2800
- handle_before_doctype_public_id_state,
2801
- handle_doctype_public_id_double_quoted_state,
2802
- handle_doctype_public_id_single_quoted_state,
2803
- handle_after_doctype_public_id_state,
2804
- handle_between_doctype_public_system_id_state,
2805
- handle_after_doctype_system_keyword_state,
2806
- handle_before_doctype_system_id_state,
2807
- handle_doctype_system_id_double_quoted_state,
2808
- handle_doctype_system_id_single_quoted_state,
2809
- handle_after_doctype_system_id_state, handle_bogus_doctype_state,
2810
- handle_cdata_state};
3136
+ typedef StateResult (*GumboLexerStateFunction) (
3137
+ GumboParser* parser,
3138
+ GumboTokenizerState* tokenizer,
3139
+ int c,
3140
+ GumboToken* output
3141
+ );
3142
+
3143
+ static GumboLexerStateFunction dispatch_table[] = {
3144
+ handle_data_state,
3145
+ handle_char_ref_in_data_state,
3146
+ handle_rcdata_state,
3147
+ handle_char_ref_in_rcdata_state,
3148
+ handle_rawtext_state,
3149
+ handle_script_state,
3150
+ handle_plaintext_state,
3151
+ handle_tag_open_state,
3152
+ handle_end_tag_open_state,
3153
+ handle_tag_name_state,
3154
+ handle_rcdata_lt_state,
3155
+ handle_rcdata_end_tag_open_state,
3156
+ handle_rcdata_end_tag_name_state,
3157
+ handle_rawtext_lt_state,
3158
+ handle_rawtext_end_tag_open_state,
3159
+ handle_rawtext_end_tag_name_state,
3160
+ handle_script_lt_state,
3161
+ handle_script_end_tag_open_state,
3162
+ handle_script_end_tag_name_state,
3163
+ handle_script_escaped_start_state,
3164
+ handle_script_escaped_start_dash_state,
3165
+ handle_script_escaped_state,
3166
+ handle_script_escaped_dash_state,
3167
+ handle_script_escaped_dash_dash_state,
3168
+ handle_script_escaped_lt_state,
3169
+ handle_script_escaped_end_tag_open_state,
3170
+ handle_script_escaped_end_tag_name_state,
3171
+ handle_script_double_escaped_start_state,
3172
+ handle_script_double_escaped_state,
3173
+ handle_script_double_escaped_dash_state,
3174
+ handle_script_double_escaped_dash_dash_state,
3175
+ handle_script_double_escaped_lt_state,
3176
+ handle_script_double_escaped_end_state,
3177
+ handle_before_attr_name_state,
3178
+ handle_attr_name_state,
3179
+ handle_after_attr_name_state,
3180
+ handle_before_attr_value_state,
3181
+ handle_attr_value_double_quoted_state,
3182
+ handle_attr_value_single_quoted_state,
3183
+ handle_attr_value_unquoted_state,
3184
+ handle_char_ref_in_attr_value_state,
3185
+ handle_after_attr_value_quoted_state,
3186
+ handle_self_closing_start_tag_state,
3187
+ handle_bogus_comment_state,
3188
+ handle_markup_declaration_state,
3189
+ handle_comment_start_state,
3190
+ handle_comment_start_dash_state,
3191
+ handle_comment_state,
3192
+ handle_comment_end_dash_state,
3193
+ handle_comment_end_state,
3194
+ handle_comment_end_bang_state,
3195
+ handle_doctype_state,
3196
+ handle_before_doctype_name_state,
3197
+ handle_doctype_name_state,
3198
+ handle_after_doctype_name_state,
3199
+ handle_after_doctype_public_keyword_state,
3200
+ handle_before_doctype_public_id_state,
3201
+ handle_doctype_public_id_double_quoted_state,
3202
+ handle_doctype_public_id_single_quoted_state,
3203
+ handle_after_doctype_public_id_state,
3204
+ handle_between_doctype_public_system_id_state,
3205
+ handle_after_doctype_system_keyword_state,
3206
+ handle_before_doctype_system_id_state,
3207
+ handle_doctype_system_id_double_quoted_state,
3208
+ handle_doctype_system_id_single_quoted_state,
3209
+ handle_after_doctype_system_id_state,
3210
+ handle_bogus_doctype_state,
3211
+ handle_cdata_state
3212
+ };
2811
3213
 
2812
3214
  bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2813
3215
  // Because of the spec requirements that...
@@ -2819,9 +3221,9 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2819
3221
  // state.
2820
3222
  //
2821
3223
  // ...all state must be held in the GumboTokenizer struct instead of in local
2822
- // variables in this function. That allows us to return from this method with
3224
+ // variables in this function. That allows us to return from this method with
2823
3225
  // a token, and then immediately jump back to the same state with the same
2824
- // input if we need to return a different token. The various emit_* functions
3226
+ // input if we need to return a different token. The various emit_* functions
2825
3227
  // are responsible for changing state (eg. flushing the chardata buffer,
2826
3228
  // reading the next input character) to avoid an infinite loop.
2827
3229
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
@@ -2845,10 +3247,9 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2845
3247
  assert(!tokenizer->_temporary_buffer_emit);
2846
3248
  assert(tokenizer->_buffered_emit_char == kGumboNoChar);
2847
3249
  int c = utf8iterator_current(&tokenizer->_input);
2848
- gumbo_debug(
2849
- "Lexing character '%c' (%d) in state %d.\n", c, c, tokenizer->_state);
2850
- StateResult result =
2851
- dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
3250
+ GumboTokenizerEnum state = tokenizer->_state;
3251
+ gumbo_debug("Lexing character '%c' (%d) in state %u.\n", c, c, state);
3252
+ StateResult result = dispatch_table[state](parser, tokenizer, c, output);
2852
3253
  // We need to clear reconsume_current_input before returning to prevent
2853
3254
  // certain infinite loop states.
2854
3255
  bool should_advance = !tokenizer->_reconsume_current_input;
@@ -2866,30 +3267,33 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2866
3267
  }
2867
3268
  }
2868
3269
 
2869
- void gumbo_token_destroy(GumboParser* parser, GumboToken* token) {
3270
+ void gumbo_token_destroy(GumboToken* token) {
2870
3271
  if (!token) return;
2871
3272
 
2872
3273
  switch (token->type) {
2873
3274
  case GUMBO_TOKEN_DOCTYPE:
2874
- gumbo_parser_deallocate(parser, (void*) token->v.doc_type.name);
2875
- gumbo_parser_deallocate(
2876
- parser, (void*) token->v.doc_type.public_identifier);
2877
- gumbo_parser_deallocate(
2878
- parser, (void*) token->v.doc_type.system_identifier);
3275
+ gumbo_free((void*) token->v.doc_type.name);
3276
+ gumbo_free((void*) token->v.doc_type.public_identifier);
3277
+ gumbo_free((void*) token->v.doc_type.system_identifier);
2879
3278
  return;
2880
3279
  case GUMBO_TOKEN_START_TAG:
2881
3280
  for (unsigned int i = 0; i < token->v.start_tag.attributes.length; ++i) {
2882
3281
  GumboAttribute* attr = token->v.start_tag.attributes.data[i];
2883
3282
  if (attr) {
2884
3283
  // May have been nulled out if this token was merged with another.
2885
- gumbo_destroy_attribute(parser, attr);
3284
+ gumbo_destroy_attribute(attr);
2886
3285
  }
2887
3286
  }
2888
- gumbo_parser_deallocate(
2889
- parser, (void*) token->v.start_tag.attributes.data);
3287
+ gumbo_free((void*) token->v.start_tag.attributes.data);
3288
+ if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN)
3289
+ gumbo_free(token->v.start_tag.name);
2890
3290
  return;
3291
+ case GUMBO_TOKEN_END_TAG:
3292
+ if (token->v.end_tag.tag == GUMBO_TAG_UNKNOWN)
3293
+ gumbo_free(token->v.end_tag.name);
3294
+ break;
2891
3295
  case GUMBO_TOKEN_COMMENT:
2892
- gumbo_parser_deallocate(parser, (void*) token->v.text);
3296
+ gumbo_free((void*) token->v.text);
2893
3297
  return;
2894
3298
  default:
2895
3299
  return;