nokogumbo 1.5.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +237 -26
  3. data/ext/nokogumbo/extconf.rb +121 -0
  4. data/ext/nokogumbo/nokogumbo.c +793 -0
  5. data/gumbo-parser/src/ascii.c +75 -0
  6. data/gumbo-parser/src/ascii.h +115 -0
  7. data/gumbo-parser/src/attribute.c +26 -28
  8. data/gumbo-parser/src/attribute.h +3 -23
  9. data/gumbo-parser/src/char_ref.c +5972 -6816
  10. data/gumbo-parser/src/char_ref.h +14 -45
  11. data/gumbo-parser/src/error.c +510 -163
  12. data/gumbo-parser/src/error.h +70 -147
  13. data/gumbo-parser/src/foreign_attrs.c +104 -0
  14. data/gumbo-parser/src/gumbo.h +577 -305
  15. data/gumbo-parser/src/insertion_mode.h +4 -28
  16. data/gumbo-parser/src/macros.h +91 -0
  17. data/gumbo-parser/src/parser.c +2922 -2228
  18. data/gumbo-parser/src/parser.h +6 -22
  19. data/gumbo-parser/src/replacement.h +33 -0
  20. data/gumbo-parser/src/string_buffer.c +43 -50
  21. data/gumbo-parser/src/string_buffer.h +24 -40
  22. data/gumbo-parser/src/string_piece.c +39 -39
  23. data/gumbo-parser/src/svg_attrs.c +174 -0
  24. data/gumbo-parser/src/svg_tags.c +137 -0
  25. data/gumbo-parser/src/tag.c +186 -59
  26. data/gumbo-parser/src/tag_lookup.c +382 -0
  27. data/gumbo-parser/src/tag_lookup.h +13 -0
  28. data/gumbo-parser/src/token_buffer.c +79 -0
  29. data/gumbo-parser/src/token_buffer.h +71 -0
  30. data/gumbo-parser/src/token_type.h +1 -25
  31. data/gumbo-parser/src/tokenizer.c +2127 -1561
  32. data/gumbo-parser/src/tokenizer.h +41 -52
  33. data/gumbo-parser/src/tokenizer_states.h +281 -45
  34. data/gumbo-parser/src/utf8.c +98 -123
  35. data/gumbo-parser/src/utf8.h +84 -52
  36. data/gumbo-parser/src/util.c +48 -38
  37. data/gumbo-parser/src/util.h +10 -40
  38. data/gumbo-parser/src/vector.c +45 -57
  39. data/gumbo-parser/src/vector.h +17 -39
  40. data/lib/nokogumbo.rb +11 -173
  41. data/lib/nokogumbo/html5.rb +252 -0
  42. data/lib/nokogumbo/html5/document.rb +53 -0
  43. data/lib/nokogumbo/html5/document_fragment.rb +62 -0
  44. data/lib/nokogumbo/html5/node.rb +72 -0
  45. data/lib/nokogumbo/version.rb +3 -0
  46. metadata +43 -24
  47. data/ext/nokogumboc/extconf.rb +0 -60
  48. data/ext/nokogumboc/nokogumbo.c +0 -295
  49. data/gumbo-parser/src/char_ref.rl +0 -2554
  50. data/gumbo-parser/src/string_piece.h +0 -38
  51. data/gumbo-parser/src/tag.in +0 -150
  52. data/gumbo-parser/src/tag_enum.h +0 -153
  53. data/gumbo-parser/src/tag_gperf.h +0 -105
  54. data/gumbo-parser/src/tag_sizes.h +0 -4
  55. data/gumbo-parser/src/tag_strings.h +0 -153
  56. data/gumbo-parser/visualc/include/strings.h +0 -4
  57. data/test-nokogumbo.rb +0 -190
@@ -1,25 +1,9 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // This contains an implementation of a tokenizer for HTML5. It consumes a
18
- // buffer of UTF-8 characters, and then emits a stream of tokens.
19
-
20
1
  #ifndef GUMBO_TOKENIZER_H_
21
2
  #define GUMBO_TOKENIZER_H_
22
3
 
4
+ // This contains an implementation of a tokenizer for HTML5. It consumes a
5
+ // buffer of UTF-8 characters, and then emits a stream of tokens.
6
+
23
7
  #include <stdbool.h>
24
8
  #include <stddef.h>
25
9
 
@@ -49,11 +33,20 @@ typedef struct GumboInternalTokenDocType {
49
33
  // Struct containing all information pertaining to start tag tokens.
50
34
  typedef struct GumboInternalTokenStartTag {
51
35
  GumboTag tag;
36
+ // NULL unless tag is GUMBO_TAG_UNKNOWN
37
+ char *name;
52
38
  GumboVector /* GumboAttribute */ attributes;
53
39
  bool is_self_closing;
54
40
  } GumboTokenStartTag;
55
41
 
56
- // A data structure representing a single token in the input stream. This
42
+ // Struct containing all information pertaining to end tag tokens.
43
+ typedef struct GumboInternalTokenEndTag {
44
+ GumboTag tag;
45
+ // NULL unless tag is GUMBO_TAG_UNKNOWN
46
+ char *name;
47
+ } GumboTokenEndTag;
48
+
49
+ // A data structure representing a single token in the input stream. This
57
50
  // contains an enum for the type, the source position, a GumboStringPiece
58
51
  // pointing to the original text, and then a union for any parsed data.
59
52
  typedef struct GumboInternalToken {
@@ -63,7 +56,7 @@ typedef struct GumboInternalToken {
63
56
  union {
64
57
  GumboTokenDocType doc_type;
65
58
  GumboTokenStartTag start_tag;
66
- GumboTag end_tag;
59
+ GumboTokenEndTag end_tag;
67
60
  const char* text; // For comments.
68
61
  int character; // For character, whitespace, null, and EOF tokens.
69
62
  } v;
@@ -71,53 +64,49 @@ typedef struct GumboInternalToken {
71
64
 
72
65
  // Initializes the tokenizer state within the GumboParser object, setting up a
73
66
  // parse of the specified text.
74
- void gumbo_tokenizer_state_init(
75
- struct GumboInternalParser* parser, const char* text, size_t text_length);
67
+ void gumbo_tokenizer_state_init (
68
+ struct GumboInternalParser* parser,
69
+ const char* text,
70
+ size_t text_length
71
+ );
76
72
 
77
73
  // Destroys the tokenizer state within the GumboParser object, freeing any
78
74
  // dynamically-allocated structures within it.
79
75
  void gumbo_tokenizer_state_destroy(struct GumboInternalParser* parser);
80
76
 
81
- // Sets the tokenizer state to the specified value. This is needed by some
77
+ // Sets the tokenizer state to the specified value. This is needed by some
82
78
  // parser states, which alter the state of the tokenizer in response to tags
83
79
  // seen.
84
- void gumbo_tokenizer_set_state(
85
- struct GumboInternalParser* parser, GumboTokenizerEnum state);
86
-
87
- // Flags whether the current node is a foreign content element. This is
88
- // necessary for the markup declaration open state, where the tokenizer must be
89
- // aware of the state of the parser to properly tokenize bad comment tags.
90
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#markup-declaration-open-state
91
- void gumbo_tokenizer_set_is_current_node_foreign(
92
- struct GumboInternalParser* parser, bool is_foreign);
80
+ void gumbo_tokenizer_set_state (
81
+ struct GumboInternalParser* parser,
82
+ GumboTokenizerEnum state
83
+ );
84
+
85
+ // Flags whether the adjusted current node is a foreign content element. This
86
+ // is necessary for the markup declaration open state, where the tokenizer
87
+ // must be aware of the state of the parser to properly tokenize bad comment
88
+ // tags.
89
+ // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
90
+ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
91
+ struct GumboInternalParser* parser,
92
+ bool is_foreign
93
+ );
93
94
 
94
95
  // Lexes a single token from the specified buffer, filling the output with the
95
- // parsed GumboToken data structure. Returns true for a successful
96
- // tokenization, false if a parse error occurs.
97
- //
98
- // Example:
99
- // struct GumboInternalParser parser;
100
- // GumboToken output;
101
- // gumbo_tokenizer_state_init(&parser, text, strlen(text));
102
- // while (gumbo_lex(&parser, &output)) {
103
- // ...do stuff with output.
104
- // gumbo_token_destroy(&parser, &token);
105
- // }
106
- // gumbo_tokenizer_state_destroy(&parser);
107
- bool gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
108
-
109
- // Frees the internally-allocated pointers within an GumboToken. Note that this
96
+ // parsed GumboToken data structure.
97
+ void gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
98
+
99
+ // Frees the internally-allocated pointers within a GumboToken. Note that this
110
100
  // doesn't free the token itself, since oftentimes it will be allocated on the
111
- // stack. A simple call to free() (or GumboParser->deallocator, if
112
- // appropriate) can handle that.
101
+ // stack.
113
102
  //
114
103
  // Note that if you are handing over ownership of the internal strings to some
115
104
  // other data structure - for example, a parse tree - these do not need to be
116
105
  // freed.
117
- void gumbo_token_destroy(struct GumboInternalParser* parser, GumboToken* token);
106
+ void gumbo_token_destroy(GumboToken* token);
118
107
 
119
108
  #ifdef __cplusplus
120
109
  }
121
110
  #endif
122
111
 
123
- #endif // GUMBO_TOKENIZER_H_
112
+ #endif // GUMBO_TOKENIZER_H_
@@ -1,103 +1,339 @@
1
- // Copyright 2011 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // This contains the list of states used in the tokenizer. Although at first
1
+ #ifndef GUMBO_TOKENIZER_STATES_H_
2
+ #define GUMBO_TOKENIZER_STATES_H_
3
+
4
+ // This contains the list of states used in the tokenizer. Although at first
18
5
  // glance it seems like these could be kept internal to the tokenizer, several
19
6
  // of the actions in the parser require that it reach into the tokenizer and
20
- // reset the tokenizer state. For that to work, it needs to have the
7
+ // reset the tokenizer state. For that to work, it needs to have the
21
8
  // definitions of individual states available.
22
9
  //
23
10
  // This may also be useful for providing more detailed error messages for parse
24
11
  // errors, as we can match up states and inputs in a table without having to
25
12
  // clutter the tokenizer code with lots of precise error messages.
26
13
 
27
- #ifndef GUMBO_TOKENIZER_STATES_H_
28
- #define GUMBO_TOKENIZER_STATES_H_
29
-
30
14
  // The ordering of this enum is also used to build the dispatch table for the
31
15
  // tokenizer state machine, so if it is changed, be sure to update that too.
32
16
  typedef enum {
17
+ // 12.2.5.1 Data state
18
+ // https://html.spec.whatwg.org/multipage/parsing.html#data-state
33
19
  GUMBO_LEX_DATA,
34
- GUMBO_LEX_CHAR_REF_IN_DATA,
20
+
21
+ // 12.2.5.2 RCDATA state
22
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
35
23
  GUMBO_LEX_RCDATA,
36
- GUMBO_LEX_CHAR_REF_IN_RCDATA,
24
+
25
+ // 12.2.5.3 RAWTEXT state
26
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state<Paste>
37
27
  GUMBO_LEX_RAWTEXT,
38
- GUMBO_LEX_SCRIPT,
28
+
29
+ // 12.2.5.4 Script data state
30
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
31
+ GUMBO_LEX_SCRIPT_DATA,
32
+
33
+ // 12.2.5.5 PLAINTEXT state
34
+ // https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
39
35
  GUMBO_LEX_PLAINTEXT,
36
+
37
+ // 12.2.5.6 Tag open state
38
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
40
39
  GUMBO_LEX_TAG_OPEN,
40
+
41
+ // 12.2.5.7 End tag open state
42
+ // https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
41
43
  GUMBO_LEX_END_TAG_OPEN,
44
+
45
+ // 12.2.5.8 Tag name state
46
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
42
47
  GUMBO_LEX_TAG_NAME,
48
+
49
+ // 12.2.5.9 RCDATA less-than sign state
50
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
43
51
  GUMBO_LEX_RCDATA_LT,
52
+
53
+ // 12.2.5.10 RCDATA end tag open state
54
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
44
55
  GUMBO_LEX_RCDATA_END_TAG_OPEN,
56
+
57
+ // 12.2.5.11 RCDATA end tag name state
58
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
45
59
  GUMBO_LEX_RCDATA_END_TAG_NAME,
60
+
61
+ // 12.2.5.12 RAWTEXT less-than sign state
62
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
46
63
  GUMBO_LEX_RAWTEXT_LT,
64
+
65
+ // 12.2.5.13 RAWTEXT end tag open state
66
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
47
67
  GUMBO_LEX_RAWTEXT_END_TAG_OPEN,
68
+
69
+ // 12.2.5.14 RAWTEXT end tag name state
70
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
48
71
  GUMBO_LEX_RAWTEXT_END_TAG_NAME,
49
- GUMBO_LEX_SCRIPT_LT,
50
- GUMBO_LEX_SCRIPT_END_TAG_OPEN,
51
- GUMBO_LEX_SCRIPT_END_TAG_NAME,
52
- GUMBO_LEX_SCRIPT_ESCAPED_START,
53
- GUMBO_LEX_SCRIPT_ESCAPED_START_DASH,
54
- GUMBO_LEX_SCRIPT_ESCAPED,
55
- GUMBO_LEX_SCRIPT_ESCAPED_DASH,
56
- GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH,
57
- GUMBO_LEX_SCRIPT_ESCAPED_LT,
58
- GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN,
59
- GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME,
60
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START,
61
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED,
62
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH,
63
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH,
64
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT,
65
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END,
72
+
73
+ // 12.2.5.15 Script data less-than sign state
74
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
75
+ GUMBO_LEX_SCRIPT_DATA_LT,
76
+
77
+ // 12.2.5.16 Script data end tag open state
78
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
79
+ GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN,
80
+
81
+ // 12.2.5.17 Script data end tag name state
82
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
83
+ GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME,
84
+
85
+ // 12.2.5.18 Script data escape start state
86
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
87
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_START,
88
+
89
+ // 12.2.5.19 Script data escape start dash state
90
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
91
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH,
92
+
93
+ // 12.2.5.20 Script data escaped state
94
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
95
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED,
96
+
97
+ // 12.2.5.21 Script data escaped dash state
98
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
99
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH,
100
+
101
+ // 12.2.5.22 Script data escaped dash dash state
102
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
103
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH,
104
+
105
+ // 12.2.5.23 Script data escaped less than sign state
106
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
107
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT,
108
+
109
+ // 12.2.5.24 Script data escaped end tag open state
110
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
111
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN,
112
+
113
+ // 12.2.5.25 Script data escaped end tag name state
114
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
115
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME,
116
+
117
+ // 12.2.5.26 Script data double escape start state
118
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
119
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START,
120
+
121
+ // 12.2.5.27 Script data double escaped state
122
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
123
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED,
124
+
125
+ // 12.2.5.28 Script data double escaped dash state
126
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
127
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH,
128
+
129
+ // 12.2.5.29 Script data double escaped dash dash state
130
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
131
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH,
132
+
133
+ // 12.2.5.30 Script data double escaped less-than sign state
134
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
135
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT,
136
+
137
+ // 12.2.5.31 Script data double escape end state (XXX: spec bug with the
138
+ // name?)
139
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
140
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END,
141
+
142
+ // 12.2.5.32 Before attribute name state
143
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
66
144
  GUMBO_LEX_BEFORE_ATTR_NAME,
145
+
146
+ // 12.2.5.33 Attributet name state
147
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
67
148
  GUMBO_LEX_ATTR_NAME,
149
+
150
+ // 12.2.5.34 After attribute name state
151
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
68
152
  GUMBO_LEX_AFTER_ATTR_NAME,
153
+
154
+ // 12.2.5.35 Before attribute value state
155
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
69
156
  GUMBO_LEX_BEFORE_ATTR_VALUE,
157
+
158
+ // 12.2.5.36 Attribute value (double-quoted) state
159
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state
70
160
  GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED,
161
+
162
+ // 12.2.5.37 Attribute value (single-quoted) state
163
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state
71
164
  GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED,
165
+
166
+ // 12.2.5.38 Attribute value (unquoted) state
167
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(unquoted)-state
72
168
  GUMBO_LEX_ATTR_VALUE_UNQUOTED,
73
- GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE,
169
+
170
+ // 12.2.5.39 After attribute value (quoted) state
171
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state
74
172
  GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED,
173
+
174
+ // 12.2.5.40 Self-closing start tag state
175
+ // https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
75
176
  GUMBO_LEX_SELF_CLOSING_START_TAG,
177
+
178
+ // 12.2.5.41 Bogus comment state
179
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
76
180
  GUMBO_LEX_BOGUS_COMMENT,
77
- GUMBO_LEX_MARKUP_DECLARATION,
181
+
182
+ // 12.2.5.42 Markup declaration open state
183
+ // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
184
+ GUMBO_LEX_MARKUP_DECLARATION_OPEN,
185
+
186
+ // 12.2.5.43 Comment start state
187
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
78
188
  GUMBO_LEX_COMMENT_START,
189
+
190
+ // 12.2.5.44 Comment start dash state
191
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
79
192
  GUMBO_LEX_COMMENT_START_DASH,
193
+
194
+ // 12.2.5.45 Comment state
195
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-state
80
196
  GUMBO_LEX_COMMENT,
197
+
198
+ // 12.2.5.46 Comment less-than sign state
199
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
200
+ GUMBO_LEX_COMMENT_LT,
201
+
202
+ // 12.2.5.47 Comment less-than sign bang state
203
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
204
+ GUMBO_LEX_COMMENT_LT_BANG,
205
+
206
+ // 12.2.5.48 Comment less-than sign bang dash state
207
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
208
+ GUMBO_LEX_COMMENT_LT_BANG_DASH,
209
+
210
+ // 12.2.5.49 Comment less-than sign bang dash dash state
211
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
212
+ GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH,
213
+
214
+ // 12.2.5.50 Comment end dash state
215
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
81
216
  GUMBO_LEX_COMMENT_END_DASH,
217
+
218
+ // 12.2.5.51 Comment end state
219
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
82
220
  GUMBO_LEX_COMMENT_END,
221
+
222
+ // 12.2.5.52 Comment end bang state
223
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
83
224
  GUMBO_LEX_COMMENT_END_BANG,
225
+
226
+ // 12.2.5.53 DOCTYPE state
227
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
84
228
  GUMBO_LEX_DOCTYPE,
229
+
230
+ // 12.2.5.54 Before DOCTYPE name state
231
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
85
232
  GUMBO_LEX_BEFORE_DOCTYPE_NAME,
233
+
234
+ // 12.2.5.55 DOCTYPE name state
235
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
86
236
  GUMBO_LEX_DOCTYPE_NAME,
237
+
238
+ // 12.2.5.56 After DOCTYPE name state
239
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
87
240
  GUMBO_LEX_AFTER_DOCTYPE_NAME,
241
+
242
+ // 12.2.5.57 After DOCTYPE public keyword state
243
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
88
244
  GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD,
245
+
246
+ // 12.2.5.58 Before DOCTYPE public identifier state
247
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
89
248
  GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID,
249
+
250
+ // 12.2.5.59 DOCTYPE public identifier (double-quoted) state
251
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
90
252
  GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED,
253
+
254
+ // 12.2.5.60 DOCTYPE public identifier (single-quoted) state
255
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
91
256
  GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED,
257
+
258
+ // 12.2.5.61 After DOCTYPE public identifier state
259
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
92
260
  GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID,
261
+
262
+ // 12.2.5.62 Between DOCTYPE public and system identifiers state
263
+ // https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
93
264
  GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID,
265
+
266
+ // 12.2.5.63 After DOCTYPE system keyword state
267
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
94
268
  GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD,
269
+
270
+ // 12.2.5.64 Before DOCTYPE system identifier state
271
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
95
272
  GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID,
273
+
274
+ // 12.2.5.65 DOCTYPE system identifier (double-quoted) state
275
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
96
276
  GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED,
277
+
278
+ // 12.2.5.66 DOCTYPE system identifier (single-quoted) state
279
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
97
280
  GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED,
281
+
282
+ // 12.2.5.67 After DOCTYPE system identifier state
283
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
98
284
  GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID,
285
+
286
+ // 12.2.5.68 Bogus DOCTYPE state
287
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
99
288
  GUMBO_LEX_BOGUS_DOCTYPE,
100
- GUMBO_LEX_CDATA
289
+
290
+ // 12.2.5.69 CDATA section state
291
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
292
+ GUMBO_LEX_CDATA_SECTION,
293
+
294
+ // 12.2.5.70 CDATA section bracket state
295
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
296
+ GUMBO_LEX_CDATA_SECTION_BRACKET,
297
+
298
+ // 12.2.5.71 CDATA section end state
299
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
300
+ GUMBO_LEX_CDATA_SECTION_END,
301
+
302
+ // 12.2.5.72 Character reference state
303
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
304
+ GUMBO_LEX_CHARACTER_REFERENCE,
305
+
306
+ // 12.2.5.73 Named character reference state
307
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
308
+ GUMBO_LEX_NAMED_CHARACTER_REFERENCE,
309
+
310
+ // 12.2.5.74 Ambiguous ampersand state
311
+ // https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
312
+ GUMBO_LEX_AMBIGUOUS_AMPERSAND,
313
+
314
+ // 12.2.5.75 Numeric character reference state
315
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
316
+ GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE,
317
+
318
+ // 12.2.5.76 Hexadecimal character reference start state
319
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state
320
+ GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START,
321
+
322
+ // 12.2.5.77 Decimal character reference start state
323
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
324
+ GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START,
325
+
326
+ // 12.2.5.78 Hexadecimal character reference state
327
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-state
328
+ GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE,
329
+
330
+ // 12.2.5.79 Decimal character reference state
331
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
332
+ GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE,
333
+
334
+ // 12.2.5.80 Numeric character reference end state
335
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
336
+ GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END
101
337
  } GumboTokenizerEnum;
102
338
 
103
- #endif // GUMBO_TOKENIZER_STATES_H_
339
+ #endif // GUMBO_TOKENIZER_STATES_H_