nokogumbo 2.0.1 → 2.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/nokogumbo/extconf.rb +1 -1
- data/ext/nokogumbo/nokogumbo.c +1 -0
- data/gumbo-parser/src/error.c +17 -8
- data/gumbo-parser/src/gumbo.h +8 -0
- data/gumbo-parser/src/parser.c +473 -480
- data/gumbo-parser/src/tokenizer.c +12 -25
- data/gumbo-parser/src/tokenizer.h +2 -13
- data/lib/nokogumbo/version.rb +1 -1
- metadata +3 -4
@@ -20,10 +20,7 @@
|
|
20
20
|
Coding conventions specific to this file:
|
21
21
|
|
22
22
|
1. Functions that fill in a token should be named emit_*, and should be
|
23
|
-
followed immediately by a return from the tokenizer
|
24
|
-
occurred, false if an error occurred). Sometimes the emit functions
|
25
|
-
themselves return a boolean so that they can be combined with the return
|
26
|
-
statement; in this case, they should match this convention.
|
23
|
+
followed immediately by a return from the tokenizer.
|
27
24
|
2. Functions that shuffle data from temporaries to final API structures
|
28
25
|
should be named finish_*, and be called just before the tokenizer exits the
|
29
26
|
state that accumulates the temporary.
|
@@ -141,10 +138,6 @@ typedef struct GumboInternalTokenizerState {
|
|
141
138
|
// text tokens emitted will be GUMBO_TOKEN_CDATA.
|
142
139
|
bool _is_in_cdata;
|
143
140
|
|
144
|
-
// A flag indicating whether the tokenizer has seen a parse error since the
|
145
|
-
// last token was emitted.
|
146
|
-
bool _parse_error;
|
147
|
-
|
148
141
|
// Certain states (notably character references) may emit two character tokens
|
149
142
|
// at once, but the contract for lex() fills in only one token at a time. The
|
150
143
|
// extra character is buffered here, and then this is checked on entry to
|
@@ -207,7 +200,6 @@ static void tokenizer_add_parse_error (
|
|
207
200
|
GumboErrorType type
|
208
201
|
) {
|
209
202
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
210
|
-
tokenizer->_parse_error = true;
|
211
203
|
GumboError* error = gumbo_add_error(parser);
|
212
204
|
if (!error) {
|
213
205
|
return;
|
@@ -228,7 +220,6 @@ static void tokenizer_add_char_ref_error (
|
|
228
220
|
int codepoint
|
229
221
|
) {
|
230
222
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
231
|
-
tokenizer->_parse_error = true;
|
232
223
|
GumboError* error = gumbo_add_error(parser);
|
233
224
|
if (!error)
|
234
225
|
return;
|
@@ -248,7 +239,6 @@ static void tokenizer_add_token_parse_error (
|
|
248
239
|
GumboErrorType type
|
249
240
|
) {
|
250
241
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
251
|
-
tokenizer->_parse_error = true;
|
252
242
|
GumboError* error = gumbo_add_error(parser);
|
253
243
|
if (!error)
|
254
244
|
return;
|
@@ -732,7 +722,10 @@ static void copy_over_original_tag_text (
|
|
732
722
|
original_text->data = tag_state->_original_text;
|
733
723
|
original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
|
734
724
|
tag_state->_original_text;
|
735
|
-
if (
|
725
|
+
if (
|
726
|
+
original_text->length
|
727
|
+
&& original_text->data[original_text->length - 1] == '\r'
|
728
|
+
) {
|
736
729
|
// Since \r is skipped by the UTF-8 iterator, it can sometimes end up
|
737
730
|
// appended to the end of original text even when it's really the first part
|
738
731
|
// of the next character. If we detect this situation, shrink the length of
|
@@ -770,7 +763,6 @@ static void finish_tag_name(GumboParser* parser) {
|
|
770
763
|
// Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
|
771
764
|
static void add_duplicate_attr_error(GumboParser* parser) {
|
772
765
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
773
|
-
tokenizer->_parse_error = true;
|
774
766
|
GumboError* error = gumbo_add_error(parser);
|
775
767
|
if (!error) {
|
776
768
|
return;
|
@@ -788,9 +780,8 @@ static void add_duplicate_attr_error(GumboParser* parser) {
|
|
788
780
|
// the attribute's name. The attribute's value starts out as the empty string
|
789
781
|
// (following the "Boolean attributes" section of the spec) and is only
|
790
782
|
// overwritten on finish_attribute_value(). If the attribute has already been
|
791
|
-
// specified, the new attribute is dropped
|
792
|
-
|
793
|
-
static bool finish_attribute_name(GumboParser* parser) {
|
783
|
+
// specified, the new attribute is dropped and a parse error is added
|
784
|
+
static void finish_attribute_name(GumboParser* parser) {
|
794
785
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
795
786
|
GumboTagState* tag_state = &tokenizer->_tag_state;
|
796
787
|
// May've been set by a previous attribute without a value; reset it here.
|
@@ -813,7 +804,7 @@ static bool finish_attribute_name(GumboParser* parser) {
|
|
813
804
|
add_duplicate_attr_error(parser);
|
814
805
|
reinitialize_tag_buffer(parser);
|
815
806
|
tag_state->_drop_next_attr_value = true;
|
816
|
-
return
|
807
|
+
return;
|
817
808
|
}
|
818
809
|
}
|
819
810
|
|
@@ -835,7 +826,6 @@ static bool finish_attribute_name(GumboParser* parser) {
|
|
835
826
|
);
|
836
827
|
gumbo_vector_add(attr, attributes);
|
837
828
|
reinitialize_tag_buffer(parser);
|
838
|
-
return true;
|
839
829
|
}
|
840
830
|
|
841
831
|
// Finishes an attribute value. This sets the value of the most recently added
|
@@ -881,7 +871,6 @@ void gumbo_tokenizer_state_init (
|
|
881
871
|
tokenizer->_reconsume_current_input = false;
|
882
872
|
tokenizer->_is_adjusted_current_node_foreign = false;
|
883
873
|
tokenizer->_is_in_cdata = false;
|
884
|
-
tokenizer->_parse_error = false;
|
885
874
|
tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
|
886
875
|
tokenizer->_tag_state._name = NULL;
|
887
876
|
|
@@ -3373,7 +3362,7 @@ static GumboLexerStateFunction dispatch_table[] = {
|
|
3373
3362
|
[GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END] = handle_numeric_character_reference_end_state,
|
3374
3363
|
};
|
3375
3364
|
|
3376
|
-
|
3365
|
+
void gumbo_lex(GumboParser* parser, GumboToken* output) {
|
3377
3366
|
// Because of the spec requirements that...
|
3378
3367
|
//
|
3379
3368
|
// 1. Tokens be handled immediately by the parser upon emission.
|
@@ -3398,15 +3387,13 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
3398
3387
|
// isn't consumed twice.
|
3399
3388
|
tokenizer->_reconsume_current_input = false;
|
3400
3389
|
tokenizer->_buffered_emit_char = kGumboNoChar;
|
3401
|
-
return
|
3390
|
+
return;
|
3402
3391
|
}
|
3403
3392
|
|
3404
3393
|
if (maybe_emit_from_mark(parser, output) == EMIT_TOKEN) {
|
3405
|
-
|
3406
|
-
return true;
|
3394
|
+
return;
|
3407
3395
|
}
|
3408
3396
|
|
3409
|
-
tokenizer->_parse_error = false;
|
3410
3397
|
while (1) {
|
3411
3398
|
assert(!tokenizer->_resume_pos);
|
3412
3399
|
assert(tokenizer->_buffered_emit_char == kGumboNoChar);
|
@@ -3420,7 +3407,7 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
3420
3407
|
tokenizer->_reconsume_current_input = false;
|
3421
3408
|
|
3422
3409
|
if (result == EMIT_TOKEN)
|
3423
|
-
return
|
3410
|
+
return;
|
3424
3411
|
|
3425
3412
|
if (should_advance) {
|
3426
3413
|
utf8iterator_next(&tokenizer->_input);
|
@@ -93,19 +93,8 @@ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
|
|
93
93
|
);
|
94
94
|
|
95
95
|
// Lexes a single token from the specified buffer, filling the output with the
|
96
|
-
// parsed GumboToken data structure.
|
97
|
-
|
98
|
-
//
|
99
|
-
// Example:
|
100
|
-
// struct GumboInternalParser parser;
|
101
|
-
// GumboToken output;
|
102
|
-
// gumbo_tokenizer_state_init(&parser, text, strlen(text));
|
103
|
-
// while (gumbo_lex(&parser, &output)) {
|
104
|
-
// ...do stuff with output.
|
105
|
-
// gumbo_token_destroy(&token);
|
106
|
-
// }
|
107
|
-
// gumbo_tokenizer_state_destroy(&parser);
|
108
|
-
bool gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
|
96
|
+
// parsed GumboToken data structure.
|
97
|
+
void gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
|
109
98
|
|
110
99
|
// Frees the internally-allocated pointers within a GumboToken. Note that this
|
111
100
|
// doesn't free the token itself, since oftentimes it will be allocated on the
|
data/lib/nokogumbo/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Ruby
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2019-11-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -109,8 +109,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
111
|
requirements: []
|
112
|
-
|
113
|
-
rubygems_version: 2.7.7
|
112
|
+
rubygems_version: 3.0.6
|
114
113
|
signing_key:
|
115
114
|
specification_version: 4
|
116
115
|
summary: Nokogiri interface to the Gumbo HTML5 parser
|