nokogumbo 2.0.1 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/nokogumbo/extconf.rb +1 -1
- data/ext/nokogumbo/nokogumbo.c +1 -0
- data/gumbo-parser/src/error.c +17 -8
- data/gumbo-parser/src/gumbo.h +8 -0
- data/gumbo-parser/src/parser.c +473 -480
- data/gumbo-parser/src/tokenizer.c +12 -25
- data/gumbo-parser/src/tokenizer.h +2 -13
- data/lib/nokogumbo/version.rb +1 -1
- metadata +3 -4
@@ -20,10 +20,7 @@
|
|
20
20
|
Coding conventions specific to this file:
|
21
21
|
|
22
22
|
1. Functions that fill in a token should be named emit_*, and should be
|
23
|
-
followed immediately by a return from the tokenizer
|
24
|
-
occurred, false if an error occurred). Sometimes the emit functions
|
25
|
-
themselves return a boolean so that they can be combined with the return
|
26
|
-
statement; in this case, they should match this convention.
|
23
|
+
followed immediately by a return from the tokenizer.
|
27
24
|
2. Functions that shuffle data from temporaries to final API structures
|
28
25
|
should be named finish_*, and be called just before the tokenizer exits the
|
29
26
|
state that accumulates the temporary.
|
@@ -141,10 +138,6 @@ typedef struct GumboInternalTokenizerState {
|
|
141
138
|
// text tokens emitted will be GUMBO_TOKEN_CDATA.
|
142
139
|
bool _is_in_cdata;
|
143
140
|
|
144
|
-
// A flag indicating whether the tokenizer has seen a parse error since the
|
145
|
-
// last token was emitted.
|
146
|
-
bool _parse_error;
|
147
|
-
|
148
141
|
// Certain states (notably character references) may emit two character tokens
|
149
142
|
// at once, but the contract for lex() fills in only one token at a time. The
|
150
143
|
// extra character is buffered here, and then this is checked on entry to
|
@@ -207,7 +200,6 @@ static void tokenizer_add_parse_error (
|
|
207
200
|
GumboErrorType type
|
208
201
|
) {
|
209
202
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
210
|
-
tokenizer->_parse_error = true;
|
211
203
|
GumboError* error = gumbo_add_error(parser);
|
212
204
|
if (!error) {
|
213
205
|
return;
|
@@ -228,7 +220,6 @@ static void tokenizer_add_char_ref_error (
|
|
228
220
|
int codepoint
|
229
221
|
) {
|
230
222
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
231
|
-
tokenizer->_parse_error = true;
|
232
223
|
GumboError* error = gumbo_add_error(parser);
|
233
224
|
if (!error)
|
234
225
|
return;
|
@@ -248,7 +239,6 @@ static void tokenizer_add_token_parse_error (
|
|
248
239
|
GumboErrorType type
|
249
240
|
) {
|
250
241
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
251
|
-
tokenizer->_parse_error = true;
|
252
242
|
GumboError* error = gumbo_add_error(parser);
|
253
243
|
if (!error)
|
254
244
|
return;
|
@@ -732,7 +722,10 @@ static void copy_over_original_tag_text (
|
|
732
722
|
original_text->data = tag_state->_original_text;
|
733
723
|
original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
|
734
724
|
tag_state->_original_text;
|
735
|
-
if (
|
725
|
+
if (
|
726
|
+
original_text->length
|
727
|
+
&& original_text->data[original_text->length - 1] == '\r'
|
728
|
+
) {
|
736
729
|
// Since \r is skipped by the UTF-8 iterator, it can sometimes end up
|
737
730
|
// appended to the end of original text even when it's really the first part
|
738
731
|
// of the next character. If we detect this situation, shrink the length of
|
@@ -770,7 +763,6 @@ static void finish_tag_name(GumboParser* parser) {
|
|
770
763
|
// Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
|
771
764
|
static void add_duplicate_attr_error(GumboParser* parser) {
|
772
765
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
773
|
-
tokenizer->_parse_error = true;
|
774
766
|
GumboError* error = gumbo_add_error(parser);
|
775
767
|
if (!error) {
|
776
768
|
return;
|
@@ -788,9 +780,8 @@ static void add_duplicate_attr_error(GumboParser* parser) {
|
|
788
780
|
// the attribute's name. The attribute's value starts out as the empty string
|
789
781
|
// (following the "Boolean attributes" section of the spec) and is only
|
790
782
|
// overwritten on finish_attribute_value(). If the attribute has already been
|
791
|
-
// specified, the new attribute is dropped
|
792
|
-
|
793
|
-
static bool finish_attribute_name(GumboParser* parser) {
|
783
|
+
// specified, the new attribute is dropped and a parse error is added
|
784
|
+
static void finish_attribute_name(GumboParser* parser) {
|
794
785
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
795
786
|
GumboTagState* tag_state = &tokenizer->_tag_state;
|
796
787
|
// May've been set by a previous attribute without a value; reset it here.
|
@@ -813,7 +804,7 @@ static bool finish_attribute_name(GumboParser* parser) {
|
|
813
804
|
add_duplicate_attr_error(parser);
|
814
805
|
reinitialize_tag_buffer(parser);
|
815
806
|
tag_state->_drop_next_attr_value = true;
|
816
|
-
return
|
807
|
+
return;
|
817
808
|
}
|
818
809
|
}
|
819
810
|
|
@@ -835,7 +826,6 @@ static bool finish_attribute_name(GumboParser* parser) {
|
|
835
826
|
);
|
836
827
|
gumbo_vector_add(attr, attributes);
|
837
828
|
reinitialize_tag_buffer(parser);
|
838
|
-
return true;
|
839
829
|
}
|
840
830
|
|
841
831
|
// Finishes an attribute value. This sets the value of the most recently added
|
@@ -881,7 +871,6 @@ void gumbo_tokenizer_state_init (
|
|
881
871
|
tokenizer->_reconsume_current_input = false;
|
882
872
|
tokenizer->_is_adjusted_current_node_foreign = false;
|
883
873
|
tokenizer->_is_in_cdata = false;
|
884
|
-
tokenizer->_parse_error = false;
|
885
874
|
tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
|
886
875
|
tokenizer->_tag_state._name = NULL;
|
887
876
|
|
@@ -3373,7 +3362,7 @@ static GumboLexerStateFunction dispatch_table[] = {
|
|
3373
3362
|
[GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END] = handle_numeric_character_reference_end_state,
|
3374
3363
|
};
|
3375
3364
|
|
3376
|
-
|
3365
|
+
void gumbo_lex(GumboParser* parser, GumboToken* output) {
|
3377
3366
|
// Because of the spec requirements that...
|
3378
3367
|
//
|
3379
3368
|
// 1. Tokens be handled immediately by the parser upon emission.
|
@@ -3398,15 +3387,13 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
3398
3387
|
// isn't consumed twice.
|
3399
3388
|
tokenizer->_reconsume_current_input = false;
|
3400
3389
|
tokenizer->_buffered_emit_char = kGumboNoChar;
|
3401
|
-
return
|
3390
|
+
return;
|
3402
3391
|
}
|
3403
3392
|
|
3404
3393
|
if (maybe_emit_from_mark(parser, output) == EMIT_TOKEN) {
|
3405
|
-
|
3406
|
-
return true;
|
3394
|
+
return;
|
3407
3395
|
}
|
3408
3396
|
|
3409
|
-
tokenizer->_parse_error = false;
|
3410
3397
|
while (1) {
|
3411
3398
|
assert(!tokenizer->_resume_pos);
|
3412
3399
|
assert(tokenizer->_buffered_emit_char == kGumboNoChar);
|
@@ -3420,7 +3407,7 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
3420
3407
|
tokenizer->_reconsume_current_input = false;
|
3421
3408
|
|
3422
3409
|
if (result == EMIT_TOKEN)
|
3423
|
-
return
|
3410
|
+
return;
|
3424
3411
|
|
3425
3412
|
if (should_advance) {
|
3426
3413
|
utf8iterator_next(&tokenizer->_input);
|
@@ -93,19 +93,8 @@ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
|
|
93
93
|
);
|
94
94
|
|
95
95
|
// Lexes a single token from the specified buffer, filling the output with the
|
96
|
-
// parsed GumboToken data structure.
|
97
|
-
|
98
|
-
//
|
99
|
-
// Example:
|
100
|
-
// struct GumboInternalParser parser;
|
101
|
-
// GumboToken output;
|
102
|
-
// gumbo_tokenizer_state_init(&parser, text, strlen(text));
|
103
|
-
// while (gumbo_lex(&parser, &output)) {
|
104
|
-
// ...do stuff with output.
|
105
|
-
// gumbo_token_destroy(&token);
|
106
|
-
// }
|
107
|
-
// gumbo_tokenizer_state_destroy(&parser);
|
108
|
-
bool gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
|
96
|
+
// parsed GumboToken data structure.
|
97
|
+
void gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
|
109
98
|
|
110
99
|
// Frees the internally-allocated pointers within a GumboToken. Note that this
|
111
100
|
// doesn't free the token itself, since oftentimes it will be allocated on the
|
data/lib/nokogumbo/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Ruby
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2019-11-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -109,8 +109,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
111
|
requirements: []
|
112
|
-
|
113
|
-
rubygems_version: 2.7.7
|
112
|
+
rubygems_version: 3.0.6
|
114
113
|
signing_key:
|
115
114
|
specification_version: 4
|
116
115
|
summary: Nokogiri interface to the Gumbo HTML5 parser
|