nokogumbo 2.0.0 → 2.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +20 -4
- data/ext/nokogumbo/extconf.rb +50 -27
- data/ext/nokogumbo/nokogumbo.c +63 -14
- data/gumbo-parser/src/error.c +17 -8
- data/gumbo-parser/src/gumbo.h +27 -0
- data/gumbo-parser/src/parser.c +476 -480
- data/gumbo-parser/src/tokenizer.c +24 -27
- data/gumbo-parser/src/tokenizer.h +2 -13
- data/gumbo-parser/src/utf8.c +5 -0
- data/gumbo-parser/src/utf8.h +1 -0
- data/lib/nokogumbo.rb +22 -9
- data/lib/nokogumbo/html5.rb +15 -14
- data/lib/nokogumbo/html5/document.rb +7 -2
- data/lib/nokogumbo/html5/document_fragment.rb +2 -1
- data/lib/nokogumbo/version.rb +1 -1
- metadata +3 -4
@@ -20,10 +20,7 @@
|
|
20
20
|
Coding conventions specific to this file:
|
21
21
|
|
22
22
|
1. Functions that fill in a token should be named emit_*, and should be
|
23
|
-
followed immediately by a return from the tokenizer
|
24
|
-
occurred, false if an error occurred). Sometimes the emit functions
|
25
|
-
themselves return a boolean so that they can be combined with the return
|
26
|
-
statement; in this case, they should match this convention.
|
23
|
+
followed immediately by a return from the tokenizer.
|
27
24
|
2. Functions that shuffle data from temporaries to final API structures
|
28
25
|
should be named finish_*, and be called just before the tokenizer exits the
|
29
26
|
state that accumulates the temporary.
|
@@ -141,10 +138,6 @@ typedef struct GumboInternalTokenizerState {
|
|
141
138
|
// text tokens emitted will be GUMBO_TOKEN_CDATA.
|
142
139
|
bool _is_in_cdata;
|
143
140
|
|
144
|
-
// A flag indicating whether the tokenizer has seen a parse error since the
|
145
|
-
// last token was emitted.
|
146
|
-
bool _parse_error;
|
147
|
-
|
148
141
|
// Certain states (notably character references) may emit two character tokens
|
149
142
|
// at once, but the contract for lex() fills in only one token at a time. The
|
150
143
|
// extra character is buffered here, and then this is checked on entry to
|
@@ -207,7 +200,6 @@ static void tokenizer_add_parse_error (
|
|
207
200
|
GumboErrorType type
|
208
201
|
) {
|
209
202
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
210
|
-
tokenizer->_parse_error = true;
|
211
203
|
GumboError* error = gumbo_add_error(parser);
|
212
204
|
if (!error) {
|
213
205
|
return;
|
@@ -228,7 +220,6 @@ static void tokenizer_add_char_ref_error (
|
|
228
220
|
int codepoint
|
229
221
|
) {
|
230
222
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
231
|
-
tokenizer->_parse_error = true;
|
232
223
|
GumboError* error = gumbo_add_error(parser);
|
233
224
|
if (!error)
|
234
225
|
return;
|
@@ -248,7 +239,6 @@ static void tokenizer_add_token_parse_error (
|
|
248
239
|
GumboErrorType type
|
249
240
|
) {
|
250
241
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
251
|
-
tokenizer->_parse_error = true;
|
252
242
|
GumboError* error = gumbo_add_error(parser);
|
253
243
|
if (!error)
|
254
244
|
return;
|
@@ -732,7 +722,10 @@ static void copy_over_original_tag_text (
|
|
732
722
|
original_text->data = tag_state->_original_text;
|
733
723
|
original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
|
734
724
|
tag_state->_original_text;
|
735
|
-
if (
|
725
|
+
if (
|
726
|
+
original_text->length
|
727
|
+
&& original_text->data[original_text->length - 1] == '\r'
|
728
|
+
) {
|
736
729
|
// Since \r is skipped by the UTF-8 iterator, it can sometimes end up
|
737
730
|
// appended to the end of original text even when it's really the first part
|
738
731
|
// of the next character. If we detect this situation, shrink the length of
|
@@ -770,7 +763,6 @@ static void finish_tag_name(GumboParser* parser) {
|
|
770
763
|
// Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
|
771
764
|
static void add_duplicate_attr_error(GumboParser* parser) {
|
772
765
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
773
|
-
tokenizer->_parse_error = true;
|
774
766
|
GumboError* error = gumbo_add_error(parser);
|
775
767
|
if (!error) {
|
776
768
|
return;
|
@@ -788,17 +780,26 @@ static void add_duplicate_attr_error(GumboParser* parser) {
|
|
788
780
|
// the attribute's name. The attribute's value starts out as the empty string
|
789
781
|
// (following the "Boolean attributes" section of the spec) and is only
|
790
782
|
// overwritten on finish_attribute_value(). If the attribute has already been
|
791
|
-
// specified, the new attribute is dropped
|
792
|
-
|
793
|
-
static bool finish_attribute_name(GumboParser* parser) {
|
783
|
+
// specified, the new attribute is dropped and a parse error is added
|
784
|
+
static void finish_attribute_name(GumboParser* parser) {
|
794
785
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
795
786
|
GumboTagState* tag_state = &tokenizer->_tag_state;
|
787
|
+
GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
|
788
|
+
|
789
|
+
int max_attributes = parser->_options->max_attributes;
|
790
|
+
if (unlikely(max_attributes >= 0 && attributes->length >= (unsigned int) max_attributes)) {
|
791
|
+
parser->_output->status = GUMBO_STATUS_TOO_MANY_ATTRIBUTES;
|
792
|
+
gumbo_debug("Attributes limit exceeded.\n");
|
793
|
+
reinitialize_tag_buffer(parser);
|
794
|
+
tag_state->_drop_next_attr_value = true;
|
795
|
+
return;
|
796
|
+
}
|
797
|
+
|
796
798
|
// May've been set by a previous attribute without a value; reset it here.
|
797
799
|
tag_state->_drop_next_attr_value = false;
|
798
800
|
assert(tag_state->_attributes.data);
|
799
801
|
assert(tag_state->_attributes.capacity);
|
800
802
|
|
801
|
-
GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
|
802
803
|
for (unsigned int i = 0; i < attributes->length; ++i) {
|
803
804
|
GumboAttribute* attr = attributes->data[i];
|
804
805
|
if (
|
@@ -813,7 +814,7 @@ static bool finish_attribute_name(GumboParser* parser) {
|
|
813
814
|
add_duplicate_attr_error(parser);
|
814
815
|
reinitialize_tag_buffer(parser);
|
815
816
|
tag_state->_drop_next_attr_value = true;
|
816
|
-
return
|
817
|
+
return;
|
817
818
|
}
|
818
819
|
}
|
819
820
|
|
@@ -835,7 +836,6 @@ static bool finish_attribute_name(GumboParser* parser) {
|
|
835
836
|
);
|
836
837
|
gumbo_vector_add(attr, attributes);
|
837
838
|
reinitialize_tag_buffer(parser);
|
838
|
-
return true;
|
839
839
|
}
|
840
840
|
|
841
841
|
// Finishes an attribute value. This sets the value of the most recently added
|
@@ -881,7 +881,6 @@ void gumbo_tokenizer_state_init (
|
|
881
881
|
tokenizer->_reconsume_current_input = false;
|
882
882
|
tokenizer->_is_adjusted_current_node_foreign = false;
|
883
883
|
tokenizer->_is_in_cdata = false;
|
884
|
-
tokenizer->_parse_error = false;
|
885
884
|
tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
|
886
885
|
tokenizer->_tag_state._name = NULL;
|
887
886
|
|
@@ -891,9 +890,9 @@ void gumbo_tokenizer_state_init (
|
|
891
890
|
|
892
891
|
mark_tag_state_as_empty(&tokenizer->_tag_state);
|
893
892
|
|
894
|
-
tokenizer->_token_start = text;
|
895
893
|
utf8iterator_init(parser, text, text_length, &tokenizer->_input);
|
896
894
|
utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
|
895
|
+
tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
|
897
896
|
doc_type_state_init(parser);
|
898
897
|
}
|
899
898
|
|
@@ -3373,7 +3372,7 @@ static GumboLexerStateFunction dispatch_table[] = {
|
|
3373
3372
|
[GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END] = handle_numeric_character_reference_end_state,
|
3374
3373
|
};
|
3375
3374
|
|
3376
|
-
|
3375
|
+
void gumbo_lex(GumboParser* parser, GumboToken* output) {
|
3377
3376
|
// Because of the spec requirements that...
|
3378
3377
|
//
|
3379
3378
|
// 1. Tokens be handled immediately by the parser upon emission.
|
@@ -3398,15 +3397,13 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
3398
3397
|
// isn't consumed twice.
|
3399
3398
|
tokenizer->_reconsume_current_input = false;
|
3400
3399
|
tokenizer->_buffered_emit_char = kGumboNoChar;
|
3401
|
-
return
|
3400
|
+
return;
|
3402
3401
|
}
|
3403
3402
|
|
3404
3403
|
if (maybe_emit_from_mark(parser, output) == EMIT_TOKEN) {
|
3405
|
-
|
3406
|
-
return true;
|
3404
|
+
return;
|
3407
3405
|
}
|
3408
3406
|
|
3409
|
-
tokenizer->_parse_error = false;
|
3410
3407
|
while (1) {
|
3411
3408
|
assert(!tokenizer->_resume_pos);
|
3412
3409
|
assert(tokenizer->_buffered_emit_char == kGumboNoChar);
|
@@ -3420,7 +3417,7 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
3420
3417
|
tokenizer->_reconsume_current_input = false;
|
3421
3418
|
|
3422
3419
|
if (result == EMIT_TOKEN)
|
3423
|
-
return
|
3420
|
+
return;
|
3424
3421
|
|
3425
3422
|
if (should_advance) {
|
3426
3423
|
utf8iterator_next(&tokenizer->_input);
|
@@ -93,19 +93,8 @@ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
|
|
93
93
|
);
|
94
94
|
|
95
95
|
// Lexes a single token from the specified buffer, filling the output with the
|
96
|
-
// parsed GumboToken data structure.
|
97
|
-
|
98
|
-
//
|
99
|
-
// Example:
|
100
|
-
// struct GumboInternalParser parser;
|
101
|
-
// GumboToken output;
|
102
|
-
// gumbo_tokenizer_state_init(&parser, text, strlen(text));
|
103
|
-
// while (gumbo_lex(&parser, &output)) {
|
104
|
-
// ...do stuff with output.
|
105
|
-
// gumbo_token_destroy(&token);
|
106
|
-
// }
|
107
|
-
// gumbo_tokenizer_state_destroy(&parser);
|
108
|
-
bool gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
|
96
|
+
// parsed GumboToken data structure.
|
97
|
+
void gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
|
109
98
|
|
110
99
|
// Frees the internally-allocated pointers within a GumboToken. Note that this
|
111
100
|
// doesn't free the token itself, since oftentimes it will be allocated on the
|
data/gumbo-parser/src/utf8.c
CHANGED
@@ -193,6 +193,11 @@ void utf8iterator_init (
|
|
193
193
|
iter->_pos.offset = 0;
|
194
194
|
iter->_parser = parser;
|
195
195
|
read_char(iter);
|
196
|
+
if (iter->_current == kUtf8BomChar) {
|
197
|
+
iter->_start += iter->_width;
|
198
|
+
iter->_pos.offset += iter->_width;
|
199
|
+
read_char(iter);
|
200
|
+
}
|
196
201
|
}
|
197
202
|
|
198
203
|
void utf8iterator_next(Utf8Iterator* iter) {
|
data/gumbo-parser/src/utf8.h
CHANGED
data/lib/nokogumbo.rb
CHANGED
@@ -1,14 +1,27 @@
|
|
1
1
|
require 'nokogiri'
|
2
|
-
require 'nokogumbo/version'
|
3
|
-
require 'nokogumbo/html5'
|
4
2
|
|
5
|
-
|
3
|
+
if ((defined?(Nokogiri::HTML5) && Nokogiri::HTML5.respond_to?(:parse)) &&
|
4
|
+
(defined?(Nokogiri::Gumbo) && Nokogiri::Gumbo.respond_to?(:parse)) &&
|
5
|
+
!(ENV.key?("NOKOGUMBO_IGNORE_NOKOGIRI_HTML5") && ENV["NOKOGUMBO_IGNORE_NOKOGIRI_HTML5"] != "false"))
|
6
|
+
|
7
|
+
warn "NOTE: nokogumbo: Using Nokogiri::HTML5 provided by Nokogiri. See https://github.com/sparklemotion/nokogiri/issues/2205 for more information."
|
8
|
+
|
9
|
+
::Nokogumbo = ::Nokogiri::Gumbo
|
10
|
+
else
|
11
|
+
require 'nokogumbo/html5'
|
12
|
+
require 'nokogumbo/nokogumbo'
|
6
13
|
|
7
|
-
module Nokogumbo
|
8
|
-
|
9
|
-
|
14
|
+
module Nokogumbo
|
15
|
+
# The default maximum number of attributes per element.
|
16
|
+
DEFAULT_MAX_ATTRIBUTES = 400
|
10
17
|
|
11
|
-
|
12
|
-
|
13
|
-
|
18
|
+
# The default maximum number of errors for parsing a document or a fragment.
|
19
|
+
DEFAULT_MAX_ERRORS = 0
|
20
|
+
|
21
|
+
# The default maximum depth of the DOM tree produced by parsing a document
|
22
|
+
# or fragment.
|
23
|
+
DEFAULT_MAX_TREE_DEPTH = 400
|
24
|
+
end
|
14
25
|
end
|
26
|
+
|
27
|
+
require 'nokogumbo/version'
|
data/lib/nokogumbo/html5.rb
CHANGED
@@ -19,7 +19,7 @@ module Nokogiri
|
|
19
19
|
|
20
20
|
# Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
|
21
21
|
def self.parse(string, url = nil, encoding = nil, **options, &block)
|
22
|
-
Document.parse(string, url, encoding, options, &block)
|
22
|
+
Document.parse(string, url, encoding, **options, &block)
|
23
23
|
end
|
24
24
|
|
25
25
|
# Parse a fragment from +string+. Convenience method for
|
@@ -92,19 +92,20 @@ module Nokogiri
|
|
92
92
|
if encoding.nil?
|
93
93
|
string = string.read
|
94
94
|
else
|
95
|
-
|
95
|
+
string = string.read(encoding: encoding)
|
96
96
|
end
|
97
97
|
else
|
98
98
|
# Otherwise the string has the given encoding.
|
99
|
-
|
99
|
+
string = string.to_s
|
100
|
+
if encoding
|
100
101
|
string = string.dup
|
101
102
|
string.force_encoding(encoding)
|
102
103
|
end
|
103
104
|
end
|
104
105
|
|
105
|
-
# convert to UTF-8
|
106
|
-
if string.
|
107
|
-
string = reencode(string
|
106
|
+
# convert to UTF-8
|
107
|
+
if string.encoding != Encoding::UTF_8
|
108
|
+
string = reencode(string)
|
108
109
|
end
|
109
110
|
string
|
110
111
|
end
|
@@ -123,18 +124,17 @@ module Nokogiri
|
|
123
124
|
# http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
|
124
125
|
#
|
125
126
|
def self.reencode(body, content_type=nil)
|
126
|
-
return body unless body.respond_to? :encoding
|
127
|
-
|
128
127
|
if body.encoding == Encoding::ASCII_8BIT
|
129
128
|
encoding = nil
|
130
129
|
|
131
130
|
# look for a Byte Order Mark (BOM)
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
131
|
+
initial_bytes = body[0..2].bytes
|
132
|
+
if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
|
133
|
+
encoding = Encoding::UTF_8
|
134
|
+
elsif initial_bytes[0..1] == [0xFE, 0xFF]
|
135
|
+
encoding = Encoding::UTF_16BE
|
136
|
+
elsif initial_bytes[0..1] == [0xFF, 0xFE]
|
137
|
+
encoding = Encoding::UTF_16LE
|
138
138
|
end
|
139
139
|
|
140
140
|
# look for a charset in a content-encoding header
|
@@ -154,6 +154,7 @@ module Nokogiri
|
|
154
154
|
encoding ||= Encoding::ISO_8859_1
|
155
155
|
|
156
156
|
# change the encoding to match the detected or inferred encoding
|
157
|
+
body = body.dup
|
157
158
|
begin
|
158
159
|
body.force_encoding(encoding)
|
159
160
|
rescue ArgumentError
|
@@ -12,6 +12,9 @@ module Nokogiri
|
|
12
12
|
if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
|
13
13
|
url ||= string_or_io.path
|
14
14
|
end
|
15
|
+
unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
|
16
|
+
raise ArgumentError.new("not a string or IO object")
|
17
|
+
end
|
15
18
|
do_parse(string_or_io, url, encoding, options)
|
16
19
|
end
|
17
20
|
|
@@ -21,7 +24,8 @@ module Nokogiri
|
|
21
24
|
end
|
22
25
|
|
23
26
|
def self.read_memory(string, url = nil, encoding = nil, **options)
|
24
|
-
|
27
|
+
raise ArgumentError.new("string object doesn't respond to :to_str") unless string.respond_to?(:to_str)
|
28
|
+
do_parse(string, url, encoding, options)
|
25
29
|
end
|
26
30
|
|
27
31
|
def fragment(tags = nil)
|
@@ -37,9 +41,10 @@ module Nokogiri
|
|
37
41
|
private
|
38
42
|
def self.do_parse(string_or_io, url, encoding, options)
|
39
43
|
string = HTML5.read_and_encode(string_or_io, encoding)
|
44
|
+
max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
|
40
45
|
max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
|
41
46
|
max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
|
42
|
-
doc = Nokogumbo.parse(string
|
47
|
+
doc = Nokogumbo.parse(string, url, max_attributes, max_errors, max_depth)
|
43
48
|
doc.encoding = 'UTF-8'
|
44
49
|
doc
|
45
50
|
end
|
@@ -12,10 +12,11 @@ module Nokogiri
|
|
12
12
|
self.errors = []
|
13
13
|
return self unless tags
|
14
14
|
|
15
|
+
max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
|
15
16
|
max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
|
16
17
|
max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
|
17
18
|
tags = Nokogiri::HTML5.read_and_encode(tags, nil)
|
18
|
-
Nokogumbo.fragment(self, tags, ctx, max_errors, max_depth)
|
19
|
+
Nokogumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
|
19
20
|
end
|
20
21
|
|
21
22
|
def serialize(options = {}, &block)
|
data/lib/nokogumbo/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Ruby
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2021-03-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -109,8 +109,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
111
|
requirements: []
|
112
|
-
|
113
|
-
rubygems_version: 2.7.6
|
112
|
+
rubygems_version: 3.1.4
|
114
113
|
signing_key:
|
115
114
|
specification_version: 4
|
116
115
|
summary: Nokogiri interface to the Gumbo HTML5 parser
|