nokogumbo 2.0.0 → 2.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +20 -4
- data/ext/nokogumbo/extconf.rb +50 -27
- data/ext/nokogumbo/nokogumbo.c +63 -14
- data/gumbo-parser/src/error.c +17 -8
- data/gumbo-parser/src/gumbo.h +27 -0
- data/gumbo-parser/src/parser.c +476 -480
- data/gumbo-parser/src/tokenizer.c +24 -27
- data/gumbo-parser/src/tokenizer.h +2 -13
- data/gumbo-parser/src/utf8.c +5 -0
- data/gumbo-parser/src/utf8.h +1 -0
- data/lib/nokogumbo.rb +22 -9
- data/lib/nokogumbo/html5.rb +15 -14
- data/lib/nokogumbo/html5/document.rb +7 -2
- data/lib/nokogumbo/html5/document_fragment.rb +2 -1
- data/lib/nokogumbo/version.rb +1 -1
- metadata +3 -4
@@ -20,10 +20,7 @@
|
|
20
20
|
Coding conventions specific to this file:
|
21
21
|
|
22
22
|
1. Functions that fill in a token should be named emit_*, and should be
|
23
|
-
followed immediately by a return from the tokenizer
|
24
|
-
occurred, false if an error occurred). Sometimes the emit functions
|
25
|
-
themselves return a boolean so that they can be combined with the return
|
26
|
-
statement; in this case, they should match this convention.
|
23
|
+
followed immediately by a return from the tokenizer.
|
27
24
|
2. Functions that shuffle data from temporaries to final API structures
|
28
25
|
should be named finish_*, and be called just before the tokenizer exits the
|
29
26
|
state that accumulates the temporary.
|
@@ -141,10 +138,6 @@ typedef struct GumboInternalTokenizerState {
|
|
141
138
|
// text tokens emitted will be GUMBO_TOKEN_CDATA.
|
142
139
|
bool _is_in_cdata;
|
143
140
|
|
144
|
-
// A flag indicating whether the tokenizer has seen a parse error since the
|
145
|
-
// last token was emitted.
|
146
|
-
bool _parse_error;
|
147
|
-
|
148
141
|
// Certain states (notably character references) may emit two character tokens
|
149
142
|
// at once, but the contract for lex() fills in only one token at a time. The
|
150
143
|
// extra character is buffered here, and then this is checked on entry to
|
@@ -207,7 +200,6 @@ static void tokenizer_add_parse_error (
|
|
207
200
|
GumboErrorType type
|
208
201
|
) {
|
209
202
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
210
|
-
tokenizer->_parse_error = true;
|
211
203
|
GumboError* error = gumbo_add_error(parser);
|
212
204
|
if (!error) {
|
213
205
|
return;
|
@@ -228,7 +220,6 @@ static void tokenizer_add_char_ref_error (
|
|
228
220
|
int codepoint
|
229
221
|
) {
|
230
222
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
231
|
-
tokenizer->_parse_error = true;
|
232
223
|
GumboError* error = gumbo_add_error(parser);
|
233
224
|
if (!error)
|
234
225
|
return;
|
@@ -248,7 +239,6 @@ static void tokenizer_add_token_parse_error (
|
|
248
239
|
GumboErrorType type
|
249
240
|
) {
|
250
241
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
251
|
-
tokenizer->_parse_error = true;
|
252
242
|
GumboError* error = gumbo_add_error(parser);
|
253
243
|
if (!error)
|
254
244
|
return;
|
@@ -732,7 +722,10 @@ static void copy_over_original_tag_text (
|
|
732
722
|
original_text->data = tag_state->_original_text;
|
733
723
|
original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
|
734
724
|
tag_state->_original_text;
|
735
|
-
if (
|
725
|
+
if (
|
726
|
+
original_text->length
|
727
|
+
&& original_text->data[original_text->length - 1] == '\r'
|
728
|
+
) {
|
736
729
|
// Since \r is skipped by the UTF-8 iterator, it can sometimes end up
|
737
730
|
// appended to the end of original text even when it's really the first part
|
738
731
|
// of the next character. If we detect this situation, shrink the length of
|
@@ -770,7 +763,6 @@ static void finish_tag_name(GumboParser* parser) {
|
|
770
763
|
// Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
|
771
764
|
static void add_duplicate_attr_error(GumboParser* parser) {
|
772
765
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
773
|
-
tokenizer->_parse_error = true;
|
774
766
|
GumboError* error = gumbo_add_error(parser);
|
775
767
|
if (!error) {
|
776
768
|
return;
|
@@ -788,17 +780,26 @@ static void add_duplicate_attr_error(GumboParser* parser) {
|
|
788
780
|
// the attribute's name. The attribute's value starts out as the empty string
|
789
781
|
// (following the "Boolean attributes" section of the spec) and is only
|
790
782
|
// overwritten on finish_attribute_value(). If the attribute has already been
|
791
|
-
// specified, the new attribute is dropped
|
792
|
-
|
793
|
-
static bool finish_attribute_name(GumboParser* parser) {
|
783
|
+
// specified, the new attribute is dropped and a parse error is added
|
784
|
+
static void finish_attribute_name(GumboParser* parser) {
|
794
785
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
795
786
|
GumboTagState* tag_state = &tokenizer->_tag_state;
|
787
|
+
GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
|
788
|
+
|
789
|
+
int max_attributes = parser->_options->max_attributes;
|
790
|
+
if (unlikely(max_attributes >= 0 && attributes->length >= (unsigned int) max_attributes)) {
|
791
|
+
parser->_output->status = GUMBO_STATUS_TOO_MANY_ATTRIBUTES;
|
792
|
+
gumbo_debug("Attributes limit exceeded.\n");
|
793
|
+
reinitialize_tag_buffer(parser);
|
794
|
+
tag_state->_drop_next_attr_value = true;
|
795
|
+
return;
|
796
|
+
}
|
797
|
+
|
796
798
|
// May've been set by a previous attribute without a value; reset it here.
|
797
799
|
tag_state->_drop_next_attr_value = false;
|
798
800
|
assert(tag_state->_attributes.data);
|
799
801
|
assert(tag_state->_attributes.capacity);
|
800
802
|
|
801
|
-
GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
|
802
803
|
for (unsigned int i = 0; i < attributes->length; ++i) {
|
803
804
|
GumboAttribute* attr = attributes->data[i];
|
804
805
|
if (
|
@@ -813,7 +814,7 @@ static bool finish_attribute_name(GumboParser* parser) {
|
|
813
814
|
add_duplicate_attr_error(parser);
|
814
815
|
reinitialize_tag_buffer(parser);
|
815
816
|
tag_state->_drop_next_attr_value = true;
|
816
|
-
return
|
817
|
+
return;
|
817
818
|
}
|
818
819
|
}
|
819
820
|
|
@@ -835,7 +836,6 @@ static bool finish_attribute_name(GumboParser* parser) {
|
|
835
836
|
);
|
836
837
|
gumbo_vector_add(attr, attributes);
|
837
838
|
reinitialize_tag_buffer(parser);
|
838
|
-
return true;
|
839
839
|
}
|
840
840
|
|
841
841
|
// Finishes an attribute value. This sets the value of the most recently added
|
@@ -881,7 +881,6 @@ void gumbo_tokenizer_state_init (
|
|
881
881
|
tokenizer->_reconsume_current_input = false;
|
882
882
|
tokenizer->_is_adjusted_current_node_foreign = false;
|
883
883
|
tokenizer->_is_in_cdata = false;
|
884
|
-
tokenizer->_parse_error = false;
|
885
884
|
tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
|
886
885
|
tokenizer->_tag_state._name = NULL;
|
887
886
|
|
@@ -891,9 +890,9 @@ void gumbo_tokenizer_state_init (
|
|
891
890
|
|
892
891
|
mark_tag_state_as_empty(&tokenizer->_tag_state);
|
893
892
|
|
894
|
-
tokenizer->_token_start = text;
|
895
893
|
utf8iterator_init(parser, text, text_length, &tokenizer->_input);
|
896
894
|
utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
|
895
|
+
tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
|
897
896
|
doc_type_state_init(parser);
|
898
897
|
}
|
899
898
|
|
@@ -3373,7 +3372,7 @@ static GumboLexerStateFunction dispatch_table[] = {
|
|
3373
3372
|
[GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END] = handle_numeric_character_reference_end_state,
|
3374
3373
|
};
|
3375
3374
|
|
3376
|
-
|
3375
|
+
void gumbo_lex(GumboParser* parser, GumboToken* output) {
|
3377
3376
|
// Because of the spec requirements that...
|
3378
3377
|
//
|
3379
3378
|
// 1. Tokens be handled immediately by the parser upon emission.
|
@@ -3398,15 +3397,13 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
3398
3397
|
// isn't consumed twice.
|
3399
3398
|
tokenizer->_reconsume_current_input = false;
|
3400
3399
|
tokenizer->_buffered_emit_char = kGumboNoChar;
|
3401
|
-
return
|
3400
|
+
return;
|
3402
3401
|
}
|
3403
3402
|
|
3404
3403
|
if (maybe_emit_from_mark(parser, output) == EMIT_TOKEN) {
|
3405
|
-
|
3406
|
-
return true;
|
3404
|
+
return;
|
3407
3405
|
}
|
3408
3406
|
|
3409
|
-
tokenizer->_parse_error = false;
|
3410
3407
|
while (1) {
|
3411
3408
|
assert(!tokenizer->_resume_pos);
|
3412
3409
|
assert(tokenizer->_buffered_emit_char == kGumboNoChar);
|
@@ -3420,7 +3417,7 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
3420
3417
|
tokenizer->_reconsume_current_input = false;
|
3421
3418
|
|
3422
3419
|
if (result == EMIT_TOKEN)
|
3423
|
-
return
|
3420
|
+
return;
|
3424
3421
|
|
3425
3422
|
if (should_advance) {
|
3426
3423
|
utf8iterator_next(&tokenizer->_input);
|
@@ -93,19 +93,8 @@ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
|
|
93
93
|
);
|
94
94
|
|
95
95
|
// Lexes a single token from the specified buffer, filling the output with the
|
96
|
-
// parsed GumboToken data structure.
|
97
|
-
|
98
|
-
//
|
99
|
-
// Example:
|
100
|
-
// struct GumboInternalParser parser;
|
101
|
-
// GumboToken output;
|
102
|
-
// gumbo_tokenizer_state_init(&parser, text, strlen(text));
|
103
|
-
// while (gumbo_lex(&parser, &output)) {
|
104
|
-
// ...do stuff with output.
|
105
|
-
// gumbo_token_destroy(&token);
|
106
|
-
// }
|
107
|
-
// gumbo_tokenizer_state_destroy(&parser);
|
108
|
-
bool gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
|
96
|
+
// parsed GumboToken data structure.
|
97
|
+
void gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
|
109
98
|
|
110
99
|
// Frees the internally-allocated pointers within a GumboToken. Note that this
|
111
100
|
// doesn't free the token itself, since oftentimes it will be allocated on the
|
data/gumbo-parser/src/utf8.c
CHANGED
@@ -193,6 +193,11 @@ void utf8iterator_init (
|
|
193
193
|
iter->_pos.offset = 0;
|
194
194
|
iter->_parser = parser;
|
195
195
|
read_char(iter);
|
196
|
+
if (iter->_current == kUtf8BomChar) {
|
197
|
+
iter->_start += iter->_width;
|
198
|
+
iter->_pos.offset += iter->_width;
|
199
|
+
read_char(iter);
|
200
|
+
}
|
196
201
|
}
|
197
202
|
|
198
203
|
void utf8iterator_next(Utf8Iterator* iter) {
|
data/gumbo-parser/src/utf8.h
CHANGED
data/lib/nokogumbo.rb
CHANGED
@@ -1,14 +1,27 @@
|
|
1
1
|
require 'nokogiri'
|
2
|
-
require 'nokogumbo/version'
|
3
|
-
require 'nokogumbo/html5'
|
4
2
|
|
5
|
-
|
3
|
+
if ((defined?(Nokogiri::HTML5) && Nokogiri::HTML5.respond_to?(:parse)) &&
|
4
|
+
(defined?(Nokogiri::Gumbo) && Nokogiri::Gumbo.respond_to?(:parse)) &&
|
5
|
+
!(ENV.key?("NOKOGUMBO_IGNORE_NOKOGIRI_HTML5") && ENV["NOKOGUMBO_IGNORE_NOKOGIRI_HTML5"] != "false"))
|
6
|
+
|
7
|
+
warn "NOTE: nokogumbo: Using Nokogiri::HTML5 provided by Nokogiri. See https://github.com/sparklemotion/nokogiri/issues/2205 for more information."
|
8
|
+
|
9
|
+
::Nokogumbo = ::Nokogiri::Gumbo
|
10
|
+
else
|
11
|
+
require 'nokogumbo/html5'
|
12
|
+
require 'nokogumbo/nokogumbo'
|
6
13
|
|
7
|
-
module Nokogumbo
|
8
|
-
|
9
|
-
|
14
|
+
module Nokogumbo
|
15
|
+
# The default maximum number of attributes per element.
|
16
|
+
DEFAULT_MAX_ATTRIBUTES = 400
|
10
17
|
|
11
|
-
|
12
|
-
|
13
|
-
|
18
|
+
# The default maximum number of errors for parsing a document or a fragment.
|
19
|
+
DEFAULT_MAX_ERRORS = 0
|
20
|
+
|
21
|
+
# The default maximum depth of the DOM tree produced by parsing a document
|
22
|
+
# or fragment.
|
23
|
+
DEFAULT_MAX_TREE_DEPTH = 400
|
24
|
+
end
|
14
25
|
end
|
26
|
+
|
27
|
+
require 'nokogumbo/version'
|
data/lib/nokogumbo/html5.rb
CHANGED
@@ -19,7 +19,7 @@ module Nokogiri
|
|
19
19
|
|
20
20
|
# Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
|
21
21
|
def self.parse(string, url = nil, encoding = nil, **options, &block)
|
22
|
-
Document.parse(string, url, encoding, options, &block)
|
22
|
+
Document.parse(string, url, encoding, **options, &block)
|
23
23
|
end
|
24
24
|
|
25
25
|
# Parse a fragment from +string+. Convenience method for
|
@@ -92,19 +92,20 @@ module Nokogiri
|
|
92
92
|
if encoding.nil?
|
93
93
|
string = string.read
|
94
94
|
else
|
95
|
-
|
95
|
+
string = string.read(encoding: encoding)
|
96
96
|
end
|
97
97
|
else
|
98
98
|
# Otherwise the string has the given encoding.
|
99
|
-
|
99
|
+
string = string.to_s
|
100
|
+
if encoding
|
100
101
|
string = string.dup
|
101
102
|
string.force_encoding(encoding)
|
102
103
|
end
|
103
104
|
end
|
104
105
|
|
105
|
-
# convert to UTF-8
|
106
|
-
if string.
|
107
|
-
string = reencode(string
|
106
|
+
# convert to UTF-8
|
107
|
+
if string.encoding != Encoding::UTF_8
|
108
|
+
string = reencode(string)
|
108
109
|
end
|
109
110
|
string
|
110
111
|
end
|
@@ -123,18 +124,17 @@ module Nokogiri
|
|
123
124
|
# http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
|
124
125
|
#
|
125
126
|
def self.reencode(body, content_type=nil)
|
126
|
-
return body unless body.respond_to? :encoding
|
127
|
-
|
128
127
|
if body.encoding == Encoding::ASCII_8BIT
|
129
128
|
encoding = nil
|
130
129
|
|
131
130
|
# look for a Byte Order Mark (BOM)
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
131
|
+
initial_bytes = body[0..2].bytes
|
132
|
+
if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
|
133
|
+
encoding = Encoding::UTF_8
|
134
|
+
elsif initial_bytes[0..1] == [0xFE, 0xFF]
|
135
|
+
encoding = Encoding::UTF_16BE
|
136
|
+
elsif initial_bytes[0..1] == [0xFF, 0xFE]
|
137
|
+
encoding = Encoding::UTF_16LE
|
138
138
|
end
|
139
139
|
|
140
140
|
# look for a charset in a content-encoding header
|
@@ -154,6 +154,7 @@ module Nokogiri
|
|
154
154
|
encoding ||= Encoding::ISO_8859_1
|
155
155
|
|
156
156
|
# change the encoding to match the detected or inferred encoding
|
157
|
+
body = body.dup
|
157
158
|
begin
|
158
159
|
body.force_encoding(encoding)
|
159
160
|
rescue ArgumentError
|
@@ -12,6 +12,9 @@ module Nokogiri
|
|
12
12
|
if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
|
13
13
|
url ||= string_or_io.path
|
14
14
|
end
|
15
|
+
unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
|
16
|
+
raise ArgumentError.new("not a string or IO object")
|
17
|
+
end
|
15
18
|
do_parse(string_or_io, url, encoding, options)
|
16
19
|
end
|
17
20
|
|
@@ -21,7 +24,8 @@ module Nokogiri
|
|
21
24
|
end
|
22
25
|
|
23
26
|
def self.read_memory(string, url = nil, encoding = nil, **options)
|
24
|
-
|
27
|
+
raise ArgumentError.new("string object doesn't respond to :to_str") unless string.respond_to?(:to_str)
|
28
|
+
do_parse(string, url, encoding, options)
|
25
29
|
end
|
26
30
|
|
27
31
|
def fragment(tags = nil)
|
@@ -37,9 +41,10 @@ module Nokogiri
|
|
37
41
|
private
|
38
42
|
def self.do_parse(string_or_io, url, encoding, options)
|
39
43
|
string = HTML5.read_and_encode(string_or_io, encoding)
|
44
|
+
max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
|
40
45
|
max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
|
41
46
|
max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
|
42
|
-
doc = Nokogumbo.parse(string
|
47
|
+
doc = Nokogumbo.parse(string, url, max_attributes, max_errors, max_depth)
|
43
48
|
doc.encoding = 'UTF-8'
|
44
49
|
doc
|
45
50
|
end
|
@@ -12,10 +12,11 @@ module Nokogiri
|
|
12
12
|
self.errors = []
|
13
13
|
return self unless tags
|
14
14
|
|
15
|
+
max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
|
15
16
|
max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
|
16
17
|
max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
|
17
18
|
tags = Nokogiri::HTML5.read_and_encode(tags, nil)
|
18
|
-
Nokogumbo.fragment(self, tags, ctx, max_errors, max_depth)
|
19
|
+
Nokogumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
|
19
20
|
end
|
20
21
|
|
21
22
|
def serialize(options = {}, &block)
|
data/lib/nokogumbo/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Ruby
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2021-03-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -109,8 +109,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
111
|
requirements: []
|
112
|
-
|
113
|
-
rubygems_version: 2.7.6
|
112
|
+
rubygems_version: 3.1.4
|
114
113
|
signing_key:
|
115
114
|
specification_version: 4
|
116
115
|
summary: Nokogiri interface to the Gumbo HTML5 parser
|