nokogumbo 2.0.0.pre.alpha → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +81 -10
- data/ext/nokogumbo/extconf.rb +6 -1
- data/ext/nokogumbo/nokogumbo.c +579 -233
- data/gumbo-parser/src/ascii.c +42 -0
- data/gumbo-parser/src/ascii.h +91 -7
- data/gumbo-parser/src/char_ref.c +5973 -4601
- data/gumbo-parser/src/char_ref.h +13 -28
- data/gumbo-parser/src/error.c +376 -120
- data/gumbo-parser/src/error.h +63 -125
- data/gumbo-parser/src/gumbo.h +47 -4
- data/gumbo-parser/src/parser.c +849 -709
- data/gumbo-parser/src/string_buffer.c +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/tokenizer.c +1426 -1261
- data/gumbo-parser/src/tokenizer.h +5 -5
- data/gumbo-parser/src/tokenizer_states.h +275 -23
- data/gumbo-parser/src/utf8.c +12 -59
- data/gumbo-parser/src/utf8.h +51 -16
- data/lib/nokogumbo.rb +0 -1
- data/lib/nokogumbo/html5.rb +2 -1
- data/lib/nokogumbo/html5/document.rb +12 -1
- data/lib/nokogumbo/html5/document_fragment.rb +35 -20
- data/lib/nokogumbo/{xml → html5}/node.rb +28 -13
- data/lib/nokogumbo/version.rb +1 -1
- metadata +16 -9
- data/CHANGELOG.md +0 -56
data/gumbo-parser/src/utf8.h
CHANGED
@@ -30,7 +30,8 @@ struct GumboInternalError;
|
|
30
30
|
struct GumboInternalParser;
|
31
31
|
|
32
32
|
// Unicode replacement char.
|
33
|
-
|
33
|
+
#define kUtf8ReplacementChar 0xFFFD
|
34
|
+
#define kUtf8MaxChar 0x10FFFF
|
34
35
|
|
35
36
|
typedef struct GumboInternalUtf8Iterator {
|
36
37
|
// Points at the start of the code point most recently read into 'current'.
|
@@ -60,9 +61,23 @@ typedef struct GumboInternalUtf8Iterator {
|
|
60
61
|
struct GumboInternalParser* _parser;
|
61
62
|
} Utf8Iterator;
|
62
63
|
|
63
|
-
// Returns true if this Unicode code point is
|
64
|
-
|
65
|
-
|
64
|
+
// Returns true if this Unicode code point is a surrogate.
|
65
|
+
CONST_FN static inline bool utf8_is_surrogate(int c) {
|
66
|
+
return c >= 0xD800 && c <= 0xDFFF;
|
67
|
+
}
|
68
|
+
|
69
|
+
// Returns true if this Unicode code point is a noncharacter.
|
70
|
+
CONST_FN static inline bool utf8_is_noncharacter(int c) {
|
71
|
+
return
|
72
|
+
(c >= 0xFDD0 && c <= 0xFDEF)
|
73
|
+
|| ((c & 0xFFFF) == 0xFFFE)
|
74
|
+
|| ((c & 0xFFFF) == 0xFFFF);
|
75
|
+
}
|
76
|
+
|
77
|
+
// Returns true if this Unicode code point is a control.
|
78
|
+
CONST_FN static inline bool utf8_is_control(int c) {
|
79
|
+
return ((unsigned int)c < 0x1Fu) || (c >= 0x7F && c <= 0x9F);
|
80
|
+
}
|
66
81
|
|
67
82
|
// Initializes a new Utf8Iterator from the given byte buffer. The source does
|
68
83
|
// not have to be NUL-terminated, but the length must be passed in explicitly.
|
@@ -77,20 +92,47 @@ void utf8iterator_init (
|
|
77
92
|
void utf8iterator_next(Utf8Iterator* iter);
|
78
93
|
|
79
94
|
// Returns the current code point as an integer.
|
80
|
-
int utf8iterator_current(const Utf8Iterator* iter)
|
95
|
+
static inline int utf8iterator_current(const Utf8Iterator* iter) {
|
96
|
+
return iter->_current;
|
97
|
+
}
|
81
98
|
|
82
99
|
// Retrieves and fills the output parameter with the current source position.
|
83
|
-
void utf8iterator_get_position(
|
84
|
-
|
100
|
+
static inline void utf8iterator_get_position (
|
101
|
+
const Utf8Iterator* iter,
|
102
|
+
GumboSourcePosition* output
|
103
|
+
) {
|
104
|
+
*output = iter->_pos;
|
105
|
+
}
|
106
|
+
|
107
|
+
// Retrieves the marked position.
|
108
|
+
static inline GumboSourcePosition utf8iterator_get_mark_position (
|
109
|
+
const Utf8Iterator* iter
|
110
|
+
) {
|
111
|
+
return iter->_mark_pos;
|
112
|
+
}
|
85
113
|
|
86
114
|
// Retrieves a character pointer to the start of the current character.
|
87
|
-
const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter)
|
115
|
+
static inline const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
|
116
|
+
return iter->_start;
|
117
|
+
}
|
118
|
+
|
119
|
+
// Retrieves the width of the current character.
|
120
|
+
static inline size_t utf8iterator_get_width(const Utf8Iterator* iter) {
|
121
|
+
return iter->_width;
|
122
|
+
}
|
88
123
|
|
89
124
|
// Retrieves a character pointer to 1 past the end of the buffer. This is
|
90
125
|
// necessary for certain state machines and string comparisons that would like
|
91
126
|
// to look directly for ASCII text in the buffer without going through the
|
92
127
|
// decoder.
|
93
|
-
const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter)
|
128
|
+
static inline const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
|
129
|
+
return iter->_end;
|
130
|
+
}
|
131
|
+
|
132
|
+
// Retrieves a character pointer to the marked position.
|
133
|
+
static inline const char* utf8iterator_get_mark_pointer(const Utf8Iterator* iter) {
|
134
|
+
return iter->_mark;
|
135
|
+
}
|
94
136
|
|
95
137
|
// If the upcoming text in the buffer matches the specified prefix (which has
|
96
138
|
// length 'length'), consume it and return true. Otherwise, return false with
|
@@ -114,13 +156,6 @@ void utf8iterator_mark(Utf8Iterator* iter);
|
|
114
156
|
// Returns the current input stream position to the mark.
|
115
157
|
void utf8iterator_reset(Utf8Iterator* iter);
|
116
158
|
|
117
|
-
// Sets the position and original text fields of an error to the value at the
|
118
|
-
// mark.
|
119
|
-
void utf8iterator_fill_error_at_mark (
|
120
|
-
Utf8Iterator* iter,
|
121
|
-
struct GumboInternalError* error
|
122
|
-
);
|
123
|
-
|
124
159
|
#ifdef __cplusplus
|
125
160
|
}
|
126
161
|
#endif
|
data/lib/nokogumbo.rb
CHANGED
data/lib/nokogumbo/html5.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'nokogumbo/html5/document'
|
2
2
|
require 'nokogumbo/html5/document_fragment'
|
3
|
+
require 'nokogumbo/html5/node'
|
3
4
|
|
4
5
|
module Nokogiri
|
5
6
|
# Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
|
@@ -167,7 +168,7 @@ module Nokogiri
|
|
167
168
|
case current_node.type
|
168
169
|
when XML::Node::ELEMENT_NODE
|
169
170
|
ns = current_node.namespace
|
170
|
-
ns_uri = ns.nil? ? nil : ns.
|
171
|
+
ns_uri = ns.nil? ? nil : ns.href
|
171
172
|
# XXX(sfc): attach namespaces to all nodes, even html?
|
172
173
|
if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
|
173
174
|
tagname = current_node.name
|
@@ -3,6 +3,7 @@ module Nokogiri
|
|
3
3
|
class Document < Nokogiri::HTML::Document
|
4
4
|
def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
|
5
5
|
yield options if block_given?
|
6
|
+
string_or_io = '' unless string_or_io
|
6
7
|
|
7
8
|
if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != 'ASCII-8BIT'
|
8
9
|
encoding ||= string_or_io.encoding.name
|
@@ -15,7 +16,7 @@ module Nokogiri
|
|
15
16
|
end
|
16
17
|
|
17
18
|
def self.read_io(io, url = nil, encoding = nil, **options)
|
18
|
-
raise ArgumentError.new("io object doesn't respond to :read") unless io.
|
19
|
+
raise ArgumentError.new("io object doesn't respond to :read") unless io.respond_to?(:read)
|
19
20
|
do_parse(io, url, encoding, options)
|
20
21
|
end
|
21
22
|
|
@@ -23,6 +24,16 @@ module Nokogiri
|
|
23
24
|
do_parse(string.to_s, url, encoding, options)
|
24
25
|
end
|
25
26
|
|
27
|
+
def fragment(tags = nil)
|
28
|
+
DocumentFragment.new(self, tags, self.root)
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_xml(options = {}, &block)
|
32
|
+
# Bypass XML::Document#to_xml which doesn't add
|
33
|
+
# XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
|
34
|
+
XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
|
35
|
+
end
|
36
|
+
|
26
37
|
private
|
27
38
|
def self.do_parse(string_or_io, url, encoding, options)
|
28
39
|
string = HTML5.read_and_encode(string_or_io, encoding)
|
@@ -3,29 +3,19 @@ require 'nokogiri'
|
|
3
3
|
module Nokogiri
|
4
4
|
module HTML5
|
5
5
|
class DocumentFragment < Nokogiri::HTML::DocumentFragment
|
6
|
+
attr_accessor :document
|
7
|
+
attr_accessor :errors
|
8
|
+
|
6
9
|
# Create a document fragment.
|
7
10
|
def initialize(doc, tags = nil, ctx = nil, options = {})
|
11
|
+
self.document = doc
|
12
|
+
self.errors = []
|
8
13
|
return self unless tags
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
# Copied from Nokogiri's document_fragment.rb and labled "a horrible
|
15
|
-
# hack."
|
16
|
-
if tags.strip =~ /^<body/i
|
17
|
-
path = "/html/body"
|
18
|
-
else
|
19
|
-
path = "/html/body/node()"
|
20
|
-
end
|
21
|
-
# Add 2 for <html> and <body>.
|
22
|
-
max_depth = (options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH) + 2
|
23
|
-
options = options.dup
|
24
|
-
options[:max_tree_depth] = max_depth
|
25
|
-
temp_doc = HTML5.parse("<!DOCTYPE html><html><body>#{tags}", options)
|
26
|
-
temp_doc.xpath(path).each { |child| child.parent = self }
|
27
|
-
self.errors = temp_doc.errors
|
28
|
-
end
|
14
|
+
|
15
|
+
max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
|
16
|
+
max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
|
17
|
+
tags = Nokogiri::HTML5.read_and_encode(tags, nil)
|
18
|
+
Nokogumbo.fragment(self, tags, ctx, max_errors, max_depth)
|
29
19
|
end
|
30
20
|
|
31
21
|
def serialize(options = {}, &block)
|
@@ -41,6 +31,31 @@ module Nokogiri
|
|
41
31
|
doc.encoding = 'UTF-8'
|
42
32
|
new(doc, tags, nil, options)
|
43
33
|
end
|
34
|
+
|
35
|
+
def extract_params params # :nodoc:
|
36
|
+
handler = params.find do |param|
|
37
|
+
![Hash, String, Symbol].include?(param.class)
|
38
|
+
end
|
39
|
+
params -= [handler] if handler
|
40
|
+
|
41
|
+
hashes = []
|
42
|
+
while Hash === params.last || params.last.nil?
|
43
|
+
hashes << params.pop
|
44
|
+
break if params.empty?
|
45
|
+
end
|
46
|
+
ns, binds = hashes.reverse
|
47
|
+
|
48
|
+
ns ||=
|
49
|
+
begin
|
50
|
+
ns = Hash.new
|
51
|
+
children.each { |child| ns.merge!(child.namespaces) }
|
52
|
+
ns
|
53
|
+
end
|
54
|
+
|
55
|
+
[params, handler, ns, binds]
|
56
|
+
end
|
57
|
+
|
44
58
|
end
|
45
59
|
end
|
46
60
|
end
|
61
|
+
# vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|
@@ -1,57 +1,72 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
|
3
3
|
module Nokogiri
|
4
|
-
|
5
|
-
|
6
|
-
class Node
|
4
|
+
module HTML5
|
5
|
+
module Node
|
7
6
|
# HTML elements can have attributes that contain colons.
|
8
7
|
# Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
|
9
8
|
# and tries to create an attribute in a namespace. This is especially
|
10
9
|
# annoying with attribute names like xml:lang since libxml2 will
|
11
10
|
# actually create the xml namespace if it doesn't exist already.
|
12
|
-
|
11
|
+
def add_child_node_and_reparent_attrs(node)
|
12
|
+
return super(node) unless document.is_a?(HTML5::Document)
|
13
|
+
# I'm not sure what this method is supposed to do. Reparenting
|
14
|
+
# namespaces is handled by libxml2, including child namespaces which
|
15
|
+
# this method wouldn't handle.
|
16
|
+
# https://github.com/sparklemotion/nokogiri/issues/1790
|
13
17
|
add_child_node(node)
|
14
|
-
node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
+
#node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
|
19
|
+
# attr.remove
|
20
|
+
# ns = attr.namespace
|
21
|
+
# a["#{ns.prefix}:#{attr.name}"] = attr.value
|
22
|
+
#end
|
18
23
|
end
|
19
24
|
|
20
25
|
def inner_html(options = {})
|
26
|
+
return super(options) unless document.is_a?(HTML5::Document)
|
21
27
|
result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? "\n" : ""
|
22
28
|
result << children.map { |child| child.to_html(options) }.join
|
23
29
|
result
|
24
30
|
end
|
25
31
|
|
26
32
|
def write_to(io, *options)
|
33
|
+
return super(io, *options) unless document.is_a?(HTML5::Document)
|
27
34
|
options = options.first.is_a?(Hash) ? options.shift : {}
|
28
35
|
encoding = options[:encoding] || options[0]
|
29
36
|
if Nokogiri.jruby?
|
30
37
|
save_options = options[:save_with] || options[1]
|
31
38
|
indent_times = options[:indent] || 0
|
32
39
|
else
|
33
|
-
save_options = options[:save_with] || options[1] || SaveOptions::FORMAT
|
40
|
+
save_options = options[:save_with] || options[1] || XML::Node::SaveOptions::FORMAT
|
34
41
|
indent_times = options[:indent] || 2
|
35
42
|
end
|
36
43
|
indent_string = (options[:indent_text] || ' ') * indent_times
|
37
44
|
|
38
|
-
config = SaveOptions.new(save_options.to_i)
|
45
|
+
config = XML::Node::SaveOptions.new(save_options.to_i)
|
39
46
|
yield config if block_given?
|
40
47
|
|
41
48
|
config_options = config.options
|
42
|
-
if (config_options & (SaveOptions::AS_XML | SaveOptions::AS_XHTML) != 0)
|
49
|
+
if (config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0)
|
43
50
|
# Use Nokogiri's serializing code.
|
44
51
|
native_write_to(io, encoding, indent_string, config_options)
|
45
52
|
else
|
46
53
|
# Serialize including the current node.
|
47
54
|
encoding ||= document.encoding || Encoding::UTF_8
|
48
55
|
internal_ops = {
|
49
|
-
trailing_nl: config_options & SaveOptions::FORMAT != 0,
|
50
56
|
preserve_newline: options[:preserve_newline] || false
|
51
57
|
}
|
52
|
-
HTML5.serialize_node_internal(self, io, encoding,
|
58
|
+
HTML5.serialize_node_internal(self, io, encoding, internal_ops)
|
53
59
|
end
|
54
60
|
end
|
61
|
+
|
62
|
+
def fragment(tags)
|
63
|
+
return super(tags) unless document.is_a?(HTML5::Document)
|
64
|
+
DocumentFragment.new(document, tags, self)
|
65
|
+
end
|
55
66
|
end
|
67
|
+
# Monkey patch
|
68
|
+
XML::Node.prepend(HTML5::Node)
|
56
69
|
end
|
57
70
|
end
|
71
|
+
|
72
|
+
# vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|
data/lib/nokogumbo/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.0
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Ruby
|
@@ -9,22 +9,28 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-
|
12
|
+
date: 2018-10-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
16
|
requirement: !ruby/object:Gem::Requirement
|
17
17
|
requirements:
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '1.8'
|
18
21
|
- - ">="
|
19
22
|
- !ruby/object:Gem::Version
|
20
|
-
version:
|
23
|
+
version: 1.8.4
|
21
24
|
type: :runtime
|
22
25
|
prerelease: false
|
23
26
|
version_requirements: !ruby/object:Gem::Requirement
|
24
27
|
requirements:
|
28
|
+
- - "~>"
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: '1.8'
|
25
31
|
- - ">="
|
26
32
|
- !ruby/object:Gem::Version
|
27
|
-
version:
|
33
|
+
version: 1.8.4
|
28
34
|
description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
|
29
35
|
access the result as a Nokogiri parsed document.
|
30
36
|
email:
|
@@ -35,7 +41,6 @@ extensions:
|
|
35
41
|
- ext/nokogumbo/extconf.rb
|
36
42
|
extra_rdoc_files: []
|
37
43
|
files:
|
38
|
-
- CHANGELOG.md
|
39
44
|
- LICENSE.txt
|
40
45
|
- README.md
|
41
46
|
- ext/nokogumbo/extconf.rb
|
@@ -63,6 +68,8 @@ files:
|
|
63
68
|
- gumbo-parser/src/tag.c
|
64
69
|
- gumbo-parser/src/tag_lookup.c
|
65
70
|
- gumbo-parser/src/tag_lookup.h
|
71
|
+
- gumbo-parser/src/token_buffer.c
|
72
|
+
- gumbo-parser/src/token_buffer.h
|
66
73
|
- gumbo-parser/src/token_type.h
|
67
74
|
- gumbo-parser/src/tokenizer.c
|
68
75
|
- gumbo-parser/src/tokenizer.h
|
@@ -77,8 +84,8 @@ files:
|
|
77
84
|
- lib/nokogumbo/html5.rb
|
78
85
|
- lib/nokogumbo/html5/document.rb
|
79
86
|
- lib/nokogumbo/html5/document_fragment.rb
|
87
|
+
- lib/nokogumbo/html5/node.rb
|
80
88
|
- lib/nokogumbo/version.rb
|
81
|
-
- lib/nokogumbo/xml/node.rb
|
82
89
|
homepage: https://github.com/rubys/nokogumbo/#readme
|
83
90
|
licenses:
|
84
91
|
- Apache-2.0
|
@@ -95,12 +102,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
95
102
|
requirements:
|
96
103
|
- - ">="
|
97
104
|
- !ruby/object:Gem::Version
|
98
|
-
version: '
|
105
|
+
version: '2.1'
|
99
106
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
100
107
|
requirements:
|
101
|
-
- - "
|
108
|
+
- - ">="
|
102
109
|
- !ruby/object:Gem::Version
|
103
|
-
version:
|
110
|
+
version: '0'
|
104
111
|
requirements: []
|
105
112
|
rubyforge_project:
|
106
113
|
rubygems_version: 2.7.6
|
data/CHANGELOG.md
DELETED
@@ -1,56 +0,0 @@
|
|
1
|
-
# Changelog
|
2
|
-
|
3
|
-
All notable changes to Nokogumbo will be documented in this file.
|
4
|
-
|
5
|
-
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
6
|
-
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
7
|
-
|
8
|
-
## [Unreleased]
|
9
|
-
### Added
|
10
|
-
- Experimental support for errors (it was supported in 1.5.0 but
|
11
|
-
undocumented).
|
12
|
-
- Added proper HTML5 serialization.
|
13
|
-
- Added option `:max_tree_depth` to control the maximum parse tree depth.
|
14
|
-
|
15
|
-
### Changed
|
16
|
-
- Integrated [Gumbo parser](https://github.com/google/gumbo-parser) into
|
17
|
-
Nokogumbo. A system version will not be used.
|
18
|
-
- The undocumented (but publicly mentioned) `:max_parse_errors` renamed to `:max_errors`;
|
19
|
-
`:max_parse_errors` is deprecated and will go away
|
20
|
-
- The various `#parse` and `#fragment` (and `Nokogiri.HTML5`) methods return
|
21
|
-
`Nokogiri::HTML5::Document` and `Nokogiri::HTML5::DocumentFragment` classes
|
22
|
-
rather than `Nokogiri::HTML::Document` and
|
23
|
-
`Nokogiri::HTML::DocumentFragment`.
|
24
|
-
- Changed the top-level API to more closely match Nokogiri's while maintaining
|
25
|
-
backwards compatibility. The new APIs are
|
26
|
-
* `Nokogiri::HTML5(html, url = nil, encoding = nil, **options, &block)`
|
27
|
-
* `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, **options, &block)`
|
28
|
-
* `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, **options, &block)`
|
29
|
-
* `Nokogiri::HTML5.fragment(html, encoding = nil, **options)`
|
30
|
-
* `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, **options)`
|
31
|
-
In all cases, `html` can be a string or an `IO` object (something that
|
32
|
-
responds to `#read`). The `url` parameter is entirely for error reporting,
|
33
|
-
as in Nokogiri. The `encoding` parameter only signals what encoding `html`
|
34
|
-
should have on input; the output `Document` or `DocumentFragment` will be in
|
35
|
-
UTF-8. Currently, the only options supported is `:max_errors` which controls
|
36
|
-
the maximum number of reported by `#errors`.
|
37
|
-
|
38
|
-
### Deprecated
|
39
|
-
- `:max_parse_errors`; use `:max_errors`
|
40
|
-
|
41
|
-
### Removed
|
42
|
-
|
43
|
-
### Fixed
|
44
|
-
- Fixed documents failing to serialize (via `to_html`) if they contain certain
|
45
|
-
`meta` elements that set the `charset`.
|
46
|
-
- Documents are now properly marked as UTF-8 after parsing.
|
47
|
-
- Fixed `Nokogiri::HTML5.fragment` reporting an error due to a missing
|
48
|
-
`<!DOCTYPE html>`.
|
49
|
-
- Fixed crash when input contains U+0000 NULL bytes and error reporting is
|
50
|
-
enabled.
|
51
|
-
|
52
|
-
### Security
|
53
|
-
- The most recent, released version of Gumbo has a [potential security
|
54
|
-
issue](https://github.com/google/gumbo-parser/pull/375) that could result in
|
55
|
-
a cross-site scripting vulnerability. This has been fixed by integrating
|
56
|
-
Gumbo into Nokogumbo.
|