nokogumbo 2.0.0.pre.alpha → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +81 -10
- data/ext/nokogumbo/extconf.rb +6 -1
- data/ext/nokogumbo/nokogumbo.c +579 -233
- data/gumbo-parser/src/ascii.c +42 -0
- data/gumbo-parser/src/ascii.h +91 -7
- data/gumbo-parser/src/char_ref.c +5973 -4601
- data/gumbo-parser/src/char_ref.h +13 -28
- data/gumbo-parser/src/error.c +376 -120
- data/gumbo-parser/src/error.h +63 -125
- data/gumbo-parser/src/gumbo.h +47 -4
- data/gumbo-parser/src/parser.c +849 -709
- data/gumbo-parser/src/string_buffer.c +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/tokenizer.c +1426 -1261
- data/gumbo-parser/src/tokenizer.h +5 -5
- data/gumbo-parser/src/tokenizer_states.h +275 -23
- data/gumbo-parser/src/utf8.c +12 -59
- data/gumbo-parser/src/utf8.h +51 -16
- data/lib/nokogumbo.rb +0 -1
- data/lib/nokogumbo/html5.rb +2 -1
- data/lib/nokogumbo/html5/document.rb +12 -1
- data/lib/nokogumbo/html5/document_fragment.rb +35 -20
- data/lib/nokogumbo/{xml → html5}/node.rb +28 -13
- data/lib/nokogumbo/version.rb +1 -1
- metadata +16 -9
- data/CHANGELOG.md +0 -56
data/gumbo-parser/src/utf8.h
CHANGED
@@ -30,7 +30,8 @@ struct GumboInternalError;
|
|
30
30
|
struct GumboInternalParser;
|
31
31
|
|
32
32
|
// Unicode replacement char.
|
33
|
-
|
33
|
+
#define kUtf8ReplacementChar 0xFFFD
|
34
|
+
#define kUtf8MaxChar 0x10FFFF
|
34
35
|
|
35
36
|
typedef struct GumboInternalUtf8Iterator {
|
36
37
|
// Points at the start of the code point most recently read into 'current'.
|
@@ -60,9 +61,23 @@ typedef struct GumboInternalUtf8Iterator {
|
|
60
61
|
struct GumboInternalParser* _parser;
|
61
62
|
} Utf8Iterator;
|
62
63
|
|
63
|
-
// Returns true if this Unicode code point is
|
64
|
-
|
65
|
-
|
64
|
+
// Returns true if this Unicode code point is a surrogate.
|
65
|
+
CONST_FN static inline bool utf8_is_surrogate(int c) {
|
66
|
+
return c >= 0xD800 && c <= 0xDFFF;
|
67
|
+
}
|
68
|
+
|
69
|
+
// Returns true if this Unicode code point is a noncharacter.
|
70
|
+
CONST_FN static inline bool utf8_is_noncharacter(int c) {
|
71
|
+
return
|
72
|
+
(c >= 0xFDD0 && c <= 0xFDEF)
|
73
|
+
|| ((c & 0xFFFF) == 0xFFFE)
|
74
|
+
|| ((c & 0xFFFF) == 0xFFFF);
|
75
|
+
}
|
76
|
+
|
77
|
+
// Returns true if this Unicode code point is a control.
|
78
|
+
CONST_FN static inline bool utf8_is_control(int c) {
|
79
|
+
return ((unsigned int)c < 0x1Fu) || (c >= 0x7F && c <= 0x9F);
|
80
|
+
}
|
66
81
|
|
67
82
|
// Initializes a new Utf8Iterator from the given byte buffer. The source does
|
68
83
|
// not have to be NUL-terminated, but the length must be passed in explicitly.
|
@@ -77,20 +92,47 @@ void utf8iterator_init (
|
|
77
92
|
void utf8iterator_next(Utf8Iterator* iter);
|
78
93
|
|
79
94
|
// Returns the current code point as an integer.
|
80
|
-
int utf8iterator_current(const Utf8Iterator* iter)
|
95
|
+
static inline int utf8iterator_current(const Utf8Iterator* iter) {
|
96
|
+
return iter->_current;
|
97
|
+
}
|
81
98
|
|
82
99
|
// Retrieves and fills the output parameter with the current source position.
|
83
|
-
void utf8iterator_get_position(
|
84
|
-
|
100
|
+
static inline void utf8iterator_get_position (
|
101
|
+
const Utf8Iterator* iter,
|
102
|
+
GumboSourcePosition* output
|
103
|
+
) {
|
104
|
+
*output = iter->_pos;
|
105
|
+
}
|
106
|
+
|
107
|
+
// Retrieves the marked position.
|
108
|
+
static inline GumboSourcePosition utf8iterator_get_mark_position (
|
109
|
+
const Utf8Iterator* iter
|
110
|
+
) {
|
111
|
+
return iter->_mark_pos;
|
112
|
+
}
|
85
113
|
|
86
114
|
// Retrieves a character pointer to the start of the current character.
|
87
|
-
const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter)
|
115
|
+
static inline const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
|
116
|
+
return iter->_start;
|
117
|
+
}
|
118
|
+
|
119
|
+
// Retrieves the width of the current character.
|
120
|
+
static inline size_t utf8iterator_get_width(const Utf8Iterator* iter) {
|
121
|
+
return iter->_width;
|
122
|
+
}
|
88
123
|
|
89
124
|
// Retrieves a character pointer to 1 past the end of the buffer. This is
|
90
125
|
// necessary for certain state machines and string comparisons that would like
|
91
126
|
// to look directly for ASCII text in the buffer without going through the
|
92
127
|
// decoder.
|
93
|
-
const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter)
|
128
|
+
static inline const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
|
129
|
+
return iter->_end;
|
130
|
+
}
|
131
|
+
|
132
|
+
// Retrieves a character pointer to the marked position.
|
133
|
+
static inline const char* utf8iterator_get_mark_pointer(const Utf8Iterator* iter) {
|
134
|
+
return iter->_mark;
|
135
|
+
}
|
94
136
|
|
95
137
|
// If the upcoming text in the buffer matches the specified prefix (which has
|
96
138
|
// length 'length'), consume it and return true. Otherwise, return false with
|
@@ -114,13 +156,6 @@ void utf8iterator_mark(Utf8Iterator* iter);
|
|
114
156
|
// Returns the current input stream position to the mark.
|
115
157
|
void utf8iterator_reset(Utf8Iterator* iter);
|
116
158
|
|
117
|
-
// Sets the position and original text fields of an error to the value at the
|
118
|
-
// mark.
|
119
|
-
void utf8iterator_fill_error_at_mark (
|
120
|
-
Utf8Iterator* iter,
|
121
|
-
struct GumboInternalError* error
|
122
|
-
);
|
123
|
-
|
124
159
|
#ifdef __cplusplus
|
125
160
|
}
|
126
161
|
#endif
|
data/lib/nokogumbo.rb
CHANGED
data/lib/nokogumbo/html5.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'nokogumbo/html5/document'
|
2
2
|
require 'nokogumbo/html5/document_fragment'
|
3
|
+
require 'nokogumbo/html5/node'
|
3
4
|
|
4
5
|
module Nokogiri
|
5
6
|
# Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
|
@@ -167,7 +168,7 @@ module Nokogiri
|
|
167
168
|
case current_node.type
|
168
169
|
when XML::Node::ELEMENT_NODE
|
169
170
|
ns = current_node.namespace
|
170
|
-
ns_uri = ns.nil? ? nil : ns.
|
171
|
+
ns_uri = ns.nil? ? nil : ns.href
|
171
172
|
# XXX(sfc): attach namespaces to all nodes, even html?
|
172
173
|
if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
|
173
174
|
tagname = current_node.name
|
@@ -3,6 +3,7 @@ module Nokogiri
|
|
3
3
|
class Document < Nokogiri::HTML::Document
|
4
4
|
def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
|
5
5
|
yield options if block_given?
|
6
|
+
string_or_io = '' unless string_or_io
|
6
7
|
|
7
8
|
if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != 'ASCII-8BIT'
|
8
9
|
encoding ||= string_or_io.encoding.name
|
@@ -15,7 +16,7 @@ module Nokogiri
|
|
15
16
|
end
|
16
17
|
|
17
18
|
def self.read_io(io, url = nil, encoding = nil, **options)
|
18
|
-
raise ArgumentError.new("io object doesn't respond to :read") unless io.
|
19
|
+
raise ArgumentError.new("io object doesn't respond to :read") unless io.respond_to?(:read)
|
19
20
|
do_parse(io, url, encoding, options)
|
20
21
|
end
|
21
22
|
|
@@ -23,6 +24,16 @@ module Nokogiri
|
|
23
24
|
do_parse(string.to_s, url, encoding, options)
|
24
25
|
end
|
25
26
|
|
27
|
+
def fragment(tags = nil)
|
28
|
+
DocumentFragment.new(self, tags, self.root)
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_xml(options = {}, &block)
|
32
|
+
# Bypass XML::Document#to_xml which doesn't add
|
33
|
+
# XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
|
34
|
+
XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
|
35
|
+
end
|
36
|
+
|
26
37
|
private
|
27
38
|
def self.do_parse(string_or_io, url, encoding, options)
|
28
39
|
string = HTML5.read_and_encode(string_or_io, encoding)
|
@@ -3,29 +3,19 @@ require 'nokogiri'
|
|
3
3
|
module Nokogiri
|
4
4
|
module HTML5
|
5
5
|
class DocumentFragment < Nokogiri::HTML::DocumentFragment
|
6
|
+
attr_accessor :document
|
7
|
+
attr_accessor :errors
|
8
|
+
|
6
9
|
# Create a document fragment.
|
7
10
|
def initialize(doc, tags = nil, ctx = nil, options = {})
|
11
|
+
self.document = doc
|
12
|
+
self.errors = []
|
8
13
|
return self unless tags
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
# Copied from Nokogiri's document_fragment.rb and labled "a horrible
|
15
|
-
# hack."
|
16
|
-
if tags.strip =~ /^<body/i
|
17
|
-
path = "/html/body"
|
18
|
-
else
|
19
|
-
path = "/html/body/node()"
|
20
|
-
end
|
21
|
-
# Add 2 for <html> and <body>.
|
22
|
-
max_depth = (options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH) + 2
|
23
|
-
options = options.dup
|
24
|
-
options[:max_tree_depth] = max_depth
|
25
|
-
temp_doc = HTML5.parse("<!DOCTYPE html><html><body>#{tags}", options)
|
26
|
-
temp_doc.xpath(path).each { |child| child.parent = self }
|
27
|
-
self.errors = temp_doc.errors
|
28
|
-
end
|
14
|
+
|
15
|
+
max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
|
16
|
+
max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
|
17
|
+
tags = Nokogiri::HTML5.read_and_encode(tags, nil)
|
18
|
+
Nokogumbo.fragment(self, tags, ctx, max_errors, max_depth)
|
29
19
|
end
|
30
20
|
|
31
21
|
def serialize(options = {}, &block)
|
@@ -41,6 +31,31 @@ module Nokogiri
|
|
41
31
|
doc.encoding = 'UTF-8'
|
42
32
|
new(doc, tags, nil, options)
|
43
33
|
end
|
34
|
+
|
35
|
+
def extract_params params # :nodoc:
|
36
|
+
handler = params.find do |param|
|
37
|
+
![Hash, String, Symbol].include?(param.class)
|
38
|
+
end
|
39
|
+
params -= [handler] if handler
|
40
|
+
|
41
|
+
hashes = []
|
42
|
+
while Hash === params.last || params.last.nil?
|
43
|
+
hashes << params.pop
|
44
|
+
break if params.empty?
|
45
|
+
end
|
46
|
+
ns, binds = hashes.reverse
|
47
|
+
|
48
|
+
ns ||=
|
49
|
+
begin
|
50
|
+
ns = Hash.new
|
51
|
+
children.each { |child| ns.merge!(child.namespaces) }
|
52
|
+
ns
|
53
|
+
end
|
54
|
+
|
55
|
+
[params, handler, ns, binds]
|
56
|
+
end
|
57
|
+
|
44
58
|
end
|
45
59
|
end
|
46
60
|
end
|
61
|
+
# vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|
@@ -1,57 +1,72 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
|
3
3
|
module Nokogiri
|
4
|
-
|
5
|
-
|
6
|
-
class Node
|
4
|
+
module HTML5
|
5
|
+
module Node
|
7
6
|
# HTML elements can have attributes that contain colons.
|
8
7
|
# Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
|
9
8
|
# and tries to create an attribute in a namespace. This is especially
|
10
9
|
# annoying with attribute names like xml:lang since libxml2 will
|
11
10
|
# actually create the xml namespace if it doesn't exist already.
|
12
|
-
|
11
|
+
def add_child_node_and_reparent_attrs(node)
|
12
|
+
return super(node) unless document.is_a?(HTML5::Document)
|
13
|
+
# I'm not sure what this method is supposed to do. Reparenting
|
14
|
+
# namespaces is handled by libxml2, including child namespaces which
|
15
|
+
# this method wouldn't handle.
|
16
|
+
# https://github.com/sparklemotion/nokogiri/issues/1790
|
13
17
|
add_child_node(node)
|
14
|
-
node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
+
#node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
|
19
|
+
# attr.remove
|
20
|
+
# ns = attr.namespace
|
21
|
+
# a["#{ns.prefix}:#{attr.name}"] = attr.value
|
22
|
+
#end
|
18
23
|
end
|
19
24
|
|
20
25
|
def inner_html(options = {})
|
26
|
+
return super(options) unless document.is_a?(HTML5::Document)
|
21
27
|
result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? "\n" : ""
|
22
28
|
result << children.map { |child| child.to_html(options) }.join
|
23
29
|
result
|
24
30
|
end
|
25
31
|
|
26
32
|
def write_to(io, *options)
|
33
|
+
return super(io, *options) unless document.is_a?(HTML5::Document)
|
27
34
|
options = options.first.is_a?(Hash) ? options.shift : {}
|
28
35
|
encoding = options[:encoding] || options[0]
|
29
36
|
if Nokogiri.jruby?
|
30
37
|
save_options = options[:save_with] || options[1]
|
31
38
|
indent_times = options[:indent] || 0
|
32
39
|
else
|
33
|
-
save_options = options[:save_with] || options[1] || SaveOptions::FORMAT
|
40
|
+
save_options = options[:save_with] || options[1] || XML::Node::SaveOptions::FORMAT
|
34
41
|
indent_times = options[:indent] || 2
|
35
42
|
end
|
36
43
|
indent_string = (options[:indent_text] || ' ') * indent_times
|
37
44
|
|
38
|
-
config = SaveOptions.new(save_options.to_i)
|
45
|
+
config = XML::Node::SaveOptions.new(save_options.to_i)
|
39
46
|
yield config if block_given?
|
40
47
|
|
41
48
|
config_options = config.options
|
42
|
-
if (config_options & (SaveOptions::AS_XML | SaveOptions::AS_XHTML) != 0)
|
49
|
+
if (config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0)
|
43
50
|
# Use Nokogiri's serializing code.
|
44
51
|
native_write_to(io, encoding, indent_string, config_options)
|
45
52
|
else
|
46
53
|
# Serialize including the current node.
|
47
54
|
encoding ||= document.encoding || Encoding::UTF_8
|
48
55
|
internal_ops = {
|
49
|
-
trailing_nl: config_options & SaveOptions::FORMAT != 0,
|
50
56
|
preserve_newline: options[:preserve_newline] || false
|
51
57
|
}
|
52
|
-
HTML5.serialize_node_internal(self, io, encoding,
|
58
|
+
HTML5.serialize_node_internal(self, io, encoding, internal_ops)
|
53
59
|
end
|
54
60
|
end
|
61
|
+
|
62
|
+
def fragment(tags)
|
63
|
+
return super(tags) unless document.is_a?(HTML5::Document)
|
64
|
+
DocumentFragment.new(document, tags, self)
|
65
|
+
end
|
55
66
|
end
|
67
|
+
# Monkey patch
|
68
|
+
XML::Node.prepend(HTML5::Node)
|
56
69
|
end
|
57
70
|
end
|
71
|
+
|
72
|
+
# vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|
data/lib/nokogumbo/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.0
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Ruby
|
@@ -9,22 +9,28 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-
|
12
|
+
date: 2018-10-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
16
|
requirement: !ruby/object:Gem::Requirement
|
17
17
|
requirements:
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '1.8'
|
18
21
|
- - ">="
|
19
22
|
- !ruby/object:Gem::Version
|
20
|
-
version:
|
23
|
+
version: 1.8.4
|
21
24
|
type: :runtime
|
22
25
|
prerelease: false
|
23
26
|
version_requirements: !ruby/object:Gem::Requirement
|
24
27
|
requirements:
|
28
|
+
- - "~>"
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: '1.8'
|
25
31
|
- - ">="
|
26
32
|
- !ruby/object:Gem::Version
|
27
|
-
version:
|
33
|
+
version: 1.8.4
|
28
34
|
description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
|
29
35
|
access the result as a Nokogiri parsed document.
|
30
36
|
email:
|
@@ -35,7 +41,6 @@ extensions:
|
|
35
41
|
- ext/nokogumbo/extconf.rb
|
36
42
|
extra_rdoc_files: []
|
37
43
|
files:
|
38
|
-
- CHANGELOG.md
|
39
44
|
- LICENSE.txt
|
40
45
|
- README.md
|
41
46
|
- ext/nokogumbo/extconf.rb
|
@@ -63,6 +68,8 @@ files:
|
|
63
68
|
- gumbo-parser/src/tag.c
|
64
69
|
- gumbo-parser/src/tag_lookup.c
|
65
70
|
- gumbo-parser/src/tag_lookup.h
|
71
|
+
- gumbo-parser/src/token_buffer.c
|
72
|
+
- gumbo-parser/src/token_buffer.h
|
66
73
|
- gumbo-parser/src/token_type.h
|
67
74
|
- gumbo-parser/src/tokenizer.c
|
68
75
|
- gumbo-parser/src/tokenizer.h
|
@@ -77,8 +84,8 @@ files:
|
|
77
84
|
- lib/nokogumbo/html5.rb
|
78
85
|
- lib/nokogumbo/html5/document.rb
|
79
86
|
- lib/nokogumbo/html5/document_fragment.rb
|
87
|
+
- lib/nokogumbo/html5/node.rb
|
80
88
|
- lib/nokogumbo/version.rb
|
81
|
-
- lib/nokogumbo/xml/node.rb
|
82
89
|
homepage: https://github.com/rubys/nokogumbo/#readme
|
83
90
|
licenses:
|
84
91
|
- Apache-2.0
|
@@ -95,12 +102,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
95
102
|
requirements:
|
96
103
|
- - ">="
|
97
104
|
- !ruby/object:Gem::Version
|
98
|
-
version: '
|
105
|
+
version: '2.1'
|
99
106
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
100
107
|
requirements:
|
101
|
-
- - "
|
108
|
+
- - ">="
|
102
109
|
- !ruby/object:Gem::Version
|
103
|
-
version:
|
110
|
+
version: '0'
|
104
111
|
requirements: []
|
105
112
|
rubyforge_project:
|
106
113
|
rubygems_version: 2.7.6
|
data/CHANGELOG.md
DELETED
@@ -1,56 +0,0 @@
|
|
1
|
-
# Changelog
|
2
|
-
|
3
|
-
All notable changes to Nokogumbo will be documented in this file.
|
4
|
-
|
5
|
-
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
6
|
-
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
7
|
-
|
8
|
-
## [Unreleased]
|
9
|
-
### Added
|
10
|
-
- Experimental support for errors (it was supported in 1.5.0 but
|
11
|
-
undocumented).
|
12
|
-
- Added proper HTML5 serialization.
|
13
|
-
- Added option `:max_tree_depth` to control the maximum parse tree depth.
|
14
|
-
|
15
|
-
### Changed
|
16
|
-
- Integrated [Gumbo parser](https://github.com/google/gumbo-parser) into
|
17
|
-
Nokogumbo. A system version will not be used.
|
18
|
-
- The undocumented (but publicly mentioned) `:max_parse_errors` renamed to `:max_errors`;
|
19
|
-
`:max_parse_errors` is deprecated and will go away
|
20
|
-
- The various `#parse` and `#fragment` (and `Nokogiri.HTML5`) methods return
|
21
|
-
`Nokogiri::HTML5::Document` and `Nokogiri::HTML5::DocumentFragment` classes
|
22
|
-
rather than `Nokogiri::HTML::Document` and
|
23
|
-
`Nokogiri::HTML::DocumentFragment`.
|
24
|
-
- Changed the top-level API to more closely match Nokogiri's while maintaining
|
25
|
-
backwards compatibility. The new APIs are
|
26
|
-
* `Nokogiri::HTML5(html, url = nil, encoding = nil, **options, &block)`
|
27
|
-
* `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, **options, &block)`
|
28
|
-
* `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, **options, &block)`
|
29
|
-
* `Nokogiri::HTML5.fragment(html, encoding = nil, **options)`
|
30
|
-
* `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, **options)`
|
31
|
-
In all cases, `html` can be a string or an `IO` object (something that
|
32
|
-
responds to `#read`). The `url` parameter is entirely for error reporting,
|
33
|
-
as in Nokogiri. The `encoding` parameter only signals what encoding `html`
|
34
|
-
should have on input; the output `Document` or `DocumentFragment` will be in
|
35
|
-
UTF-8. Currently, the only options supported is `:max_errors` which controls
|
36
|
-
the maximum number of reported by `#errors`.
|
37
|
-
|
38
|
-
### Deprecated
|
39
|
-
- `:max_parse_errors`; use `:max_errors`
|
40
|
-
|
41
|
-
### Removed
|
42
|
-
|
43
|
-
### Fixed
|
44
|
-
- Fixed documents failing to serialize (via `to_html`) if they contain certain
|
45
|
-
`meta` elements that set the `charset`.
|
46
|
-
- Documents are now properly marked as UTF-8 after parsing.
|
47
|
-
- Fixed `Nokogiri::HTML5.fragment` reporting an error due to a missing
|
48
|
-
`<!DOCTYPE html>`.
|
49
|
-
- Fixed crash when input contains U+0000 NULL bytes and error reporting is
|
50
|
-
enabled.
|
51
|
-
|
52
|
-
### Security
|
53
|
-
- The most recent, released version of Gumbo has a [potential security
|
54
|
-
issue](https://github.com/google/gumbo-parser/pull/375) that could result in
|
55
|
-
a cross-site scripting vulnerability. This has been fixed by integrating
|
56
|
-
Gumbo into Nokogumbo.
|