nokogiri 1.16.8-x86_64-darwin → 1.17.0-x86_64-darwin
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +11 -21
- data/README.md +4 -0
- data/dependencies.yml +6 -6
- data/ext/nokogiri/extconf.rb +191 -137
- data/ext/nokogiri/gumbo.c +69 -53
- data/ext/nokogiri/html4_document.c +10 -4
- data/ext/nokogiri/html4_element_description.c +18 -18
- data/ext/nokogiri/html4_sax_parser.c +40 -0
- data/ext/nokogiri/html4_sax_parser_context.c +48 -58
- data/ext/nokogiri/html4_sax_push_parser.c +25 -24
- data/ext/nokogiri/include/libexslt/exsltconfig.h +3 -3
- data/ext/nokogiri/include/libxml2/libxml/HTMLparser.h +12 -19
- data/ext/nokogiri/include/libxml2/libxml/c14n.h +1 -12
- data/ext/nokogiri/include/libxml2/libxml/debugXML.h +1 -1
- data/ext/nokogiri/include/libxml2/libxml/encoding.h +9 -0
- data/ext/nokogiri/include/libxml2/libxml/entities.h +12 -1
- data/ext/nokogiri/include/libxml2/libxml/hash.h +19 -0
- data/ext/nokogiri/include/libxml2/libxml/list.h +2 -2
- data/ext/nokogiri/include/libxml2/libxml/nanohttp.h +17 -0
- data/ext/nokogiri/include/libxml2/libxml/parser.h +60 -54
- data/ext/nokogiri/include/libxml2/libxml/parserInternals.h +9 -1
- data/ext/nokogiri/include/libxml2/libxml/pattern.h +6 -0
- data/ext/nokogiri/include/libxml2/libxml/tree.h +32 -12
- data/ext/nokogiri/include/libxml2/libxml/uri.h +11 -0
- data/ext/nokogiri/include/libxml2/libxml/valid.h +29 -2
- data/ext/nokogiri/include/libxml2/libxml/xinclude.h +7 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlIO.h +21 -4
- data/ext/nokogiri/include/libxml2/libxml/xmlerror.h +14 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlexports.h +111 -15
- data/ext/nokogiri/include/libxml2/libxml/xmlmemory.h +8 -45
- data/ext/nokogiri/include/libxml2/libxml/xmlreader.h +2 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlsave.h +5 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlunicode.h +165 -1
- data/ext/nokogiri/include/libxml2/libxml/xmlversion.h +7 -171
- data/ext/nokogiri/include/libxml2/libxml/xmlwriter.h +1 -0
- data/ext/nokogiri/include/libxml2/libxml/xpath.h +4 -0
- data/ext/nokogiri/include/libxslt/xsltInternals.h +3 -0
- data/ext/nokogiri/include/libxslt/xsltconfig.h +4 -37
- data/ext/nokogiri/libxml2_polyfill.c +114 -0
- data/ext/nokogiri/nokogiri.c +9 -2
- data/ext/nokogiri/nokogiri.h +18 -33
- data/ext/nokogiri/xml_attr.c +1 -1
- data/ext/nokogiri/xml_cdata.c +2 -10
- data/ext/nokogiri/xml_comment.c +3 -8
- data/ext/nokogiri/xml_document.c +163 -156
- data/ext/nokogiri/xml_document_fragment.c +10 -25
- data/ext/nokogiri/xml_dtd.c +1 -1
- data/ext/nokogiri/xml_element_content.c +9 -9
- data/ext/nokogiri/xml_encoding_handler.c +4 -4
- data/ext/nokogiri/xml_namespace.c +6 -6
- data/ext/nokogiri/xml_node.c +130 -104
- data/ext/nokogiri/xml_node_set.c +46 -44
- data/ext/nokogiri/xml_reader.c +54 -58
- data/ext/nokogiri/xml_relax_ng.c +35 -56
- data/ext/nokogiri/xml_sax_parser.c +156 -88
- data/ext/nokogiri/xml_sax_parser_context.c +213 -131
- data/ext/nokogiri/xml_sax_push_parser.c +68 -49
- data/ext/nokogiri/xml_schema.c +50 -85
- data/ext/nokogiri/xml_syntax_error.c +19 -11
- data/ext/nokogiri/xml_text.c +2 -4
- data/ext/nokogiri/xml_xpath_context.c +2 -2
- data/ext/nokogiri/xslt_stylesheet.c +8 -8
- data/lib/nokogiri/3.0/nokogiri.bundle +0 -0
- data/lib/nokogiri/3.1/nokogiri.bundle +0 -0
- data/lib/nokogiri/3.2/nokogiri.bundle +0 -0
- data/lib/nokogiri/3.3/nokogiri.bundle +0 -0
- data/lib/nokogiri/class_resolver.rb +1 -1
- data/lib/nokogiri/css/node.rb +6 -2
- data/lib/nokogiri/css/parser.rb +6 -4
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/parser_extras.rb +6 -66
- data/lib/nokogiri/css/selector_cache.rb +38 -0
- data/lib/nokogiri/css/tokenizer.rb +4 -4
- data/lib/nokogiri/css/tokenizer.rex +9 -8
- data/lib/nokogiri/css/xpath_visitor.rb +42 -6
- data/lib/nokogiri/css.rb +86 -20
- data/lib/nokogiri/decorators/slop.rb +3 -5
- data/lib/nokogiri/encoding_handler.rb +2 -2
- data/lib/nokogiri/html4/document.rb +44 -23
- data/lib/nokogiri/html4/document_fragment.rb +124 -12
- data/lib/nokogiri/html4/encoding_reader.rb +1 -1
- data/lib/nokogiri/html4/sax/parser.rb +23 -38
- data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
- data/lib/nokogiri/html4.rb +9 -14
- data/lib/nokogiri/html5/builder.rb +40 -0
- data/lib/nokogiri/html5/document.rb +61 -30
- data/lib/nokogiri/html5/document_fragment.rb +130 -20
- data/lib/nokogiri/html5/node.rb +4 -4
- data/lib/nokogiri/html5.rb +114 -72
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/xml/builder.rb +8 -1
- data/lib/nokogiri/xml/document.rb +70 -26
- data/lib/nokogiri/xml/document_fragment.rb +84 -13
- data/lib/nokogiri/xml/node.rb +82 -11
- data/lib/nokogiri/xml/node_set.rb +9 -7
- data/lib/nokogiri/xml/parse_options.rb +1 -1
- data/lib/nokogiri/xml/pp/node.rb +6 -1
- data/lib/nokogiri/xml/reader.rb +46 -13
- data/lib/nokogiri/xml/relax_ng.rb +57 -20
- data/lib/nokogiri/xml/sax/document.rb +174 -83
- data/lib/nokogiri/xml/sax/parser.rb +115 -41
- data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
- data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
- data/lib/nokogiri/xml/sax.rb +48 -0
- data/lib/nokogiri/xml/schema.rb +112 -45
- data/lib/nokogiri/xml/searchable.rb +6 -8
- data/lib/nokogiri/xml/syntax_error.rb +22 -0
- data/lib/nokogiri/xml.rb +13 -24
- data/lib/nokogiri/xslt.rb +3 -9
- data/lib/xsd/xmlparser/nokogiri.rb +3 -4
- metadata +8 -4
- data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
@@ -3,13 +3,83 @@
|
|
3
3
|
module Nokogiri
|
4
4
|
module HTML4
|
5
5
|
class DocumentFragment < Nokogiri::XML::DocumentFragment
|
6
|
-
|
7
|
-
#
|
8
|
-
|
6
|
+
#
|
7
|
+
# :call-seq:
|
8
|
+
# parse(input) { |options| ... } → HTML4::DocumentFragment
|
9
|
+
# parse(input, encoding:, options:) { |options| ... } → HTML4::DocumentFragment
|
10
|
+
#
|
11
|
+
# Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment. This
|
12
|
+
# method creates a new, empty HTML4::Document to contain the fragment.
|
13
|
+
#
|
14
|
+
# [Required Parameters]
|
15
|
+
# - +input+ (String | IO) The content to be parsed.
|
16
|
+
#
|
17
|
+
# [Optional Keyword Arguments]
|
18
|
+
# - +encoding:+ (String) The name of the encoding that should be used when processing the
|
19
|
+
# document. When not provided, the encoding will be determined based on the document
|
20
|
+
# content.
|
21
|
+
#
|
22
|
+
# - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
|
23
|
+
# behaviors during parsing. See ParseOptions for more information. The default value is
|
24
|
+
# +ParseOptions::DEFAULT_HTML+.
|
25
|
+
#
|
26
|
+
# [Yields]
|
27
|
+
# If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
|
28
|
+
# can be configured before parsing. See ParseOptions for more information.
|
29
|
+
#
|
30
|
+
# [Returns] HTML4::DocumentFragment
|
31
|
+
#
|
32
|
+
# *Example:* Parsing a string
|
33
|
+
#
|
34
|
+
# fragment = HTML4::DocumentFragment.parse("<div>Hello World</div>")
|
35
|
+
#
|
36
|
+
# *Example:* Parsing an IO
|
37
|
+
#
|
38
|
+
# fragment = File.open("fragment.html") do |file|
|
39
|
+
# HTML4::DocumentFragment.parse(file)
|
40
|
+
# end
|
41
|
+
#
|
42
|
+
# *Example:* Specifying encoding
|
43
|
+
#
|
44
|
+
# fragment = HTML4::DocumentFragment.parse(input, encoding: "EUC-JP")
|
45
|
+
#
|
46
|
+
# *Example:* Setting parse options dynamically
|
47
|
+
#
|
48
|
+
# HTML4::DocumentFragment.parse("<div>Hello World") do |options|
|
49
|
+
# options.huge.pedantic
|
50
|
+
# end
|
51
|
+
#
|
52
|
+
def self.parse(
|
53
|
+
input,
|
54
|
+
encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
|
55
|
+
encoding: encoding_, options: options_,
|
56
|
+
&block
|
57
|
+
)
|
58
|
+
# TODO: this method should take a context node.
|
9
59
|
doc = HTML4::Document.new
|
10
60
|
|
11
|
-
|
12
|
-
|
61
|
+
if input.respond_to?(:read)
|
62
|
+
# Handle IO-like objects (IO, File, StringIO, etc.)
|
63
|
+
# The _read_ method of these objects doesn't accept an +encoding+ parameter.
|
64
|
+
# Encoding is usually set when the IO object is created or opened,
|
65
|
+
# or by using the _set_encoding_ method.
|
66
|
+
#
|
67
|
+
# 1. If +encoding+ is provided and the object supports _set_encoding_,
|
68
|
+
# set the encoding before reading.
|
69
|
+
# 2. Read the content from the IO-like object.
|
70
|
+
#
|
71
|
+
# Note: After reading, the content's encoding will be:
|
72
|
+
# - The encoding set by _set_encoding_ if it was called
|
73
|
+
# - The default encoding of the IO object otherwise
|
74
|
+
#
|
75
|
+
# For StringIO specifically, _set_encoding_ affects only the internal string,
|
76
|
+
# not how the data is read out.
|
77
|
+
input.set_encoding(encoding) if encoding && input.respond_to?(:set_encoding)
|
78
|
+
input = input.read
|
79
|
+
end
|
80
|
+
|
81
|
+
encoding ||= if input.respond_to?(:encoding)
|
82
|
+
encoding = input.encoding
|
13
83
|
if encoding == ::Encoding::ASCII_8BIT
|
14
84
|
"UTF-8"
|
15
85
|
else
|
@@ -21,29 +91,71 @@ module Nokogiri
|
|
21
91
|
|
22
92
|
doc.encoding = encoding
|
23
93
|
|
24
|
-
new(doc,
|
94
|
+
new(doc, input, options: options, &block)
|
25
95
|
end
|
26
96
|
|
27
|
-
|
28
|
-
|
97
|
+
#
|
98
|
+
# :call-seq:
|
99
|
+
# new(document) { |options| ... } → HTML4::DocumentFragment
|
100
|
+
# new(document, input) { |options| ... } → HTML4::DocumentFragment
|
101
|
+
# new(document, input, context:, options:) { |options| ... } → HTML4::DocumentFragment
|
102
|
+
#
|
103
|
+
# Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment.
|
104
|
+
#
|
105
|
+
# 💡 It's recommended to use either HTML4::DocumentFragment.parse or XML::Node#parse rather
|
106
|
+
# than call this method directly.
|
107
|
+
#
|
108
|
+
# [Required Parameters]
|
109
|
+
# - +document+ (HTML4::Document) The parent document to associate the returned fragment with.
|
110
|
+
#
|
111
|
+
# [Optional Parameters]
|
112
|
+
# - +input+ (String) The content to be parsed.
|
113
|
+
#
|
114
|
+
# [Optional Keyword Arguments]
|
115
|
+
# - +context:+ (Nokogiri::XML::Node) The <b>context node</b> for the subtree created. See
|
116
|
+
# below for more information.
|
117
|
+
#
|
118
|
+
# - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
|
119
|
+
# behaviors during parsing. See ParseOptions for more information. The default value is
|
120
|
+
# +ParseOptions::DEFAULT_HTML+.
|
121
|
+
#
|
122
|
+
# [Yields]
|
123
|
+
# If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
|
124
|
+
# can be configured before parsing. See ParseOptions for more information.
|
125
|
+
#
|
126
|
+
# [Returns] HTML4::DocumentFragment
|
127
|
+
#
|
128
|
+
# === Context \Node
|
129
|
+
#
|
130
|
+
# If a context node is specified using +context:+, then the fragment will be created by
|
131
|
+
# calling XML::Node#parse on that node, so the parser will behave as if that Node is the
|
132
|
+
# parent of the fragment subtree.
|
133
|
+
#
|
134
|
+
def initialize(
|
135
|
+
document, input = nil,
|
136
|
+
context_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
|
137
|
+
context: context_, options: options_
|
138
|
+
) # rubocop:disable Lint/MissingSuper
|
139
|
+
return self unless input
|
29
140
|
|
30
141
|
options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
|
142
|
+
@parse_options = options
|
31
143
|
yield options if block_given?
|
32
144
|
|
33
|
-
if
|
145
|
+
if context
|
34
146
|
preexisting_errors = document.errors.dup
|
35
|
-
node_set =
|
147
|
+
node_set = context.parse("<div>#{input}</div>", options)
|
36
148
|
node_set.first.children.each { |child| child.parent = self } unless node_set.empty?
|
37
149
|
self.errors = document.errors - preexisting_errors
|
38
150
|
else
|
39
151
|
# This is a horrible hack, but I don't care
|
40
|
-
path = if /^\s*?<body/i.match?(
|
152
|
+
path = if /^\s*?<body/i.match?(input)
|
41
153
|
"/html/body"
|
42
154
|
else
|
43
155
|
"/html/body/node()"
|
44
156
|
end
|
45
157
|
|
46
|
-
temp_doc = HTML4::Document.parse("<html><body>#{
|
158
|
+
temp_doc = HTML4::Document.parse("<html><body>#{input}", nil, document.encoding, options)
|
47
159
|
temp_doc.xpath(path).each { |child| child.parent = self }
|
48
160
|
self.errors = temp_doc.errors
|
49
161
|
end
|
@@ -3,60 +3,45 @@
|
|
3
3
|
module Nokogiri
|
4
4
|
module HTML4
|
5
5
|
###
|
6
|
-
# Nokogiri
|
6
|
+
# Nokogiri provides a SAX parser to process HTML4 which will provide HTML recovery
|
7
|
+
# ("autocorrection") features.
|
7
8
|
#
|
8
9
|
# See Nokogiri::HTML4::SAX::Parser for a basic example of using a SAX parser with HTML.
|
9
10
|
#
|
10
11
|
# For more information on SAX parsers, see Nokogiri::XML::SAX
|
12
|
+
#
|
11
13
|
module SAX
|
12
14
|
###
|
13
|
-
# This
|
15
|
+
# This parser is a SAX style parser that reads its input as it deems necessary. The parser
|
16
|
+
# takes a Nokogiri::XML::SAX::Document, an optional encoding, then given an HTML input, sends
|
17
|
+
# messages to the Nokogiri::XML::SAX::Document.
|
18
|
+
#
|
19
|
+
# ⚠ This is an HTML4 parser and so may not support some HTML5 features and behaviors.
|
14
20
|
#
|
15
21
|
# Here is a basic usage example:
|
16
22
|
#
|
17
|
-
# class
|
23
|
+
# class MyHandler < Nokogiri::XML::SAX::Document
|
18
24
|
# def start_element name, attributes = []
|
19
25
|
# puts "found a #{name}"
|
20
26
|
# end
|
21
27
|
# end
|
22
28
|
#
|
23
|
-
# parser = Nokogiri::HTML4::SAX::Parser.new(
|
24
|
-
#
|
29
|
+
# parser = Nokogiri::HTML4::SAX::Parser.new(MyHandler.new)
|
30
|
+
#
|
31
|
+
# # Hand an IO object to the parser, which will read the HTML from the IO.
|
32
|
+
# File.open(path_to_html) do |f|
|
33
|
+
# parser.parse(f)
|
34
|
+
# end
|
35
|
+
#
|
36
|
+
# For more information on \SAX parsers, see Nokogiri::XML::SAX or the parent class
|
37
|
+
# Nokogiri::XML::SAX::Parser.
|
38
|
+
#
|
39
|
+
# Also see Nokogiri::XML::SAX::Document for the available events.
|
25
40
|
#
|
26
|
-
# For more information on SAX parsers, see Nokogiri::XML::SAX
|
27
41
|
class Parser < Nokogiri::XML::SAX::Parser
|
28
|
-
|
29
|
-
#
|
30
|
-
|
31
|
-
raise TypeError unless String === data
|
32
|
-
return if data.empty?
|
33
|
-
|
34
|
-
ctx = ParserContext.memory(data, encoding)
|
35
|
-
yield ctx if block_given?
|
36
|
-
ctx.parse_with(self)
|
37
|
-
end
|
38
|
-
|
39
|
-
###
|
40
|
-
# Parse given +io+
|
41
|
-
def parse_io(io, encoding = "UTF-8")
|
42
|
-
check_encoding(encoding)
|
43
|
-
@encoding = encoding
|
44
|
-
ctx = ParserContext.io(io, ENCODINGS[encoding])
|
45
|
-
yield ctx if block_given?
|
46
|
-
ctx.parse_with(self)
|
47
|
-
end
|
48
|
-
|
49
|
-
###
|
50
|
-
# Parse a file with +filename+
|
51
|
-
def parse_file(filename, encoding = "UTF-8")
|
52
|
-
raise ArgumentError unless filename
|
53
|
-
raise Errno::ENOENT unless File.exist?(filename)
|
54
|
-
raise Errno::EISDIR if File.directory?(filename)
|
55
|
-
|
56
|
-
ctx = ParserContext.file(filename, encoding)
|
57
|
-
yield ctx if block_given?
|
58
|
-
ctx.parse_with(self)
|
59
|
-
end
|
42
|
+
# this class inherits its behavior from Nokogiri::XML::SAX::Parser, but note that superclass
|
43
|
+
# uses Nokogiri::ClassResolver to use HTML4::SAX::ParserContext as the context class for
|
44
|
+
# this class, which is where the real behavioral differences are implemented.
|
60
45
|
end
|
61
46
|
end
|
62
47
|
end
|
@@ -4,16 +4,11 @@ module Nokogiri
|
|
4
4
|
module HTML4
|
5
5
|
module SAX
|
6
6
|
###
|
7
|
-
# Context
|
8
|
-
#
|
7
|
+
# Context object to invoke the HTML4 SAX parser on the SAX::Document handler.
|
8
|
+
#
|
9
|
+
# 💡 This class is usually not instantiated by the user. Use Nokogiri::HTML4::SAX::Parser
|
10
|
+
# instead.
|
9
11
|
class ParserContext < Nokogiri::XML::SAX::ParserContext
|
10
|
-
def self.new(thing, encoding = "UTF-8")
|
11
|
-
if [:read, :close].all? { |x| thing.respond_to?(x) }
|
12
|
-
super
|
13
|
-
else
|
14
|
-
memory(thing, encoding)
|
15
|
-
end
|
16
|
-
end
|
17
12
|
end
|
18
13
|
end
|
19
14
|
end
|
data/lib/nokogiri/html4.rb
CHANGED
@@ -3,12 +3,9 @@
|
|
3
3
|
|
4
4
|
module Nokogiri
|
5
5
|
class << self
|
6
|
-
#
|
7
|
-
|
8
|
-
|
9
|
-
# Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
|
10
|
-
def HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
|
11
|
-
Nokogiri::HTML4::Document.parse(input, url, encoding, options, &block)
|
6
|
+
# Convenience method for Nokogiri::HTML4::Document.parse
|
7
|
+
def HTML4(...)
|
8
|
+
Nokogiri::HTML4::Document.parse(...)
|
12
9
|
end
|
13
10
|
end
|
14
11
|
|
@@ -18,16 +15,14 @@ module Nokogiri
|
|
18
15
|
# for parsing HTML.
|
19
16
|
module HTML4
|
20
17
|
class << self
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
Document.parse(input, url, encoding, options, &block)
|
18
|
+
# Convenience method for Nokogiri::HTML4::Document.parse
|
19
|
+
def parse(...)
|
20
|
+
Document.parse(...)
|
25
21
|
end
|
26
22
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
HTML4::DocumentFragment.parse(string, encoding, options, &block)
|
23
|
+
# Convenience method for Nokogiri::HTML4::DocumentFragment.parse
|
24
|
+
def fragment(...)
|
25
|
+
HTML4::DocumentFragment.parse(...)
|
31
26
|
end
|
32
27
|
end
|
33
28
|
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module HTML5
|
5
|
+
###
|
6
|
+
# Nokogiri HTML5 builder is used for building HTML documents. It is very similar to the
|
7
|
+
# Nokogiri::XML::Builder. In fact, you should go read the documentation for
|
8
|
+
# Nokogiri::XML::Builder before reading this documentation.
|
9
|
+
#
|
10
|
+
# The construction behavior is identical to HTML4::Builder, but HTML5 documents implement the
|
11
|
+
# [HTML5 standard's serialization
|
12
|
+
# algorithm](https://www.w3.org/TR/2008/WD-html5-20080610/serializing.html).
|
13
|
+
#
|
14
|
+
# == Synopsis:
|
15
|
+
#
|
16
|
+
# Create an HTML5 document with a body that has an onload attribute, and a
|
17
|
+
# span tag with a class of "bold" that has content of "Hello world".
|
18
|
+
#
|
19
|
+
# builder = Nokogiri::HTML5::Builder.new do |doc|
|
20
|
+
# doc.html {
|
21
|
+
# doc.body(:onload => 'some_func();') {
|
22
|
+
# doc.span.bold {
|
23
|
+
# doc.text "Hello world"
|
24
|
+
# }
|
25
|
+
# }
|
26
|
+
# }
|
27
|
+
# end
|
28
|
+
# puts builder.to_html
|
29
|
+
#
|
30
|
+
# The HTML5 builder inherits from the XML builder, so make sure to read the
|
31
|
+
# Nokogiri::XML::Builder documentation.
|
32
|
+
class Builder < Nokogiri::XML::Builder
|
33
|
+
###
|
34
|
+
# Convert the builder to HTML
|
35
|
+
def to_html
|
36
|
+
@doc.to_html
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -43,41 +43,69 @@ module Nokogiri
|
|
43
43
|
|
44
44
|
# Get the parser's quirks mode value. See HTML5::QuirksMode.
|
45
45
|
#
|
46
|
-
# This method returns
|
46
|
+
# This method returns +nil+ if the parser was not invoked (e.g., Nokogiri::HTML5::Document.new).
|
47
47
|
#
|
48
48
|
# Since v1.14.0
|
49
49
|
attr_reader :quirks_mode
|
50
50
|
|
51
51
|
class << self
|
52
52
|
# :call-seq:
|
53
|
-
# parse(input)
|
54
|
-
# parse(input, url
|
55
|
-
# parse(input,
|
53
|
+
# parse(input) { |options| ... } → HTML5::Document
|
54
|
+
# parse(input, url: encoding:) { |options| ... } → HTML5::Document
|
55
|
+
# parse(input, **options) → HTML5::Document
|
56
56
|
#
|
57
|
-
# Parse HTML5
|
57
|
+
# Parse \HTML input with a parser compliant with the HTML5 spec. This method uses the
|
58
|
+
# encoding of +input+ if it can be determined, or else falls back to the +encoding:+
|
59
|
+
# parameter.
|
58
60
|
#
|
59
|
-
# [Parameters]
|
60
|
-
# - +input+
|
61
|
-
# IO, or StringIO.
|
61
|
+
# [Required Parameters]
|
62
|
+
# - +input+ (String | IO) the \HTML content to be parsed.
|
62
63
|
#
|
63
|
-
#
|
64
|
+
# [Optional Parameters]
|
65
|
+
# - +url:+ (String) the base URI of the document.
|
64
66
|
#
|
65
|
-
#
|
66
|
-
#
|
67
|
+
# [Optional Keyword Arguments]
|
68
|
+
# - +encoding:+ (Encoding) The name of the encoding that should be used when processing the
|
69
|
+
# document. When not provided, the encoding will be determined based on the document
|
70
|
+
# content.
|
67
71
|
#
|
68
|
-
# - +
|
69
|
-
#
|
70
|
-
# +:max_tree_depth+ and +:max_attributes+, described at Nokogiri::HTML5.
|
72
|
+
# - +max_errors:+ (Integer) The maximum number of parse errors to record. (default
|
73
|
+
# +Nokogiri::Gumbo::DEFAULT_MAX_ERRORS+ which is currently 0)
|
71
74
|
#
|
72
|
-
#
|
73
|
-
# Nokogiri::
|
75
|
+
# - +max_tree_depth:+ (Integer) The maximum depth of the parse tree. (default
|
76
|
+
# +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+)
|
74
77
|
#
|
75
|
-
# - +
|
76
|
-
# Nokogiri::
|
78
|
+
# - +max_attributes:+ (Integer) The maximum number of attributes allowed on an
|
79
|
+
# element. (default +Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES+)
|
80
|
+
#
|
81
|
+
# - +parse_noscript_content_as_text:+ (Boolean) Whether to parse the content of +noscript+
|
82
|
+
# elements as text. (default +false+)
|
83
|
+
#
|
84
|
+
# See rdoc-ref:HTML5@Parsing+options for a complete description of these parsing options.
|
85
|
+
#
|
86
|
+
# [Yields]
|
87
|
+
# If present, the block will be passed a Hash object to modify with parse options before the
|
88
|
+
# input is parsed. See rdoc-ref:HTML5@Parsing+options for a list of available options.
|
89
|
+
#
|
90
|
+
# ⚠ Note that +url:+ and +encoding:+ cannot be set by the configuration block.
|
77
91
|
#
|
78
92
|
# [Returns] Nokogiri::HTML5::Document
|
79
93
|
#
|
80
|
-
|
94
|
+
# *Example:* Parse a string with a specific encoding and custom max errors limit.
|
95
|
+
#
|
96
|
+
# Nokogiri::HTML5::Document.parse(socket, encoding: "ISO-8859-1", max_errors: 10)
|
97
|
+
#
|
98
|
+
# *Example:* Parse a string setting the +:parse_noscript_content_as_text+ option using the
|
99
|
+
# configuration block parameter.
|
100
|
+
#
|
101
|
+
# Nokogiri::HTML5::Document.parse(input) { |c| c[:parse_noscript_content_as_text] = true }
|
102
|
+
#
|
103
|
+
def parse(
|
104
|
+
string_or_io,
|
105
|
+
url_ = nil, encoding_ = nil,
|
106
|
+
url: url_, encoding: encoding_,
|
107
|
+
**options, &block
|
108
|
+
)
|
81
109
|
yield options if block
|
82
110
|
string_or_io = "" unless string_or_io
|
83
111
|
|
@@ -92,35 +120,37 @@ module Nokogiri
|
|
92
120
|
raise ArgumentError, "not a string or IO object"
|
93
121
|
end
|
94
122
|
|
95
|
-
do_parse(string_or_io, url, encoding, options)
|
123
|
+
do_parse(string_or_io, url, encoding, **options)
|
96
124
|
end
|
97
125
|
|
98
126
|
# Create a new document from an IO object.
|
99
127
|
#
|
100
128
|
# 💡 Most users should prefer Document.parse to this method.
|
101
|
-
def read_io(io,
|
129
|
+
def read_io(io, url_ = nil, encoding_ = nil, url: url_, encoding: encoding_, **options)
|
102
130
|
raise ArgumentError, "io object doesn't respond to :read" unless io.respond_to?(:read)
|
103
131
|
|
104
|
-
do_parse(io, url, encoding, options)
|
132
|
+
do_parse(io, url, encoding, **options)
|
105
133
|
end
|
106
134
|
|
107
135
|
# Create a new document from a String.
|
108
136
|
#
|
109
137
|
# 💡 Most users should prefer Document.parse to this method.
|
110
|
-
def read_memory(string,
|
138
|
+
def read_memory(string, url_ = nil, encoding_ = nil, url: url_, encoding: encoding_, **options)
|
111
139
|
raise ArgumentError, "string object doesn't respond to :to_str" unless string.respond_to?(:to_str)
|
112
140
|
|
113
|
-
do_parse(string, url, encoding, options)
|
141
|
+
do_parse(string, url, encoding, **options)
|
114
142
|
end
|
115
143
|
|
116
144
|
private
|
117
145
|
|
118
|
-
def do_parse(string_or_io, url, encoding, options)
|
146
|
+
def do_parse(string_or_io, url, encoding, **options)
|
119
147
|
string = HTML5.read_and_encode(string_or_io, encoding)
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
148
|
+
|
149
|
+
options[:max_attributes] ||= Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
|
150
|
+
options[:max_errors] ||= options.delete(:max_parse_errors) || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
|
151
|
+
options[:max_tree_depth] ||= Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
|
152
|
+
|
153
|
+
doc = Nokogiri::Gumbo.parse(string, url, self, **options)
|
124
154
|
doc.encoding = "UTF-8"
|
125
155
|
doc
|
126
156
|
end
|
@@ -142,7 +172,8 @@ module Nokogiri
|
|
142
172
|
# - +markup+ (String) The HTML5 markup fragment to be parsed
|
143
173
|
#
|
144
174
|
# [Returns]
|
145
|
-
# Nokogiri::HTML5::DocumentFragment. This object's children will be empty if
|
175
|
+
# Nokogiri::HTML5::DocumentFragment. This object's children will be empty if +markup+ is not
|
176
|
+
# passed, is empty, or is +nil+.
|
146
177
|
#
|
147
178
|
def fragment(markup = nil)
|
148
179
|
DocumentFragment.new(self, markup)
|