nokogumbo 1.5.0 → 2.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +237 -26
- data/ext/nokogumbo/extconf.rb +121 -0
- data/ext/nokogumbo/nokogumbo.c +793 -0
- data/gumbo-parser/src/ascii.c +75 -0
- data/gumbo-parser/src/ascii.h +115 -0
- data/gumbo-parser/src/attribute.c +26 -28
- data/gumbo-parser/src/attribute.h +3 -23
- data/gumbo-parser/src/char_ref.c +5972 -6816
- data/gumbo-parser/src/char_ref.h +14 -45
- data/gumbo-parser/src/error.c +510 -163
- data/gumbo-parser/src/error.h +70 -147
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/gumbo.h +577 -305
- data/gumbo-parser/src/insertion_mode.h +4 -28
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +2922 -2228
- data/gumbo-parser/src/parser.h +6 -22
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +43 -50
- data/gumbo-parser/src/string_buffer.h +24 -40
- data/gumbo-parser/src/string_piece.c +39 -39
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/tag.c +186 -59
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/token_type.h +1 -25
- data/gumbo-parser/src/tokenizer.c +2127 -1561
- data/gumbo-parser/src/tokenizer.h +41 -52
- data/gumbo-parser/src/tokenizer_states.h +281 -45
- data/gumbo-parser/src/utf8.c +98 -123
- data/gumbo-parser/src/utf8.h +84 -52
- data/gumbo-parser/src/util.c +48 -38
- data/gumbo-parser/src/util.h +10 -40
- data/gumbo-parser/src/vector.c +45 -57
- data/gumbo-parser/src/vector.h +17 -39
- data/lib/nokogumbo.rb +11 -173
- data/lib/nokogumbo/html5.rb +252 -0
- data/lib/nokogumbo/html5/document.rb +53 -0
- data/lib/nokogumbo/html5/document_fragment.rb +62 -0
- data/lib/nokogumbo/html5/node.rb +72 -0
- data/lib/nokogumbo/version.rb +3 -0
- metadata +43 -24
- data/ext/nokogumboc/extconf.rb +0 -60
- data/ext/nokogumboc/nokogumbo.c +0 -295
- data/gumbo-parser/src/char_ref.rl +0 -2554
- data/gumbo-parser/src/string_piece.h +0 -38
- data/gumbo-parser/src/tag.in +0 -150
- data/gumbo-parser/src/tag_enum.h +0 -153
- data/gumbo-parser/src/tag_gperf.h +0 -105
- data/gumbo-parser/src/tag_sizes.h +0 -4
- data/gumbo-parser/src/tag_strings.h +0 -153
- data/gumbo-parser/visualc/include/strings.h +0 -4
- data/test-nokogumbo.rb +0 -190
@@ -0,0 +1,252 @@
|
|
1
|
+
require 'nokogumbo/html5/document'
|
2
|
+
require 'nokogumbo/html5/document_fragment'
|
3
|
+
require 'nokogumbo/html5/node'
|
4
|
+
|
5
|
+
module Nokogiri
|
6
|
+
# Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
|
7
|
+
def self.HTML5(string_or_io, url = nil, encoding = nil, **options, &block)
|
8
|
+
Nokogiri::HTML5::Document.parse(string_or_io, url, encoding, **options, &block)
|
9
|
+
end
|
10
|
+
|
11
|
+
module HTML5
|
12
|
+
# HTML uses the XHTML namespace.
|
13
|
+
HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml'.freeze
|
14
|
+
MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML'.freeze
|
15
|
+
SVG_NAMESPACE = 'http://www.w3.org/2000/svg'.freeze
|
16
|
+
XLINK_NAMESPACE = 'http://www.w3.org/1999/xlink'.freeze
|
17
|
+
XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'.freeze
|
18
|
+
XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/'.freeze
|
19
|
+
|
20
|
+
# Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
|
21
|
+
def self.parse(string, url = nil, encoding = nil, **options, &block)
|
22
|
+
Document.parse(string, url, encoding, **options, &block)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Parse a fragment from +string+. Convenience method for
|
26
|
+
# Nokogiri::HTML5::DocumentFragment.parse.
|
27
|
+
def self.fragment(string, encoding = nil, **options)
|
28
|
+
DocumentFragment.parse(string, encoding, options)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Fetch and parse a HTML document from the web, following redirects,
|
32
|
+
# handling https, and determining the character encoding using HTML5
|
33
|
+
# rules. +uri+ may be a +String+ or a +URI+. +options+ contains
|
34
|
+
# http headers and special options. Everything which is not a
|
35
|
+
# special option is considered a header. Special options include:
|
36
|
+
# * :follow_limit => number of redirects which are followed
|
37
|
+
# * :basic_auth => [username, password]
|
38
|
+
def self.get(uri, options={})
|
39
|
+
headers = options.clone
|
40
|
+
headers = {:follow_limit => headers} if Numeric === headers # deprecated
|
41
|
+
limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
|
42
|
+
|
43
|
+
require 'net/http'
|
44
|
+
uri = URI(uri) unless URI === uri
|
45
|
+
|
46
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
47
|
+
|
48
|
+
# TLS / SSL support
|
49
|
+
http.use_ssl = true if uri.scheme == 'https'
|
50
|
+
|
51
|
+
# Pass through Net::HTTP override values, which currently include:
|
52
|
+
# :ca_file, :ca_path, :cert, :cert_store, :ciphers,
|
53
|
+
# :close_on_empty_response, :continue_timeout, :key, :open_timeout,
|
54
|
+
# :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
|
55
|
+
# :verify_callback, :verify_depth, :verify_mode
|
56
|
+
options.each do |key, value|
|
57
|
+
http.send "#{key}=", headers.delete(key) if http.respond_to? "#{key}="
|
58
|
+
end
|
59
|
+
|
60
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
61
|
+
|
62
|
+
# basic authentication
|
63
|
+
auth = headers.delete(:basic_auth)
|
64
|
+
auth ||= [uri.user, uri.password] if uri.user && uri.password
|
65
|
+
request.basic_auth auth.first, auth.last if auth
|
66
|
+
|
67
|
+
# remaining options are treated as headers
|
68
|
+
headers.each {|key, value| request[key.to_s] = value.to_s}
|
69
|
+
|
70
|
+
response = http.request(request)
|
71
|
+
|
72
|
+
case response
|
73
|
+
when Net::HTTPSuccess
|
74
|
+
doc = parse(reencode(response.body, response['content-type']), options)
|
75
|
+
doc.instance_variable_set('@response', response)
|
76
|
+
doc.class.send(:attr_reader, :response)
|
77
|
+
doc
|
78
|
+
when Net::HTTPRedirection
|
79
|
+
response.value if limit <= 1
|
80
|
+
location = URI.join(uri, response['location'])
|
81
|
+
get(location, options.merge(:follow_limit => limit-1))
|
82
|
+
else
|
83
|
+
response.value
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
def self.read_and_encode(string, encoding)
|
90
|
+
# Read the string with the given encoding.
|
91
|
+
if string.respond_to?(:read)
|
92
|
+
if encoding.nil?
|
93
|
+
string = string.read
|
94
|
+
else
|
95
|
+
string = string.read(encoding: encoding)
|
96
|
+
end
|
97
|
+
else
|
98
|
+
# Otherwise the string has the given encoding.
|
99
|
+
string = string.to_str
|
100
|
+
if encoding
|
101
|
+
string = string.dup
|
102
|
+
string.force_encoding(encoding)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# convert to UTF-8
|
107
|
+
if string.encoding != Encoding::UTF_8
|
108
|
+
string = reencode(string)
|
109
|
+
end
|
110
|
+
string
|
111
|
+
end
|
112
|
+
|
113
|
+
# Charset sniffing is a complex and controversial topic that understandably
|
114
|
+
# isn't done _by default_ by the Ruby Net::HTTP library. This being said,
|
115
|
+
# it is a very real problem for consumers of HTML as the default for HTML
|
116
|
+
# is iso-8859-1, most "good" producers use utf-8, and the Gumbo parser
|
117
|
+
# *only* supports utf-8.
|
118
|
+
#
|
119
|
+
# Accordingly, Nokogiri::HTML::Document.parse provides limited encoding
|
120
|
+
# detection. Following this lead, Nokogiri::HTML5 attempts to do likewise,
|
121
|
+
# while attempting to more closely follow the HTML5 standard.
|
122
|
+
#
|
123
|
+
# http://bugs.ruby-lang.org/issues/2567
|
124
|
+
# http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
|
125
|
+
#
|
126
|
+
def self.reencode(body, content_type=nil)
|
127
|
+
if body.encoding == Encoding::ASCII_8BIT
|
128
|
+
encoding = nil
|
129
|
+
|
130
|
+
# look for a Byte Order Mark (BOM)
|
131
|
+
initial_bytes = body[0..2].bytes
|
132
|
+
if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
|
133
|
+
encoding = Encoding::UTF_8
|
134
|
+
elsif initial_bytes[0..1] == [0xFE, 0xFF]
|
135
|
+
encoding = Encoding::UTF_16BE
|
136
|
+
elsif initial_bytes[0..1] == [0xFF, 0xFE]
|
137
|
+
encoding = Encoding::UTF_16LE
|
138
|
+
end
|
139
|
+
|
140
|
+
# look for a charset in a content-encoding header
|
141
|
+
if content_type
|
142
|
+
encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
|
143
|
+
end
|
144
|
+
|
145
|
+
# look for a charset in a meta tag in the first 1024 bytes
|
146
|
+
if not encoding
|
147
|
+
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
|
148
|
+
data.scan(/<meta.*?>/m).each do |meta|
|
149
|
+
encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
# if all else fails, default to the official default encoding for HTML
|
154
|
+
encoding ||= Encoding::ISO_8859_1
|
155
|
+
|
156
|
+
# change the encoding to match the detected or inferred encoding
|
157
|
+
body = body.dup
|
158
|
+
begin
|
159
|
+
body.force_encoding(encoding)
|
160
|
+
rescue ArgumentError
|
161
|
+
body.force_encoding(Encoding::ISO_8859_1)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
body.encode(Encoding::UTF_8)
|
166
|
+
end
|
167
|
+
|
168
|
+
def self.serialize_node_internal(current_node, io, encoding, options)
|
169
|
+
case current_node.type
|
170
|
+
when XML::Node::ELEMENT_NODE
|
171
|
+
ns = current_node.namespace
|
172
|
+
ns_uri = ns.nil? ? nil : ns.href
|
173
|
+
# XXX(sfc): attach namespaces to all nodes, even html?
|
174
|
+
if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
|
175
|
+
tagname = current_node.name
|
176
|
+
else
|
177
|
+
tagname = "#{ns.prefix}:#{current_node.name}"
|
178
|
+
end
|
179
|
+
io << '<' << tagname
|
180
|
+
current_node.attribute_nodes.each do |attr|
|
181
|
+
attr_ns = attr.namespace
|
182
|
+
if attr_ns.nil?
|
183
|
+
attr_name = attr.name
|
184
|
+
else
|
185
|
+
ns_uri = attr_ns.href
|
186
|
+
if ns_uri == XML_NAMESPACE
|
187
|
+
attr_name = 'xml:' + attr.name.sub(/^[^:]*:/, '')
|
188
|
+
elsif ns_uri == XMLNS_NAMESPACE && attr.name.sub(/^[^:]*:/, '') == 'xmlns'
|
189
|
+
attr_name = 'xmlns'
|
190
|
+
elsif ns_uri == XMLNS_NAMESPACE
|
191
|
+
attr_name = 'xmlns:' + attr.name.sub(/^[^:]*:/, '')
|
192
|
+
elsif ns_uri == XLINK_NAMESPACE
|
193
|
+
attr_name = 'xlink:' + attr.name.sub(/^[^:]*:/, '')
|
194
|
+
else
|
195
|
+
attr_name = "#{attr_ns.prefix}:#{attr.name}"
|
196
|
+
end
|
197
|
+
end
|
198
|
+
io << ' ' << attr_name << '="' << escape_text(attr.content, encoding, true) << '"'
|
199
|
+
end
|
200
|
+
io << '>'
|
201
|
+
if !%w[area base basefont bgsound br col embed frame hr img input keygen
|
202
|
+
link meta param source track wbr].include?(current_node.name)
|
203
|
+
io << "\n" if options[:preserve_newline] && prepend_newline?(current_node)
|
204
|
+
current_node.children.each do |child|
|
205
|
+
# XXX(sfc): Templates handled specially?
|
206
|
+
serialize_node_internal(child, io, encoding, options)
|
207
|
+
end
|
208
|
+
io << '</' << tagname << '>'
|
209
|
+
end
|
210
|
+
when XML::Node::TEXT_NODE
|
211
|
+
parent = current_node.parent
|
212
|
+
if parent.element? && %w[style script xmp iframe noembed noframes plaintext noscript].include?(parent.name)
|
213
|
+
io << current_node.content
|
214
|
+
else
|
215
|
+
io << escape_text(current_node.content, encoding, false)
|
216
|
+
end
|
217
|
+
when XML::Node::CDATA_SECTION_NODE
|
218
|
+
io << '<![CDATA[' << current_node.content << ']]>'
|
219
|
+
when XML::Node::COMMENT_NODE
|
220
|
+
io << '<!--' << current_node.content << '-->'
|
221
|
+
when XML::Node::PI_NODE
|
222
|
+
io << '<?' << current_node.content << '>'
|
223
|
+
when XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE
|
224
|
+
io << '<!DOCTYPE ' << current_node.name << '>'
|
225
|
+
when XML::Node::HTML_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE
|
226
|
+
current_node.children.each do |child|
|
227
|
+
serialize_node_internal(child, io, encoding, options)
|
228
|
+
end
|
229
|
+
else
|
230
|
+
raise "Unexpected node '#{current_node.name}' of type #{current_node.type}"
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
def self.escape_text(text, encoding, attribute_mode)
|
235
|
+
if attribute_mode
|
236
|
+
text = text.gsub(/[&\u00a0"]/,
|
237
|
+
'&' => '&', "\u00a0" => ' ', '"' => '"')
|
238
|
+
else
|
239
|
+
text = text.gsub(/[&\u00a0<>]/,
|
240
|
+
'&' => '&', "\u00a0" => ' ', '<' => '<', '>' => '>')
|
241
|
+
end
|
242
|
+
# Not part of the standard
|
243
|
+
text.encode(encoding, fallback: lambda { |c| "&\#x#{c.ord.to_s(16)};" })
|
244
|
+
end
|
245
|
+
|
246
|
+
def self.prepend_newline?(node)
|
247
|
+
return false unless %w[pre textarea listing].include?(node.name) && !node.children.empty?
|
248
|
+
first_child = node.children[0]
|
249
|
+
first_child.text? && first_child.content.start_with?("\n")
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module Nokogiri
|
2
|
+
module HTML5
|
3
|
+
class Document < Nokogiri::HTML::Document
|
4
|
+
def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
|
5
|
+
yield options if block_given?
|
6
|
+
string_or_io = '' unless string_or_io
|
7
|
+
|
8
|
+
if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != 'ASCII-8BIT'
|
9
|
+
encoding ||= string_or_io.encoding.name
|
10
|
+
end
|
11
|
+
|
12
|
+
if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
|
13
|
+
url ||= string_or_io.path
|
14
|
+
end
|
15
|
+
unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
|
16
|
+
raise ArgumentError.new("not a string or IO object")
|
17
|
+
end
|
18
|
+
do_parse(string_or_io, url, encoding, options)
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.read_io(io, url = nil, encoding = nil, **options)
|
22
|
+
raise ArgumentError.new("io object doesn't respond to :read") unless io.respond_to?(:read)
|
23
|
+
do_parse(io, url, encoding, options)
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.read_memory(string, url = nil, encoding = nil, **options)
|
27
|
+
raise ArgumentError.new("string object doesn't respond to :to_str") unless string.respond_to?(:to_str)
|
28
|
+
do_parse(string, url, encoding, options)
|
29
|
+
end
|
30
|
+
|
31
|
+
def fragment(tags = nil)
|
32
|
+
DocumentFragment.new(self, tags, self.root)
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_xml(options = {}, &block)
|
36
|
+
# Bypass XML::Document#to_xml which doesn't add
|
37
|
+
# XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
|
38
|
+
XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
def self.do_parse(string_or_io, url, encoding, options)
|
43
|
+
string = HTML5.read_and_encode(string_or_io, encoding)
|
44
|
+
max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
|
45
|
+
max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
|
46
|
+
max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
|
47
|
+
doc = Nokogumbo.parse(string, url, max_attributes, max_errors, max_depth)
|
48
|
+
doc.encoding = 'UTF-8'
|
49
|
+
doc
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module HTML5
|
5
|
+
class DocumentFragment < Nokogiri::HTML::DocumentFragment
|
6
|
+
attr_accessor :document
|
7
|
+
attr_accessor :errors
|
8
|
+
|
9
|
+
# Create a document fragment.
|
10
|
+
def initialize(doc, tags = nil, ctx = nil, options = {})
|
11
|
+
self.document = doc
|
12
|
+
self.errors = []
|
13
|
+
return self unless tags
|
14
|
+
|
15
|
+
max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
|
16
|
+
max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
|
17
|
+
max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
|
18
|
+
tags = Nokogiri::HTML5.read_and_encode(tags, nil)
|
19
|
+
Nokogumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
|
20
|
+
end
|
21
|
+
|
22
|
+
def serialize(options = {}, &block)
|
23
|
+
# Bypass XML::Document.serialize which doesn't support options even
|
24
|
+
# though XML::Node.serialize does!
|
25
|
+
XML::Node.instance_method(:serialize).bind(self).call(options, &block)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Parse a document fragment from +tags+, returning a Nodeset.
|
29
|
+
def self.parse(tags, encoding = nil, options = {})
|
30
|
+
doc = HTML5::Document.new
|
31
|
+
tags = HTML5.read_and_encode(tags, encoding)
|
32
|
+
doc.encoding = 'UTF-8'
|
33
|
+
new(doc, tags, nil, options)
|
34
|
+
end
|
35
|
+
|
36
|
+
def extract_params params # :nodoc:
|
37
|
+
handler = params.find do |param|
|
38
|
+
![Hash, String, Symbol].include?(param.class)
|
39
|
+
end
|
40
|
+
params -= [handler] if handler
|
41
|
+
|
42
|
+
hashes = []
|
43
|
+
while Hash === params.last || params.last.nil?
|
44
|
+
hashes << params.pop
|
45
|
+
break if params.empty?
|
46
|
+
end
|
47
|
+
ns, binds = hashes.reverse
|
48
|
+
|
49
|
+
ns ||=
|
50
|
+
begin
|
51
|
+
ns = Hash.new
|
52
|
+
children.each { |child| ns.merge!(child.namespaces) }
|
53
|
+
ns
|
54
|
+
end
|
55
|
+
|
56
|
+
[params, handler, ns, binds]
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
# vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module HTML5
|
5
|
+
module Node
|
6
|
+
# HTML elements can have attributes that contain colons.
|
7
|
+
# Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
|
8
|
+
# and tries to create an attribute in a namespace. This is especially
|
9
|
+
# annoying with attribute names like xml:lang since libxml2 will
|
10
|
+
# actually create the xml namespace if it doesn't exist already.
|
11
|
+
def add_child_node_and_reparent_attrs(node)
|
12
|
+
return super(node) unless document.is_a?(HTML5::Document)
|
13
|
+
# I'm not sure what this method is supposed to do. Reparenting
|
14
|
+
# namespaces is handled by libxml2, including child namespaces which
|
15
|
+
# this method wouldn't handle.
|
16
|
+
# https://github.com/sparklemotion/nokogiri/issues/1790
|
17
|
+
add_child_node(node)
|
18
|
+
#node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
|
19
|
+
# attr.remove
|
20
|
+
# ns = attr.namespace
|
21
|
+
# a["#{ns.prefix}:#{attr.name}"] = attr.value
|
22
|
+
#end
|
23
|
+
end
|
24
|
+
|
25
|
+
def inner_html(options = {})
|
26
|
+
return super(options) unless document.is_a?(HTML5::Document)
|
27
|
+
result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? "\n" : ""
|
28
|
+
result << children.map { |child| child.to_html(options) }.join
|
29
|
+
result
|
30
|
+
end
|
31
|
+
|
32
|
+
def write_to(io, *options)
|
33
|
+
return super(io, *options) unless document.is_a?(HTML5::Document)
|
34
|
+
options = options.first.is_a?(Hash) ? options.shift : {}
|
35
|
+
encoding = options[:encoding] || options[0]
|
36
|
+
if Nokogiri.jruby?
|
37
|
+
save_options = options[:save_with] || options[1]
|
38
|
+
indent_times = options[:indent] || 0
|
39
|
+
else
|
40
|
+
save_options = options[:save_with] || options[1] || XML::Node::SaveOptions::FORMAT
|
41
|
+
indent_times = options[:indent] || 2
|
42
|
+
end
|
43
|
+
indent_string = (options[:indent_text] || ' ') * indent_times
|
44
|
+
|
45
|
+
config = XML::Node::SaveOptions.new(save_options.to_i)
|
46
|
+
yield config if block_given?
|
47
|
+
|
48
|
+
config_options = config.options
|
49
|
+
if (config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0)
|
50
|
+
# Use Nokogiri's serializing code.
|
51
|
+
native_write_to(io, encoding, indent_string, config_options)
|
52
|
+
else
|
53
|
+
# Serialize including the current node.
|
54
|
+
encoding ||= document.encoding || Encoding::UTF_8
|
55
|
+
internal_ops = {
|
56
|
+
preserve_newline: options[:preserve_newline] || false
|
57
|
+
}
|
58
|
+
HTML5.serialize_node_internal(self, io, encoding, internal_ops)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def fragment(tags)
|
63
|
+
return super(tags) unless document.is_a?(HTML5::Document)
|
64
|
+
DocumentFragment.new(document, tags, self)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
# Monkey patch
|
68
|
+
XML::Node.prepend(HTML5::Node)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|
metadata
CHANGED
@@ -1,62 +1,75 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Ruby
|
8
|
-
|
8
|
+
- Stephen Checkoway
|
9
|
+
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date:
|
12
|
+
date: 2020-11-22 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
15
|
name: nokogiri
|
15
16
|
requirement: !ruby/object:Gem::Requirement
|
16
17
|
requirements:
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '1.8'
|
17
21
|
- - ">="
|
18
22
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
23
|
+
version: 1.8.4
|
20
24
|
type: :runtime
|
21
25
|
prerelease: false
|
22
26
|
version_requirements: !ruby/object:Gem::Requirement
|
23
27
|
requirements:
|
28
|
+
- - "~>"
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: '1.8'
|
24
31
|
- - ">="
|
25
32
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
33
|
+
version: 1.8.4
|
27
34
|
description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
|
28
35
|
access the result as a Nokogiri parsed document.
|
29
|
-
email:
|
36
|
+
email:
|
37
|
+
- rubys@intertwingly.net
|
38
|
+
- s@pahtak.org
|
30
39
|
executables: []
|
31
40
|
extensions:
|
32
|
-
- ext/
|
41
|
+
- ext/nokogumbo/extconf.rb
|
33
42
|
extra_rdoc_files: []
|
34
43
|
files:
|
35
44
|
- LICENSE.txt
|
36
45
|
- README.md
|
37
|
-
- ext/
|
38
|
-
- ext/
|
46
|
+
- ext/nokogumbo/extconf.rb
|
47
|
+
- ext/nokogumbo/nokogumbo.c
|
48
|
+
- gumbo-parser/src/ascii.c
|
49
|
+
- gumbo-parser/src/ascii.h
|
39
50
|
- gumbo-parser/src/attribute.c
|
40
51
|
- gumbo-parser/src/attribute.h
|
41
52
|
- gumbo-parser/src/char_ref.c
|
42
53
|
- gumbo-parser/src/char_ref.h
|
43
|
-
- gumbo-parser/src/char_ref.rl
|
44
54
|
- gumbo-parser/src/error.c
|
45
55
|
- gumbo-parser/src/error.h
|
56
|
+
- gumbo-parser/src/foreign_attrs.c
|
46
57
|
- gumbo-parser/src/gumbo.h
|
47
58
|
- gumbo-parser/src/insertion_mode.h
|
59
|
+
- gumbo-parser/src/macros.h
|
48
60
|
- gumbo-parser/src/parser.c
|
49
61
|
- gumbo-parser/src/parser.h
|
62
|
+
- gumbo-parser/src/replacement.h
|
50
63
|
- gumbo-parser/src/string_buffer.c
|
51
64
|
- gumbo-parser/src/string_buffer.h
|
52
65
|
- gumbo-parser/src/string_piece.c
|
53
|
-
- gumbo-parser/src/
|
66
|
+
- gumbo-parser/src/svg_attrs.c
|
67
|
+
- gumbo-parser/src/svg_tags.c
|
54
68
|
- gumbo-parser/src/tag.c
|
55
|
-
- gumbo-parser/src/
|
56
|
-
- gumbo-parser/src/
|
57
|
-
- gumbo-parser/src/
|
58
|
-
- gumbo-parser/src/
|
59
|
-
- gumbo-parser/src/tag_strings.h
|
69
|
+
- gumbo-parser/src/tag_lookup.c
|
70
|
+
- gumbo-parser/src/tag_lookup.h
|
71
|
+
- gumbo-parser/src/token_buffer.c
|
72
|
+
- gumbo-parser/src/token_buffer.h
|
60
73
|
- gumbo-parser/src/token_type.h
|
61
74
|
- gumbo-parser/src/tokenizer.c
|
62
75
|
- gumbo-parser/src/tokenizer.h
|
@@ -67,14 +80,21 @@ files:
|
|
67
80
|
- gumbo-parser/src/util.h
|
68
81
|
- gumbo-parser/src/vector.c
|
69
82
|
- gumbo-parser/src/vector.h
|
70
|
-
- gumbo-parser/visualc/include/strings.h
|
71
83
|
- lib/nokogumbo.rb
|
72
|
-
-
|
84
|
+
- lib/nokogumbo/html5.rb
|
85
|
+
- lib/nokogumbo/html5/document.rb
|
86
|
+
- lib/nokogumbo/html5/document_fragment.rb
|
87
|
+
- lib/nokogumbo/html5/node.rb
|
88
|
+
- lib/nokogumbo/version.rb
|
73
89
|
homepage: https://github.com/rubys/nokogumbo/#readme
|
74
90
|
licenses:
|
75
91
|
- Apache-2.0
|
76
|
-
metadata:
|
77
|
-
|
92
|
+
metadata:
|
93
|
+
bug_tracker_uri: https://github.com/rubys/nokogumbo/issues
|
94
|
+
changelog_uri: https://github.com/rubys/nokogumbo/blob/master/CHANGELOG.md
|
95
|
+
homepage_uri: https://github.com/rubys/nokogumbo/#readme
|
96
|
+
source_code_uri: https://github.com/rubys/nokogumbo
|
97
|
+
post_install_message:
|
78
98
|
rdoc_options: []
|
79
99
|
require_paths:
|
80
100
|
- lib
|
@@ -82,16 +102,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
82
102
|
requirements:
|
83
103
|
- - ">="
|
84
104
|
- !ruby/object:Gem::Version
|
85
|
-
version: '
|
105
|
+
version: '2.1'
|
86
106
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
87
107
|
requirements:
|
88
108
|
- - ">="
|
89
109
|
- !ruby/object:Gem::Version
|
90
110
|
version: '0'
|
91
111
|
requirements: []
|
92
|
-
|
93
|
-
|
94
|
-
signing_key:
|
112
|
+
rubygems_version: 3.1.2
|
113
|
+
signing_key:
|
95
114
|
specification_version: 4
|
96
115
|
summary: Nokogiri interface to the Gumbo HTML5 parser
|
97
116
|
test_files: []
|