nokogumbo 1.5.0 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +237 -26
- data/ext/nokogumbo/extconf.rb +121 -0
- data/ext/nokogumbo/nokogumbo.c +793 -0
- data/gumbo-parser/src/ascii.c +75 -0
- data/gumbo-parser/src/ascii.h +115 -0
- data/gumbo-parser/src/attribute.c +26 -28
- data/gumbo-parser/src/attribute.h +3 -23
- data/gumbo-parser/src/char_ref.c +5972 -6816
- data/gumbo-parser/src/char_ref.h +14 -45
- data/gumbo-parser/src/error.c +510 -163
- data/gumbo-parser/src/error.h +70 -147
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/gumbo.h +577 -305
- data/gumbo-parser/src/insertion_mode.h +4 -28
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +2922 -2228
- data/gumbo-parser/src/parser.h +6 -22
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +43 -50
- data/gumbo-parser/src/string_buffer.h +24 -40
- data/gumbo-parser/src/string_piece.c +39 -39
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/tag.c +186 -59
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/token_type.h +1 -25
- data/gumbo-parser/src/tokenizer.c +2127 -1561
- data/gumbo-parser/src/tokenizer.h +41 -52
- data/gumbo-parser/src/tokenizer_states.h +281 -45
- data/gumbo-parser/src/utf8.c +98 -123
- data/gumbo-parser/src/utf8.h +84 -52
- data/gumbo-parser/src/util.c +48 -38
- data/gumbo-parser/src/util.h +10 -40
- data/gumbo-parser/src/vector.c +45 -57
- data/gumbo-parser/src/vector.h +17 -39
- data/lib/nokogumbo.rb +11 -173
- data/lib/nokogumbo/html5.rb +252 -0
- data/lib/nokogumbo/html5/document.rb +53 -0
- data/lib/nokogumbo/html5/document_fragment.rb +62 -0
- data/lib/nokogumbo/html5/node.rb +72 -0
- data/lib/nokogumbo/version.rb +3 -0
- metadata +43 -24
- data/ext/nokogumboc/extconf.rb +0 -60
- data/ext/nokogumboc/nokogumbo.c +0 -295
- data/gumbo-parser/src/char_ref.rl +0 -2554
- data/gumbo-parser/src/string_piece.h +0 -38
- data/gumbo-parser/src/tag.in +0 -150
- data/gumbo-parser/src/tag_enum.h +0 -153
- data/gumbo-parser/src/tag_gperf.h +0 -105
- data/gumbo-parser/src/tag_sizes.h +0 -4
- data/gumbo-parser/src/tag_strings.h +0 -153
- data/gumbo-parser/visualc/include/strings.h +0 -4
- data/test-nokogumbo.rb +0 -190
@@ -0,0 +1,252 @@
|
|
1
|
+
require 'nokogumbo/html5/document'
|
2
|
+
require 'nokogumbo/html5/document_fragment'
|
3
|
+
require 'nokogumbo/html5/node'
|
4
|
+
|
5
|
+
module Nokogiri
|
6
|
+
# Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
|
7
|
+
def self.HTML5(string_or_io, url = nil, encoding = nil, **options, &block)
|
8
|
+
Nokogiri::HTML5::Document.parse(string_or_io, url, encoding, **options, &block)
|
9
|
+
end
|
10
|
+
|
11
|
+
module HTML5
|
12
|
+
# HTML uses the XHTML namespace.
|
13
|
+
HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml'.freeze
|
14
|
+
MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML'.freeze
|
15
|
+
SVG_NAMESPACE = 'http://www.w3.org/2000/svg'.freeze
|
16
|
+
XLINK_NAMESPACE = 'http://www.w3.org/1999/xlink'.freeze
|
17
|
+
XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'.freeze
|
18
|
+
XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/'.freeze
|
19
|
+
|
20
|
+
# Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
|
21
|
+
def self.parse(string, url = nil, encoding = nil, **options, &block)
|
22
|
+
Document.parse(string, url, encoding, **options, &block)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Parse a fragment from +string+. Convenience method for
|
26
|
+
# Nokogiri::HTML5::DocumentFragment.parse.
|
27
|
+
def self.fragment(string, encoding = nil, **options)
|
28
|
+
DocumentFragment.parse(string, encoding, options)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Fetch and parse a HTML document from the web, following redirects,
|
32
|
+
# handling https, and determining the character encoding using HTML5
|
33
|
+
# rules. +uri+ may be a +String+ or a +URI+. +options+ contains
|
34
|
+
# http headers and special options. Everything which is not a
|
35
|
+
# special option is considered a header. Special options include:
|
36
|
+
# * :follow_limit => number of redirects which are followed
|
37
|
+
# * :basic_auth => [username, password]
|
38
|
+
def self.get(uri, options={})
|
39
|
+
headers = options.clone
|
40
|
+
headers = {:follow_limit => headers} if Numeric === headers # deprecated
|
41
|
+
limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
|
42
|
+
|
43
|
+
require 'net/http'
|
44
|
+
uri = URI(uri) unless URI === uri
|
45
|
+
|
46
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
47
|
+
|
48
|
+
# TLS / SSL support
|
49
|
+
http.use_ssl = true if uri.scheme == 'https'
|
50
|
+
|
51
|
+
# Pass through Net::HTTP override values, which currently include:
|
52
|
+
# :ca_file, :ca_path, :cert, :cert_store, :ciphers,
|
53
|
+
# :close_on_empty_response, :continue_timeout, :key, :open_timeout,
|
54
|
+
# :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
|
55
|
+
# :verify_callback, :verify_depth, :verify_mode
|
56
|
+
options.each do |key, value|
|
57
|
+
http.send "#{key}=", headers.delete(key) if http.respond_to? "#{key}="
|
58
|
+
end
|
59
|
+
|
60
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
61
|
+
|
62
|
+
# basic authentication
|
63
|
+
auth = headers.delete(:basic_auth)
|
64
|
+
auth ||= [uri.user, uri.password] if uri.user && uri.password
|
65
|
+
request.basic_auth auth.first, auth.last if auth
|
66
|
+
|
67
|
+
# remaining options are treated as headers
|
68
|
+
headers.each {|key, value| request[key.to_s] = value.to_s}
|
69
|
+
|
70
|
+
response = http.request(request)
|
71
|
+
|
72
|
+
case response
|
73
|
+
when Net::HTTPSuccess
|
74
|
+
doc = parse(reencode(response.body, response['content-type']), options)
|
75
|
+
doc.instance_variable_set('@response', response)
|
76
|
+
doc.class.send(:attr_reader, :response)
|
77
|
+
doc
|
78
|
+
when Net::HTTPRedirection
|
79
|
+
response.value if limit <= 1
|
80
|
+
location = URI.join(uri, response['location'])
|
81
|
+
get(location, options.merge(:follow_limit => limit-1))
|
82
|
+
else
|
83
|
+
response.value
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
def self.read_and_encode(string, encoding)
|
90
|
+
# Read the string with the given encoding.
|
91
|
+
if string.respond_to?(:read)
|
92
|
+
if encoding.nil?
|
93
|
+
string = string.read
|
94
|
+
else
|
95
|
+
string = string.read(encoding: encoding)
|
96
|
+
end
|
97
|
+
else
|
98
|
+
# Otherwise the string has the given encoding.
|
99
|
+
string = string.to_str
|
100
|
+
if encoding
|
101
|
+
string = string.dup
|
102
|
+
string.force_encoding(encoding)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# convert to UTF-8
|
107
|
+
if string.encoding != Encoding::UTF_8
|
108
|
+
string = reencode(string)
|
109
|
+
end
|
110
|
+
string
|
111
|
+
end
|
112
|
+
|
113
|
+
# Charset sniffing is a complex and controversial topic that understandably
|
114
|
+
# isn't done _by default_ by the Ruby Net::HTTP library. This being said,
|
115
|
+
# it is a very real problem for consumers of HTML as the default for HTML
|
116
|
+
# is iso-8859-1, most "good" producers use utf-8, and the Gumbo parser
|
117
|
+
# *only* supports utf-8.
|
118
|
+
#
|
119
|
+
# Accordingly, Nokogiri::HTML::Document.parse provides limited encoding
|
120
|
+
# detection. Following this lead, Nokogiri::HTML5 attempts to do likewise,
|
121
|
+
# while attempting to more closely follow the HTML5 standard.
|
122
|
+
#
|
123
|
+
# http://bugs.ruby-lang.org/issues/2567
|
124
|
+
# http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
|
125
|
+
#
|
126
|
+
def self.reencode(body, content_type=nil)
|
127
|
+
if body.encoding == Encoding::ASCII_8BIT
|
128
|
+
encoding = nil
|
129
|
+
|
130
|
+
# look for a Byte Order Mark (BOM)
|
131
|
+
initial_bytes = body[0..2].bytes
|
132
|
+
if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
|
133
|
+
encoding = Encoding::UTF_8
|
134
|
+
elsif initial_bytes[0..1] == [0xFE, 0xFF]
|
135
|
+
encoding = Encoding::UTF_16BE
|
136
|
+
elsif initial_bytes[0..1] == [0xFF, 0xFE]
|
137
|
+
encoding = Encoding::UTF_16LE
|
138
|
+
end
|
139
|
+
|
140
|
+
# look for a charset in a content-encoding header
|
141
|
+
if content_type
|
142
|
+
encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
|
143
|
+
end
|
144
|
+
|
145
|
+
# look for a charset in a meta tag in the first 1024 bytes
|
146
|
+
if not encoding
|
147
|
+
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
|
148
|
+
data.scan(/<meta.*?>/m).each do |meta|
|
149
|
+
encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
# if all else fails, default to the official default encoding for HTML
|
154
|
+
encoding ||= Encoding::ISO_8859_1
|
155
|
+
|
156
|
+
# change the encoding to match the detected or inferred encoding
|
157
|
+
body = body.dup
|
158
|
+
begin
|
159
|
+
body.force_encoding(encoding)
|
160
|
+
rescue ArgumentError
|
161
|
+
body.force_encoding(Encoding::ISO_8859_1)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
body.encode(Encoding::UTF_8)
|
166
|
+
end
|
167
|
+
|
168
|
+
def self.serialize_node_internal(current_node, io, encoding, options)
|
169
|
+
case current_node.type
|
170
|
+
when XML::Node::ELEMENT_NODE
|
171
|
+
ns = current_node.namespace
|
172
|
+
ns_uri = ns.nil? ? nil : ns.href
|
173
|
+
# XXX(sfc): attach namespaces to all nodes, even html?
|
174
|
+
if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
|
175
|
+
tagname = current_node.name
|
176
|
+
else
|
177
|
+
tagname = "#{ns.prefix}:#{current_node.name}"
|
178
|
+
end
|
179
|
+
io << '<' << tagname
|
180
|
+
current_node.attribute_nodes.each do |attr|
|
181
|
+
attr_ns = attr.namespace
|
182
|
+
if attr_ns.nil?
|
183
|
+
attr_name = attr.name
|
184
|
+
else
|
185
|
+
ns_uri = attr_ns.href
|
186
|
+
if ns_uri == XML_NAMESPACE
|
187
|
+
attr_name = 'xml:' + attr.name.sub(/^[^:]*:/, '')
|
188
|
+
elsif ns_uri == XMLNS_NAMESPACE && attr.name.sub(/^[^:]*:/, '') == 'xmlns'
|
189
|
+
attr_name = 'xmlns'
|
190
|
+
elsif ns_uri == XMLNS_NAMESPACE
|
191
|
+
attr_name = 'xmlns:' + attr.name.sub(/^[^:]*:/, '')
|
192
|
+
elsif ns_uri == XLINK_NAMESPACE
|
193
|
+
attr_name = 'xlink:' + attr.name.sub(/^[^:]*:/, '')
|
194
|
+
else
|
195
|
+
attr_name = "#{attr_ns.prefix}:#{attr.name}"
|
196
|
+
end
|
197
|
+
end
|
198
|
+
io << ' ' << attr_name << '="' << escape_text(attr.content, encoding, true) << '"'
|
199
|
+
end
|
200
|
+
io << '>'
|
201
|
+
if !%w[area base basefont bgsound br col embed frame hr img input keygen
|
202
|
+
link meta param source track wbr].include?(current_node.name)
|
203
|
+
io << "\n" if options[:preserve_newline] && prepend_newline?(current_node)
|
204
|
+
current_node.children.each do |child|
|
205
|
+
# XXX(sfc): Templates handled specially?
|
206
|
+
serialize_node_internal(child, io, encoding, options)
|
207
|
+
end
|
208
|
+
io << '</' << tagname << '>'
|
209
|
+
end
|
210
|
+
when XML::Node::TEXT_NODE
|
211
|
+
parent = current_node.parent
|
212
|
+
if parent.element? && %w[style script xmp iframe noembed noframes plaintext noscript].include?(parent.name)
|
213
|
+
io << current_node.content
|
214
|
+
else
|
215
|
+
io << escape_text(current_node.content, encoding, false)
|
216
|
+
end
|
217
|
+
when XML::Node::CDATA_SECTION_NODE
|
218
|
+
io << '<![CDATA[' << current_node.content << ']]>'
|
219
|
+
when XML::Node::COMMENT_NODE
|
220
|
+
io << '<!--' << current_node.content << '-->'
|
221
|
+
when XML::Node::PI_NODE
|
222
|
+
io << '<?' << current_node.content << '>'
|
223
|
+
when XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE
|
224
|
+
io << '<!DOCTYPE ' << current_node.name << '>'
|
225
|
+
when XML::Node::HTML_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE
|
226
|
+
current_node.children.each do |child|
|
227
|
+
serialize_node_internal(child, io, encoding, options)
|
228
|
+
end
|
229
|
+
else
|
230
|
+
raise "Unexpected node '#{current_node.name}' of type #{current_node.type}"
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
def self.escape_text(text, encoding, attribute_mode)
|
235
|
+
if attribute_mode
|
236
|
+
text = text.gsub(/[&\u00a0"]/,
|
237
|
+
'&' => '&', "\u00a0" => ' ', '"' => '"')
|
238
|
+
else
|
239
|
+
text = text.gsub(/[&\u00a0<>]/,
|
240
|
+
'&' => '&', "\u00a0" => ' ', '<' => '<', '>' => '>')
|
241
|
+
end
|
242
|
+
# Not part of the standard
|
243
|
+
text.encode(encoding, fallback: lambda { |c| "&\#x#{c.ord.to_s(16)};" })
|
244
|
+
end
|
245
|
+
|
246
|
+
def self.prepend_newline?(node)
|
247
|
+
return false unless %w[pre textarea listing].include?(node.name) && !node.children.empty?
|
248
|
+
first_child = node.children[0]
|
249
|
+
first_child.text? && first_child.content.start_with?("\n")
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module Nokogiri
|
2
|
+
module HTML5
|
3
|
+
class Document < Nokogiri::HTML::Document
|
4
|
+
def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
|
5
|
+
yield options if block_given?
|
6
|
+
string_or_io = '' unless string_or_io
|
7
|
+
|
8
|
+
if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != 'ASCII-8BIT'
|
9
|
+
encoding ||= string_or_io.encoding.name
|
10
|
+
end
|
11
|
+
|
12
|
+
if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
|
13
|
+
url ||= string_or_io.path
|
14
|
+
end
|
15
|
+
unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
|
16
|
+
raise ArgumentError.new("not a string or IO object")
|
17
|
+
end
|
18
|
+
do_parse(string_or_io, url, encoding, options)
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.read_io(io, url = nil, encoding = nil, **options)
|
22
|
+
raise ArgumentError.new("io object doesn't respond to :read") unless io.respond_to?(:read)
|
23
|
+
do_parse(io, url, encoding, options)
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.read_memory(string, url = nil, encoding = nil, **options)
|
27
|
+
raise ArgumentError.new("string object doesn't respond to :to_str") unless string.respond_to?(:to_str)
|
28
|
+
do_parse(string, url, encoding, options)
|
29
|
+
end
|
30
|
+
|
31
|
+
def fragment(tags = nil)
|
32
|
+
DocumentFragment.new(self, tags, self.root)
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_xml(options = {}, &block)
|
36
|
+
# Bypass XML::Document#to_xml which doesn't add
|
37
|
+
# XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
|
38
|
+
XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
def self.do_parse(string_or_io, url, encoding, options)
|
43
|
+
string = HTML5.read_and_encode(string_or_io, encoding)
|
44
|
+
max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
|
45
|
+
max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
|
46
|
+
max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
|
47
|
+
doc = Nokogumbo.parse(string, url, max_attributes, max_errors, max_depth)
|
48
|
+
doc.encoding = 'UTF-8'
|
49
|
+
doc
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module HTML5
|
5
|
+
class DocumentFragment < Nokogiri::HTML::DocumentFragment
|
6
|
+
attr_accessor :document
|
7
|
+
attr_accessor :errors
|
8
|
+
|
9
|
+
# Create a document fragment.
|
10
|
+
def initialize(doc, tags = nil, ctx = nil, options = {})
|
11
|
+
self.document = doc
|
12
|
+
self.errors = []
|
13
|
+
return self unless tags
|
14
|
+
|
15
|
+
max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
|
16
|
+
max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
|
17
|
+
max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
|
18
|
+
tags = Nokogiri::HTML5.read_and_encode(tags, nil)
|
19
|
+
Nokogumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
|
20
|
+
end
|
21
|
+
|
22
|
+
def serialize(options = {}, &block)
|
23
|
+
# Bypass XML::Document.serialize which doesn't support options even
|
24
|
+
# though XML::Node.serialize does!
|
25
|
+
XML::Node.instance_method(:serialize).bind(self).call(options, &block)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Parse a document fragment from +tags+, returning a Nodeset.
|
29
|
+
def self.parse(tags, encoding = nil, options = {})
|
30
|
+
doc = HTML5::Document.new
|
31
|
+
tags = HTML5.read_and_encode(tags, encoding)
|
32
|
+
doc.encoding = 'UTF-8'
|
33
|
+
new(doc, tags, nil, options)
|
34
|
+
end
|
35
|
+
|
36
|
+
def extract_params params # :nodoc:
|
37
|
+
handler = params.find do |param|
|
38
|
+
![Hash, String, Symbol].include?(param.class)
|
39
|
+
end
|
40
|
+
params -= [handler] if handler
|
41
|
+
|
42
|
+
hashes = []
|
43
|
+
while Hash === params.last || params.last.nil?
|
44
|
+
hashes << params.pop
|
45
|
+
break if params.empty?
|
46
|
+
end
|
47
|
+
ns, binds = hashes.reverse
|
48
|
+
|
49
|
+
ns ||=
|
50
|
+
begin
|
51
|
+
ns = Hash.new
|
52
|
+
children.each { |child| ns.merge!(child.namespaces) }
|
53
|
+
ns
|
54
|
+
end
|
55
|
+
|
56
|
+
[params, handler, ns, binds]
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
# vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module HTML5
|
5
|
+
module Node
|
6
|
+
# HTML elements can have attributes that contain colons.
|
7
|
+
# Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
|
8
|
+
# and tries to create an attribute in a namespace. This is especially
|
9
|
+
# annoying with attribute names like xml:lang since libxml2 will
|
10
|
+
# actually create the xml namespace if it doesn't exist already.
|
11
|
+
def add_child_node_and_reparent_attrs(node)
|
12
|
+
return super(node) unless document.is_a?(HTML5::Document)
|
13
|
+
# I'm not sure what this method is supposed to do. Reparenting
|
14
|
+
# namespaces is handled by libxml2, including child namespaces which
|
15
|
+
# this method wouldn't handle.
|
16
|
+
# https://github.com/sparklemotion/nokogiri/issues/1790
|
17
|
+
add_child_node(node)
|
18
|
+
#node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
|
19
|
+
# attr.remove
|
20
|
+
# ns = attr.namespace
|
21
|
+
# a["#{ns.prefix}:#{attr.name}"] = attr.value
|
22
|
+
#end
|
23
|
+
end
|
24
|
+
|
25
|
+
def inner_html(options = {})
|
26
|
+
return super(options) unless document.is_a?(HTML5::Document)
|
27
|
+
result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? "\n" : ""
|
28
|
+
result << children.map { |child| child.to_html(options) }.join
|
29
|
+
result
|
30
|
+
end
|
31
|
+
|
32
|
+
def write_to(io, *options)
|
33
|
+
return super(io, *options) unless document.is_a?(HTML5::Document)
|
34
|
+
options = options.first.is_a?(Hash) ? options.shift : {}
|
35
|
+
encoding = options[:encoding] || options[0]
|
36
|
+
if Nokogiri.jruby?
|
37
|
+
save_options = options[:save_with] || options[1]
|
38
|
+
indent_times = options[:indent] || 0
|
39
|
+
else
|
40
|
+
save_options = options[:save_with] || options[1] || XML::Node::SaveOptions::FORMAT
|
41
|
+
indent_times = options[:indent] || 2
|
42
|
+
end
|
43
|
+
indent_string = (options[:indent_text] || ' ') * indent_times
|
44
|
+
|
45
|
+
config = XML::Node::SaveOptions.new(save_options.to_i)
|
46
|
+
yield config if block_given?
|
47
|
+
|
48
|
+
config_options = config.options
|
49
|
+
if (config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0)
|
50
|
+
# Use Nokogiri's serializing code.
|
51
|
+
native_write_to(io, encoding, indent_string, config_options)
|
52
|
+
else
|
53
|
+
# Serialize including the current node.
|
54
|
+
encoding ||= document.encoding || Encoding::UTF_8
|
55
|
+
internal_ops = {
|
56
|
+
preserve_newline: options[:preserve_newline] || false
|
57
|
+
}
|
58
|
+
HTML5.serialize_node_internal(self, io, encoding, internal_ops)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def fragment(tags)
|
63
|
+
return super(tags) unless document.is_a?(HTML5::Document)
|
64
|
+
DocumentFragment.new(document, tags, self)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
# Monkey patch
|
68
|
+
XML::Node.prepend(HTML5::Node)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|
metadata
CHANGED
@@ -1,62 +1,75 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Ruby
|
8
|
-
|
8
|
+
- Stephen Checkoway
|
9
|
+
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date:
|
12
|
+
date: 2020-11-22 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
15
|
name: nokogiri
|
15
16
|
requirement: !ruby/object:Gem::Requirement
|
16
17
|
requirements:
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '1.8'
|
17
21
|
- - ">="
|
18
22
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
23
|
+
version: 1.8.4
|
20
24
|
type: :runtime
|
21
25
|
prerelease: false
|
22
26
|
version_requirements: !ruby/object:Gem::Requirement
|
23
27
|
requirements:
|
28
|
+
- - "~>"
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: '1.8'
|
24
31
|
- - ">="
|
25
32
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
33
|
+
version: 1.8.4
|
27
34
|
description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
|
28
35
|
access the result as a Nokogiri parsed document.
|
29
|
-
email:
|
36
|
+
email:
|
37
|
+
- rubys@intertwingly.net
|
38
|
+
- s@pahtak.org
|
30
39
|
executables: []
|
31
40
|
extensions:
|
32
|
-
- ext/
|
41
|
+
- ext/nokogumbo/extconf.rb
|
33
42
|
extra_rdoc_files: []
|
34
43
|
files:
|
35
44
|
- LICENSE.txt
|
36
45
|
- README.md
|
37
|
-
- ext/
|
38
|
-
- ext/
|
46
|
+
- ext/nokogumbo/extconf.rb
|
47
|
+
- ext/nokogumbo/nokogumbo.c
|
48
|
+
- gumbo-parser/src/ascii.c
|
49
|
+
- gumbo-parser/src/ascii.h
|
39
50
|
- gumbo-parser/src/attribute.c
|
40
51
|
- gumbo-parser/src/attribute.h
|
41
52
|
- gumbo-parser/src/char_ref.c
|
42
53
|
- gumbo-parser/src/char_ref.h
|
43
|
-
- gumbo-parser/src/char_ref.rl
|
44
54
|
- gumbo-parser/src/error.c
|
45
55
|
- gumbo-parser/src/error.h
|
56
|
+
- gumbo-parser/src/foreign_attrs.c
|
46
57
|
- gumbo-parser/src/gumbo.h
|
47
58
|
- gumbo-parser/src/insertion_mode.h
|
59
|
+
- gumbo-parser/src/macros.h
|
48
60
|
- gumbo-parser/src/parser.c
|
49
61
|
- gumbo-parser/src/parser.h
|
62
|
+
- gumbo-parser/src/replacement.h
|
50
63
|
- gumbo-parser/src/string_buffer.c
|
51
64
|
- gumbo-parser/src/string_buffer.h
|
52
65
|
- gumbo-parser/src/string_piece.c
|
53
|
-
- gumbo-parser/src/
|
66
|
+
- gumbo-parser/src/svg_attrs.c
|
67
|
+
- gumbo-parser/src/svg_tags.c
|
54
68
|
- gumbo-parser/src/tag.c
|
55
|
-
- gumbo-parser/src/
|
56
|
-
- gumbo-parser/src/
|
57
|
-
- gumbo-parser/src/
|
58
|
-
- gumbo-parser/src/
|
59
|
-
- gumbo-parser/src/tag_strings.h
|
69
|
+
- gumbo-parser/src/tag_lookup.c
|
70
|
+
- gumbo-parser/src/tag_lookup.h
|
71
|
+
- gumbo-parser/src/token_buffer.c
|
72
|
+
- gumbo-parser/src/token_buffer.h
|
60
73
|
- gumbo-parser/src/token_type.h
|
61
74
|
- gumbo-parser/src/tokenizer.c
|
62
75
|
- gumbo-parser/src/tokenizer.h
|
@@ -67,14 +80,21 @@ files:
|
|
67
80
|
- gumbo-parser/src/util.h
|
68
81
|
- gumbo-parser/src/vector.c
|
69
82
|
- gumbo-parser/src/vector.h
|
70
|
-
- gumbo-parser/visualc/include/strings.h
|
71
83
|
- lib/nokogumbo.rb
|
72
|
-
-
|
84
|
+
- lib/nokogumbo/html5.rb
|
85
|
+
- lib/nokogumbo/html5/document.rb
|
86
|
+
- lib/nokogumbo/html5/document_fragment.rb
|
87
|
+
- lib/nokogumbo/html5/node.rb
|
88
|
+
- lib/nokogumbo/version.rb
|
73
89
|
homepage: https://github.com/rubys/nokogumbo/#readme
|
74
90
|
licenses:
|
75
91
|
- Apache-2.0
|
76
|
-
metadata:
|
77
|
-
|
92
|
+
metadata:
|
93
|
+
bug_tracker_uri: https://github.com/rubys/nokogumbo/issues
|
94
|
+
changelog_uri: https://github.com/rubys/nokogumbo/blob/master/CHANGELOG.md
|
95
|
+
homepage_uri: https://github.com/rubys/nokogumbo/#readme
|
96
|
+
source_code_uri: https://github.com/rubys/nokogumbo
|
97
|
+
post_install_message:
|
78
98
|
rdoc_options: []
|
79
99
|
require_paths:
|
80
100
|
- lib
|
@@ -82,16 +102,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
82
102
|
requirements:
|
83
103
|
- - ">="
|
84
104
|
- !ruby/object:Gem::Version
|
85
|
-
version: '
|
105
|
+
version: '2.1'
|
86
106
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
87
107
|
requirements:
|
88
108
|
- - ">="
|
89
109
|
- !ruby/object:Gem::Version
|
90
110
|
version: '0'
|
91
111
|
requirements: []
|
92
|
-
|
93
|
-
|
94
|
-
signing_key:
|
112
|
+
rubygems_version: 3.1.2
|
113
|
+
signing_key:
|
95
114
|
specification_version: 4
|
96
115
|
summary: Nokogiri interface to the Gumbo HTML5 parser
|
97
116
|
test_files: []
|