nokogiri 1.11.0.rc1-x86-linux
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +7 -0
- data/LICENSE-DEPENDENCIES.md +1614 -0
- data/LICENSE.md +9 -0
- data/README.md +200 -0
- data/bin/nokogiri +118 -0
- data/dependencies.yml +74 -0
- data/ext/nokogiri/depend +358 -0
- data/ext/nokogiri/extconf.rb +695 -0
- data/ext/nokogiri/html_document.c +170 -0
- data/ext/nokogiri/html_document.h +10 -0
- data/ext/nokogiri/html_element_description.c +279 -0
- data/ext/nokogiri/html_element_description.h +10 -0
- data/ext/nokogiri/html_entity_lookup.c +32 -0
- data/ext/nokogiri/html_entity_lookup.h +8 -0
- data/ext/nokogiri/html_sax_parser_context.c +116 -0
- data/ext/nokogiri/html_sax_parser_context.h +11 -0
- data/ext/nokogiri/html_sax_push_parser.c +87 -0
- data/ext/nokogiri/html_sax_push_parser.h +9 -0
- data/ext/nokogiri/nokogiri.c +147 -0
- data/ext/nokogiri/nokogiri.h +122 -0
- data/ext/nokogiri/xml_attr.c +103 -0
- data/ext/nokogiri/xml_attr.h +9 -0
- data/ext/nokogiri/xml_attribute_decl.c +70 -0
- data/ext/nokogiri/xml_attribute_decl.h +9 -0
- data/ext/nokogiri/xml_cdata.c +62 -0
- data/ext/nokogiri/xml_cdata.h +9 -0
- data/ext/nokogiri/xml_comment.c +69 -0
- data/ext/nokogiri/xml_comment.h +9 -0
- data/ext/nokogiri/xml_document.c +617 -0
- data/ext/nokogiri/xml_document.h +23 -0
- data/ext/nokogiri/xml_document_fragment.c +48 -0
- data/ext/nokogiri/xml_document_fragment.h +10 -0
- data/ext/nokogiri/xml_dtd.c +202 -0
- data/ext/nokogiri/xml_dtd.h +10 -0
- data/ext/nokogiri/xml_element_content.c +123 -0
- data/ext/nokogiri/xml_element_content.h +10 -0
- data/ext/nokogiri/xml_element_decl.c +69 -0
- data/ext/nokogiri/xml_element_decl.h +9 -0
- data/ext/nokogiri/xml_encoding_handler.c +79 -0
- data/ext/nokogiri/xml_encoding_handler.h +8 -0
- data/ext/nokogiri/xml_entity_decl.c +110 -0
- data/ext/nokogiri/xml_entity_decl.h +10 -0
- data/ext/nokogiri/xml_entity_reference.c +52 -0
- data/ext/nokogiri/xml_entity_reference.h +9 -0
- data/ext/nokogiri/xml_io.c +61 -0
- data/ext/nokogiri/xml_io.h +11 -0
- data/ext/nokogiri/xml_libxml2_hacks.c +112 -0
- data/ext/nokogiri/xml_libxml2_hacks.h +12 -0
- data/ext/nokogiri/xml_namespace.c +111 -0
- data/ext/nokogiri/xml_namespace.h +14 -0
- data/ext/nokogiri/xml_node.c +1773 -0
- data/ext/nokogiri/xml_node.h +13 -0
- data/ext/nokogiri/xml_node_set.c +486 -0
- data/ext/nokogiri/xml_node_set.h +12 -0
- data/ext/nokogiri/xml_processing_instruction.c +56 -0
- data/ext/nokogiri/xml_processing_instruction.h +9 -0
- data/ext/nokogiri/xml_reader.c +668 -0
- data/ext/nokogiri/xml_reader.h +10 -0
- data/ext/nokogiri/xml_relax_ng.c +161 -0
- data/ext/nokogiri/xml_relax_ng.h +9 -0
- data/ext/nokogiri/xml_sax_parser.c +310 -0
- data/ext/nokogiri/xml_sax_parser.h +39 -0
- data/ext/nokogiri/xml_sax_parser_context.c +262 -0
- data/ext/nokogiri/xml_sax_parser_context.h +10 -0
- data/ext/nokogiri/xml_sax_push_parser.c +159 -0
- data/ext/nokogiri/xml_sax_push_parser.h +9 -0
- data/ext/nokogiri/xml_schema.c +205 -0
- data/ext/nokogiri/xml_schema.h +9 -0
- data/ext/nokogiri/xml_syntax_error.c +64 -0
- data/ext/nokogiri/xml_syntax_error.h +13 -0
- data/ext/nokogiri/xml_text.c +52 -0
- data/ext/nokogiri/xml_text.h +9 -0
- data/ext/nokogiri/xml_xpath_context.c +298 -0
- data/ext/nokogiri/xml_xpath_context.h +10 -0
- data/ext/nokogiri/xslt_stylesheet.c +266 -0
- data/ext/nokogiri/xslt_stylesheet.h +14 -0
- data/lib/nokogiri.rb +127 -0
- data/lib/nokogiri/2.4/nokogiri.so +0 -0
- data/lib/nokogiri/2.5/nokogiri.so +0 -0
- data/lib/nokogiri/2.6/nokogiri.so +0 -0
- data/lib/nokogiri/2.7/nokogiri.so +0 -0
- data/lib/nokogiri/css.rb +28 -0
- data/lib/nokogiri/css/node.rb +53 -0
- data/lib/nokogiri/css/parser.rb +751 -0
- data/lib/nokogiri/css/parser.y +272 -0
- data/lib/nokogiri/css/parser_extras.rb +92 -0
- data/lib/nokogiri/css/syntax_error.rb +8 -0
- data/lib/nokogiri/css/tokenizer.rb +154 -0
- data/lib/nokogiri/css/tokenizer.rex +55 -0
- data/lib/nokogiri/css/xpath_visitor.rb +232 -0
- data/lib/nokogiri/decorators/slop.rb +43 -0
- data/lib/nokogiri/html.rb +38 -0
- data/lib/nokogiri/html/builder.rb +36 -0
- data/lib/nokogiri/html/document.rb +336 -0
- data/lib/nokogiri/html/document_fragment.rb +50 -0
- data/lib/nokogiri/html/element_description.rb +24 -0
- data/lib/nokogiri/html/element_description_defaults.rb +672 -0
- data/lib/nokogiri/html/entity_lookup.rb +14 -0
- data/lib/nokogiri/html/sax/parser.rb +63 -0
- data/lib/nokogiri/html/sax/parser_context.rb +17 -0
- data/lib/nokogiri/html/sax/push_parser.rb +37 -0
- data/lib/nokogiri/jruby/dependencies.rb +20 -0
- data/lib/nokogiri/syntax_error.rb +5 -0
- data/lib/nokogiri/version.rb +149 -0
- data/lib/nokogiri/xml.rb +76 -0
- data/lib/nokogiri/xml/attr.rb +15 -0
- data/lib/nokogiri/xml/attribute_decl.rb +19 -0
- data/lib/nokogiri/xml/builder.rb +447 -0
- data/lib/nokogiri/xml/cdata.rb +12 -0
- data/lib/nokogiri/xml/character_data.rb +8 -0
- data/lib/nokogiri/xml/document.rb +280 -0
- data/lib/nokogiri/xml/document_fragment.rb +161 -0
- data/lib/nokogiri/xml/dtd.rb +33 -0
- data/lib/nokogiri/xml/element_content.rb +37 -0
- data/lib/nokogiri/xml/element_decl.rb +14 -0
- data/lib/nokogiri/xml/entity_decl.rb +20 -0
- data/lib/nokogiri/xml/entity_reference.rb +19 -0
- data/lib/nokogiri/xml/namespace.rb +14 -0
- data/lib/nokogiri/xml/node.rb +916 -0
- data/lib/nokogiri/xml/node/save_options.rb +62 -0
- data/lib/nokogiri/xml/node_set.rb +372 -0
- data/lib/nokogiri/xml/notation.rb +7 -0
- data/lib/nokogiri/xml/parse_options.rb +121 -0
- data/lib/nokogiri/xml/pp.rb +3 -0
- data/lib/nokogiri/xml/pp/character_data.rb +19 -0
- data/lib/nokogiri/xml/pp/node.rb +57 -0
- data/lib/nokogiri/xml/processing_instruction.rb +9 -0
- data/lib/nokogiri/xml/reader.rb +116 -0
- data/lib/nokogiri/xml/relax_ng.rb +33 -0
- data/lib/nokogiri/xml/sax.rb +5 -0
- data/lib/nokogiri/xml/sax/document.rb +172 -0
- data/lib/nokogiri/xml/sax/parser.rb +123 -0
- data/lib/nokogiri/xml/sax/parser_context.rb +17 -0
- data/lib/nokogiri/xml/sax/push_parser.rb +61 -0
- data/lib/nokogiri/xml/schema.rb +64 -0
- data/lib/nokogiri/xml/searchable.rb +231 -0
- data/lib/nokogiri/xml/syntax_error.rb +71 -0
- data/lib/nokogiri/xml/text.rb +10 -0
- data/lib/nokogiri/xml/xpath.rb +11 -0
- data/lib/nokogiri/xml/xpath/syntax_error.rb +12 -0
- data/lib/nokogiri/xml/xpath_context.rb +17 -0
- data/lib/nokogiri/xslt.rb +57 -0
- data/lib/nokogiri/xslt/stylesheet.rb +26 -0
- data/lib/xsd/xmlparser/nokogiri.rb +103 -0
- metadata +482 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
module Nokogiri
|
2
|
+
module CSS
|
3
|
+
class Tokenizer # :nodoc:
|
4
|
+
|
5
|
+
macro
|
6
|
+
nl \n|\r\n|\r|\f
|
7
|
+
w [\s]*
|
8
|
+
nonascii [^\0-\177]
|
9
|
+
num -?([0-9]+|[0-9]*\.[0-9]+)
|
10
|
+
unicode \\[0-9A-Fa-f]{1,6}(\r\n|[\s])?
|
11
|
+
|
12
|
+
escape {unicode}|\\[^\n\r\f0-9A-Fa-f]
|
13
|
+
nmchar [_A-Za-z0-9-]|{nonascii}|{escape}
|
14
|
+
nmstart [_A-Za-z]|{nonascii}|{escape}
|
15
|
+
ident [-@]?({nmstart})({nmchar})*
|
16
|
+
name ({nmchar})+
|
17
|
+
string1 "([^\n\r\f"]|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*"
|
18
|
+
string2 '([^\n\r\f']|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*'
|
19
|
+
string {string1}|{string2}
|
20
|
+
|
21
|
+
rule
|
22
|
+
|
23
|
+
# [:state] pattern [actions]
|
24
|
+
|
25
|
+
has\({w} { [:HAS, text] }
|
26
|
+
{ident}\({w} { [:FUNCTION, text] }
|
27
|
+
{ident} { [:IDENT, text] }
|
28
|
+
\#{name} { [:HASH, text] }
|
29
|
+
{w}~={w} { [:INCLUDES, text] }
|
30
|
+
{w}\|={w} { [:DASHMATCH, text] }
|
31
|
+
{w}\^={w} { [:PREFIXMATCH, text] }
|
32
|
+
{w}\$={w} { [:SUFFIXMATCH, text] }
|
33
|
+
{w}\*={w} { [:SUBSTRINGMATCH, text] }
|
34
|
+
{w}!={w} { [:NOT_EQUAL, text] }
|
35
|
+
{w}={w} { [:EQUAL, text] }
|
36
|
+
{w}\) { [:RPAREN, text] }
|
37
|
+
\[{w} { [:LSQUARE, text] }
|
38
|
+
{w}\] { [:RSQUARE, text] }
|
39
|
+
{w}\+{w} { [:PLUS, text] }
|
40
|
+
{w}>{w} { [:GREATER, text] }
|
41
|
+
{w},{w} { [:COMMA, text] }
|
42
|
+
{w}~{w} { [:TILDE, text] }
|
43
|
+
\:not\({w} { [:NOT, text] }
|
44
|
+
{num} { [:NUMBER, text] }
|
45
|
+
{w}\/\/{w} { [:DOUBLESLASH, text] }
|
46
|
+
{w}\/{w} { [:SLASH, text] }
|
47
|
+
|
48
|
+
U\+[0-9a-f?]{1,6}(-[0-9a-f]{1,6})? {[:UNICODE_RANGE, text] }
|
49
|
+
|
50
|
+
[\s]+ { [:S, text] }
|
51
|
+
{string} { [:STRING, text] }
|
52
|
+
. { [text, text] }
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,232 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Nokogiri
|
3
|
+
module CSS
|
4
|
+
class XPathVisitor # :nodoc:
|
5
|
+
def visit_function node
|
6
|
+
|
7
|
+
msg = :"visit_function_#{node.value.first.gsub(/[(]/, '')}"
|
8
|
+
return self.send(msg, node) if self.respond_to?(msg)
|
9
|
+
|
10
|
+
case node.value.first
|
11
|
+
when /^text\(/
|
12
|
+
'child::text()'
|
13
|
+
when /^self\(/
|
14
|
+
"self::#{node.value[1]}"
|
15
|
+
when /^eq\(/
|
16
|
+
"position() = #{node.value[1]}"
|
17
|
+
when /^(nth|nth-of-type)\(/
|
18
|
+
if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
|
19
|
+
nth(node.value[1])
|
20
|
+
else
|
21
|
+
"position() = #{node.value[1]}"
|
22
|
+
end
|
23
|
+
when /^nth-child\(/
|
24
|
+
if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
|
25
|
+
nth(node.value[1], :child => true)
|
26
|
+
else
|
27
|
+
"count(preceding-sibling::*) = #{node.value[1].to_i-1}"
|
28
|
+
end
|
29
|
+
when /^nth-last-of-type\(/
|
30
|
+
if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
|
31
|
+
nth(node.value[1], :last => true)
|
32
|
+
else
|
33
|
+
index = node.value[1].to_i - 1
|
34
|
+
index == 0 ? "position() = last()" : "position() = last() - #{index}"
|
35
|
+
end
|
36
|
+
when /^nth-last-child\(/
|
37
|
+
if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
|
38
|
+
nth(node.value[1], :last => true, :child => true)
|
39
|
+
else
|
40
|
+
"count(following-sibling::*) = #{node.value[1].to_i-1}"
|
41
|
+
end
|
42
|
+
when /^(first|first-of-type)\(/
|
43
|
+
"position() = 1"
|
44
|
+
when /^(last|last-of-type)\(/
|
45
|
+
"position() = last()"
|
46
|
+
when /^contains\(/
|
47
|
+
"contains(., #{node.value[1]})"
|
48
|
+
when /^gt\(/
|
49
|
+
"position() > #{node.value[1]}"
|
50
|
+
when /^only-child\(/
|
51
|
+
"last() = 1"
|
52
|
+
when /^comment\(/
|
53
|
+
"comment()"
|
54
|
+
when /^has\(/
|
55
|
+
is_direct = node.value[1].value[0].nil? # e.g. "has(> a)", "has(~ a)", "has(+ a)"
|
56
|
+
".#{"//" if !is_direct}#{node.value[1].accept(self)}"
|
57
|
+
else
|
58
|
+
args = ['.'] + node.value[1..-1]
|
59
|
+
"#{node.value.first}#{args.join(', ')})"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def visit_not node
|
64
|
+
child = node.value.first
|
65
|
+
if :ELEMENT_NAME == child.type
|
66
|
+
"not(self::#{child.accept(self)})"
|
67
|
+
else
|
68
|
+
"not(#{child.accept(self)})"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def visit_id node
|
73
|
+
node.value.first =~ /^#(.*)$/
|
74
|
+
"@id = '#{$1}'"
|
75
|
+
end
|
76
|
+
|
77
|
+
def visit_attribute_condition node
|
78
|
+
attribute = if (node.value.first.type == :FUNCTION) or (node.value.first.value.first =~ /::/)
|
79
|
+
''
|
80
|
+
else
|
81
|
+
'@'
|
82
|
+
end
|
83
|
+
attribute += node.value.first.accept(self)
|
84
|
+
|
85
|
+
# Support non-standard css
|
86
|
+
attribute.gsub!(/^@@/, '@')
|
87
|
+
|
88
|
+
return attribute unless node.value.length == 3
|
89
|
+
|
90
|
+
value = node.value.last
|
91
|
+
value = "'#{value}'" if value !~ /^['"]/
|
92
|
+
|
93
|
+
if (value[0]==value[-1]) && %q{"'}.include?(value[0])
|
94
|
+
str_value = value[1..-2]
|
95
|
+
if str_value.include?(value[0])
|
96
|
+
value = 'concat("' + str_value.split('"', -1).join(%q{", '"', "}) + '", "")'
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
case node.value[1]
|
101
|
+
when :equal
|
102
|
+
attribute + " = " + "#{value}"
|
103
|
+
when :not_equal
|
104
|
+
attribute + " != " + "#{value}"
|
105
|
+
when :substring_match
|
106
|
+
"contains(#{attribute}, #{value})"
|
107
|
+
when :prefix_match
|
108
|
+
"starts-with(#{attribute}, #{value})"
|
109
|
+
when :dash_match
|
110
|
+
"#{attribute} = #{value} or starts-with(#{attribute}, concat(#{value}, '-'))"
|
111
|
+
when :includes
|
112
|
+
"contains(concat(\" \", #{attribute}, \" \"),concat(\" \", #{value}, \" \"))"
|
113
|
+
when :suffix_match
|
114
|
+
"substring(#{attribute}, string-length(#{attribute}) - " +
|
115
|
+
"string-length(#{value}) + 1, string-length(#{value})) = #{value}"
|
116
|
+
else
|
117
|
+
attribute + " #{node.value[1]} " + "#{value}"
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def visit_pseudo_class node
|
122
|
+
if node.value.first.is_a?(Nokogiri::CSS::Node) and node.value.first.type == :FUNCTION
|
123
|
+
node.value.first.accept(self)
|
124
|
+
else
|
125
|
+
msg = :"visit_pseudo_class_#{node.value.first.gsub(/[(]/, '')}"
|
126
|
+
return self.send(msg, node) if self.respond_to?(msg)
|
127
|
+
|
128
|
+
case node.value.first
|
129
|
+
when "first" then "position() = 1"
|
130
|
+
when "first-child" then "count(preceding-sibling::*) = 0"
|
131
|
+
when "last" then "position() = last()"
|
132
|
+
when "last-child" then "count(following-sibling::*) = 0"
|
133
|
+
when "first-of-type" then "position() = 1"
|
134
|
+
when "last-of-type" then "position() = last()"
|
135
|
+
when "only-child" then "count(preceding-sibling::*) = 0 and count(following-sibling::*) = 0"
|
136
|
+
when "only-of-type" then "last() = 1"
|
137
|
+
when "empty" then "not(node())"
|
138
|
+
when "parent" then "node()"
|
139
|
+
when "root" then "not(parent::*)"
|
140
|
+
else
|
141
|
+
node.value.first + "(.)"
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def visit_class_condition node
|
147
|
+
"contains(concat(' ', normalize-space(@class), ' '), ' #{node.value.first} ')"
|
148
|
+
end
|
149
|
+
|
150
|
+
def visit_combinator node
|
151
|
+
if is_of_type_pseudo_class?(node.value.last)
|
152
|
+
"#{node.value.first.accept(self) if node.value.first}][#{node.value.last.accept(self)}"
|
153
|
+
else
|
154
|
+
"#{node.value.first.accept(self) if node.value.first} and #{node.value.last.accept(self)}"
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
{
|
159
|
+
'direct_adjacent_selector' => "/following-sibling::*[1]/self::",
|
160
|
+
'following_selector' => "/following-sibling::",
|
161
|
+
'descendant_selector' => '//',
|
162
|
+
'child_selector' => '/',
|
163
|
+
}.each do |k,v|
|
164
|
+
class_eval %{
|
165
|
+
def visit_#{k} node
|
166
|
+
"\#{node.value.first.accept(self) if node.value.first}#{v}\#{node.value.last.accept(self)}"
|
167
|
+
end
|
168
|
+
}
|
169
|
+
end
|
170
|
+
|
171
|
+
def visit_conditional_selector node
|
172
|
+
node.value.first.accept(self) + '[' +
|
173
|
+
node.value.last.accept(self) + ']'
|
174
|
+
end
|
175
|
+
|
176
|
+
def visit_element_name node
|
177
|
+
node.value.first
|
178
|
+
end
|
179
|
+
|
180
|
+
def accept node
|
181
|
+
node.accept(self)
|
182
|
+
end
|
183
|
+
|
184
|
+
private
|
185
|
+
def nth node, options={}
|
186
|
+
raise ArgumentError, "expected an+b node to contain 4 tokens, but is #{node.value.inspect}" unless node.value.size == 4
|
187
|
+
|
188
|
+
a, b = read_a_and_positive_b node.value
|
189
|
+
position = if options[:child]
|
190
|
+
options[:last] ? "(count(following-sibling::*) + 1)" : "(count(preceding-sibling::*) + 1)"
|
191
|
+
else
|
192
|
+
options[:last] ? "(last()-position()+1)" : "position()"
|
193
|
+
end
|
194
|
+
|
195
|
+
if b.zero?
|
196
|
+
"(#{position} mod #{a}) = 0"
|
197
|
+
else
|
198
|
+
compare = a < 0 ? "<=" : ">="
|
199
|
+
if a.abs == 1
|
200
|
+
"#{position} #{compare} #{b}"
|
201
|
+
else
|
202
|
+
"(#{position} #{compare} #{b}) and (((#{position}-#{b}) mod #{a.abs}) = 0)"
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
def read_a_and_positive_b values
|
208
|
+
op = values[2]
|
209
|
+
if op == "+"
|
210
|
+
a = values[0].to_i
|
211
|
+
b = values[3].to_i
|
212
|
+
elsif op == "-"
|
213
|
+
a = values[0].to_i
|
214
|
+
b = a - (values[3].to_i % a)
|
215
|
+
else
|
216
|
+
raise ArgumentError, "expected an+b node to have either + or - as the operator, but is #{op.inspect}"
|
217
|
+
end
|
218
|
+
[a, b]
|
219
|
+
end
|
220
|
+
|
221
|
+
def is_of_type_pseudo_class? node
|
222
|
+
if node.type==:PSEUDO_CLASS
|
223
|
+
if node.value[0].is_a?(Nokogiri::CSS::Node) and node.value[0].type == :FUNCTION
|
224
|
+
node.value[0].value[0]
|
225
|
+
else
|
226
|
+
node.value[0]
|
227
|
+
end =~ /(nth|first|last|only)-of-type(\()?/
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Nokogiri
|
3
|
+
module Decorators
|
4
|
+
###
|
5
|
+
# The Slop decorator implements method missing such that a methods may be
|
6
|
+
# used instead of XPath or CSS. See Nokogiri.Slop
|
7
|
+
module Slop
|
8
|
+
# The default XPath search context for Slop
|
9
|
+
XPATH_PREFIX = "./"
|
10
|
+
|
11
|
+
###
|
12
|
+
# look for node with +name+. See Nokogiri.Slop
|
13
|
+
def method_missing name, *args, &block
|
14
|
+
if args.empty?
|
15
|
+
list = xpath("#{XPATH_PREFIX}#{name.to_s.sub(/^_/, '')}")
|
16
|
+
elsif args.first.is_a? Hash
|
17
|
+
hash = args.first
|
18
|
+
if hash[:css]
|
19
|
+
list = css("#{name}#{hash[:css]}")
|
20
|
+
elsif hash[:xpath]
|
21
|
+
conds = Array(hash[:xpath]).join(' and ')
|
22
|
+
list = xpath("#{XPATH_PREFIX}#{name}[#{conds}]")
|
23
|
+
end
|
24
|
+
else
|
25
|
+
CSS::Parser.without_cache do
|
26
|
+
list = xpath(
|
27
|
+
*CSS.xpath_for("#{name}#{args.first}", :prefix => XPATH_PREFIX)
|
28
|
+
)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
super if list.empty?
|
33
|
+
list.length == 1 ? list.first : list
|
34
|
+
end
|
35
|
+
|
36
|
+
def respond_to_missing? name, include_private = false
|
37
|
+
list = xpath("#{XPATH_PREFIX}#{name.to_s.sub(/^_/, '')}")
|
38
|
+
|
39
|
+
!list.empty?
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'nokogiri/html/entity_lookup'
|
3
|
+
require 'nokogiri/html/document'
|
4
|
+
require 'nokogiri/html/document_fragment'
|
5
|
+
require 'nokogiri/html/sax/parser_context'
|
6
|
+
require 'nokogiri/html/sax/parser'
|
7
|
+
require 'nokogiri/html/sax/push_parser'
|
8
|
+
require 'nokogiri/html/element_description'
|
9
|
+
require 'nokogiri/html/element_description_defaults'
|
10
|
+
|
11
|
+
module Nokogiri
|
12
|
+
class << self
|
13
|
+
###
|
14
|
+
# Parse HTML. Convenience method for Nokogiri::HTML::Document.parse
|
15
|
+
def HTML thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block
|
16
|
+
Nokogiri::HTML::Document.parse(thing, url, encoding, options, &block)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
module HTML
|
21
|
+
class << self
|
22
|
+
###
|
23
|
+
# Parse HTML. Convenience method for Nokogiri::HTML::Document.parse
|
24
|
+
def parse thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block
|
25
|
+
Document.parse(thing, url, encoding, options, &block)
|
26
|
+
end
|
27
|
+
|
28
|
+
####
|
29
|
+
# Parse a fragment from +string+ in to a NodeSet.
|
30
|
+
def fragment string, encoding = nil
|
31
|
+
HTML::DocumentFragment.parse string, encoding
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Instance of Nokogiri::HTML::EntityLookup
|
36
|
+
NamedCharacters = EntityLookup.new
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Nokogiri
|
3
|
+
module HTML
|
4
|
+
###
|
5
|
+
# Nokogiri HTML builder is used for building HTML documents. It is very
|
6
|
+
# similar to the Nokogiri::XML::Builder. In fact, you should go read the
|
7
|
+
# documentation for Nokogiri::XML::Builder before reading this
|
8
|
+
# documentation.
|
9
|
+
#
|
10
|
+
# == Synopsis:
|
11
|
+
#
|
12
|
+
# Create an HTML document with a body that has an onload attribute, and a
|
13
|
+
# span tag with a class of "bold" that has content of "Hello world".
|
14
|
+
#
|
15
|
+
# builder = Nokogiri::HTML::Builder.new do |doc|
|
16
|
+
# doc.html {
|
17
|
+
# doc.body(:onload => 'some_func();') {
|
18
|
+
# doc.span.bold {
|
19
|
+
# doc.text "Hello world"
|
20
|
+
# }
|
21
|
+
# }
|
22
|
+
# }
|
23
|
+
# end
|
24
|
+
# puts builder.to_html
|
25
|
+
#
|
26
|
+
# The HTML builder inherits from the XML builder, so make sure to read the
|
27
|
+
# Nokogiri::XML::Builder documentation.
|
28
|
+
class Builder < Nokogiri::XML::Builder
|
29
|
+
###
|
30
|
+
# Convert the builder to HTML
|
31
|
+
def to_html
|
32
|
+
@doc.to_html
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,336 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Nokogiri
|
3
|
+
module HTML
|
4
|
+
class Document < Nokogiri::XML::Document
|
5
|
+
###
|
6
|
+
# Get the meta tag encoding for this document. If there is no meta tag,
|
7
|
+
# then nil is returned.
|
8
|
+
def meta_encoding
|
9
|
+
case
|
10
|
+
when meta = at('//meta[@charset]')
|
11
|
+
meta[:charset]
|
12
|
+
when meta = meta_content_type
|
13
|
+
meta['content'][/charset\s*=\s*([\w-]+)/i, 1]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
###
|
18
|
+
# Set the meta tag encoding for this document.
|
19
|
+
#
|
20
|
+
# If an meta encoding tag is already present, its content is
|
21
|
+
# replaced with the given text.
|
22
|
+
#
|
23
|
+
# Otherwise, this method tries to create one at an appropriate
|
24
|
+
# place supplying head and/or html elements as necessary, which
|
25
|
+
# is inside a head element if any, and before any text node or
|
26
|
+
# content element (typically <body>) if any.
|
27
|
+
#
|
28
|
+
# The result when trying to set an encoding that is different
|
29
|
+
# from the document encoding is undefined.
|
30
|
+
#
|
31
|
+
# Beware in CRuby, that libxml2 automatically inserts a meta tag
|
32
|
+
# into a head element.
|
33
|
+
def meta_encoding= encoding
|
34
|
+
case
|
35
|
+
when meta = meta_content_type
|
36
|
+
meta['content'] = 'text/html; charset=%s' % encoding
|
37
|
+
encoding
|
38
|
+
when meta = at('//meta[@charset]')
|
39
|
+
meta['charset'] = encoding
|
40
|
+
else
|
41
|
+
meta = XML::Node.new('meta', self)
|
42
|
+
if dtd = internal_subset and dtd.html5_dtd?
|
43
|
+
meta['charset'] = encoding
|
44
|
+
else
|
45
|
+
meta['http-equiv'] = 'Content-Type'
|
46
|
+
meta['content'] = 'text/html; charset=%s' % encoding
|
47
|
+
end
|
48
|
+
|
49
|
+
case
|
50
|
+
when head = at('//head')
|
51
|
+
head.prepend_child(meta)
|
52
|
+
else
|
53
|
+
set_metadata_element(meta)
|
54
|
+
end
|
55
|
+
encoding
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def meta_content_type
|
60
|
+
xpath('//meta[@http-equiv and boolean(@content)]').find { |node|
|
61
|
+
node['http-equiv'] =~ /\AContent-Type\z/i
|
62
|
+
}
|
63
|
+
end
|
64
|
+
private :meta_content_type
|
65
|
+
|
66
|
+
###
|
67
|
+
# Get the title string of this document. Return nil if there is
|
68
|
+
# no title tag.
|
69
|
+
def title
|
70
|
+
title = at('//title') and title.inner_text
|
71
|
+
end
|
72
|
+
|
73
|
+
###
|
74
|
+
# Set the title string of this document.
|
75
|
+
#
|
76
|
+
# If a title element is already present, its content is replaced
|
77
|
+
# with the given text.
|
78
|
+
#
|
79
|
+
# Otherwise, this method tries to create one at an appropriate
|
80
|
+
# place supplying head and/or html elements as necessary, which
|
81
|
+
# is inside a head element if any, right after a meta
|
82
|
+
# encoding/charset tag if any, and before any text node or
|
83
|
+
# content element (typically <body>) if any.
|
84
|
+
def title=(text)
|
85
|
+
tnode = XML::Text.new(text, self)
|
86
|
+
if title = at('//title')
|
87
|
+
title.children = tnode
|
88
|
+
return text
|
89
|
+
end
|
90
|
+
|
91
|
+
title = XML::Node.new('title', self) << tnode
|
92
|
+
case
|
93
|
+
when head = at('//head')
|
94
|
+
head << title
|
95
|
+
when meta = at('//meta[@charset]') || meta_content_type
|
96
|
+
# better put after charset declaration
|
97
|
+
meta.add_next_sibling(title)
|
98
|
+
else
|
99
|
+
set_metadata_element(title)
|
100
|
+
end
|
101
|
+
text
|
102
|
+
end
|
103
|
+
|
104
|
+
def set_metadata_element(element)
|
105
|
+
case
|
106
|
+
when head = at('//head')
|
107
|
+
head << element
|
108
|
+
when html = at('//html')
|
109
|
+
head = html.prepend_child(XML::Node.new('head', self))
|
110
|
+
head.prepend_child(element)
|
111
|
+
when first = children.find { |node|
|
112
|
+
case node
|
113
|
+
when XML::Element, XML::Text
|
114
|
+
true
|
115
|
+
end
|
116
|
+
}
|
117
|
+
# We reach here only if the underlying document model
|
118
|
+
# allows <html>/<head> elements to be omitted and does not
|
119
|
+
# automatically supply them.
|
120
|
+
first.add_previous_sibling(element)
|
121
|
+
else
|
122
|
+
html = add_child(XML::Node.new('html', self))
|
123
|
+
head = html.add_child(XML::Node.new('head', self))
|
124
|
+
head.prepend_child(element)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
private :set_metadata_element
|
128
|
+
|
129
|
+
####
|
130
|
+
# Serialize Node using +options+. Save options can also be set using a
|
131
|
+
# block. See SaveOptions.
|
132
|
+
#
|
133
|
+
# These two statements are equivalent:
|
134
|
+
#
|
135
|
+
# node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)
|
136
|
+
#
|
137
|
+
# or
|
138
|
+
#
|
139
|
+
# node.serialize(:encoding => 'UTF-8') do |config|
|
140
|
+
# config.format.as_xml
|
141
|
+
# end
|
142
|
+
#
|
143
|
+
def serialize options = {}
|
144
|
+
options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
|
145
|
+
super
|
146
|
+
end
|
147
|
+
|
148
|
+
####
|
149
|
+
# Create a Nokogiri::XML::DocumentFragment from +tags+
|
150
|
+
def fragment tags = nil
|
151
|
+
DocumentFragment.new(self, tags, self.root)
|
152
|
+
end
|
153
|
+
|
154
|
+
class << self
|
155
|
+
###
|
156
|
+
# Parse HTML. +string_or_io+ may be a String, or any object that
|
157
|
+
# responds to _read_ and _close_ such as an IO, or StringIO.
|
158
|
+
# +url+ is resource where this document is located. +encoding+ is the
|
159
|
+
# encoding that should be used when processing the document. +options+
|
160
|
+
# is a number that sets options in the parser, such as
|
161
|
+
# Nokogiri::XML::ParseOptions::RECOVER. See the constants in
|
162
|
+
# Nokogiri::XML::ParseOptions.
|
163
|
+
def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML
|
164
|
+
|
165
|
+
options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
|
166
|
+
# Give the options to the user
|
167
|
+
yield options if block_given?
|
168
|
+
|
169
|
+
if string_or_io.respond_to?(:encoding)
|
170
|
+
unless string_or_io.encoding.name == "ASCII-8BIT"
|
171
|
+
encoding ||= string_or_io.encoding.name
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
if string_or_io.respond_to?(:read)
|
176
|
+
url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
|
177
|
+
unless encoding
|
178
|
+
# Libxml2's parser has poor support for encoding
|
179
|
+
# detection. First, it does not recognize the HTML5
|
180
|
+
# style meta charset declaration. Secondly, even if it
|
181
|
+
# successfully detects an encoding hint, it does not
|
182
|
+
# re-decode or re-parse the preceding part which may be
|
183
|
+
# garbled.
|
184
|
+
#
|
185
|
+
# EncodingReader aims to perform advanced encoding
|
186
|
+
# detection beyond what Libxml2 does, and to emulate
|
187
|
+
# rewinding of a stream and make Libxml2 redo parsing
|
188
|
+
# from the start when an encoding hint is found.
|
189
|
+
string_or_io = EncodingReader.new(string_or_io)
|
190
|
+
begin
|
191
|
+
return read_io(string_or_io, url, encoding, options.to_i)
|
192
|
+
rescue EncodingFound => e
|
193
|
+
encoding = e.found_encoding
|
194
|
+
end
|
195
|
+
end
|
196
|
+
return read_io(string_or_io, url, encoding, options.to_i)
|
197
|
+
end
|
198
|
+
|
199
|
+
# read_memory pukes on empty docs
|
200
|
+
if string_or_io.nil? or string_or_io.empty?
|
201
|
+
return encoding ? new.tap { |i| i.encoding = encoding } : new
|
202
|
+
end
|
203
|
+
|
204
|
+
encoding ||= EncodingReader.detect_encoding(string_or_io)
|
205
|
+
|
206
|
+
read_memory(string_or_io, url, encoding, options.to_i)
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
class EncodingFound < StandardError # :nodoc:
|
211
|
+
attr_reader :found_encoding
|
212
|
+
|
213
|
+
def initialize(encoding)
|
214
|
+
@found_encoding = encoding
|
215
|
+
super("encoding found: %s" % encoding)
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
class EncodingReader # :nodoc:
|
220
|
+
class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc:
|
221
|
+
attr_reader :encoding
|
222
|
+
|
223
|
+
def initialize
|
224
|
+
@encoding = nil
|
225
|
+
super()
|
226
|
+
end
|
227
|
+
|
228
|
+
def start_element(name, attrs = [])
|
229
|
+
return unless name == 'meta'
|
230
|
+
attr = Hash[attrs]
|
231
|
+
charset = attr['charset'] and
|
232
|
+
@encoding = charset
|
233
|
+
http_equiv = attr['http-equiv'] and
|
234
|
+
http_equiv.match(/\AContent-Type\z/i) and
|
235
|
+
content = attr['content'] and
|
236
|
+
m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
|
237
|
+
@encoding = m[1]
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
class JumpSAXHandler < SAXHandler
|
242
|
+
def initialize(jumptag)
|
243
|
+
@jumptag = jumptag
|
244
|
+
super()
|
245
|
+
end
|
246
|
+
|
247
|
+
def start_element(name, attrs = [])
|
248
|
+
super
|
249
|
+
throw @jumptag, @encoding if @encoding
|
250
|
+
throw @jumptag, nil if name =~ /\A(?:div|h1|img|p|br)\z/
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
def self.detect_encoding(chunk)
|
255
|
+
if Nokogiri.jruby? && EncodingReader.is_jruby_without_fix?
|
256
|
+
return EncodingReader.detect_encoding_for_jruby_without_fix(chunk)
|
257
|
+
end
|
258
|
+
m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
|
259
|
+
return Nokogiri.XML(m[1]).encoding
|
260
|
+
|
261
|
+
if Nokogiri.jruby?
|
262
|
+
m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
|
263
|
+
return m[4]
|
264
|
+
catch(:encoding_found) {
|
265
|
+
Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
|
266
|
+
nil
|
267
|
+
}
|
268
|
+
else
|
269
|
+
handler = SAXHandler.new
|
270
|
+
parser = Nokogiri::HTML::SAX::PushParser.new(handler)
|
271
|
+
parser << chunk rescue Nokogiri::SyntaxError
|
272
|
+
handler.encoding
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
def self.is_jruby_without_fix?
|
277
|
+
JRUBY_VERSION.split('.').join.to_i < 165
|
278
|
+
end
|
279
|
+
|
280
|
+
def self.detect_encoding_for_jruby_without_fix(chunk)
|
281
|
+
m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
|
282
|
+
return Nokogiri.XML(m[1]).encoding
|
283
|
+
|
284
|
+
m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
|
285
|
+
return m[4]
|
286
|
+
|
287
|
+
catch(:encoding_found) {
|
288
|
+
Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk)
|
289
|
+
nil
|
290
|
+
}
|
291
|
+
rescue Nokogiri::SyntaxError, RuntimeError
|
292
|
+
# Ignore parser errors that nokogiri may raise
|
293
|
+
nil
|
294
|
+
end
|
295
|
+
|
296
|
+
def initialize(io)
|
297
|
+
@io = io
|
298
|
+
@firstchunk = nil
|
299
|
+
@encoding_found = nil
|
300
|
+
end
|
301
|
+
|
302
|
+
# This method is used by the C extension so that
|
303
|
+
# Nokogiri::HTML::Document#read_io() does not leak memory when
|
304
|
+
# EncodingFound is raised.
|
305
|
+
attr_reader :encoding_found
|
306
|
+
|
307
|
+
def read(len)
|
308
|
+
# no support for a call without len
|
309
|
+
|
310
|
+
if !@firstchunk
|
311
|
+
@firstchunk = @io.read(len) or return nil
|
312
|
+
|
313
|
+
# This implementation expects that the first call from
|
314
|
+
# htmlReadIO() is made with a length long enough (~1KB) to
|
315
|
+
# achieve advanced encoding detection.
|
316
|
+
if encoding = EncodingReader.detect_encoding(@firstchunk)
|
317
|
+
# The first chunk is stored for the next read in retry.
|
318
|
+
raise @encoding_found = EncodingFound.new(encoding)
|
319
|
+
end
|
320
|
+
end
|
321
|
+
@encoding_found = nil
|
322
|
+
|
323
|
+
ret = @firstchunk.slice!(0, len)
|
324
|
+
if (len -= ret.length) > 0
|
325
|
+
rest = @io.read(len) and ret << rest
|
326
|
+
end
|
327
|
+
if ret.empty?
|
328
|
+
nil
|
329
|
+
else
|
330
|
+
ret
|
331
|
+
end
|
332
|
+
end
|
333
|
+
end
|
334
|
+
end
|
335
|
+
end
|
336
|
+
end
|