nokogiri 1.11.0.rc1-x86-linux

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (145) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE-DEPENDENCIES.md +1614 -0
  3. data/LICENSE.md +9 -0
  4. data/README.md +200 -0
  5. data/bin/nokogiri +118 -0
  6. data/dependencies.yml +74 -0
  7. data/ext/nokogiri/depend +358 -0
  8. data/ext/nokogiri/extconf.rb +695 -0
  9. data/ext/nokogiri/html_document.c +170 -0
  10. data/ext/nokogiri/html_document.h +10 -0
  11. data/ext/nokogiri/html_element_description.c +279 -0
  12. data/ext/nokogiri/html_element_description.h +10 -0
  13. data/ext/nokogiri/html_entity_lookup.c +32 -0
  14. data/ext/nokogiri/html_entity_lookup.h +8 -0
  15. data/ext/nokogiri/html_sax_parser_context.c +116 -0
  16. data/ext/nokogiri/html_sax_parser_context.h +11 -0
  17. data/ext/nokogiri/html_sax_push_parser.c +87 -0
  18. data/ext/nokogiri/html_sax_push_parser.h +9 -0
  19. data/ext/nokogiri/nokogiri.c +147 -0
  20. data/ext/nokogiri/nokogiri.h +122 -0
  21. data/ext/nokogiri/xml_attr.c +103 -0
  22. data/ext/nokogiri/xml_attr.h +9 -0
  23. data/ext/nokogiri/xml_attribute_decl.c +70 -0
  24. data/ext/nokogiri/xml_attribute_decl.h +9 -0
  25. data/ext/nokogiri/xml_cdata.c +62 -0
  26. data/ext/nokogiri/xml_cdata.h +9 -0
  27. data/ext/nokogiri/xml_comment.c +69 -0
  28. data/ext/nokogiri/xml_comment.h +9 -0
  29. data/ext/nokogiri/xml_document.c +617 -0
  30. data/ext/nokogiri/xml_document.h +23 -0
  31. data/ext/nokogiri/xml_document_fragment.c +48 -0
  32. data/ext/nokogiri/xml_document_fragment.h +10 -0
  33. data/ext/nokogiri/xml_dtd.c +202 -0
  34. data/ext/nokogiri/xml_dtd.h +10 -0
  35. data/ext/nokogiri/xml_element_content.c +123 -0
  36. data/ext/nokogiri/xml_element_content.h +10 -0
  37. data/ext/nokogiri/xml_element_decl.c +69 -0
  38. data/ext/nokogiri/xml_element_decl.h +9 -0
  39. data/ext/nokogiri/xml_encoding_handler.c +79 -0
  40. data/ext/nokogiri/xml_encoding_handler.h +8 -0
  41. data/ext/nokogiri/xml_entity_decl.c +110 -0
  42. data/ext/nokogiri/xml_entity_decl.h +10 -0
  43. data/ext/nokogiri/xml_entity_reference.c +52 -0
  44. data/ext/nokogiri/xml_entity_reference.h +9 -0
  45. data/ext/nokogiri/xml_io.c +61 -0
  46. data/ext/nokogiri/xml_io.h +11 -0
  47. data/ext/nokogiri/xml_libxml2_hacks.c +112 -0
  48. data/ext/nokogiri/xml_libxml2_hacks.h +12 -0
  49. data/ext/nokogiri/xml_namespace.c +111 -0
  50. data/ext/nokogiri/xml_namespace.h +14 -0
  51. data/ext/nokogiri/xml_node.c +1773 -0
  52. data/ext/nokogiri/xml_node.h +13 -0
  53. data/ext/nokogiri/xml_node_set.c +486 -0
  54. data/ext/nokogiri/xml_node_set.h +12 -0
  55. data/ext/nokogiri/xml_processing_instruction.c +56 -0
  56. data/ext/nokogiri/xml_processing_instruction.h +9 -0
  57. data/ext/nokogiri/xml_reader.c +668 -0
  58. data/ext/nokogiri/xml_reader.h +10 -0
  59. data/ext/nokogiri/xml_relax_ng.c +161 -0
  60. data/ext/nokogiri/xml_relax_ng.h +9 -0
  61. data/ext/nokogiri/xml_sax_parser.c +310 -0
  62. data/ext/nokogiri/xml_sax_parser.h +39 -0
  63. data/ext/nokogiri/xml_sax_parser_context.c +262 -0
  64. data/ext/nokogiri/xml_sax_parser_context.h +10 -0
  65. data/ext/nokogiri/xml_sax_push_parser.c +159 -0
  66. data/ext/nokogiri/xml_sax_push_parser.h +9 -0
  67. data/ext/nokogiri/xml_schema.c +205 -0
  68. data/ext/nokogiri/xml_schema.h +9 -0
  69. data/ext/nokogiri/xml_syntax_error.c +64 -0
  70. data/ext/nokogiri/xml_syntax_error.h +13 -0
  71. data/ext/nokogiri/xml_text.c +52 -0
  72. data/ext/nokogiri/xml_text.h +9 -0
  73. data/ext/nokogiri/xml_xpath_context.c +298 -0
  74. data/ext/nokogiri/xml_xpath_context.h +10 -0
  75. data/ext/nokogiri/xslt_stylesheet.c +266 -0
  76. data/ext/nokogiri/xslt_stylesheet.h +14 -0
  77. data/lib/nokogiri.rb +127 -0
  78. data/lib/nokogiri/2.4/nokogiri.so +0 -0
  79. data/lib/nokogiri/2.5/nokogiri.so +0 -0
  80. data/lib/nokogiri/2.6/nokogiri.so +0 -0
  81. data/lib/nokogiri/2.7/nokogiri.so +0 -0
  82. data/lib/nokogiri/css.rb +28 -0
  83. data/lib/nokogiri/css/node.rb +53 -0
  84. data/lib/nokogiri/css/parser.rb +751 -0
  85. data/lib/nokogiri/css/parser.y +272 -0
  86. data/lib/nokogiri/css/parser_extras.rb +92 -0
  87. data/lib/nokogiri/css/syntax_error.rb +8 -0
  88. data/lib/nokogiri/css/tokenizer.rb +154 -0
  89. data/lib/nokogiri/css/tokenizer.rex +55 -0
  90. data/lib/nokogiri/css/xpath_visitor.rb +232 -0
  91. data/lib/nokogiri/decorators/slop.rb +43 -0
  92. data/lib/nokogiri/html.rb +38 -0
  93. data/lib/nokogiri/html/builder.rb +36 -0
  94. data/lib/nokogiri/html/document.rb +336 -0
  95. data/lib/nokogiri/html/document_fragment.rb +50 -0
  96. data/lib/nokogiri/html/element_description.rb +24 -0
  97. data/lib/nokogiri/html/element_description_defaults.rb +672 -0
  98. data/lib/nokogiri/html/entity_lookup.rb +14 -0
  99. data/lib/nokogiri/html/sax/parser.rb +63 -0
  100. data/lib/nokogiri/html/sax/parser_context.rb +17 -0
  101. data/lib/nokogiri/html/sax/push_parser.rb +37 -0
  102. data/lib/nokogiri/jruby/dependencies.rb +20 -0
  103. data/lib/nokogiri/syntax_error.rb +5 -0
  104. data/lib/nokogiri/version.rb +149 -0
  105. data/lib/nokogiri/xml.rb +76 -0
  106. data/lib/nokogiri/xml/attr.rb +15 -0
  107. data/lib/nokogiri/xml/attribute_decl.rb +19 -0
  108. data/lib/nokogiri/xml/builder.rb +447 -0
  109. data/lib/nokogiri/xml/cdata.rb +12 -0
  110. data/lib/nokogiri/xml/character_data.rb +8 -0
  111. data/lib/nokogiri/xml/document.rb +280 -0
  112. data/lib/nokogiri/xml/document_fragment.rb +161 -0
  113. data/lib/nokogiri/xml/dtd.rb +33 -0
  114. data/lib/nokogiri/xml/element_content.rb +37 -0
  115. data/lib/nokogiri/xml/element_decl.rb +14 -0
  116. data/lib/nokogiri/xml/entity_decl.rb +20 -0
  117. data/lib/nokogiri/xml/entity_reference.rb +19 -0
  118. data/lib/nokogiri/xml/namespace.rb +14 -0
  119. data/lib/nokogiri/xml/node.rb +916 -0
  120. data/lib/nokogiri/xml/node/save_options.rb +62 -0
  121. data/lib/nokogiri/xml/node_set.rb +372 -0
  122. data/lib/nokogiri/xml/notation.rb +7 -0
  123. data/lib/nokogiri/xml/parse_options.rb +121 -0
  124. data/lib/nokogiri/xml/pp.rb +3 -0
  125. data/lib/nokogiri/xml/pp/character_data.rb +19 -0
  126. data/lib/nokogiri/xml/pp/node.rb +57 -0
  127. data/lib/nokogiri/xml/processing_instruction.rb +9 -0
  128. data/lib/nokogiri/xml/reader.rb +116 -0
  129. data/lib/nokogiri/xml/relax_ng.rb +33 -0
  130. data/lib/nokogiri/xml/sax.rb +5 -0
  131. data/lib/nokogiri/xml/sax/document.rb +172 -0
  132. data/lib/nokogiri/xml/sax/parser.rb +123 -0
  133. data/lib/nokogiri/xml/sax/parser_context.rb +17 -0
  134. data/lib/nokogiri/xml/sax/push_parser.rb +61 -0
  135. data/lib/nokogiri/xml/schema.rb +64 -0
  136. data/lib/nokogiri/xml/searchable.rb +231 -0
  137. data/lib/nokogiri/xml/syntax_error.rb +71 -0
  138. data/lib/nokogiri/xml/text.rb +10 -0
  139. data/lib/nokogiri/xml/xpath.rb +11 -0
  140. data/lib/nokogiri/xml/xpath/syntax_error.rb +12 -0
  141. data/lib/nokogiri/xml/xpath_context.rb +17 -0
  142. data/lib/nokogiri/xslt.rb +57 -0
  143. data/lib/nokogiri/xslt/stylesheet.rb +26 -0
  144. data/lib/xsd/xmlparser/nokogiri.rb +103 -0
  145. metadata +482 -0
@@ -0,0 +1,55 @@
1
+ module Nokogiri
2
+ module CSS
3
+ class Tokenizer # :nodoc:
4
+
5
+ macro
6
+ nl \n|\r\n|\r|\f
7
+ w [\s]*
8
+ nonascii [^\0-\177]
9
+ num -?([0-9]+|[0-9]*\.[0-9]+)
10
+ unicode \\[0-9A-Fa-f]{1,6}(\r\n|[\s])?
11
+
12
+ escape {unicode}|\\[^\n\r\f0-9A-Fa-f]
13
+ nmchar [_A-Za-z0-9-]|{nonascii}|{escape}
14
+ nmstart [_A-Za-z]|{nonascii}|{escape}
15
+ ident [-@]?({nmstart})({nmchar})*
16
+ name ({nmchar})+
17
+ string1 "([^\n\r\f"]|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*"
18
+ string2 '([^\n\r\f']|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*'
19
+ string {string1}|{string2}
20
+
21
+ rule
22
+
23
+ # [:state] pattern [actions]
24
+
25
+ has\({w} { [:HAS, text] }
26
+ {ident}\({w} { [:FUNCTION, text] }
27
+ {ident} { [:IDENT, text] }
28
+ \#{name} { [:HASH, text] }
29
+ {w}~={w} { [:INCLUDES, text] }
30
+ {w}\|={w} { [:DASHMATCH, text] }
31
+ {w}\^={w} { [:PREFIXMATCH, text] }
32
+ {w}\$={w} { [:SUFFIXMATCH, text] }
33
+ {w}\*={w} { [:SUBSTRINGMATCH, text] }
34
+ {w}!={w} { [:NOT_EQUAL, text] }
35
+ {w}={w} { [:EQUAL, text] }
36
+ {w}\) { [:RPAREN, text] }
37
+ \[{w} { [:LSQUARE, text] }
38
+ {w}\] { [:RSQUARE, text] }
39
+ {w}\+{w} { [:PLUS, text] }
40
+ {w}>{w} { [:GREATER, text] }
41
+ {w},{w} { [:COMMA, text] }
42
+ {w}~{w} { [:TILDE, text] }
43
+ \:not\({w} { [:NOT, text] }
44
+ {num} { [:NUMBER, text] }
45
+ {w}\/\/{w} { [:DOUBLESLASH, text] }
46
+ {w}\/{w} { [:SLASH, text] }
47
+
48
+ U\+[0-9a-f?]{1,6}(-[0-9a-f]{1,6})? {[:UNICODE_RANGE, text] }
49
+
50
+ [\s]+ { [:S, text] }
51
+ {string} { [:STRING, text] }
52
+ . { [text, text] }
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,232 @@
1
+ # frozen_string_literal: true
2
+ module Nokogiri
3
+ module CSS
4
+ class XPathVisitor # :nodoc:
5
+ def visit_function node
6
+
7
+ msg = :"visit_function_#{node.value.first.gsub(/[(]/, '')}"
8
+ return self.send(msg, node) if self.respond_to?(msg)
9
+
10
+ case node.value.first
11
+ when /^text\(/
12
+ 'child::text()'
13
+ when /^self\(/
14
+ "self::#{node.value[1]}"
15
+ when /^eq\(/
16
+ "position() = #{node.value[1]}"
17
+ when /^(nth|nth-of-type)\(/
18
+ if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
19
+ nth(node.value[1])
20
+ else
21
+ "position() = #{node.value[1]}"
22
+ end
23
+ when /^nth-child\(/
24
+ if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
25
+ nth(node.value[1], :child => true)
26
+ else
27
+ "count(preceding-sibling::*) = #{node.value[1].to_i-1}"
28
+ end
29
+ when /^nth-last-of-type\(/
30
+ if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
31
+ nth(node.value[1], :last => true)
32
+ else
33
+ index = node.value[1].to_i - 1
34
+ index == 0 ? "position() = last()" : "position() = last() - #{index}"
35
+ end
36
+ when /^nth-last-child\(/
37
+ if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
38
+ nth(node.value[1], :last => true, :child => true)
39
+ else
40
+ "count(following-sibling::*) = #{node.value[1].to_i-1}"
41
+ end
42
+ when /^(first|first-of-type)\(/
43
+ "position() = 1"
44
+ when /^(last|last-of-type)\(/
45
+ "position() = last()"
46
+ when /^contains\(/
47
+ "contains(., #{node.value[1]})"
48
+ when /^gt\(/
49
+ "position() > #{node.value[1]}"
50
+ when /^only-child\(/
51
+ "last() = 1"
52
+ when /^comment\(/
53
+ "comment()"
54
+ when /^has\(/
55
+ is_direct = node.value[1].value[0].nil? # e.g. "has(> a)", "has(~ a)", "has(+ a)"
56
+ ".#{"//" if !is_direct}#{node.value[1].accept(self)}"
57
+ else
58
+ args = ['.'] + node.value[1..-1]
59
+ "#{node.value.first}#{args.join(', ')})"
60
+ end
61
+ end
62
+
63
+ def visit_not node
64
+ child = node.value.first
65
+ if :ELEMENT_NAME == child.type
66
+ "not(self::#{child.accept(self)})"
67
+ else
68
+ "not(#{child.accept(self)})"
69
+ end
70
+ end
71
+
72
+ def visit_id node
73
+ node.value.first =~ /^#(.*)$/
74
+ "@id = '#{$1}'"
75
+ end
76
+
77
+ def visit_attribute_condition node
78
+ attribute = if (node.value.first.type == :FUNCTION) or (node.value.first.value.first =~ /::/)
79
+ ''
80
+ else
81
+ '@'
82
+ end
83
+ attribute += node.value.first.accept(self)
84
+
85
+ # Support non-standard css
86
+ attribute.gsub!(/^@@/, '@')
87
+
88
+ return attribute unless node.value.length == 3
89
+
90
+ value = node.value.last
91
+ value = "'#{value}'" if value !~ /^['"]/
92
+
93
+ if (value[0]==value[-1]) && %q{"'}.include?(value[0])
94
+ str_value = value[1..-2]
95
+ if str_value.include?(value[0])
96
+ value = 'concat("' + str_value.split('"', -1).join(%q{", '"', "}) + '", "")'
97
+ end
98
+ end
99
+
100
+ case node.value[1]
101
+ when :equal
102
+ attribute + " = " + "#{value}"
103
+ when :not_equal
104
+ attribute + " != " + "#{value}"
105
+ when :substring_match
106
+ "contains(#{attribute}, #{value})"
107
+ when :prefix_match
108
+ "starts-with(#{attribute}, #{value})"
109
+ when :dash_match
110
+ "#{attribute} = #{value} or starts-with(#{attribute}, concat(#{value}, '-'))"
111
+ when :includes
112
+ "contains(concat(\" \", #{attribute}, \" \"),concat(\" \", #{value}, \" \"))"
113
+ when :suffix_match
114
+ "substring(#{attribute}, string-length(#{attribute}) - " +
115
+ "string-length(#{value}) + 1, string-length(#{value})) = #{value}"
116
+ else
117
+ attribute + " #{node.value[1]} " + "#{value}"
118
+ end
119
+ end
120
+
121
+ def visit_pseudo_class node
122
+ if node.value.first.is_a?(Nokogiri::CSS::Node) and node.value.first.type == :FUNCTION
123
+ node.value.first.accept(self)
124
+ else
125
+ msg = :"visit_pseudo_class_#{node.value.first.gsub(/[(]/, '')}"
126
+ return self.send(msg, node) if self.respond_to?(msg)
127
+
128
+ case node.value.first
129
+ when "first" then "position() = 1"
130
+ when "first-child" then "count(preceding-sibling::*) = 0"
131
+ when "last" then "position() = last()"
132
+ when "last-child" then "count(following-sibling::*) = 0"
133
+ when "first-of-type" then "position() = 1"
134
+ when "last-of-type" then "position() = last()"
135
+ when "only-child" then "count(preceding-sibling::*) = 0 and count(following-sibling::*) = 0"
136
+ when "only-of-type" then "last() = 1"
137
+ when "empty" then "not(node())"
138
+ when "parent" then "node()"
139
+ when "root" then "not(parent::*)"
140
+ else
141
+ node.value.first + "(.)"
142
+ end
143
+ end
144
+ end
145
+
146
+ def visit_class_condition node
147
+ "contains(concat(' ', normalize-space(@class), ' '), ' #{node.value.first} ')"
148
+ end
149
+
150
+ def visit_combinator node
151
+ if is_of_type_pseudo_class?(node.value.last)
152
+ "#{node.value.first.accept(self) if node.value.first}][#{node.value.last.accept(self)}"
153
+ else
154
+ "#{node.value.first.accept(self) if node.value.first} and #{node.value.last.accept(self)}"
155
+ end
156
+ end
157
+
158
+ {
159
+ 'direct_adjacent_selector' => "/following-sibling::*[1]/self::",
160
+ 'following_selector' => "/following-sibling::",
161
+ 'descendant_selector' => '//',
162
+ 'child_selector' => '/',
163
+ }.each do |k,v|
164
+ class_eval %{
165
+ def visit_#{k} node
166
+ "\#{node.value.first.accept(self) if node.value.first}#{v}\#{node.value.last.accept(self)}"
167
+ end
168
+ }
169
+ end
170
+
171
+ def visit_conditional_selector node
172
+ node.value.first.accept(self) + '[' +
173
+ node.value.last.accept(self) + ']'
174
+ end
175
+
176
+ def visit_element_name node
177
+ node.value.first
178
+ end
179
+
180
+ def accept node
181
+ node.accept(self)
182
+ end
183
+
184
+ private
185
+ def nth node, options={}
186
+ raise ArgumentError, "expected an+b node to contain 4 tokens, but is #{node.value.inspect}" unless node.value.size == 4
187
+
188
+ a, b = read_a_and_positive_b node.value
189
+ position = if options[:child]
190
+ options[:last] ? "(count(following-sibling::*) + 1)" : "(count(preceding-sibling::*) + 1)"
191
+ else
192
+ options[:last] ? "(last()-position()+1)" : "position()"
193
+ end
194
+
195
+ if b.zero?
196
+ "(#{position} mod #{a}) = 0"
197
+ else
198
+ compare = a < 0 ? "<=" : ">="
199
+ if a.abs == 1
200
+ "#{position} #{compare} #{b}"
201
+ else
202
+ "(#{position} #{compare} #{b}) and (((#{position}-#{b}) mod #{a.abs}) = 0)"
203
+ end
204
+ end
205
+ end
206
+
207
+ def read_a_and_positive_b values
208
+ op = values[2]
209
+ if op == "+"
210
+ a = values[0].to_i
211
+ b = values[3].to_i
212
+ elsif op == "-"
213
+ a = values[0].to_i
214
+ b = a - (values[3].to_i % a)
215
+ else
216
+ raise ArgumentError, "expected an+b node to have either + or - as the operator, but is #{op.inspect}"
217
+ end
218
+ [a, b]
219
+ end
220
+
221
+ def is_of_type_pseudo_class? node
222
+ if node.type==:PSEUDO_CLASS
223
+ if node.value[0].is_a?(Nokogiri::CSS::Node) and node.value[0].type == :FUNCTION
224
+ node.value[0].value[0]
225
+ else
226
+ node.value[0]
227
+ end =~ /(nth|first|last|only)-of-type(\()?/
228
+ end
229
+ end
230
+ end
231
+ end
232
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+ module Nokogiri
3
+ module Decorators
4
+ ###
5
+ # The Slop decorator implements method missing such that a methods may be
6
+ # used instead of XPath or CSS. See Nokogiri.Slop
7
+ module Slop
8
+ # The default XPath search context for Slop
9
+ XPATH_PREFIX = "./"
10
+
11
+ ###
12
+ # look for node with +name+. See Nokogiri.Slop
13
+ def method_missing name, *args, &block
14
+ if args.empty?
15
+ list = xpath("#{XPATH_PREFIX}#{name.to_s.sub(/^_/, '')}")
16
+ elsif args.first.is_a? Hash
17
+ hash = args.first
18
+ if hash[:css]
19
+ list = css("#{name}#{hash[:css]}")
20
+ elsif hash[:xpath]
21
+ conds = Array(hash[:xpath]).join(' and ')
22
+ list = xpath("#{XPATH_PREFIX}#{name}[#{conds}]")
23
+ end
24
+ else
25
+ CSS::Parser.without_cache do
26
+ list = xpath(
27
+ *CSS.xpath_for("#{name}#{args.first}", :prefix => XPATH_PREFIX)
28
+ )
29
+ end
30
+ end
31
+
32
+ super if list.empty?
33
+ list.length == 1 ? list.first : list
34
+ end
35
+
36
+ def respond_to_missing? name, include_private = false
37
+ list = xpath("#{XPATH_PREFIX}#{name.to_s.sub(/^_/, '')}")
38
+
39
+ !list.empty?
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+ require 'nokogiri/html/entity_lookup'
3
+ require 'nokogiri/html/document'
4
+ require 'nokogiri/html/document_fragment'
5
+ require 'nokogiri/html/sax/parser_context'
6
+ require 'nokogiri/html/sax/parser'
7
+ require 'nokogiri/html/sax/push_parser'
8
+ require 'nokogiri/html/element_description'
9
+ require 'nokogiri/html/element_description_defaults'
10
+
11
+ module Nokogiri
12
+ class << self
13
+ ###
14
+ # Parse HTML. Convenience method for Nokogiri::HTML::Document.parse
15
+ def HTML thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block
16
+ Nokogiri::HTML::Document.parse(thing, url, encoding, options, &block)
17
+ end
18
+ end
19
+
20
+ module HTML
21
+ class << self
22
+ ###
23
+ # Parse HTML. Convenience method for Nokogiri::HTML::Document.parse
24
+ def parse thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block
25
+ Document.parse(thing, url, encoding, options, &block)
26
+ end
27
+
28
+ ####
29
+ # Parse a fragment from +string+ in to a NodeSet.
30
+ def fragment string, encoding = nil
31
+ HTML::DocumentFragment.parse string, encoding
32
+ end
33
+ end
34
+
35
+ # Instance of Nokogiri::HTML::EntityLookup
36
+ NamedCharacters = EntityLookup.new
37
+ end
38
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+ module Nokogiri
3
+ module HTML
4
+ ###
5
+ # Nokogiri HTML builder is used for building HTML documents. It is very
6
+ # similar to the Nokogiri::XML::Builder. In fact, you should go read the
7
+ # documentation for Nokogiri::XML::Builder before reading this
8
+ # documentation.
9
+ #
10
+ # == Synopsis:
11
+ #
12
+ # Create an HTML document with a body that has an onload attribute, and a
13
+ # span tag with a class of "bold" that has content of "Hello world".
14
+ #
15
+ # builder = Nokogiri::HTML::Builder.new do |doc|
16
+ # doc.html {
17
+ # doc.body(:onload => 'some_func();') {
18
+ # doc.span.bold {
19
+ # doc.text "Hello world"
20
+ # }
21
+ # }
22
+ # }
23
+ # end
24
+ # puts builder.to_html
25
+ #
26
+ # The HTML builder inherits from the XML builder, so make sure to read the
27
+ # Nokogiri::XML::Builder documentation.
28
+ class Builder < Nokogiri::XML::Builder
29
+ ###
30
+ # Convert the builder to HTML
31
+ def to_html
32
+ @doc.to_html
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,336 @@
1
+ # frozen_string_literal: true
2
+ module Nokogiri
3
+ module HTML
4
+ class Document < Nokogiri::XML::Document
5
+ ###
6
+ # Get the meta tag encoding for this document. If there is no meta tag,
7
+ # then nil is returned.
8
+ def meta_encoding
9
+ case
10
+ when meta = at('//meta[@charset]')
11
+ meta[:charset]
12
+ when meta = meta_content_type
13
+ meta['content'][/charset\s*=\s*([\w-]+)/i, 1]
14
+ end
15
+ end
16
+
17
+ ###
18
+ # Set the meta tag encoding for this document.
19
+ #
20
+ # If an meta encoding tag is already present, its content is
21
+ # replaced with the given text.
22
+ #
23
+ # Otherwise, this method tries to create one at an appropriate
24
+ # place supplying head and/or html elements as necessary, which
25
+ # is inside a head element if any, and before any text node or
26
+ # content element (typically <body>) if any.
27
+ #
28
+ # The result when trying to set an encoding that is different
29
+ # from the document encoding is undefined.
30
+ #
31
+ # Beware in CRuby, that libxml2 automatically inserts a meta tag
32
+ # into a head element.
33
+ def meta_encoding= encoding
34
+ case
35
+ when meta = meta_content_type
36
+ meta['content'] = 'text/html; charset=%s' % encoding
37
+ encoding
38
+ when meta = at('//meta[@charset]')
39
+ meta['charset'] = encoding
40
+ else
41
+ meta = XML::Node.new('meta', self)
42
+ if dtd = internal_subset and dtd.html5_dtd?
43
+ meta['charset'] = encoding
44
+ else
45
+ meta['http-equiv'] = 'Content-Type'
46
+ meta['content'] = 'text/html; charset=%s' % encoding
47
+ end
48
+
49
+ case
50
+ when head = at('//head')
51
+ head.prepend_child(meta)
52
+ else
53
+ set_metadata_element(meta)
54
+ end
55
+ encoding
56
+ end
57
+ end
58
+
59
+ def meta_content_type
60
+ xpath('//meta[@http-equiv and boolean(@content)]').find { |node|
61
+ node['http-equiv'] =~ /\AContent-Type\z/i
62
+ }
63
+ end
64
+ private :meta_content_type
65
+
66
+ ###
67
+ # Get the title string of this document. Return nil if there is
68
+ # no title tag.
69
+ def title
70
+ title = at('//title') and title.inner_text
71
+ end
72
+
73
+ ###
74
+ # Set the title string of this document.
75
+ #
76
+ # If a title element is already present, its content is replaced
77
+ # with the given text.
78
+ #
79
+ # Otherwise, this method tries to create one at an appropriate
80
+ # place supplying head and/or html elements as necessary, which
81
+ # is inside a head element if any, right after a meta
82
+ # encoding/charset tag if any, and before any text node or
83
+ # content element (typically <body>) if any.
84
+ def title=(text)
85
+ tnode = XML::Text.new(text, self)
86
+ if title = at('//title')
87
+ title.children = tnode
88
+ return text
89
+ end
90
+
91
+ title = XML::Node.new('title', self) << tnode
92
+ case
93
+ when head = at('//head')
94
+ head << title
95
+ when meta = at('//meta[@charset]') || meta_content_type
96
+ # better put after charset declaration
97
+ meta.add_next_sibling(title)
98
+ else
99
+ set_metadata_element(title)
100
+ end
101
+ text
102
+ end
103
+
104
+ def set_metadata_element(element)
105
+ case
106
+ when head = at('//head')
107
+ head << element
108
+ when html = at('//html')
109
+ head = html.prepend_child(XML::Node.new('head', self))
110
+ head.prepend_child(element)
111
+ when first = children.find { |node|
112
+ case node
113
+ when XML::Element, XML::Text
114
+ true
115
+ end
116
+ }
117
+ # We reach here only if the underlying document model
118
+ # allows <html>/<head> elements to be omitted and does not
119
+ # automatically supply them.
120
+ first.add_previous_sibling(element)
121
+ else
122
+ html = add_child(XML::Node.new('html', self))
123
+ head = html.add_child(XML::Node.new('head', self))
124
+ head.prepend_child(element)
125
+ end
126
+ end
127
+ private :set_metadata_element
128
+
129
+ ####
130
+ # Serialize Node using +options+. Save options can also be set using a
131
+ # block. See SaveOptions.
132
+ #
133
+ # These two statements are equivalent:
134
+ #
135
+ # node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)
136
+ #
137
+ # or
138
+ #
139
+ # node.serialize(:encoding => 'UTF-8') do |config|
140
+ # config.format.as_xml
141
+ # end
142
+ #
143
+ def serialize options = {}
144
+ options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
145
+ super
146
+ end
147
+
148
+ ####
149
+ # Create a Nokogiri::XML::DocumentFragment from +tags+
150
+ def fragment tags = nil
151
+ DocumentFragment.new(self, tags, self.root)
152
+ end
153
+
154
+ class << self
155
+ ###
156
+ # Parse HTML. +string_or_io+ may be a String, or any object that
157
+ # responds to _read_ and _close_ such as an IO, or StringIO.
158
+ # +url+ is resource where this document is located. +encoding+ is the
159
+ # encoding that should be used when processing the document. +options+
160
+ # is a number that sets options in the parser, such as
161
+ # Nokogiri::XML::ParseOptions::RECOVER. See the constants in
162
+ # Nokogiri::XML::ParseOptions.
163
+ def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML
164
+
165
+ options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
166
+ # Give the options to the user
167
+ yield options if block_given?
168
+
169
+ if string_or_io.respond_to?(:encoding)
170
+ unless string_or_io.encoding.name == "ASCII-8BIT"
171
+ encoding ||= string_or_io.encoding.name
172
+ end
173
+ end
174
+
175
+ if string_or_io.respond_to?(:read)
176
+ url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
177
+ unless encoding
178
+ # Libxml2's parser has poor support for encoding
179
+ # detection. First, it does not recognize the HTML5
180
+ # style meta charset declaration. Secondly, even if it
181
+ # successfully detects an encoding hint, it does not
182
+ # re-decode or re-parse the preceding part which may be
183
+ # garbled.
184
+ #
185
+ # EncodingReader aims to perform advanced encoding
186
+ # detection beyond what Libxml2 does, and to emulate
187
+ # rewinding of a stream and make Libxml2 redo parsing
188
+ # from the start when an encoding hint is found.
189
+ string_or_io = EncodingReader.new(string_or_io)
190
+ begin
191
+ return read_io(string_or_io, url, encoding, options.to_i)
192
+ rescue EncodingFound => e
193
+ encoding = e.found_encoding
194
+ end
195
+ end
196
+ return read_io(string_or_io, url, encoding, options.to_i)
197
+ end
198
+
199
+ # read_memory pukes on empty docs
200
+ if string_or_io.nil? or string_or_io.empty?
201
+ return encoding ? new.tap { |i| i.encoding = encoding } : new
202
+ end
203
+
204
+ encoding ||= EncodingReader.detect_encoding(string_or_io)
205
+
206
+ read_memory(string_or_io, url, encoding, options.to_i)
207
+ end
208
+ end
209
+
210
+ class EncodingFound < StandardError # :nodoc:
211
+ attr_reader :found_encoding
212
+
213
+ def initialize(encoding)
214
+ @found_encoding = encoding
215
+ super("encoding found: %s" % encoding)
216
+ end
217
+ end
218
+
219
+ class EncodingReader # :nodoc:
220
+ class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc:
221
+ attr_reader :encoding
222
+
223
+ def initialize
224
+ @encoding = nil
225
+ super()
226
+ end
227
+
228
+ def start_element(name, attrs = [])
229
+ return unless name == 'meta'
230
+ attr = Hash[attrs]
231
+ charset = attr['charset'] and
232
+ @encoding = charset
233
+ http_equiv = attr['http-equiv'] and
234
+ http_equiv.match(/\AContent-Type\z/i) and
235
+ content = attr['content'] and
236
+ m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
237
+ @encoding = m[1]
238
+ end
239
+ end
240
+
241
+ class JumpSAXHandler < SAXHandler
242
+ def initialize(jumptag)
243
+ @jumptag = jumptag
244
+ super()
245
+ end
246
+
247
+ def start_element(name, attrs = [])
248
+ super
249
+ throw @jumptag, @encoding if @encoding
250
+ throw @jumptag, nil if name =~ /\A(?:div|h1|img|p|br)\z/
251
+ end
252
+ end
253
+
254
+ def self.detect_encoding(chunk)
255
+ if Nokogiri.jruby? && EncodingReader.is_jruby_without_fix?
256
+ return EncodingReader.detect_encoding_for_jruby_without_fix(chunk)
257
+ end
258
+ m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
259
+ return Nokogiri.XML(m[1]).encoding
260
+
261
+ if Nokogiri.jruby?
262
+ m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
263
+ return m[4]
264
+ catch(:encoding_found) {
265
+ Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
266
+ nil
267
+ }
268
+ else
269
+ handler = SAXHandler.new
270
+ parser = Nokogiri::HTML::SAX::PushParser.new(handler)
271
+ parser << chunk rescue Nokogiri::SyntaxError
272
+ handler.encoding
273
+ end
274
+ end
275
+
276
+ def self.is_jruby_without_fix?
277
+ JRUBY_VERSION.split('.').join.to_i < 165
278
+ end
279
+
280
+ def self.detect_encoding_for_jruby_without_fix(chunk)
281
+ m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
282
+ return Nokogiri.XML(m[1]).encoding
283
+
284
+ m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
285
+ return m[4]
286
+
287
+ catch(:encoding_found) {
288
+ Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk)
289
+ nil
290
+ }
291
+ rescue Nokogiri::SyntaxError, RuntimeError
292
+ # Ignore parser errors that nokogiri may raise
293
+ nil
294
+ end
295
+
296
+ def initialize(io)
297
+ @io = io
298
+ @firstchunk = nil
299
+ @encoding_found = nil
300
+ end
301
+
302
+ # This method is used by the C extension so that
303
+ # Nokogiri::HTML::Document#read_io() does not leak memory when
304
+ # EncodingFound is raised.
305
+ attr_reader :encoding_found
306
+
307
+ def read(len)
308
+ # no support for a call without len
309
+
310
+ if !@firstchunk
311
+ @firstchunk = @io.read(len) or return nil
312
+
313
+ # This implementation expects that the first call from
314
+ # htmlReadIO() is made with a length long enough (~1KB) to
315
+ # achieve advanced encoding detection.
316
+ if encoding = EncodingReader.detect_encoding(@firstchunk)
317
+ # The first chunk is stored for the next read in retry.
318
+ raise @encoding_found = EncodingFound.new(encoding)
319
+ end
320
+ end
321
+ @encoding_found = nil
322
+
323
+ ret = @firstchunk.slice!(0, len)
324
+ if (len -= ret.length) > 0
325
+ rest = @io.read(len) and ret << rest
326
+ end
327
+ if ret.empty?
328
+ nil
329
+ else
330
+ ret
331
+ end
332
+ end
333
+ end
334
+ end
335
+ end
336
+ end