nokogiri 1.5.10 → 1.10.4

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (182) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE-DEPENDENCIES.md +1614 -0
  3. data/LICENSE.md +9 -0
  4. data/README.md +198 -0
  5. data/bin/nokogiri +50 -10
  6. data/dependencies.yml +72 -0
  7. data/ext/nokogiri/extconf.rb +634 -92
  8. data/ext/nokogiri/html_document.c +8 -8
  9. data/ext/nokogiri/html_element_description.c +15 -15
  10. data/ext/nokogiri/html_entity_lookup.c +1 -1
  11. data/ext/nokogiri/html_sax_parser_context.c +4 -4
  12. data/ext/nokogiri/html_sax_push_parser.c +2 -2
  13. data/ext/nokogiri/nokogiri.c +20 -12
  14. data/ext/nokogiri/nokogiri.h +1 -44
  15. data/ext/nokogiri/xml_attr.c +34 -25
  16. data/ext/nokogiri/xml_cdata.c +12 -6
  17. data/ext/nokogiri/xml_comment.c +18 -3
  18. data/ext/nokogiri/xml_document.c +64 -32
  19. data/ext/nokogiri/xml_dtd.c +2 -2
  20. data/ext/nokogiri/xml_encoding_handler.c +3 -3
  21. data/ext/nokogiri/xml_entity_reference.c +1 -1
  22. data/ext/nokogiri/xml_io.c +11 -6
  23. data/ext/nokogiri/xml_namespace.c +50 -17
  24. data/ext/nokogiri/xml_namespace.h +3 -2
  25. data/ext/nokogiri/xml_node.c +459 -240
  26. data/ext/nokogiri/xml_node_set.c +166 -147
  27. data/ext/nokogiri/xml_node_set.h +2 -4
  28. data/ext/nokogiri/xml_processing_instruction.c +2 -2
  29. data/ext/nokogiri/xml_reader.c +6 -19
  30. data/ext/nokogiri/xml_sax_parser.c +11 -13
  31. data/ext/nokogiri/xml_sax_parser_context.c +41 -1
  32. data/ext/nokogiri/xml_sax_push_parser.c +56 -12
  33. data/ext/nokogiri/xml_schema.c +1 -1
  34. data/ext/nokogiri/xml_syntax_error.c +11 -5
  35. data/ext/nokogiri/xml_syntax_error.h +1 -1
  36. data/ext/nokogiri/xml_text.c +1 -1
  37. data/ext/nokogiri/xml_xpath_context.c +17 -38
  38. data/ext/nokogiri/xslt_stylesheet.c +10 -10
  39. data/lib/nokogiri/css/node.rb +0 -50
  40. data/lib/nokogiri/css/parser.rb +263 -233
  41. data/lib/nokogiri/css/parser.y +54 -40
  42. data/lib/nokogiri/css/tokenizer.rb +104 -103
  43. data/lib/nokogiri/css/tokenizer.rex +5 -5
  44. data/lib/nokogiri/css/xpath_visitor.rb +78 -19
  45. data/lib/nokogiri/decorators/slop.rb +12 -5
  46. data/lib/nokogiri/html/document.rb +102 -21
  47. data/lib/nokogiri/html/document_fragment.rb +11 -3
  48. data/lib/nokogiri/html/sax/parser.rb +12 -2
  49. data/lib/nokogiri/html/sax/push_parser.rb +22 -2
  50. data/lib/nokogiri/version.rb +40 -22
  51. data/lib/nokogiri/xml/builder.rb +34 -31
  52. data/lib/nokogiri/xml/document.rb +20 -14
  53. data/lib/nokogiri/xml/document_fragment.rb +50 -2
  54. data/lib/nokogiri/xml/dtd.rb +14 -4
  55. data/lib/nokogiri/xml/entity_reference.rb +18 -0
  56. data/lib/nokogiri/xml/node.rb +148 -203
  57. data/lib/nokogiri/xml/node_set.rb +139 -123
  58. data/lib/nokogiri/xml/parse_options.rb +22 -0
  59. data/lib/nokogiri/xml/sax/document.rb +1 -1
  60. data/lib/nokogiri/xml/sax/parser.rb +7 -8
  61. data/lib/nokogiri/xml/searchable.rb +230 -0
  62. data/lib/nokogiri/xml/syntax_error.rb +24 -1
  63. data/lib/nokogiri/xml.rb +3 -1
  64. data/lib/nokogiri.rb +40 -24
  65. data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +78 -0
  66. data/patches/libxml2/0002-Remove-script-macro-support.patch +40 -0
  67. data/patches/libxml2/0003-Update-entities-to-remove-handling-of-ssi.patch +44 -0
  68. data/patches/libxslt/0001-Fix-security-framework-bypass.patch +120 -0
  69. data/ports/archives/libxml2-2.9.9.tar.gz +0 -0
  70. data/ports/archives/libxslt-1.1.33.tar.gz +0 -0
  71. metadata +252 -388
  72. data/.autotest +0 -26
  73. data/.gemtest +0 -0
  74. data/CHANGELOG.ja.rdoc +0 -785
  75. data/CHANGELOG.rdoc +0 -783
  76. data/C_CODING_STYLE.rdoc +0 -33
  77. data/Manifest.txt +0 -303
  78. data/README.ja.rdoc +0 -106
  79. data/README.rdoc +0 -175
  80. data/ROADMAP.md +0 -90
  81. data/Rakefile +0 -228
  82. data/STANDARD_RESPONSES.md +0 -47
  83. data/Y_U_NO_GEMSPEC.md +0 -155
  84. data/build_all +0 -105
  85. data/tasks/cross_compile.rb +0 -150
  86. data/tasks/nokogiri.org.rb +0 -24
  87. data/tasks/test.rb +0 -95
  88. data/test/css/test_nthiness.rb +0 -159
  89. data/test/css/test_parser.rb +0 -341
  90. data/test/css/test_tokenizer.rb +0 -198
  91. data/test/css/test_xpath_visitor.rb +0 -91
  92. data/test/decorators/test_slop.rb +0 -16
  93. data/test/files/2ch.html +0 -108
  94. data/test/files/address_book.rlx +0 -12
  95. data/test/files/address_book.xml +0 -10
  96. data/test/files/bar/bar.xsd +0 -4
  97. data/test/files/dont_hurt_em_why.xml +0 -422
  98. data/test/files/encoding.html +0 -82
  99. data/test/files/encoding.xhtml +0 -84
  100. data/test/files/exslt.xml +0 -8
  101. data/test/files/exslt.xslt +0 -35
  102. data/test/files/foo/foo.xsd +0 -4
  103. data/test/files/metacharset.html +0 -10
  104. data/test/files/noencoding.html +0 -47
  105. data/test/files/po.xml +0 -32
  106. data/test/files/po.xsd +0 -66
  107. data/test/files/shift_jis.html +0 -10
  108. data/test/files/shift_jis.xml +0 -5
  109. data/test/files/snuggles.xml +0 -3
  110. data/test/files/staff.dtd +0 -10
  111. data/test/files/staff.xml +0 -59
  112. data/test/files/staff.xslt +0 -32
  113. data/test/files/test_document_url/bar.xml +0 -2
  114. data/test/files/test_document_url/document.dtd +0 -4
  115. data/test/files/test_document_url/document.xml +0 -6
  116. data/test/files/tlm.html +0 -850
  117. data/test/files/to_be_xincluded.xml +0 -2
  118. data/test/files/valid_bar.xml +0 -2
  119. data/test/files/xinclude.xml +0 -4
  120. data/test/helper.rb +0 -154
  121. data/test/html/sax/test_parser.rb +0 -141
  122. data/test/html/sax/test_parser_context.rb +0 -46
  123. data/test/html/test_builder.rb +0 -164
  124. data/test/html/test_document.rb +0 -552
  125. data/test/html/test_document_encoding.rb +0 -138
  126. data/test/html/test_document_fragment.rb +0 -261
  127. data/test/html/test_element_description.rb +0 -105
  128. data/test/html/test_named_characters.rb +0 -14
  129. data/test/html/test_node.rb +0 -196
  130. data/test/html/test_node_encoding.rb +0 -27
  131. data/test/namespaces/test_additional_namespaces_in_builder_doc.rb +0 -14
  132. data/test/namespaces/test_namespaces_in_builder_doc.rb +0 -75
  133. data/test/namespaces/test_namespaces_in_created_doc.rb +0 -75
  134. data/test/namespaces/test_namespaces_in_parsed_doc.rb +0 -66
  135. data/test/test_convert_xpath.rb +0 -135
  136. data/test/test_css_cache.rb +0 -45
  137. data/test/test_encoding_handler.rb +0 -46
  138. data/test/test_memory_leak.rb +0 -156
  139. data/test/test_nokogiri.rb +0 -132
  140. data/test/test_reader.rb +0 -555
  141. data/test/test_soap4r_sax.rb +0 -52
  142. data/test/test_xslt_transforms.rb +0 -254
  143. data/test/xml/node/test_save_options.rb +0 -28
  144. data/test/xml/node/test_subclass.rb +0 -44
  145. data/test/xml/sax/test_parser.rb +0 -366
  146. data/test/xml/sax/test_parser_context.rb +0 -106
  147. data/test/xml/sax/test_push_parser.rb +0 -157
  148. data/test/xml/test_attr.rb +0 -64
  149. data/test/xml/test_attribute_decl.rb +0 -86
  150. data/test/xml/test_builder.rb +0 -306
  151. data/test/xml/test_c14n.rb +0 -151
  152. data/test/xml/test_cdata.rb +0 -48
  153. data/test/xml/test_comment.rb +0 -29
  154. data/test/xml/test_document.rb +0 -828
  155. data/test/xml/test_document_encoding.rb +0 -28
  156. data/test/xml/test_document_fragment.rb +0 -223
  157. data/test/xml/test_dtd.rb +0 -103
  158. data/test/xml/test_dtd_encoding.rb +0 -33
  159. data/test/xml/test_element_content.rb +0 -56
  160. data/test/xml/test_element_decl.rb +0 -73
  161. data/test/xml/test_entity_decl.rb +0 -122
  162. data/test/xml/test_entity_reference.rb +0 -245
  163. data/test/xml/test_namespace.rb +0 -95
  164. data/test/xml/test_node.rb +0 -1137
  165. data/test/xml/test_node_attributes.rb +0 -96
  166. data/test/xml/test_node_encoding.rb +0 -107
  167. data/test/xml/test_node_inheritance.rb +0 -32
  168. data/test/xml/test_node_reparenting.rb +0 -374
  169. data/test/xml/test_node_set.rb +0 -755
  170. data/test/xml/test_parse_options.rb +0 -64
  171. data/test/xml/test_processing_instruction.rb +0 -30
  172. data/test/xml/test_reader_encoding.rb +0 -142
  173. data/test/xml/test_relax_ng.rb +0 -60
  174. data/test/xml/test_schema.rb +0 -103
  175. data/test/xml/test_syntax_error.rb +0 -12
  176. data/test/xml/test_text.rb +0 -45
  177. data/test/xml/test_unparented_node.rb +0 -422
  178. data/test/xml/test_xinclude.rb +0 -83
  179. data/test/xml/test_xpath.rb +0 -295
  180. data/test/xslt/test_custom_functions.rb +0 -133
  181. data/test/xslt/test_exception_handling.rb +0 -37
  182. data/test_all +0 -81
@@ -0,0 +1,230 @@
1
+ module Nokogiri
2
+ module XML
3
+ #
4
+ # The Searchable module declares the interface used for searching your DOM.
5
+ #
6
+ # It implements the public methods `search`, `css`, and `xpath`,
7
+ # as well as allowing specific implementations to specialize some
8
+ # of the important behaviors.
9
+ #
10
+ module Searchable
11
+ # Regular expression used by Searchable#search to determine if a query
12
+ # string is CSS or XPath
13
+ LOOKS_LIKE_XPATH = /^(\.\/|\/|\.\.|\.$)/
14
+
15
+ ###
16
+ # call-seq: search *paths, [namespace-bindings, xpath-variable-bindings, custom-handler-class]
17
+ #
18
+ # Search this object for +paths+. +paths+ must be one or more XPath or CSS queries:
19
+ #
20
+ # node.search("div.employee", ".//title")
21
+ #
22
+ # A hash of namespace bindings may be appended:
23
+ #
24
+ # node.search('.//bike:tire', {'bike' => 'http://schwinn.com/'})
25
+ # node.search('bike|tire', {'bike' => 'http://schwinn.com/'})
26
+ #
27
+ # For XPath queries, a hash of variable bindings may also be
28
+ # appended to the namespace bindings. For example:
29
+ #
30
+ # node.search('.//address[@domestic=$value]', nil, {:value => 'Yes'})
31
+ #
32
+ # Custom XPath functions and CSS pseudo-selectors may also be
33
+ # defined. To define custom functions create a class and
34
+ # implement the function you want to define. The first argument
35
+ # to the method will be the current matching NodeSet. Any other
36
+ # arguments are ones that you pass in. Note that this class may
37
+ # appear anywhere in the argument list. For example:
38
+ #
39
+ # node.search('.//title[regex(., "\w+")]', 'div.employee:regex("[0-9]+")'
40
+ # Class.new {
41
+ # def regex node_set, regex
42
+ # node_set.find_all { |node| node['some_attribute'] =~ /#{regex}/ }
43
+ # end
44
+ # }.new
45
+ # )
46
+ #
47
+ # See Searchable#xpath and Searchable#css for further usage help.
48
+ def search *args
49
+ paths, handler, ns, binds = extract_params(args)
50
+
51
+ xpaths = paths.map(&:to_s).map do |path|
52
+ (path =~ LOOKS_LIKE_XPATH) ? path : xpath_query_from_css_rule(path, ns)
53
+ end.flatten.uniq
54
+
55
+ xpath(*(xpaths + [ns, handler, binds].compact))
56
+ end
57
+ alias :/ :search
58
+
59
+ ###
60
+ # call-seq: search *paths, [namespace-bindings, xpath-variable-bindings, custom-handler-class]
61
+ #
62
+ # Search this object for +paths+, and return only the first
63
+ # result. +paths+ must be one or more XPath or CSS queries.
64
+ #
65
+ # See Searchable#search for more information.
66
+ def at *args
67
+ search(*args).first
68
+ end
69
+ alias :% :at
70
+
71
+ ###
72
+ # call-seq: css *rules, [namespace-bindings, custom-pseudo-class]
73
+ #
74
+ # Search this object for CSS +rules+. +rules+ must be one or more CSS
75
+ # selectors. For example:
76
+ #
77
+ # node.css('title')
78
+ # node.css('body h1.bold')
79
+ # node.css('div + p.green', 'div#one')
80
+ #
81
+ # A hash of namespace bindings may be appended. For example:
82
+ #
83
+ # node.css('bike|tire', {'bike' => 'http://schwinn.com/'})
84
+ #
85
+ # Custom CSS pseudo classes may also be defined. To define
86
+ # custom pseudo classes, create a class and implement the custom
87
+ # pseudo class you want defined. The first argument to the
88
+ # method will be the current matching NodeSet. Any other
89
+ # arguments are ones that you pass in. For example:
90
+ #
91
+ # node.css('title:regex("\w+")', Class.new {
92
+ # def regex node_set, regex
93
+ # node_set.find_all { |node| node['some_attribute'] =~ /#{regex}/ }
94
+ # end
95
+ # }.new)
96
+ #
97
+ # Note that the CSS query string is case-sensitive with regards
98
+ # to your document type. That is, if you're looking for "H1" in
99
+ # an HTML document, you'll never find anything, since HTML tags
100
+ # will match only lowercase CSS queries. However, "H1" might be
101
+ # found in an XML document, where tags names are case-sensitive
102
+ # (e.g., "H1" is distinct from "h1").
103
+ #
104
+ def css *args
105
+ rules, handler, ns, _ = extract_params(args)
106
+
107
+ css_internal self, rules, handler, ns
108
+ end
109
+
110
+ ##
111
+ # call-seq: css *rules, [namespace-bindings, custom-pseudo-class]
112
+ #
113
+ # Search this object for CSS +rules+, and return only the first
114
+ # match. +rules+ must be one or more CSS selectors.
115
+ #
116
+ # See Searchable#css for more information.
117
+ def at_css *args
118
+ css(*args).first
119
+ end
120
+
121
+ ###
122
+ # call-seq: xpath *paths, [namespace-bindings, variable-bindings, custom-handler-class]
123
+ #
124
+ # Search this node for XPath +paths+. +paths+ must be one or more XPath
125
+ # queries.
126
+ #
127
+ # node.xpath('.//title')
128
+ #
129
+ # A hash of namespace bindings may be appended. For example:
130
+ #
131
+ # node.xpath('.//foo:name', {'foo' => 'http://example.org/'})
132
+ # node.xpath('.//xmlns:name', node.root.namespaces)
133
+ #
134
+ # A hash of variable bindings may also be appended to the namespace bindings. For example:
135
+ #
136
+ # node.xpath('.//address[@domestic=$value]', nil, {:value => 'Yes'})
137
+ #
138
+ # Custom XPath functions may also be defined. To define custom
139
+ # functions create a class and implement the function you want
140
+ # to define. The first argument to the method will be the
141
+ # current matching NodeSet. Any other arguments are ones that
142
+ # you pass in. Note that this class may appear anywhere in the
143
+ # argument list. For example:
144
+ #
145
+ # node.xpath('.//title[regex(., "\w+")]', Class.new {
146
+ # def regex node_set, regex
147
+ # node_set.find_all { |node| node['some_attribute'] =~ /#{regex}/ }
148
+ # end
149
+ # }.new)
150
+ #
151
+ def xpath *args
152
+ paths, handler, ns, binds = extract_params(args)
153
+
154
+ xpath_internal self, paths, handler, ns, binds
155
+ end
156
+
157
+ ##
158
+ # call-seq: xpath *paths, [namespace-bindings, variable-bindings, custom-handler-class]
159
+ #
160
+ # Search this node for XPath +paths+, and return only the first
161
+ # match. +paths+ must be one or more XPath queries.
162
+ #
163
+ # See Searchable#xpath for more information.
164
+ def at_xpath *args
165
+ xpath(*args).first
166
+ end
167
+
168
+ private
169
+
170
+ def css_internal node, rules, handler, ns
171
+ xpath_internal node, css_rules_to_xpath(rules, ns), handler, ns, nil
172
+ end
173
+
174
+ def xpath_internal node, paths, handler, ns, binds
175
+ document = node.document
176
+ return NodeSet.new(document) unless document
177
+
178
+ if paths.length == 1
179
+ return xpath_impl(node, paths.first, handler, ns, binds)
180
+ end
181
+
182
+ NodeSet.new(document) do |combined|
183
+ paths.each do |path|
184
+ xpath_impl(node, path, handler, ns, binds).each { |set| combined << set }
185
+ end
186
+ end
187
+ end
188
+
189
+ def xpath_impl node, path, handler, ns, binds
190
+ ctx = XPathContext.new(node)
191
+ ctx.register_namespaces(ns)
192
+ path = path.gsub(/xmlns:/, ' :') unless Nokogiri.uses_libxml?
193
+
194
+ binds.each do |key,value|
195
+ ctx.register_variable key.to_s, value
196
+ end if binds
197
+
198
+ ctx.evaluate(path, handler)
199
+ end
200
+
201
+ def css_rules_to_xpath(rules, ns)
202
+ rules.map { |rule| xpath_query_from_css_rule(rule, ns) }
203
+ end
204
+
205
+ def xpath_query_from_css_rule rule, ns
206
+ self.class::IMPLIED_XPATH_CONTEXTS.map do |implied_xpath_context|
207
+ CSS.xpath_for(rule.to_s, :prefix => implied_xpath_context, :ns => ns)
208
+ end.join(' | ')
209
+ end
210
+
211
+ def extract_params params # :nodoc:
212
+ handler = params.find do |param|
213
+ ![Hash, String, Symbol].include?(param.class)
214
+ end
215
+ params -= [handler] if handler
216
+
217
+ hashes = []
218
+ while Hash === params.last || params.last.nil?
219
+ hashes << params.pop
220
+ break if params.empty?
221
+ end
222
+ ns, binds = hashes.reverse
223
+
224
+ ns ||= document.root ? document.root.namespaces : {}
225
+
226
+ [params, handler, ns, binds]
227
+ end
228
+ end
229
+ end
230
+ end
@@ -40,7 +40,30 @@ module Nokogiri
40
40
  end
41
41
 
42
42
  def to_s
43
- super.chomp
43
+ message = super.chomp
44
+ [location_to_s, level_to_s, message].
45
+ compact.join(": ").
46
+ force_encoding(message.encoding)
47
+ end
48
+
49
+ private
50
+
51
+ def level_to_s
52
+ case level
53
+ when 3 then "FATAL"
54
+ when 2 then "ERROR"
55
+ when 1 then "WARNING"
56
+ else nil
57
+ end
58
+ end
59
+
60
+ def nil_or_zero?(attribute)
61
+ attribute.nil? || attribute.zero?
62
+ end
63
+
64
+ def location_to_s
65
+ return nil if nil_or_zero?(line) && nil_or_zero?(column)
66
+ "#{line}:#{column}"
44
67
  end
45
68
  end
46
69
  end
data/lib/nokogiri/xml.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require 'nokogiri/xml/pp'
2
2
  require 'nokogiri/xml/parse_options'
3
3
  require 'nokogiri/xml/sax'
4
+ require 'nokogiri/xml/searchable'
4
5
  require 'nokogiri/xml/node'
5
6
  require 'nokogiri/xml/attribute_decl'
6
7
  require 'nokogiri/xml/element_decl'
@@ -22,6 +23,7 @@ require 'nokogiri/xml/builder'
22
23
  require 'nokogiri/xml/reader'
23
24
  require 'nokogiri/xml/notation'
24
25
  require 'nokogiri/xml/entity_decl'
26
+ require 'nokogiri/xml/entity_reference'
25
27
  require 'nokogiri/xml/schema'
26
28
  require 'nokogiri/xml/relax_ng'
27
29
 
@@ -47,7 +49,7 @@ module Nokogiri
47
49
  # Nokogiri::XML::Reader for mor information
48
50
  def Reader string_or_io, url = nil, encoding = nil, options = ParseOptions::STRICT
49
51
 
50
- options = Nokogiri::XML::ParseOptions.new(options) if Fixnum === options
52
+ options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
51
53
  # Give the options to the user
52
54
  yield options if block_given?
53
55
 
data/lib/nokogiri.rb CHANGED
@@ -2,9 +2,6 @@
2
2
  # Modify the PATH on windows so that the external DLLs will get loaded.
3
3
 
4
4
  require 'rbconfig'
5
- ENV['PATH'] = [File.expand_path(
6
- File.join(File.dirname(__FILE__), "..", "ext", "nokogiri")
7
- ), ENV['PATH']].compact.join(';') if RbConfig::CONFIG['host_os'] =~ /(mswin|mingw)/i
8
5
 
9
6
  if defined?(RUBY_ENGINE) && RUBY_ENGINE == "jruby"
10
7
  # The line below caused a problem on non-GAE rack environment.
@@ -13,7 +10,7 @@ if defined?(RUBY_ENGINE) && RUBY_ENGINE == "jruby"
13
10
  # However, simply cutting defined?(JRuby::Rack::VERSION) off resulted in
14
11
  # an unable-to-load-nokogiri problem. Thus, now, Nokogiri checks the presense
15
12
  # of appengine-rack.jar in $LOAD_PATH. If Nokogiri is on GAE, Nokogiri
16
- # should skip loading xml jars. This is because those are in WEB-INF/lib and
13
+ # should skip loading xml jars. This is because those are in WEB-INF/lib and
17
14
  # already set in the classpath.
18
15
  unless $LOAD_PATH.to_s.include?("appengine-rack")
19
16
  require 'stringio'
@@ -22,10 +19,18 @@ if defined?(RUBY_ENGINE) && RUBY_ENGINE == "jruby"
22
19
  require 'nekohtml.jar'
23
20
  require 'nekodtd.jar'
24
21
  require 'xercesImpl.jar'
22
+ require 'serializer.jar'
23
+ require 'xalan.jar'
24
+ require 'xml-apis.jar'
25
25
  end
26
26
  end
27
27
 
28
- require 'nokogiri/nokogiri'
28
+ begin
29
+ RUBY_VERSION =~ /(\d+\.\d+)/
30
+ require "nokogiri/#{$1}/nokogiri"
31
+ rescue LoadError
32
+ require 'nokogiri/nokogiri'
33
+ end
29
34
  require 'nokogiri/version'
30
35
  require 'nokogiri/syntax_error'
31
36
  require 'nokogiri/xml'
@@ -36,7 +41,8 @@ require 'nokogiri/css'
36
41
  require 'nokogiri/html/builder'
37
42
 
38
43
  # Nokogiri parses and searches XML/HTML very quickly, and also has
39
- # correctly implemented CSS3 selector support as well as XPath support.
44
+ # correctly implemented CSS3 selector support as well as XPath 1.0
45
+ # support.
40
46
  #
41
47
  # Parsing a document returns either a Nokogiri::XML::Document, or a
42
48
  # Nokogiri::HTML::Document depending on the kind of document you parse.
@@ -58,27 +64,26 @@ require 'nokogiri/html/builder'
58
64
  # puts link.content
59
65
  # end
60
66
  #
61
- # See Nokogiri::XML::Node#css for more information about CSS searching.
62
- # See Nokogiri::XML::Node#xpath for more information about XPath searching.
67
+ # See Nokogiri::XML::Searchable#css for more information about CSS searching.
68
+ # See Nokogiri::XML::Searchable#xpath for more information about XPath searching.
63
69
  module Nokogiri
64
70
  class << self
65
71
  ###
66
72
  # Parse an HTML or XML document. +string+ contains the document.
67
73
  def parse string, url = nil, encoding = nil, options = nil
68
- doc =
69
- if string.respond_to?(:read) ||
70
- string =~ /^\s*<[^Hh>]*html/i # Probably html
71
- Nokogiri.HTML(
72
- string,
73
- url,
74
- encoding, options || XML::ParseOptions::DEFAULT_HTML
75
- )
76
- else
77
- Nokogiri.XML(string, url, encoding,
78
- options || XML::ParseOptions::DEFAULT_XML)
79
- end
80
- yield doc if block_given?
81
- doc
74
+ if string.respond_to?(:read) ||
75
+ /^\s*<(?:!DOCTYPE\s+)?html[\s>]/i === string[0, 512]
76
+ # Expect an HTML indicator to appear within the first 512
77
+ # characters of a document. (<?xml ?> + <?xml-stylesheet ?>
78
+ # shouldn't be that long)
79
+ Nokogiri.HTML(string, url, encoding,
80
+ options || XML::ParseOptions::DEFAULT_HTML)
81
+ else
82
+ Nokogiri.XML(string, url, encoding,
83
+ options || XML::ParseOptions::DEFAULT_XML)
84
+ end.tap { |doc|
85
+ yield doc if block_given?
86
+ }
82
87
  end
83
88
 
84
89
  ###
@@ -109,7 +114,19 @@ module Nokogiri
109
114
  def Slop(*args, &block)
110
115
  Nokogiri(*args, &block).slop!
111
116
  end
117
+
118
+ def install_default_aliases
119
+ # Make sure to support some popular encoding aliases not known by
120
+ # all iconv implementations.
121
+ {
122
+ 'Windows-31J' => 'CP932', # Windows-31J is the IANA registered name of CP932.
123
+ }.each { |alias_name, name|
124
+ EncodingHandler.alias(name, alias_name) if EncodingHandler[alias_name].nil?
125
+ }
126
+ end
112
127
  end
128
+
129
+ Nokogiri.install_default_aliases
113
130
  end
114
131
 
115
132
  ###
@@ -120,8 +137,7 @@ end
120
137
  # To specify the type of document, use Nokogiri.XML or Nokogiri.HTML.
121
138
  def Nokogiri(*args, &block)
122
139
  if block_given?
123
- builder = Nokogiri::HTML::Builder.new(&block)
124
- return builder.doc.root
140
+ Nokogiri::HTML::Builder.new(&block).doc.root
125
141
  else
126
142
  Nokogiri.parse(*args)
127
143
  end
@@ -0,0 +1,78 @@
1
+ From c5538465c08a8ea248a370bf55bc39cd3385e4af Mon Sep 17 00:00:00 2001
2
+ From: Mike Dalessio <mike.dalessio@gmail.com>
3
+ Date: Thu, 29 Mar 2018 14:09:00 -0400
4
+ Subject: [PATCH] Revert "Do not URI escape in server side includes"
5
+
6
+ This reverts commit 960f0e275616cadc29671a218d7fb9b69eb35588.
7
+ ---
8
+ HTMLtree.c | 49 +++++++++++--------------------------------------
9
+ 1 file changed, 11 insertions(+), 38 deletions(-)
10
+
11
+ diff --git a/HTMLtree.c b/HTMLtree.c
12
+ index 2fd0c9c..67160c5 100644
13
+ --- a/HTMLtree.c
14
+ +++ b/HTMLtree.c
15
+ @@ -717,49 +717,22 @@ htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
16
+ (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
17
+ ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
18
+ (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
19
+ + xmlChar *escaped;
20
+ xmlChar *tmp = value;
21
+ - /* xmlURIEscapeStr() escapes '"' so it can be safely used. */
22
+ - xmlBufCCat(buf->buffer, "\"");
23
+
24
+ while (IS_BLANK_CH(*tmp)) tmp++;
25
+
26
+ - /* URI Escape everything, except server side includes. */
27
+ - for ( ; ; ) {
28
+ - xmlChar *escaped;
29
+ - xmlChar endChar;
30
+ - xmlChar *end = NULL;
31
+ - xmlChar *start = (xmlChar *)xmlStrstr(tmp, BAD_CAST "<!--");
32
+ - if (start != NULL) {
33
+ - end = (xmlChar *)xmlStrstr(tmp, BAD_CAST "-->");
34
+ - if (end != NULL) {
35
+ - *start = '\0';
36
+ - }
37
+ - }
38
+ -
39
+ - /* Escape the whole string, or until start (set to '\0'). */
40
+ - escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+");
41
+ - if (escaped != NULL) {
42
+ - xmlBufCat(buf->buffer, escaped);
43
+ - xmlFree(escaped);
44
+ - } else {
45
+ - xmlBufCat(buf->buffer, tmp);
46
+ - }
47
+ -
48
+ - if (end == NULL) { /* Everything has been written. */
49
+ - break;
50
+ - }
51
+ -
52
+ - /* Do not escape anything within server side includes. */
53
+ - *start = '<'; /* Restore the first character of "<!--". */
54
+ - end += 3; /* strlen("-->") */
55
+ - endChar = *end;
56
+ - *end = '\0';
57
+ - xmlBufCat(buf->buffer, start);
58
+ - *end = endChar;
59
+ - tmp = end;
60
+ + /*
61
+ + * the < and > have already been escaped at the entity level
62
+ + * And doing so here breaks server side includes
63
+ + */
64
+ + escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+<>");
65
+ + if (escaped != NULL) {
66
+ + xmlBufWriteQuotedString(buf->buffer, escaped);
67
+ + xmlFree(escaped);
68
+ + } else {
69
+ + xmlBufWriteQuotedString(buf->buffer, value);
70
+ }
71
+ -
72
+ - xmlBufCCat(buf->buffer, "\"");
73
+ } else {
74
+ xmlBufWriteQuotedString(buf->buffer, value);
75
+ }
76
+ --
77
+ 2.9.5
78
+
@@ -0,0 +1,40 @@
1
+ From 27e4aa8d885e47a296ea78d114dbbe8fc7aa3508 Mon Sep 17 00:00:00 2001
2
+ From: Kevin Solorio <soloriok@gmail.com>
3
+ Date: Fri, 1 Feb 2019 14:32:42 -0800
4
+ Subject: [PATCH] Revert-support-html-h-b-7-1
5
+
6
+ ---
7
+ entities.c | 17 -----------------
8
+ 1 file changed, 17 deletions(-)
9
+
10
+ diff --git a/entities.c b/entities.c
11
+ index 43549bc5..82652f6d 100644
12
+ --- a/entities.c
13
+ +++ b/entities.c
14
+ @@ -623,23 +623,6 @@ xmlEncodeEntitiesInternal(xmlDocPtr doc, const xmlChar *input, int attr) {
15
+ *out++ = 't';
16
+ *out++ = ';';
17
+ } else if (*cur == '&') {
18
+ - /*
19
+ - * Special handling of &{...} construct from HTML 4, see
20
+ - * http://www.w3.org/TR/html401/appendix/notes.html#h-B.7.1
21
+ - */
22
+ - if (html && attr && (cur[1] == '{') &&
23
+ - (strchr((const char *) cur, '}'))) {
24
+ - while (*cur != '}') {
25
+ - *out++ = *cur++;
26
+ - indx = out - buffer;
27
+ - if (indx + 100 > buffer_size) {
28
+ - growBufferReentrant();
29
+ - out = &buffer[indx];
30
+ - }
31
+ - }
32
+ - *out++ = *cur++;
33
+ - continue;
34
+ - }
35
+ *out++ = '&';
36
+ *out++ = 'a';
37
+ *out++ = 'm';
38
+ --
39
+ 2.16.2
40
+
@@ -0,0 +1,44 @@
1
+ From ffc08467744bd2305d41ca882c37fa30adf3a067 Mon Sep 17 00:00:00 2001
2
+ From: Kevin Solorio <soloriok@gmail.com>
3
+ Date: Wed, 27 Feb 2019 14:34:17 -0800
4
+ Subject: [PATCH 2/2] update entities.c to remove handling of ssi
5
+
6
+ ---
7
+ entities.c | 21 ---------------------
8
+ 1 file changed, 21 deletions(-)
9
+
10
+ diff --git a/entities.c b/entities.c
11
+ index 43549bc5..5c4a2a60 100644
12
+ --- a/entities.c
13
+ +++ b/entities.c
14
+ @@ -592,27 +592,6 @@ xmlEncodeEntitiesInternal(xmlDocPtr doc, const xmlChar *input, int attr) {
15
+ * By default one have to encode at least '<', '>', '"' and '&' !
16
+ */
17
+ if (*cur == '<') {
18
+ - const xmlChar *end;
19
+ -
20
+ - /*
21
+ - * Special handling of server side include in HTML attributes
22
+ - */
23
+ - if (html && attr &&
24
+ - (cur[1] == '!') && (cur[2] == '-') && (cur[3] == '-') &&
25
+ - ((end = xmlStrstr(cur, BAD_CAST "-->")) != NULL)) {
26
+ - while (cur != end) {
27
+ - *out++ = *cur++;
28
+ - indx = out - buffer;
29
+ - if (indx + 100 > buffer_size) {
30
+ - growBufferReentrant();
31
+ - out = &buffer[indx];
32
+ - }
33
+ - }
34
+ - *out++ = *cur++;
35
+ - *out++ = *cur++;
36
+ - *out++ = *cur++;
37
+ - continue;
38
+ - }
39
+ *out++ = '&';
40
+ *out++ = 'l';
41
+ *out++ = 't';
42
+ --
43
+ 2.16.2
44
+
@@ -0,0 +1,120 @@
1
+ From e03553605b45c88f0b4b2980adfbbb8f6fca2fd6 Mon Sep 17 00:00:00 2001
2
+ From: Nick Wellnhofer <wellnhofer@aevum.de>
3
+ Date: Sun, 24 Mar 2019 09:51:39 +0100
4
+ Subject: [PATCH] Fix security framework bypass
5
+
6
+ xsltCheckRead and xsltCheckWrite return -1 in case of error but callers
7
+ don't check for this condition and allow access. With a specially
8
+ crafted URL, xsltCheckRead could be tricked into returning an error
9
+ because of a supposedly invalid URL that would still be loaded
10
+ succesfully later on.
11
+
12
+ Fixes #12.
13
+
14
+ Thanks to Felix Wilhelm for the report.
15
+ ---
16
+ libxslt/documents.c | 18 ++++++++++--------
17
+ libxslt/imports.c | 9 +++++----
18
+ libxslt/transform.c | 9 +++++----
19
+ libxslt/xslt.c | 9 +++++----
20
+ 4 files changed, 25 insertions(+), 20 deletions(-)
21
+
22
+ diff --git a/libxslt/documents.c b/libxslt/documents.c
23
+ index 3f3a731..4aad11b 100644
24
+ --- a/libxslt/documents.c
25
+ +++ b/libxslt/documents.c
26
+ @@ -296,10 +296,11 @@ xsltLoadDocument(xsltTransformContextPtr ctxt, const xmlChar *URI) {
27
+ int res;
28
+
29
+ res = xsltCheckRead(ctxt->sec, ctxt, URI);
30
+ - if (res == 0) {
31
+ - xsltTransformError(ctxt, NULL, NULL,
32
+ - "xsltLoadDocument: read rights for %s denied\n",
33
+ - URI);
34
+ + if (res <= 0) {
35
+ + if (res == 0)
36
+ + xsltTransformError(ctxt, NULL, NULL,
37
+ + "xsltLoadDocument: read rights for %s denied\n",
38
+ + URI);
39
+ return(NULL);
40
+ }
41
+ }
42
+ @@ -372,10 +373,11 @@ xsltLoadStyleDocument(xsltStylesheetPtr style, const xmlChar *URI) {
43
+ int res;
44
+
45
+ res = xsltCheckRead(sec, NULL, URI);
46
+ - if (res == 0) {
47
+ - xsltTransformError(NULL, NULL, NULL,
48
+ - "xsltLoadStyleDocument: read rights for %s denied\n",
49
+ - URI);
50
+ + if (res <= 0) {
51
+ + if (res == 0)
52
+ + xsltTransformError(NULL, NULL, NULL,
53
+ + "xsltLoadStyleDocument: read rights for %s denied\n",
54
+ + URI);
55
+ return(NULL);
56
+ }
57
+ }
58
+ diff --git a/libxslt/imports.c b/libxslt/imports.c
59
+ index 874870c..3783b24 100644
60
+ --- a/libxslt/imports.c
61
+ +++ b/libxslt/imports.c
62
+ @@ -130,10 +130,11 @@ xsltParseStylesheetImport(xsltStylesheetPtr style, xmlNodePtr cur) {
63
+ int secres;
64
+
65
+ secres = xsltCheckRead(sec, NULL, URI);
66
+ - if (secres == 0) {
67
+ - xsltTransformError(NULL, NULL, NULL,
68
+ - "xsl:import: read rights for %s denied\n",
69
+ - URI);
70
+ + if (secres <= 0) {
71
+ + if (secres == 0)
72
+ + xsltTransformError(NULL, NULL, NULL,
73
+ + "xsl:import: read rights for %s denied\n",
74
+ + URI);
75
+ goto error;
76
+ }
77
+ }
78
+ diff --git a/libxslt/transform.c b/libxslt/transform.c
79
+ index 1379391..0636dbd 100644
80
+ --- a/libxslt/transform.c
81
+ +++ b/libxslt/transform.c
82
+ @@ -3493,10 +3493,11 @@ xsltDocumentElem(xsltTransformContextPtr ctxt, xmlNodePtr node,
83
+ */
84
+ if (ctxt->sec != NULL) {
85
+ ret = xsltCheckWrite(ctxt->sec, ctxt, filename);
86
+ - if (ret == 0) {
87
+ - xsltTransformError(ctxt, NULL, inst,
88
+ - "xsltDocumentElem: write rights for %s denied\n",
89
+ - filename);
90
+ + if (ret <= 0) {
91
+ + if (ret == 0)
92
+ + xsltTransformError(ctxt, NULL, inst,
93
+ + "xsltDocumentElem: write rights for %s denied\n",
94
+ + filename);
95
+ xmlFree(URL);
96
+ xmlFree(filename);
97
+ return;
98
+ diff --git a/libxslt/xslt.c b/libxslt/xslt.c
99
+ index 780a5ad..a234eb7 100644
100
+ --- a/libxslt/xslt.c
101
+ +++ b/libxslt/xslt.c
102
+ @@ -6763,10 +6763,11 @@ xsltParseStylesheetFile(const xmlChar* filename) {
103
+ int res;
104
+
105
+ res = xsltCheckRead(sec, NULL, filename);
106
+ - if (res == 0) {
107
+ - xsltTransformError(NULL, NULL, NULL,
108
+ - "xsltParseStylesheetFile: read rights for %s denied\n",
109
+ - filename);
110
+ + if (res <= 0) {
111
+ + if (res == 0)
112
+ + xsltTransformError(NULL, NULL, NULL,
113
+ + "xsltParseStylesheetFile: read rights for %s denied\n",
114
+ + filename);
115
+ return(NULL);
116
+ }
117
+ }
118
+ --
119
+ 2.17.1
120
+
Binary file