nokogiri 1.10.10 → 1.13.9

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (220) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +5 -0
  3. data/LICENSE-DEPENDENCIES.md +1173 -884
  4. data/LICENSE.md +1 -1
  5. data/README.md +178 -96
  6. data/bin/nokogiri +63 -50
  7. data/dependencies.yml +13 -64
  8. data/ext/nokogiri/depend +38 -358
  9. data/ext/nokogiri/extconf.rb +761 -424
  10. data/ext/nokogiri/gumbo.c +584 -0
  11. data/ext/nokogiri/html4_document.c +166 -0
  12. data/ext/nokogiri/html4_element_description.c +294 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser_context.c +119 -0
  15. data/ext/nokogiri/html4_sax_push_parser.c +95 -0
  16. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  17. data/ext/nokogiri/nokogiri.c +228 -91
  18. data/ext/nokogiri/nokogiri.h +199 -88
  19. data/ext/nokogiri/test_global_handlers.c +40 -0
  20. data/ext/nokogiri/xml_attr.c +17 -17
  21. data/ext/nokogiri/xml_attribute_decl.c +21 -21
  22. data/ext/nokogiri/xml_cdata.c +14 -19
  23. data/ext/nokogiri/xml_comment.c +19 -26
  24. data/ext/nokogiri/xml_document.c +296 -220
  25. data/ext/nokogiri/xml_document_fragment.c +12 -16
  26. data/ext/nokogiri/xml_dtd.c +64 -58
  27. data/ext/nokogiri/xml_element_content.c +31 -26
  28. data/ext/nokogiri/xml_element_decl.c +25 -25
  29. data/ext/nokogiri/xml_encoding_handler.c +43 -18
  30. data/ext/nokogiri/xml_entity_decl.c +37 -35
  31. data/ext/nokogiri/xml_entity_reference.c +16 -18
  32. data/ext/nokogiri/xml_namespace.c +98 -53
  33. data/ext/nokogiri/xml_node.c +1065 -653
  34. data/ext/nokogiri/xml_node_set.c +178 -166
  35. data/ext/nokogiri/xml_processing_instruction.c +17 -19
  36. data/ext/nokogiri/xml_reader.c +277 -175
  37. data/ext/nokogiri/xml_relax_ng.c +52 -28
  38. data/ext/nokogiri/xml_sax_parser.c +112 -112
  39. data/ext/nokogiri/xml_sax_parser_context.c +112 -86
  40. data/ext/nokogiri/xml_sax_push_parser.c +36 -27
  41. data/ext/nokogiri/xml_schema.c +98 -48
  42. data/ext/nokogiri/xml_syntax_error.c +42 -21
  43. data/ext/nokogiri/xml_text.c +14 -18
  44. data/ext/nokogiri/xml_xpath_context.c +226 -115
  45. data/ext/nokogiri/xslt_stylesheet.c +265 -173
  46. data/gumbo-parser/CHANGES.md +63 -0
  47. data/gumbo-parser/Makefile +101 -0
  48. data/gumbo-parser/THANKS +27 -0
  49. data/gumbo-parser/src/Makefile +34 -0
  50. data/gumbo-parser/src/README.md +41 -0
  51. data/gumbo-parser/src/ascii.c +75 -0
  52. data/gumbo-parser/src/ascii.h +115 -0
  53. data/gumbo-parser/src/attribute.c +42 -0
  54. data/gumbo-parser/src/attribute.h +17 -0
  55. data/gumbo-parser/src/char_ref.c +22225 -0
  56. data/gumbo-parser/src/char_ref.h +29 -0
  57. data/gumbo-parser/src/char_ref.rl +2154 -0
  58. data/gumbo-parser/src/error.c +626 -0
  59. data/gumbo-parser/src/error.h +148 -0
  60. data/gumbo-parser/src/foreign_attrs.c +104 -0
  61. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  62. data/gumbo-parser/src/gumbo.h +943 -0
  63. data/gumbo-parser/src/insertion_mode.h +33 -0
  64. data/gumbo-parser/src/macros.h +91 -0
  65. data/gumbo-parser/src/parser.c +4875 -0
  66. data/gumbo-parser/src/parser.h +41 -0
  67. data/gumbo-parser/src/replacement.h +33 -0
  68. data/gumbo-parser/src/string_buffer.c +103 -0
  69. data/gumbo-parser/src/string_buffer.h +68 -0
  70. data/gumbo-parser/src/string_piece.c +48 -0
  71. data/gumbo-parser/src/svg_attrs.c +174 -0
  72. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  73. data/gumbo-parser/src/svg_tags.c +137 -0
  74. data/gumbo-parser/src/svg_tags.gperf +55 -0
  75. data/gumbo-parser/src/tag.c +222 -0
  76. data/gumbo-parser/src/tag_lookup.c +382 -0
  77. data/gumbo-parser/src/tag_lookup.gperf +169 -0
  78. data/gumbo-parser/src/tag_lookup.h +13 -0
  79. data/gumbo-parser/src/token_buffer.c +79 -0
  80. data/gumbo-parser/src/token_buffer.h +71 -0
  81. data/gumbo-parser/src/token_type.h +17 -0
  82. data/gumbo-parser/src/tokenizer.c +3463 -0
  83. data/gumbo-parser/src/tokenizer.h +112 -0
  84. data/gumbo-parser/src/tokenizer_states.h +339 -0
  85. data/gumbo-parser/src/utf8.c +245 -0
  86. data/gumbo-parser/src/utf8.h +164 -0
  87. data/gumbo-parser/src/util.c +68 -0
  88. data/gumbo-parser/src/util.h +30 -0
  89. data/gumbo-parser/src/vector.c +111 -0
  90. data/gumbo-parser/src/vector.h +45 -0
  91. data/lib/nokogiri/class_resolver.rb +67 -0
  92. data/lib/nokogiri/css/node.rb +10 -8
  93. data/lib/nokogiri/css/parser.rb +397 -377
  94. data/lib/nokogiri/css/parser.y +250 -245
  95. data/lib/nokogiri/css/parser_extras.rb +54 -49
  96. data/lib/nokogiri/css/syntax_error.rb +3 -1
  97. data/lib/nokogiri/css/tokenizer.rb +5 -3
  98. data/lib/nokogiri/css/tokenizer.rex +3 -2
  99. data/lib/nokogiri/css/xpath_visitor.rb +218 -91
  100. data/lib/nokogiri/css.rb +50 -17
  101. data/lib/nokogiri/decorators/slop.rb +9 -7
  102. data/lib/nokogiri/extension.rb +31 -0
  103. data/lib/nokogiri/gumbo.rb +15 -0
  104. data/lib/nokogiri/html.rb +38 -27
  105. data/lib/nokogiri/{html → html4}/builder.rb +4 -2
  106. data/lib/nokogiri/{html → html4}/document.rb +103 -105
  107. data/lib/nokogiri/html4/document_fragment.rb +54 -0
  108. data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
  109. data/lib/nokogiri/html4/element_description_defaults.rb +578 -0
  110. data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
  111. data/lib/nokogiri/{html → html4}/sax/parser.rb +17 -16
  112. data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
  113. data/lib/nokogiri/{html → html4}/sax/push_parser.rb +12 -11
  114. data/lib/nokogiri/html4.rb +46 -0
  115. data/lib/nokogiri/html5/document.rb +91 -0
  116. data/lib/nokogiri/html5/document_fragment.rb +83 -0
  117. data/lib/nokogiri/html5/node.rb +100 -0
  118. data/lib/nokogiri/html5.rb +478 -0
  119. data/lib/nokogiri/jruby/dependencies.rb +21 -0
  120. data/lib/nokogiri/syntax_error.rb +2 -0
  121. data/lib/nokogiri/version/constant.rb +6 -0
  122. data/lib/nokogiri/version/info.rb +222 -0
  123. data/lib/nokogiri/version.rb +3 -108
  124. data/lib/nokogiri/xml/attr.rb +6 -3
  125. data/lib/nokogiri/xml/attribute_decl.rb +3 -1
  126. data/lib/nokogiri/xml/builder.rb +74 -33
  127. data/lib/nokogiri/xml/cdata.rb +3 -1
  128. data/lib/nokogiri/xml/character_data.rb +2 -0
  129. data/lib/nokogiri/xml/document.rb +224 -86
  130. data/lib/nokogiri/xml/document_fragment.rb +46 -44
  131. data/lib/nokogiri/xml/dtd.rb +4 -2
  132. data/lib/nokogiri/xml/element_content.rb +2 -0
  133. data/lib/nokogiri/xml/element_decl.rb +3 -1
  134. data/lib/nokogiri/xml/entity_decl.rb +4 -2
  135. data/lib/nokogiri/xml/entity_reference.rb +2 -0
  136. data/lib/nokogiri/xml/namespace.rb +3 -0
  137. data/lib/nokogiri/xml/node/save_options.rb +10 -5
  138. data/lib/nokogiri/xml/node.rb +884 -378
  139. data/lib/nokogiri/xml/node_set.rb +51 -54
  140. data/lib/nokogiri/xml/notation.rb +13 -0
  141. data/lib/nokogiri/xml/parse_options.rb +22 -8
  142. data/lib/nokogiri/xml/pp/character_data.rb +9 -6
  143. data/lib/nokogiri/xml/pp/node.rb +25 -26
  144. data/lib/nokogiri/xml/pp.rb +4 -2
  145. data/lib/nokogiri/xml/processing_instruction.rb +3 -1
  146. data/lib/nokogiri/xml/reader.rb +21 -28
  147. data/lib/nokogiri/xml/relax_ng.rb +8 -2
  148. data/lib/nokogiri/xml/sax/document.rb +45 -49
  149. data/lib/nokogiri/xml/sax/parser.rb +38 -34
  150. data/lib/nokogiri/xml/sax/parser_context.rb +8 -3
  151. data/lib/nokogiri/xml/sax/push_parser.rb +6 -5
  152. data/lib/nokogiri/xml/sax.rb +6 -4
  153. data/lib/nokogiri/xml/schema.rb +19 -9
  154. data/lib/nokogiri/xml/searchable.rb +112 -72
  155. data/lib/nokogiri/xml/syntax_error.rb +6 -4
  156. data/lib/nokogiri/xml/text.rb +2 -0
  157. data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
  158. data/lib/nokogiri/xml/xpath.rb +15 -4
  159. data/lib/nokogiri/xml/xpath_context.rb +3 -3
  160. data/lib/nokogiri/xml.rb +38 -37
  161. data/lib/nokogiri/xslt/stylesheet.rb +3 -1
  162. data/lib/nokogiri/xslt.rb +29 -20
  163. data/lib/nokogiri.rb +49 -65
  164. data/lib/xsd/xmlparser/nokogiri.rb +26 -24
  165. data/patches/libxml2/{0002-Remove-script-macro-support.patch → 0001-Remove-script-macro-support.patch} +0 -0
  166. data/patches/libxml2/{0003-Update-entities-to-remove-handling-of-ssi.patch → 0002-Update-entities-to-remove-handling-of-ssi.patch} +0 -0
  167. data/patches/libxml2/{0004-libxml2.la-is-in-top_builddir.patch → 0003-libxml2.la-is-in-top_builddir.patch} +1 -1
  168. data/patches/libxml2/0005-avoid-isnan-isinf.patch +81 -0
  169. data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
  170. data/patches/libxslt/0001-update-automake-files-for-arm64.patch +3037 -0
  171. data/ports/archives/libxml2-2.10.3.tar.xz +0 -0
  172. data/ports/archives/libxslt-1.1.37.tar.xz +0 -0
  173. metadata +189 -142
  174. data/ext/nokogiri/html_document.c +0 -170
  175. data/ext/nokogiri/html_document.h +0 -10
  176. data/ext/nokogiri/html_element_description.c +0 -279
  177. data/ext/nokogiri/html_element_description.h +0 -10
  178. data/ext/nokogiri/html_entity_lookup.c +0 -32
  179. data/ext/nokogiri/html_entity_lookup.h +0 -8
  180. data/ext/nokogiri/html_sax_parser_context.c +0 -116
  181. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  182. data/ext/nokogiri/html_sax_push_parser.c +0 -87
  183. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  184. data/ext/nokogiri/xml_attr.h +0 -9
  185. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  186. data/ext/nokogiri/xml_cdata.h +0 -9
  187. data/ext/nokogiri/xml_comment.h +0 -9
  188. data/ext/nokogiri/xml_document.h +0 -23
  189. data/ext/nokogiri/xml_document_fragment.h +0 -10
  190. data/ext/nokogiri/xml_dtd.h +0 -10
  191. data/ext/nokogiri/xml_element_content.h +0 -10
  192. data/ext/nokogiri/xml_element_decl.h +0 -9
  193. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  194. data/ext/nokogiri/xml_entity_decl.h +0 -10
  195. data/ext/nokogiri/xml_entity_reference.h +0 -9
  196. data/ext/nokogiri/xml_io.c +0 -61
  197. data/ext/nokogiri/xml_io.h +0 -11
  198. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  199. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  200. data/ext/nokogiri/xml_namespace.h +0 -14
  201. data/ext/nokogiri/xml_node.h +0 -13
  202. data/ext/nokogiri/xml_node_set.h +0 -12
  203. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  204. data/ext/nokogiri/xml_reader.h +0 -10
  205. data/ext/nokogiri/xml_relax_ng.h +0 -9
  206. data/ext/nokogiri/xml_sax_parser.h +0 -39
  207. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  208. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  209. data/ext/nokogiri/xml_schema.h +0 -9
  210. data/ext/nokogiri/xml_syntax_error.h +0 -13
  211. data/ext/nokogiri/xml_text.h +0 -9
  212. data/ext/nokogiri/xml_xpath_context.h +0 -10
  213. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  214. data/lib/nokogiri/html/document_fragment.rb +0 -49
  215. data/lib/nokogiri/html/element_description_defaults.rb +0 -671
  216. data/lib/nokogiri/html/sax/parser_context.rb +0 -16
  217. data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
  218. data/patches/libxml2/0005-Fix-infinite-loop-in-xmlStringLenDecodeEntities.patch +0 -32
  219. data/ports/archives/libxml2-2.9.10.tar.gz +0 -0
  220. data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
data/lib/nokogiri/html.rb CHANGED
@@ -1,37 +1,48 @@
1
- require 'nokogiri/html/entity_lookup'
2
- require 'nokogiri/html/document'
3
- require 'nokogiri/html/document_fragment'
4
- require 'nokogiri/html/sax/parser_context'
5
- require 'nokogiri/html/sax/parser'
6
- require 'nokogiri/html/sax/push_parser'
7
- require 'nokogiri/html/element_description'
8
- require 'nokogiri/html/element_description_defaults'
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ require_relative "html4"
9
5
 
10
6
  module Nokogiri
11
- class << self
12
- ###
13
- # Parse HTML. Convenience method for Nokogiri::HTML::Document.parse
14
- def HTML thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block
15
- Nokogiri::HTML::Document.parse(thing, url, encoding, options, &block)
16
- end
17
- end
7
+ # Alias for Nokogiri::HTML4
8
+ HTML = Nokogiri::HTML4
9
+
10
+ # :singleton-method: HTML
11
+ # :call-seq: HTML(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) → Nokogiri::HTML4::Document
12
+ #
13
+ # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
18
14
 
15
+ # :nodoc:
16
+ define_singleton_method(:HTML, Nokogiri.method(:HTML4))
17
+
18
+ # 💡 This module/namespace is an alias for Nokogiri::HTML4 as of v1.12.0. Before v1.12.0,
19
+ # Nokogiri::HTML4 did not exist, and this was the module/namespace for all HTML-related
20
+ # classes.
19
21
  module HTML
20
- class << self
21
- ###
22
- # Parse HTML. Convenience method for Nokogiri::HTML::Document.parse
23
- def parse thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block
24
- Document.parse(thing, url, encoding, options, &block)
22
+ # 💡 This class is an alias for Nokogiri::HTML4::Document as of v1.12.0.
23
+ class Document < Nokogiri::XML::Document
24
+ end
25
+
26
+ # 💡 This class is an alias for Nokogiri::HTML4::DocumentFragment as of v1.12.0.
27
+ class DocumentFragment < Nokogiri::XML::DocumentFragment
28
+ end
29
+
30
+ # 💡 This class is an alias for Nokogiri::HTML4::Builder as of v1.12.0.
31
+ class Builder < Nokogiri::XML::Builder
32
+ end
33
+
34
+ module SAX
35
+ # 💡 This class is an alias for Nokogiri::HTML4::SAX::Parser as of v1.12.0.
36
+ class Parser < Nokogiri::XML::SAX::Parser
25
37
  end
26
38
 
27
- ####
28
- # Parse a fragment from +string+ in to a NodeSet.
29
- def fragment string, encoding = nil
30
- HTML::DocumentFragment.parse string, encoding
39
+ # 💡 This class is an alias for Nokogiri::HTML4::SAX::ParserContext as of v1.12.0.
40
+ class ParserContext < Nokogiri::XML::SAX::ParserContext
31
41
  end
32
- end
33
42
 
34
- # Instance of Nokogiri::HTML::EntityLookup
35
- NamedCharacters = EntityLookup.new
43
+ # 💡 This class is an alias for Nokogiri::HTML4::SAX::PushParser as of v1.12.0.
44
+ class PushParser
45
+ end
46
+ end
36
47
  end
37
48
  end
@@ -1,5 +1,7 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Nokogiri
2
- module HTML
4
+ module HTML4
3
5
  ###
4
6
  # Nokogiri HTML builder is used for building HTML documents. It is very
5
7
  # similar to the Nokogiri::XML::Builder. In fact, you should go read the
@@ -11,7 +13,7 @@ module Nokogiri
11
13
  # Create an HTML document with a body that has an onload attribute, and a
12
14
  # span tag with a class of "bold" that has content of "Hello world".
13
15
  #
14
- # builder = Nokogiri::HTML::Builder.new do |doc|
16
+ # builder = Nokogiri::HTML4::Builder.new do |doc|
15
17
  # doc.html {
16
18
  # doc.body(:onload => 'some_func();') {
17
19
  # doc.span.bold {
@@ -1,15 +1,19 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ require "pathname"
5
+
1
6
  module Nokogiri
2
- module HTML
7
+ module HTML4
3
8
  class Document < Nokogiri::XML::Document
4
9
  ###
5
10
  # Get the meta tag encoding for this document. If there is no meta tag,
6
11
  # then nil is returned.
7
12
  def meta_encoding
8
- case
9
- when meta = at('//meta[@charset]')
13
+ if (meta = at_xpath("//meta[@charset]"))
10
14
  meta[:charset]
11
- when meta = meta_content_type
12
- meta['content'][/charset\s*=\s*([\w-]+)/i, 1]
15
+ elsif (meta = meta_content_type)
16
+ meta["content"][/charset\s*=\s*([\w-]+)/i, 1]
13
17
  end
14
18
  end
15
19
 
@@ -29,24 +33,22 @@ module Nokogiri
29
33
  #
30
34
  # Beware in CRuby, that libxml2 automatically inserts a meta tag
31
35
  # into a head element.
32
- def meta_encoding= encoding
33
- case
34
- when meta = meta_content_type
35
- meta['content'] = 'text/html; charset=%s' % encoding
36
+ def meta_encoding=(encoding)
37
+ if (meta = meta_content_type)
38
+ meta["content"] = format("text/html; charset=%s", encoding)
36
39
  encoding
37
- when meta = at('//meta[@charset]')
38
- meta['charset'] = encoding
40
+ elsif (meta = at_xpath("//meta[@charset]"))
41
+ meta["charset"] = encoding
39
42
  else
40
- meta = XML::Node.new('meta', self)
41
- if dtd = internal_subset and dtd.html5_dtd?
42
- meta['charset'] = encoding
43
+ meta = XML::Node.new("meta", self)
44
+ if (dtd = internal_subset) && dtd.html5_dtd?
45
+ meta["charset"] = encoding
43
46
  else
44
- meta['http-equiv'] = 'Content-Type'
45
- meta['content'] = 'text/html; charset=%s' % encoding
47
+ meta["http-equiv"] = "Content-Type"
48
+ meta["content"] = format("text/html; charset=%s", encoding)
46
49
  end
47
50
 
48
- case
49
- when head = at('//head')
51
+ if (head = at_xpath("//head"))
50
52
  head.prepend_child(meta)
51
53
  else
52
54
  set_metadata_element(meta)
@@ -56,9 +58,9 @@ module Nokogiri
56
58
  end
57
59
 
58
60
  def meta_content_type
59
- xpath('//meta[@http-equiv and boolean(@content)]').find { |node|
60
- node['http-equiv'] =~ /\AContent-Type\z/i
61
- }
61
+ xpath("//meta[@http-equiv and boolean(@content)]").find do |node|
62
+ node["http-equiv"] =~ /\AContent-Type\z/i
63
+ end
62
64
  end
63
65
  private :meta_content_type
64
66
 
@@ -66,7 +68,7 @@ module Nokogiri
66
68
  # Get the title string of this document. Return nil if there is
67
69
  # no title tag.
68
70
  def title
69
- title = at('//title') and title.inner_text
71
+ (title = at_xpath("//title")) && title.inner_text
70
72
  end
71
73
 
72
74
  ###
@@ -82,52 +84,50 @@ module Nokogiri
82
84
  # content element (typically <body>) if any.
83
85
  def title=(text)
84
86
  tnode = XML::Text.new(text, self)
85
- if title = at('//title')
87
+ if (title = at_xpath("//title"))
86
88
  title.children = tnode
87
89
  return text
88
90
  end
89
91
 
90
- title = XML::Node.new('title', self) << tnode
91
- case
92
- when head = at('//head')
92
+ title = XML::Node.new("title", self) << tnode
93
+ if (head = at_xpath("//head"))
93
94
  head << title
94
- when meta = at('//meta[@charset]') || meta_content_type
95
+ elsif (meta = (at_xpath("//meta[@charset]") || meta_content_type))
95
96
  # better put after charset declaration
96
97
  meta.add_next_sibling(title)
97
98
  else
98
99
  set_metadata_element(title)
99
100
  end
100
- text
101
101
  end
102
102
 
103
- def set_metadata_element(element)
104
- case
105
- when head = at('//head')
103
+ def set_metadata_element(element) # rubocop:disable Naming/AccessorMethodName
104
+ if (head = at_xpath("//head"))
106
105
  head << element
107
- when html = at('//html')
108
- head = html.prepend_child(XML::Node.new('head', self))
106
+ elsif (html = at_xpath("//html"))
107
+ head = html.prepend_child(XML::Node.new("head", self))
109
108
  head.prepend_child(element)
110
- when first = children.find { |node|
111
- case node
112
- when XML::Element, XML::Text
113
- true
114
- end
115
- }
109
+ elsif (first = children.find do |node|
110
+ case node
111
+ when XML::Element, XML::Text
112
+ true
113
+ end
114
+ end)
116
115
  # We reach here only if the underlying document model
117
116
  # allows <html>/<head> elements to be omitted and does not
118
117
  # automatically supply them.
119
118
  first.add_previous_sibling(element)
120
119
  else
121
- html = add_child(XML::Node.new('html', self))
122
- head = html.add_child(XML::Node.new('head', self))
120
+ html = add_child(XML::Node.new("html", self))
121
+ head = html.add_child(XML::Node.new("head", self))
123
122
  head.prepend_child(element)
124
123
  end
125
124
  end
126
125
  private :set_metadata_element
127
126
 
128
127
  ####
129
- # Serialize Node using +options+. Save options can also be set using a
130
- # block. See SaveOptions.
128
+ # Serialize Node using +options+. Save options can also be set using a block.
129
+ #
130
+ # See also Nokogiri::XML::Node::SaveOptions and Node@Serialization+and+Generating+Output.
131
131
  #
132
132
  # These two statements are equivalent:
133
133
  #
@@ -139,15 +139,25 @@ module Nokogiri
139
139
  # config.format.as_xml
140
140
  # end
141
141
  #
142
- def serialize options = {}
142
+ def serialize(options = {})
143
143
  options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
144
144
  super
145
145
  end
146
146
 
147
147
  ####
148
148
  # Create a Nokogiri::XML::DocumentFragment from +tags+
149
- def fragment tags = nil
150
- DocumentFragment.new(self, tags, self.root)
149
+ def fragment(tags = nil)
150
+ DocumentFragment.new(self, tags, root)
151
+ end
152
+
153
+ # :call-seq:
154
+ # xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig
155
+ #
156
+ # [Returns] The document type which determines CSS-to-XPath translation.
157
+ #
158
+ # See XPathVisitor for more information.
159
+ def xpath_doctype
160
+ Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML4
151
161
  end
152
162
 
153
163
  class << self
@@ -159,12 +169,12 @@ module Nokogiri
159
169
  # is a number that sets options in the parser, such as
160
170
  # Nokogiri::XML::ParseOptions::RECOVER. See the constants in
161
171
  # Nokogiri::XML::ParseOptions.
162
- def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML
163
-
172
+ def parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML)
164
173
  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
165
- # Give the options to the user
166
174
  yield options if block_given?
167
175
 
176
+ url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
177
+
168
178
  if string_or_io.respond_to?(:encoding)
169
179
  unless string_or_io.encoding.name == "ASCII-8BIT"
170
180
  encoding ||= string_or_io.encoding.name
@@ -172,7 +182,12 @@ module Nokogiri
172
182
  end
173
183
 
174
184
  if string_or_io.respond_to?(:read)
175
- url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
185
+ if string_or_io.is_a?(Pathname)
186
+ # resolve the Pathname to the file and open it as an IO object, see #2110
187
+ string_or_io = string_or_io.expand_path.open
188
+ url ||= string_or_io.path
189
+ end
190
+
176
191
  unless encoding
177
192
  # Libxml2's parser has poor support for encoding
178
193
  # detection. First, it does not recognize the HTML5
@@ -196,7 +211,7 @@ module Nokogiri
196
211
  end
197
212
 
198
213
  # read_memory pukes on empty docs
199
- if string_or_io.nil? or string_or_io.empty?
214
+ if string_or_io.nil? || string_or_io.empty?
200
215
  return encoding ? new.tap { |i| i.encoding = encoding } : new
201
216
  end
202
217
 
@@ -206,37 +221,39 @@ module Nokogiri
206
221
  end
207
222
  end
208
223
 
209
- class EncodingFound < StandardError # :nodoc:
224
+ class EncodingFound < StandardError # :nodoc: all
210
225
  attr_reader :found_encoding
211
226
 
212
227
  def initialize(encoding)
213
228
  @found_encoding = encoding
214
- super("encoding found: %s" % encoding)
229
+ super(format("encoding found: %s", encoding))
215
230
  end
216
231
  end
217
232
 
218
- class EncodingReader # :nodoc:
219
- class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc:
233
+ # :nodoc: all
234
+ class EncodingReader
235
+ class SAXHandler < Nokogiri::XML::SAX::Document
220
236
  attr_reader :encoding
221
-
237
+
222
238
  def initialize
223
239
  @encoding = nil
224
240
  super()
225
241
  end
226
-
242
+
227
243
  def start_element(name, attrs = [])
228
- return unless name == 'meta'
244
+ return unless name == "meta"
245
+
229
246
  attr = Hash[attrs]
230
- charset = attr['charset'] and
231
- @encoding = charset
232
- http_equiv = attr['http-equiv'] and
233
- http_equiv.match(/\AContent-Type\z/i) and
234
- content = attr['content'] and
235
- m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
236
- @encoding = m[1]
247
+ (charset = attr["charset"]) &&
248
+ (@encoding = charset)
249
+ (http_equiv = attr["http-equiv"]) &&
250
+ http_equiv.match(/\AContent-Type\z/i) &&
251
+ (content = attr["content"]) &&
252
+ (m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
253
+ (@encoding = m[1])
237
254
  end
238
255
  end
239
-
256
+
240
257
  class JumpSAXHandler < SAXHandler
241
258
  def initialize(jumptag)
242
259
  @jumptag = jumptag
@@ -245,53 +262,34 @@ module Nokogiri
245
262
 
246
263
  def start_element(name, attrs = [])
247
264
  super
248
- throw @jumptag, @encoding if @encoding
249
- throw @jumptag, nil if name =~ /\A(?:div|h1|img|p|br)\z/
265
+ throw(@jumptag, @encoding) if @encoding
266
+ throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
250
267
  end
251
268
  end
252
269
 
253
270
  def self.detect_encoding(chunk)
254
- if Nokogiri.jruby? && EncodingReader.is_jruby_without_fix?
255
- return EncodingReader.detect_encoding_for_jruby_without_fix(chunk)
256
- end
257
- m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
258
- return Nokogiri.XML(m[1]).encoding
271
+ (m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) &&
272
+ (return Nokogiri.XML(m[1]).encoding)
259
273
 
260
274
  if Nokogiri.jruby?
261
- m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
262
- return m[4]
263
- catch(:encoding_found) {
264
- Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
275
+ (m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
276
+ (return m[4])
277
+ catch(:encoding_found) do
278
+ Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
265
279
  nil
266
- }
280
+ end
267
281
  else
268
282
  handler = SAXHandler.new
269
- parser = Nokogiri::HTML::SAX::PushParser.new(handler)
270
- parser << chunk rescue Nokogiri::SyntaxError
283
+ parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
284
+ begin
285
+ parser << chunk
286
+ rescue
287
+ Nokogiri::SyntaxError
288
+ end
271
289
  handler.encoding
272
290
  end
273
291
  end
274
292
 
275
- def self.is_jruby_without_fix?
276
- JRUBY_VERSION.split('.').join.to_i < 165
277
- end
278
-
279
- def self.detect_encoding_for_jruby_without_fix(chunk)
280
- m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
281
- return Nokogiri.XML(m[1]).encoding
282
-
283
- m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
284
- return m[4]
285
-
286
- catch(:encoding_found) {
287
- Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk)
288
- nil
289
- }
290
- rescue Nokogiri::SyntaxError, RuntimeError
291
- # Ignore parser errors that nokogiri may raise
292
- nil
293
- end
294
-
295
293
  def initialize(io)
296
294
  @io = io
297
295
  @firstchunk = nil
@@ -299,20 +297,20 @@ module Nokogiri
299
297
  end
300
298
 
301
299
  # This method is used by the C extension so that
302
- # Nokogiri::HTML::Document#read_io() does not leak memory when
300
+ # Nokogiri::HTML4::Document#read_io() does not leak memory when
303
301
  # EncodingFound is raised.
304
302
  attr_reader :encoding_found
305
303
 
306
304
  def read(len)
307
305
  # no support for a call without len
308
306
 
309
- if !@firstchunk
310
- @firstchunk = @io.read(len) or return nil
307
+ unless @firstchunk
308
+ (@firstchunk = @io.read(len)) || (return nil)
311
309
 
312
310
  # This implementation expects that the first call from
313
311
  # htmlReadIO() is made with a length long enough (~1KB) to
314
312
  # achieve advanced encoding detection.
315
- if encoding = EncodingReader.detect_encoding(@firstchunk)
313
+ if (encoding = EncodingReader.detect_encoding(@firstchunk))
316
314
  # The first chunk is stored for the next read in retry.
317
315
  raise @encoding_found = EncodingFound.new(encoding)
318
316
  end
@@ -321,7 +319,7 @@ module Nokogiri
321
319
 
322
320
  ret = @firstchunk.slice!(0, len)
323
321
  if (len -= ret.length) > 0
324
- rest = @io.read(len) and ret << rest
322
+ (rest = @io.read(len)) && ret << (rest)
325
323
  end
326
324
  if ret.empty?
327
325
  nil
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML4
5
+ class DocumentFragment < Nokogiri::XML::DocumentFragment
6
+ ####
7
+ # Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
8
+ def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
9
+ doc = HTML4::Document.new
10
+
11
+ encoding ||= if tags.respond_to?(:encoding)
12
+ encoding = tags.encoding
13
+ if encoding == ::Encoding::ASCII_8BIT
14
+ "UTF-8"
15
+ else
16
+ encoding.name
17
+ end
18
+ else
19
+ "UTF-8"
20
+ end
21
+
22
+ doc.encoding = encoding
23
+
24
+ new(doc, tags, nil, options, &block)
25
+ end
26
+
27
+ def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML)
28
+ return self unless tags
29
+
30
+ options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
31
+ yield options if block_given?
32
+
33
+ if ctx
34
+ preexisting_errors = document.errors.dup
35
+ node_set = ctx.parse("<div>#{tags}</div>", options)
36
+ node_set.first.children.each { |child| child.parent = self } unless node_set.empty?
37
+ self.errors = document.errors - preexisting_errors
38
+ else
39
+ # This is a horrible hack, but I don't care
40
+ path = if /^\s*?<body/i.match?(tags)
41
+ "/html/body"
42
+ else
43
+ "/html/body/node()"
44
+ end
45
+
46
+ temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding, options)
47
+ temp_doc.xpath(path).each { |child| child.parent = self }
48
+ self.errors = temp_doc.errors
49
+ end
50
+ children
51
+ end
52
+ end
53
+ end
54
+ end
@@ -1,5 +1,7 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Nokogiri
2
- module HTML
4
+ module HTML4
3
5
  class ElementDescription
4
6
  ###
5
7
  # Is this element a block element?