nokogiri 1.10.9 → 1.18.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (230) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +38 -0
  3. data/LICENSE-DEPENDENCIES.md +1632 -1022
  4. data/LICENSE.md +1 -1
  5. data/README.md +190 -95
  6. data/bin/nokogiri +63 -50
  7. data/dependencies.yml +34 -66
  8. data/ext/nokogiri/depend +38 -358
  9. data/ext/nokogiri/extconf.rb +909 -422
  10. data/ext/nokogiri/gumbo.c +610 -0
  11. data/ext/nokogiri/html4_document.c +171 -0
  12. data/ext/nokogiri/html4_element_description.c +299 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser.c +40 -0
  15. data/ext/nokogiri/html4_sax_parser_context.c +98 -0
  16. data/ext/nokogiri/html4_sax_push_parser.c +96 -0
  17. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  18. data/ext/nokogiri/nokogiri.c +258 -105
  19. data/ext/nokogiri/nokogiri.h +207 -90
  20. data/ext/nokogiri/test_global_handlers.c +40 -0
  21. data/ext/nokogiri/xml_attr.c +18 -18
  22. data/ext/nokogiri/xml_attribute_decl.c +22 -22
  23. data/ext/nokogiri/xml_cdata.c +33 -33
  24. data/ext/nokogiri/xml_comment.c +19 -31
  25. data/ext/nokogiri/xml_document.c +499 -323
  26. data/ext/nokogiri/xml_document_fragment.c +17 -36
  27. data/ext/nokogiri/xml_dtd.c +65 -59
  28. data/ext/nokogiri/xml_element_content.c +63 -55
  29. data/ext/nokogiri/xml_element_decl.c +31 -31
  30. data/ext/nokogiri/xml_encoding_handler.c +54 -21
  31. data/ext/nokogiri/xml_entity_decl.c +37 -35
  32. data/ext/nokogiri/xml_entity_reference.c +17 -19
  33. data/ext/nokogiri/xml_namespace.c +131 -61
  34. data/ext/nokogiri/xml_node.c +1429 -723
  35. data/ext/nokogiri/xml_node_set.c +257 -225
  36. data/ext/nokogiri/xml_processing_instruction.c +18 -20
  37. data/ext/nokogiri/xml_reader.c +340 -231
  38. data/ext/nokogiri/xml_relax_ng.c +87 -99
  39. data/ext/nokogiri/xml_sax_parser.c +269 -176
  40. data/ext/nokogiri/xml_sax_parser_context.c +286 -152
  41. data/ext/nokogiri/xml_sax_push_parser.c +111 -64
  42. data/ext/nokogiri/xml_schema.c +132 -140
  43. data/ext/nokogiri/xml_syntax_error.c +52 -23
  44. data/ext/nokogiri/xml_text.c +37 -30
  45. data/ext/nokogiri/xml_xpath_context.c +373 -185
  46. data/ext/nokogiri/xslt_stylesheet.c +342 -191
  47. data/gumbo-parser/CHANGES.md +63 -0
  48. data/gumbo-parser/Makefile +129 -0
  49. data/gumbo-parser/THANKS +27 -0
  50. data/gumbo-parser/src/Makefile +34 -0
  51. data/gumbo-parser/src/README.md +41 -0
  52. data/gumbo-parser/src/ascii.c +75 -0
  53. data/gumbo-parser/src/ascii.h +115 -0
  54. data/gumbo-parser/src/attribute.c +42 -0
  55. data/gumbo-parser/src/attribute.h +17 -0
  56. data/gumbo-parser/src/char_ref.c +22225 -0
  57. data/gumbo-parser/src/char_ref.h +29 -0
  58. data/gumbo-parser/src/char_ref.rl +2154 -0
  59. data/gumbo-parser/src/error.c +658 -0
  60. data/gumbo-parser/src/error.h +152 -0
  61. data/gumbo-parser/src/foreign_attrs.c +103 -0
  62. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  63. data/gumbo-parser/src/insertion_mode.h +33 -0
  64. data/gumbo-parser/src/macros.h +91 -0
  65. data/gumbo-parser/src/nokogiri_gumbo.h +953 -0
  66. data/gumbo-parser/src/parser.c +4932 -0
  67. data/gumbo-parser/src/parser.h +41 -0
  68. data/gumbo-parser/src/replacement.h +33 -0
  69. data/gumbo-parser/src/string_buffer.c +103 -0
  70. data/gumbo-parser/src/string_buffer.h +68 -0
  71. data/gumbo-parser/src/string_piece.c +48 -0
  72. data/gumbo-parser/src/svg_attrs.c +174 -0
  73. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  74. data/gumbo-parser/src/svg_tags.c +137 -0
  75. data/gumbo-parser/src/svg_tags.gperf +55 -0
  76. data/gumbo-parser/src/tag.c +223 -0
  77. data/gumbo-parser/src/tag_lookup.c +382 -0
  78. data/gumbo-parser/src/tag_lookup.gperf +170 -0
  79. data/gumbo-parser/src/tag_lookup.h +13 -0
  80. data/gumbo-parser/src/token_buffer.c +79 -0
  81. data/gumbo-parser/src/token_buffer.h +71 -0
  82. data/gumbo-parser/src/token_type.h +17 -0
  83. data/gumbo-parser/src/tokenizer.c +3464 -0
  84. data/gumbo-parser/src/tokenizer.h +112 -0
  85. data/gumbo-parser/src/tokenizer_states.h +339 -0
  86. data/gumbo-parser/src/utf8.c +245 -0
  87. data/gumbo-parser/src/utf8.h +164 -0
  88. data/gumbo-parser/src/util.c +66 -0
  89. data/gumbo-parser/src/util.h +34 -0
  90. data/gumbo-parser/src/vector.c +111 -0
  91. data/gumbo-parser/src/vector.h +45 -0
  92. data/lib/nokogiri/class_resolver.rb +67 -0
  93. data/lib/nokogiri/css/node.rb +14 -8
  94. data/lib/nokogiri/css/parser.rb +399 -377
  95. data/lib/nokogiri/css/parser.y +250 -245
  96. data/lib/nokogiri/css/parser_extras.rb +16 -71
  97. data/lib/nokogiri/css/selector_cache.rb +38 -0
  98. data/lib/nokogiri/css/syntax_error.rb +3 -1
  99. data/lib/nokogiri/css/tokenizer.rb +7 -5
  100. data/lib/nokogiri/css/tokenizer.rex +11 -9
  101. data/lib/nokogiri/css/xpath_visitor.rb +242 -96
  102. data/lib/nokogiri/css.rb +122 -17
  103. data/lib/nokogiri/decorators/slop.rb +11 -11
  104. data/lib/nokogiri/encoding_handler.rb +57 -0
  105. data/lib/nokogiri/extension.rb +32 -0
  106. data/lib/nokogiri/gumbo.rb +15 -0
  107. data/lib/nokogiri/html.rb +38 -27
  108. data/lib/nokogiri/{html → html4}/builder.rb +4 -2
  109. data/lib/nokogiri/html4/document.rb +235 -0
  110. data/lib/nokogiri/html4/document_fragment.rb +166 -0
  111. data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
  112. data/lib/nokogiri/html4/element_description_defaults.rb +2040 -0
  113. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  114. data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
  115. data/lib/nokogiri/html4/sax/parser.rb +48 -0
  116. data/lib/nokogiri/html4/sax/parser_context.rb +15 -0
  117. data/lib/nokogiri/{html → html4}/sax/push_parser.rb +12 -11
  118. data/lib/nokogiri/html4.rb +42 -0
  119. data/lib/nokogiri/html5/builder.rb +40 -0
  120. data/lib/nokogiri/html5/document.rb +199 -0
  121. data/lib/nokogiri/html5/document_fragment.rb +200 -0
  122. data/lib/nokogiri/html5/node.rb +103 -0
  123. data/lib/nokogiri/html5.rb +368 -0
  124. data/lib/nokogiri/jruby/dependencies.rb +3 -0
  125. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  126. data/lib/nokogiri/syntax_error.rb +2 -0
  127. data/lib/nokogiri/version/constant.rb +6 -0
  128. data/lib/nokogiri/version/info.rb +224 -0
  129. data/lib/nokogiri/version.rb +3 -108
  130. data/lib/nokogiri/xml/attr.rb +55 -3
  131. data/lib/nokogiri/xml/attribute_decl.rb +6 -2
  132. data/lib/nokogiri/xml/builder.rb +83 -35
  133. data/lib/nokogiri/xml/cdata.rb +3 -1
  134. data/lib/nokogiri/xml/character_data.rb +2 -0
  135. data/lib/nokogiri/xml/document.rb +359 -130
  136. data/lib/nokogiri/xml/document_fragment.rb +170 -54
  137. data/lib/nokogiri/xml/dtd.rb +4 -2
  138. data/lib/nokogiri/xml/element_content.rb +12 -2
  139. data/lib/nokogiri/xml/element_decl.rb +6 -2
  140. data/lib/nokogiri/xml/entity_decl.rb +7 -3
  141. data/lib/nokogiri/xml/entity_reference.rb +2 -0
  142. data/lib/nokogiri/xml/namespace.rb +44 -0
  143. data/lib/nokogiri/xml/node/save_options.rb +23 -8
  144. data/lib/nokogiri/xml/node.rb +1168 -420
  145. data/lib/nokogiri/xml/node_set.rb +145 -67
  146. data/lib/nokogiri/xml/notation.rb +13 -0
  147. data/lib/nokogiri/xml/parse_options.rb +145 -52
  148. data/lib/nokogiri/xml/pp/character_data.rb +9 -6
  149. data/lib/nokogiri/xml/pp/node.rb +47 -30
  150. data/lib/nokogiri/xml/pp.rb +4 -2
  151. data/lib/nokogiri/xml/processing_instruction.rb +4 -1
  152. data/lib/nokogiri/xml/reader.rb +68 -41
  153. data/lib/nokogiri/xml/relax_ng.rb +60 -17
  154. data/lib/nokogiri/xml/sax/document.rb +198 -111
  155. data/lib/nokogiri/xml/sax/parser.rb +144 -67
  156. data/lib/nokogiri/xml/sax/parser_context.rb +119 -6
  157. data/lib/nokogiri/xml/sax/push_parser.rb +9 -5
  158. data/lib/nokogiri/xml/sax.rb +54 -4
  159. data/lib/nokogiri/xml/schema.rb +116 -39
  160. data/lib/nokogiri/xml/searchable.rb +139 -95
  161. data/lib/nokogiri/xml/syntax_error.rb +29 -5
  162. data/lib/nokogiri/xml/text.rb +2 -0
  163. data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
  164. data/lib/nokogiri/xml/xpath.rb +15 -4
  165. data/lib/nokogiri/xml/xpath_context.rb +15 -4
  166. data/lib/nokogiri/xml.rb +45 -55
  167. data/lib/nokogiri/xslt/stylesheet.rb +32 -8
  168. data/lib/nokogiri/xslt.rb +103 -30
  169. data/lib/nokogiri.rb +59 -75
  170. data/lib/xsd/xmlparser/nokogiri.rb +32 -29
  171. data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
  172. data/patches/libxml2/0010-update-config.guess-and-config.sub-for-libxml2.patch +224 -0
  173. data/patches/libxml2/0011-rip-out-libxml2-s-libc_single_threaded-support.patch +30 -0
  174. data/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch +244 -0
  175. data/patches/libxslt/0001-update-config.guess-and-config.sub-for-libxslt.patch +224 -0
  176. data/ports/archives/libxml2-2.13.6.tar.xz +0 -0
  177. data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
  178. metadata +123 -295
  179. data/ext/nokogiri/html_document.c +0 -170
  180. data/ext/nokogiri/html_document.h +0 -10
  181. data/ext/nokogiri/html_element_description.c +0 -279
  182. data/ext/nokogiri/html_element_description.h +0 -10
  183. data/ext/nokogiri/html_entity_lookup.c +0 -32
  184. data/ext/nokogiri/html_entity_lookup.h +0 -8
  185. data/ext/nokogiri/html_sax_parser_context.c +0 -116
  186. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  187. data/ext/nokogiri/html_sax_push_parser.c +0 -87
  188. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  189. data/ext/nokogiri/xml_attr.h +0 -9
  190. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  191. data/ext/nokogiri/xml_cdata.h +0 -9
  192. data/ext/nokogiri/xml_comment.h +0 -9
  193. data/ext/nokogiri/xml_document.h +0 -23
  194. data/ext/nokogiri/xml_document_fragment.h +0 -10
  195. data/ext/nokogiri/xml_dtd.h +0 -10
  196. data/ext/nokogiri/xml_element_content.h +0 -10
  197. data/ext/nokogiri/xml_element_decl.h +0 -9
  198. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  199. data/ext/nokogiri/xml_entity_decl.h +0 -10
  200. data/ext/nokogiri/xml_entity_reference.h +0 -9
  201. data/ext/nokogiri/xml_io.c +0 -61
  202. data/ext/nokogiri/xml_io.h +0 -11
  203. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  204. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  205. data/ext/nokogiri/xml_namespace.h +0 -14
  206. data/ext/nokogiri/xml_node.h +0 -13
  207. data/ext/nokogiri/xml_node_set.h +0 -12
  208. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  209. data/ext/nokogiri/xml_reader.h +0 -10
  210. data/ext/nokogiri/xml_relax_ng.h +0 -9
  211. data/ext/nokogiri/xml_sax_parser.h +0 -39
  212. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  213. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  214. data/ext/nokogiri/xml_schema.h +0 -9
  215. data/ext/nokogiri/xml_syntax_error.h +0 -13
  216. data/ext/nokogiri/xml_text.h +0 -9
  217. data/ext/nokogiri/xml_xpath_context.h +0 -10
  218. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  219. data/lib/nokogiri/html/document.rb +0 -335
  220. data/lib/nokogiri/html/document_fragment.rb +0 -49
  221. data/lib/nokogiri/html/element_description_defaults.rb +0 -671
  222. data/lib/nokogiri/html/sax/parser.rb +0 -62
  223. data/lib/nokogiri/html/sax/parser_context.rb +0 -16
  224. data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
  225. data/patches/libxml2/0004-libxml2.la-is-in-top_builddir.patch +0 -25
  226. data/patches/libxml2/0005-Fix-infinite-loop-in-xmlStringLenDecodeEntities.patch +0 -32
  227. data/ports/archives/libxml2-2.9.10.tar.gz +0 -0
  228. data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
  229. /data/patches/libxml2/{0002-Remove-script-macro-support.patch → 0001-Remove-script-macro-support.patch} +0 -0
  230. /data/patches/libxml2/{0003-Update-entities-to-remove-handling-of-ssi.patch → 0002-Update-entities-to-remove-handling-of-ssi.patch} +0 -0
data/lib/nokogiri/css.rb CHANGED
@@ -1,27 +1,132 @@
1
- require 'nokogiri/css/node'
2
- require 'nokogiri/css/xpath_visitor'
3
- x = $-w
4
- $-w = false
5
- require 'nokogiri/css/parser'
6
- $-w = x
7
-
8
- require 'nokogiri/css/tokenizer'
9
- require 'nokogiri/css/syntax_error'
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
10
3
 
11
4
  module Nokogiri
5
+ # Translate a CSS selector into an XPath 1.0 query
12
6
  module CSS
13
7
  class << self
14
- ###
15
- # Parse this CSS selector in +selector+. Returns an AST.
16
- def parse selector
17
- Parser.new.parse selector
8
+ # TODO: Deprecate this method ahead of 2.0 and delete it in 2.0.
9
+ # It is not used by Nokogiri and shouldn't be part of the public API.
10
+ def parse(selector) # :nodoc:
11
+ warn("Nokogiri::CSS.parse is deprecated and will be removed in a future version of Nokogiri. Use Nokogiri::CSS::Parser#parse instead.", uplevel: 1, category: :deprecated)
12
+ Parser.new.parse(selector)
18
13
  end
19
14
 
20
- ###
21
- # Get the XPath for +selector+.
22
- def xpath_for selector, options={}
23
- Parser.new(options[:ns] || {}).xpath_for selector, options
15
+ # :call-seq:
16
+ # xpath_for(selector_list) Array<String>
17
+ # xpath_for(selector_list [, prefix:] [, ns:] [, visitor:] [, cache:]) → Array<String>
18
+ #
19
+ # Translate a CSS selector list to the equivalent XPath expressions.
20
+ #
21
+ # 💡 Note that translated queries are cached by default for performance concerns.
22
+ #
23
+ # ⚠ Users should prefer Nokogiri::XML::Searchable#css, which is mixed into all document and
24
+ # node classes, for querying documents with CSS selectors. This method is the underlying
25
+ # mechanism used by XML::Searchable and is provided solely for advanced users to translate
26
+ # \CSS selectors to XPath directly.
27
+ #
28
+ # Also see Nokogiri::XML::Searchable#css for documentation on supported CSS selector features,
29
+ # some extended syntax that Nokogiri supports, and advanced CSS features like pseudo-class
30
+ # functions.
31
+ #
32
+ # [Parameters]
33
+ # - +selector_list+ (String)
34
+ #
35
+ # The CSS selector to be translated into XPath. This is always a String, but that string
36
+ # value may be a {selector list}[https://www.w3.org/TR/selectors-4/#grouping] (see
37
+ # examples).
38
+ #
39
+ # [Keyword arguments]
40
+ # - +prefix:+ (String)
41
+ #
42
+ # The XPath expression prefix which determines the search context. See Nokogiri::XML::XPath
43
+ # for standard options. Default is +XPath::GLOBAL_SEARCH_PREFIX+.
44
+ #
45
+ # - +ns:+ (Hash<String ⇒ String>, nil)
46
+ #
47
+ # Namespaces that are referenced in the query, if any. This is a hash where the keys are the
48
+ # namespace prefix and the values are the namespace URIs. Default is +nil+ indicating an
49
+ # empty set of namespaces.
50
+ #
51
+ # - +visitor:+ (Nokogiri::CSS::XPathVisitor)
52
+ #
53
+ # Use this XPathVisitor object to transform the CSS AST into XPath expressions. See
54
+ # Nokogiri::CSS::XPathVisitor for more information on some of the complex behavior that can
55
+ # be customized for your document type. Default is +Nokogiri::CSS::XPathVisitor.new+.
56
+ #
57
+ # ⚠ Note that this option is mutually exclusive with +prefix+ and +ns+. If +visitor+ is
58
+ # provided, +prefix+ and +ns+ must not be present.
59
+ #
60
+ # - +cache:+ (Boolean)
61
+ #
62
+ # Whether to use the SelectorCache for the translated query to ensure that repeated queries
63
+ # don't incur the overhead of re-parsing the selector. Default is +true+.
64
+ #
65
+ # [Returns] (Array<String>) The equivalent set of XPath expressions for +selector_list+
66
+ #
67
+ # *Example* with a simple selector:
68
+ #
69
+ # Nokogiri::CSS.xpath_for("div") # => ["//div"]
70
+ #
71
+ # *Example* with a compound selector:
72
+ #
73
+ # Nokogiri::CSS.xpath_for("div.xl") # => ["//div[contains(concat(' ',normalize-space(@class),' '),' xl ')]"]
74
+ #
75
+ # *Example* with a complex selector:
76
+ #
77
+ # Nokogiri::CSS.xpath_for("h1 + div") # => ["//h1/following-sibling::*[1]/self::div"]
78
+ #
79
+ # *Example* with a selector list:
80
+ #
81
+ # Nokogiri::CSS.xpath_for("h1, h2, h3") # => ["//h1", "//h2", "//h3"]
82
+ #
83
+ def xpath_for(
84
+ selector, options = nil,
85
+ prefix: options&.delete(:prefix),
86
+ visitor: options&.delete(:visitor),
87
+ ns: options&.delete(:ns),
88
+ cache: true
89
+ )
90
+ unless options.nil?
91
+ warn("Nokogiri::CSS.xpath_for: Passing options as an explicit hash is deprecated. Use keyword arguments instead. This will become an error in a future release.", uplevel: 1, category: :deprecated)
92
+ end
93
+
94
+ raise(TypeError, "no implicit conversion of #{selector.inspect} to String") unless selector.respond_to?(:to_str)
95
+
96
+ selector = selector.to_str
97
+ raise(Nokogiri::CSS::SyntaxError, "empty CSS selector") if selector.empty?
98
+
99
+ if visitor
100
+ raise ArgumentError, "cannot provide both :prefix and :visitor" if prefix
101
+ raise ArgumentError, "cannot provide both :ns and :visitor" if ns
102
+ end
103
+
104
+ visitor ||= begin
105
+ visitor_kw = {}
106
+ visitor_kw[:prefix] = prefix if prefix
107
+ visitor_kw[:namespaces] = ns if ns
108
+
109
+ Nokogiri::CSS::XPathVisitor.new(**visitor_kw)
110
+ end
111
+
112
+ if cache
113
+ key = SelectorCache.key(selector: selector, visitor: visitor)
114
+ SelectorCache[key] ||= Parser.new.xpath_for(selector, visitor)
115
+ else
116
+ Parser.new.xpath_for(selector, visitor)
117
+ end
24
118
  end
25
119
  end
26
120
  end
27
121
  end
122
+
123
+ require_relative "css/selector_cache"
124
+ require_relative "css/node"
125
+ require_relative "css/xpath_visitor"
126
+ x = $-w
127
+ $-w = false
128
+ require_relative "css/parser"
129
+ $-w = x
130
+
131
+ require_relative "css/tokenizer"
132
+ require_relative "css/syntax_error"
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Nokogiri
2
4
  module Decorators
3
5
  ###
@@ -9,31 +11,29 @@ module Nokogiri
9
11
 
10
12
  ###
11
13
  # look for node with +name+. See Nokogiri.Slop
12
- def method_missing name, *args, &block
14
+ def method_missing(name, *args, &block)
13
15
  if args.empty?
14
- list = xpath("#{XPATH_PREFIX}#{name.to_s.sub(/^_/, '')}")
15
- elsif args.first.is_a? Hash
16
+ list = xpath("#{XPATH_PREFIX}#{name.to_s.sub(/^_/, "")}")
17
+ elsif args.first.is_a?(Hash)
16
18
  hash = args.first
17
19
  if hash[:css]
18
20
  list = css("#{name}#{hash[:css]}")
19
21
  elsif hash[:xpath]
20
- conds = Array(hash[:xpath]).join(' and ')
22
+ conds = Array(hash[:xpath]).join(" and ")
21
23
  list = xpath("#{XPATH_PREFIX}#{name}[#{conds}]")
22
24
  end
23
25
  else
24
- CSS::Parser.without_cache do
25
- list = xpath(
26
- *CSS.xpath_for("#{name}#{args.first}", :prefix => XPATH_PREFIX)
27
- )
28
- end
26
+ list = xpath(
27
+ *CSS.xpath_for("#{name}#{args.first}", prefix: XPATH_PREFIX, cache: false),
28
+ )
29
29
  end
30
30
 
31
31
  super if list.empty?
32
32
  list.length == 1 ? list.first : list
33
33
  end
34
34
 
35
- def respond_to_missing? name, include_private = false
36
- list = xpath("#{XPATH_PREFIX}#{name.to_s.sub(/^_/, '')}")
35
+ def respond_to_missing?(name, include_private = false)
36
+ list = xpath("#{XPATH_PREFIX}#{name.to_s.sub(/^_/, "")}")
37
37
 
38
38
  !list.empty?
39
39
  end
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ module Nokogiri
5
+ class EncodingHandler
6
+ # Popular encoding aliases not known by all iconv implementations that Nokogiri should support.
7
+ USEFUL_ALIASES = {
8
+ # alias_name => true_name
9
+ "ISO-2022-JP" => "ISO-2022-JP", # only for JRuby tests, this is a no-op in CRuby
10
+ "NOKOGIRI-SENTINEL" => "ISO-2022-JP", # indicating the Nokogiri has installed aliases
11
+ "Windows-31J" => "CP932", # Windows-31J is the IANA registered name of CP932.
12
+ }
13
+
14
+ class << self
15
+ def install_default_aliases
16
+ USEFUL_ALIASES.each do |alias_name, name|
17
+ EncodingHandler.alias(name, alias_name) if EncodingHandler[alias_name].nil?
18
+ end
19
+ end
20
+ end
21
+
22
+ # :stopdoc:
23
+ if Nokogiri.jruby?
24
+ class << self
25
+ def [](name)
26
+ storage.key?(name) ? new(storage[name]) : nil
27
+ end
28
+
29
+ def alias(name, alias_name)
30
+ storage[alias_name] = name
31
+ end
32
+
33
+ def delete(name)
34
+ storage.delete(name)
35
+ end
36
+
37
+ def clear_aliases!
38
+ storage.clear
39
+ end
40
+
41
+ private
42
+
43
+ def storage
44
+ @storage ||= {}
45
+ end
46
+ end
47
+
48
+ def initialize(name)
49
+ @name = name
50
+ end
51
+
52
+ attr_reader :name
53
+ end
54
+ end
55
+ end
56
+
57
+ Nokogiri::EncodingHandler.install_default_aliases
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ # load the C or Java extension
4
+ begin
5
+ # native precompiled gems package shared libraries in <gem_dir>/lib/nokogiri/<ruby_version>
6
+ RUBY_VERSION =~ /(\d+\.\d+)/
7
+ require_relative "#{Regexp.last_match(1)}/nokogiri"
8
+ rescue LoadError => e
9
+ if e.message.include?("GLIBC")
10
+ warn(<<~EOM)
11
+
12
+ ERROR: It looks like you're trying to use Nokogiri as a precompiled native gem on a system
13
+ with an unsupported version of glibc.
14
+
15
+ #{e.message}
16
+
17
+ If that's the case, then please install Nokogiri via the `ruby` platform gem:
18
+ gem install nokogiri --platform=ruby
19
+ or:
20
+ bundle config set force_ruby_platform true
21
+
22
+ Please visit https://nokogiri.org/tutorials/installing_nokogiri.html for more help.
23
+
24
+ EOM
25
+ raise e
26
+ end
27
+
28
+ # use "require" instead of "require_relative" because non-native gems will place C extension files
29
+ # in Gem::BasicSpecification#extension_dir after compilation (during normal installation), which
30
+ # is in $LOAD_PATH but not necessarily relative to this file (see #2300)
31
+ require "nokogiri/nokogiri"
32
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module Gumbo
5
+ # The default maximum number of attributes per element.
6
+ DEFAULT_MAX_ATTRIBUTES = 400
7
+
8
+ # The default maximum number of errors for parsing a document or a fragment.
9
+ DEFAULT_MAX_ERRORS = 0
10
+
11
+ # The default maximum depth of the DOM tree produced by parsing a document
12
+ # or fragment.
13
+ DEFAULT_MAX_TREE_DEPTH = 400
14
+ end
15
+ end
data/lib/nokogiri/html.rb CHANGED
@@ -1,37 +1,48 @@
1
- require 'nokogiri/html/entity_lookup'
2
- require 'nokogiri/html/document'
3
- require 'nokogiri/html/document_fragment'
4
- require 'nokogiri/html/sax/parser_context'
5
- require 'nokogiri/html/sax/parser'
6
- require 'nokogiri/html/sax/push_parser'
7
- require 'nokogiri/html/element_description'
8
- require 'nokogiri/html/element_description_defaults'
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ require_relative "html4"
9
5
 
10
6
  module Nokogiri
11
- class << self
12
- ###
13
- # Parse HTML. Convenience method for Nokogiri::HTML::Document.parse
14
- def HTML thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block
15
- Nokogiri::HTML::Document.parse(thing, url, encoding, options, &block)
16
- end
17
- end
7
+ # Alias for Nokogiri::HTML4
8
+ HTML = Nokogiri::HTML4
9
+
10
+ # :singleton-method: HTML
11
+ # :call-seq: HTML(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) → Nokogiri::HTML4::Document
12
+ #
13
+ # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
18
14
 
15
+ # :nodoc:
16
+ define_singleton_method(:HTML, Nokogiri.method(:HTML4))
17
+
18
+ # 💡 This module/namespace is an alias for Nokogiri::HTML4 as of v1.12.0. Before v1.12.0,
19
+ # Nokogiri::HTML4 did not exist, and this was the module/namespace for all HTML-related
20
+ # classes.
19
21
  module HTML
20
- class << self
21
- ###
22
- # Parse HTML. Convenience method for Nokogiri::HTML::Document.parse
23
- def parse thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block
24
- Document.parse(thing, url, encoding, options, &block)
22
+ # 💡 This class is an alias for Nokogiri::HTML4::Document as of v1.12.0.
23
+ class Document < Nokogiri::XML::Document
24
+ end
25
+
26
+ # 💡 This class is an alias for Nokogiri::HTML4::DocumentFragment as of v1.12.0.
27
+ class DocumentFragment < Nokogiri::XML::DocumentFragment
28
+ end
29
+
30
+ # 💡 This class is an alias for Nokogiri::HTML4::Builder as of v1.12.0.
31
+ class Builder < Nokogiri::XML::Builder
32
+ end
33
+
34
+ module SAX
35
+ # 💡 This class is an alias for Nokogiri::HTML4::SAX::Parser as of v1.12.0.
36
+ class Parser < Nokogiri::XML::SAX::Parser
25
37
  end
26
38
 
27
- ####
28
- # Parse a fragment from +string+ in to a NodeSet.
29
- def fragment string, encoding = nil
30
- HTML::DocumentFragment.parse string, encoding
39
+ # 💡 This class is an alias for Nokogiri::HTML4::SAX::ParserContext as of v1.12.0.
40
+ class ParserContext < Nokogiri::XML::SAX::ParserContext
31
41
  end
32
- end
33
42
 
34
- # Instance of Nokogiri::HTML::EntityLookup
35
- NamedCharacters = EntityLookup.new
43
+ # 💡 This class is an alias for Nokogiri::HTML4::SAX::PushParser as of v1.12.0.
44
+ class PushParser
45
+ end
46
+ end
36
47
  end
37
48
  end
@@ -1,5 +1,7 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Nokogiri
2
- module HTML
4
+ module HTML4
3
5
  ###
4
6
  # Nokogiri HTML builder is used for building HTML documents. It is very
5
7
  # similar to the Nokogiri::XML::Builder. In fact, you should go read the
@@ -11,7 +13,7 @@ module Nokogiri
11
13
  # Create an HTML document with a body that has an onload attribute, and a
12
14
  # span tag with a class of "bold" that has content of "Hello world".
13
15
  #
14
- # builder = Nokogiri::HTML::Builder.new do |doc|
16
+ # builder = Nokogiri::HTML4::Builder.new do |doc|
15
17
  # doc.html {
16
18
  # doc.body(:onload => 'some_func();') {
17
19
  # doc.span.bold {
@@ -0,0 +1,235 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ require "pathname"
5
+
6
+ module Nokogiri
7
+ module HTML4
8
+ class Document < Nokogiri::XML::Document
9
+ ###
10
+ # Get the meta tag encoding for this document. If there is no meta tag,
11
+ # then nil is returned.
12
+ def meta_encoding
13
+ if (meta = at_xpath("//meta[@charset]"))
14
+ meta[:charset]
15
+ elsif (meta = meta_content_type)
16
+ meta["content"][/charset\s*=\s*([\w-]+)/i, 1]
17
+ end
18
+ end
19
+
20
+ ###
21
+ # Set the meta tag encoding for this document.
22
+ #
23
+ # If an meta encoding tag is already present, its content is
24
+ # replaced with the given text.
25
+ #
26
+ # Otherwise, this method tries to create one at an appropriate
27
+ # place supplying head and/or html elements as necessary, which
28
+ # is inside a head element if any, and before any text node or
29
+ # content element (typically <body>) if any.
30
+ #
31
+ # The result when trying to set an encoding that is different
32
+ # from the document encoding is undefined.
33
+ #
34
+ # Beware in CRuby, that libxml2 automatically inserts a meta tag
35
+ # into a head element.
36
+ def meta_encoding=(encoding)
37
+ if (meta = meta_content_type)
38
+ meta["content"] = format("text/html; charset=%s", encoding)
39
+ encoding
40
+ elsif (meta = at_xpath("//meta[@charset]"))
41
+ meta["charset"] = encoding
42
+ else
43
+ meta = XML::Node.new("meta", self)
44
+ if (dtd = internal_subset) && dtd.html5_dtd?
45
+ meta["charset"] = encoding
46
+ else
47
+ meta["http-equiv"] = "Content-Type"
48
+ meta["content"] = format("text/html; charset=%s", encoding)
49
+ end
50
+
51
+ if (head = at_xpath("//head"))
52
+ head.prepend_child(meta)
53
+ else
54
+ set_metadata_element(meta)
55
+ end
56
+ encoding
57
+ end
58
+ end
59
+
60
+ def meta_content_type
61
+ xpath("//meta[@http-equiv and boolean(@content)]").find do |node|
62
+ node["http-equiv"] =~ /\AContent-Type\z/i
63
+ end
64
+ end
65
+ private :meta_content_type
66
+
67
+ ###
68
+ # Get the title string of this document. Return nil if there is
69
+ # no title tag.
70
+ def title
71
+ (title = at_xpath("//title")) && title.inner_text
72
+ end
73
+
74
+ ###
75
+ # Set the title string of this document.
76
+ #
77
+ # If a title element is already present, its content is replaced
78
+ # with the given text.
79
+ #
80
+ # Otherwise, this method tries to create one at an appropriate
81
+ # place supplying head and/or html elements as necessary, which
82
+ # is inside a head element if any, right after a meta
83
+ # encoding/charset tag if any, and before any text node or
84
+ # content element (typically <body>) if any.
85
+ def title=(text)
86
+ tnode = XML::Text.new(text, self)
87
+ if (title = at_xpath("//title"))
88
+ title.children = tnode
89
+ return text
90
+ end
91
+
92
+ title = XML::Node.new("title", self) << tnode
93
+ if (head = at_xpath("//head"))
94
+ head << title
95
+ elsif (meta = at_xpath("//meta[@charset]") || meta_content_type)
96
+ # better put after charset declaration
97
+ meta.add_next_sibling(title)
98
+ else
99
+ set_metadata_element(title)
100
+ end
101
+ end
102
+
103
+ def set_metadata_element(element) # rubocop:disable Naming/AccessorMethodName
104
+ if (head = at_xpath("//head"))
105
+ head << element
106
+ elsif (html = at_xpath("//html"))
107
+ head = html.prepend_child(XML::Node.new("head", self))
108
+ head.prepend_child(element)
109
+ elsif (first = children.find do |node|
110
+ case node
111
+ when XML::Element, XML::Text
112
+ true
113
+ end
114
+ end)
115
+ # We reach here only if the underlying document model
116
+ # allows <html>/<head> elements to be omitted and does not
117
+ # automatically supply them.
118
+ first.add_previous_sibling(element)
119
+ else
120
+ html = add_child(XML::Node.new("html", self))
121
+ head = html.add_child(XML::Node.new("head", self))
122
+ head.prepend_child(element)
123
+ end
124
+ end
125
+ private :set_metadata_element
126
+
127
+ ####
128
+ # Serialize Node using +options+. Save options can also be set using a block.
129
+ #
130
+ # See also Nokogiri::XML::Node::SaveOptions and Node@Serialization+and+Generating+Output.
131
+ #
132
+ # These two statements are equivalent:
133
+ #
134
+ # node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)
135
+ #
136
+ # or
137
+ #
138
+ # node.serialize(:encoding => 'UTF-8') do |config|
139
+ # config.format.as_xml
140
+ # end
141
+ #
142
+ def serialize(options = {})
143
+ options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
144
+ super
145
+ end
146
+
147
+ ####
148
+ # Create a Nokogiri::XML::DocumentFragment from +tags+
149
+ def fragment(tags = nil)
150
+ DocumentFragment.new(self, tags, root)
151
+ end
152
+
153
+ # :call-seq:
154
+ # xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig
155
+ #
156
+ # [Returns] The document type which determines CSS-to-XPath translation.
157
+ #
158
+ # See XPathVisitor for more information.
159
+ def xpath_doctype
160
+ Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML4
161
+ end
162
+
163
+ class << self
164
+ # :call-seq:
165
+ # parse(input) { |options| ... } => Nokogiri::HTML4::Document
166
+ # parse(input, url:, encoding:, options:) => Nokogiri::HTML4::Document
167
+ #
168
+ # Parse \HTML4 input from a String or IO object, and return a new HTML4::Document.
169
+ #
170
+ # [Required Parameters]
171
+ # - +input+ (String | IO) The content to be parsed.
172
+ #
173
+ # [Optional Keyword Arguments]
174
+ # - +url:+ (String) The base URI for this document.
175
+ #
176
+ # - +encoding:+ (String) The name of the encoding that should be used when processing the
177
+ # document. When not provided, the encoding will be determined based on the document
178
+ # content.
179
+ #
180
+ # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
181
+ # behaviors during parsing. See ParseOptions for more information. The default value is
182
+ # +ParseOptions::DEFAULT_HTML+.
183
+ #
184
+ # [Yields]
185
+ # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
186
+ # can be configured before parsing. See Nokogiri::XML::ParseOptions for more information.
187
+ #
188
+ # [Returns] Nokogiri::HTML4::Document
189
+ def parse(
190
+ input,
191
+ url_ = nil, encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
192
+ url: url_, encoding: encoding_, options: options_
193
+ )
194
+ options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
195
+ yield options if block_given?
196
+
197
+ url ||= input.respond_to?(:path) ? input.path : nil
198
+
199
+ if input.respond_to?(:encoding)
200
+ unless input.encoding == Encoding::ASCII_8BIT
201
+ encoding ||= input.encoding.name
202
+ end
203
+ end
204
+
205
+ if input.respond_to?(:read)
206
+ if input.is_a?(Pathname)
207
+ # resolve the Pathname to the file and open it as an IO object, see #2110
208
+ input = input.expand_path.open
209
+ url ||= input.path
210
+ end
211
+
212
+ unless encoding
213
+ input = EncodingReader.new(input)
214
+ begin
215
+ return read_io(input, url, encoding, options.to_i)
216
+ rescue EncodingReader::EncodingFound => e
217
+ encoding = e.found_encoding
218
+ end
219
+ end
220
+ return read_io(input, url, encoding, options.to_i)
221
+ end
222
+
223
+ # read_memory pukes on empty docs
224
+ if input.nil? || input.empty?
225
+ return encoding ? new.tap { |i| i.encoding = encoding } : new
226
+ end
227
+
228
+ encoding ||= EncodingReader.detect_encoding(input)
229
+
230
+ read_memory(input, url, encoding, options.to_i)
231
+ end
232
+ end
233
+ end
234
+ end
235
+ end