nokogiri 1.10.9 → 1.18.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (230) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +38 -0
  3. data/LICENSE-DEPENDENCIES.md +1632 -1022
  4. data/LICENSE.md +1 -1
  5. data/README.md +190 -95
  6. data/bin/nokogiri +63 -50
  7. data/dependencies.yml +34 -66
  8. data/ext/nokogiri/depend +38 -358
  9. data/ext/nokogiri/extconf.rb +909 -422
  10. data/ext/nokogiri/gumbo.c +610 -0
  11. data/ext/nokogiri/html4_document.c +171 -0
  12. data/ext/nokogiri/html4_element_description.c +299 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser.c +40 -0
  15. data/ext/nokogiri/html4_sax_parser_context.c +98 -0
  16. data/ext/nokogiri/html4_sax_push_parser.c +96 -0
  17. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  18. data/ext/nokogiri/nokogiri.c +258 -105
  19. data/ext/nokogiri/nokogiri.h +207 -90
  20. data/ext/nokogiri/test_global_handlers.c +40 -0
  21. data/ext/nokogiri/xml_attr.c +18 -18
  22. data/ext/nokogiri/xml_attribute_decl.c +22 -22
  23. data/ext/nokogiri/xml_cdata.c +33 -33
  24. data/ext/nokogiri/xml_comment.c +19 -31
  25. data/ext/nokogiri/xml_document.c +499 -323
  26. data/ext/nokogiri/xml_document_fragment.c +17 -36
  27. data/ext/nokogiri/xml_dtd.c +65 -59
  28. data/ext/nokogiri/xml_element_content.c +63 -55
  29. data/ext/nokogiri/xml_element_decl.c +31 -31
  30. data/ext/nokogiri/xml_encoding_handler.c +54 -21
  31. data/ext/nokogiri/xml_entity_decl.c +37 -35
  32. data/ext/nokogiri/xml_entity_reference.c +17 -19
  33. data/ext/nokogiri/xml_namespace.c +131 -61
  34. data/ext/nokogiri/xml_node.c +1429 -723
  35. data/ext/nokogiri/xml_node_set.c +257 -225
  36. data/ext/nokogiri/xml_processing_instruction.c +18 -20
  37. data/ext/nokogiri/xml_reader.c +340 -231
  38. data/ext/nokogiri/xml_relax_ng.c +87 -99
  39. data/ext/nokogiri/xml_sax_parser.c +269 -176
  40. data/ext/nokogiri/xml_sax_parser_context.c +286 -152
  41. data/ext/nokogiri/xml_sax_push_parser.c +111 -64
  42. data/ext/nokogiri/xml_schema.c +132 -140
  43. data/ext/nokogiri/xml_syntax_error.c +52 -23
  44. data/ext/nokogiri/xml_text.c +37 -30
  45. data/ext/nokogiri/xml_xpath_context.c +373 -185
  46. data/ext/nokogiri/xslt_stylesheet.c +342 -191
  47. data/gumbo-parser/CHANGES.md +63 -0
  48. data/gumbo-parser/Makefile +129 -0
  49. data/gumbo-parser/THANKS +27 -0
  50. data/gumbo-parser/src/Makefile +34 -0
  51. data/gumbo-parser/src/README.md +41 -0
  52. data/gumbo-parser/src/ascii.c +75 -0
  53. data/gumbo-parser/src/ascii.h +115 -0
  54. data/gumbo-parser/src/attribute.c +42 -0
  55. data/gumbo-parser/src/attribute.h +17 -0
  56. data/gumbo-parser/src/char_ref.c +22225 -0
  57. data/gumbo-parser/src/char_ref.h +29 -0
  58. data/gumbo-parser/src/char_ref.rl +2154 -0
  59. data/gumbo-parser/src/error.c +658 -0
  60. data/gumbo-parser/src/error.h +152 -0
  61. data/gumbo-parser/src/foreign_attrs.c +103 -0
  62. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  63. data/gumbo-parser/src/insertion_mode.h +33 -0
  64. data/gumbo-parser/src/macros.h +91 -0
  65. data/gumbo-parser/src/nokogiri_gumbo.h +953 -0
  66. data/gumbo-parser/src/parser.c +4932 -0
  67. data/gumbo-parser/src/parser.h +41 -0
  68. data/gumbo-parser/src/replacement.h +33 -0
  69. data/gumbo-parser/src/string_buffer.c +103 -0
  70. data/gumbo-parser/src/string_buffer.h +68 -0
  71. data/gumbo-parser/src/string_piece.c +48 -0
  72. data/gumbo-parser/src/svg_attrs.c +174 -0
  73. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  74. data/gumbo-parser/src/svg_tags.c +137 -0
  75. data/gumbo-parser/src/svg_tags.gperf +55 -0
  76. data/gumbo-parser/src/tag.c +223 -0
  77. data/gumbo-parser/src/tag_lookup.c +382 -0
  78. data/gumbo-parser/src/tag_lookup.gperf +170 -0
  79. data/gumbo-parser/src/tag_lookup.h +13 -0
  80. data/gumbo-parser/src/token_buffer.c +79 -0
  81. data/gumbo-parser/src/token_buffer.h +71 -0
  82. data/gumbo-parser/src/token_type.h +17 -0
  83. data/gumbo-parser/src/tokenizer.c +3464 -0
  84. data/gumbo-parser/src/tokenizer.h +112 -0
  85. data/gumbo-parser/src/tokenizer_states.h +339 -0
  86. data/gumbo-parser/src/utf8.c +245 -0
  87. data/gumbo-parser/src/utf8.h +164 -0
  88. data/gumbo-parser/src/util.c +66 -0
  89. data/gumbo-parser/src/util.h +34 -0
  90. data/gumbo-parser/src/vector.c +111 -0
  91. data/gumbo-parser/src/vector.h +45 -0
  92. data/lib/nokogiri/class_resolver.rb +67 -0
  93. data/lib/nokogiri/css/node.rb +14 -8
  94. data/lib/nokogiri/css/parser.rb +399 -377
  95. data/lib/nokogiri/css/parser.y +250 -245
  96. data/lib/nokogiri/css/parser_extras.rb +16 -71
  97. data/lib/nokogiri/css/selector_cache.rb +38 -0
  98. data/lib/nokogiri/css/syntax_error.rb +3 -1
  99. data/lib/nokogiri/css/tokenizer.rb +7 -5
  100. data/lib/nokogiri/css/tokenizer.rex +11 -9
  101. data/lib/nokogiri/css/xpath_visitor.rb +242 -96
  102. data/lib/nokogiri/css.rb +122 -17
  103. data/lib/nokogiri/decorators/slop.rb +11 -11
  104. data/lib/nokogiri/encoding_handler.rb +57 -0
  105. data/lib/nokogiri/extension.rb +32 -0
  106. data/lib/nokogiri/gumbo.rb +15 -0
  107. data/lib/nokogiri/html.rb +38 -27
  108. data/lib/nokogiri/{html → html4}/builder.rb +4 -2
  109. data/lib/nokogiri/html4/document.rb +235 -0
  110. data/lib/nokogiri/html4/document_fragment.rb +166 -0
  111. data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
  112. data/lib/nokogiri/html4/element_description_defaults.rb +2040 -0
  113. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  114. data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
  115. data/lib/nokogiri/html4/sax/parser.rb +48 -0
  116. data/lib/nokogiri/html4/sax/parser_context.rb +15 -0
  117. data/lib/nokogiri/{html → html4}/sax/push_parser.rb +12 -11
  118. data/lib/nokogiri/html4.rb +42 -0
  119. data/lib/nokogiri/html5/builder.rb +40 -0
  120. data/lib/nokogiri/html5/document.rb +199 -0
  121. data/lib/nokogiri/html5/document_fragment.rb +200 -0
  122. data/lib/nokogiri/html5/node.rb +103 -0
  123. data/lib/nokogiri/html5.rb +368 -0
  124. data/lib/nokogiri/jruby/dependencies.rb +3 -0
  125. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  126. data/lib/nokogiri/syntax_error.rb +2 -0
  127. data/lib/nokogiri/version/constant.rb +6 -0
  128. data/lib/nokogiri/version/info.rb +224 -0
  129. data/lib/nokogiri/version.rb +3 -108
  130. data/lib/nokogiri/xml/attr.rb +55 -3
  131. data/lib/nokogiri/xml/attribute_decl.rb +6 -2
  132. data/lib/nokogiri/xml/builder.rb +83 -35
  133. data/lib/nokogiri/xml/cdata.rb +3 -1
  134. data/lib/nokogiri/xml/character_data.rb +2 -0
  135. data/lib/nokogiri/xml/document.rb +359 -130
  136. data/lib/nokogiri/xml/document_fragment.rb +170 -54
  137. data/lib/nokogiri/xml/dtd.rb +4 -2
  138. data/lib/nokogiri/xml/element_content.rb +12 -2
  139. data/lib/nokogiri/xml/element_decl.rb +6 -2
  140. data/lib/nokogiri/xml/entity_decl.rb +7 -3
  141. data/lib/nokogiri/xml/entity_reference.rb +2 -0
  142. data/lib/nokogiri/xml/namespace.rb +44 -0
  143. data/lib/nokogiri/xml/node/save_options.rb +23 -8
  144. data/lib/nokogiri/xml/node.rb +1168 -420
  145. data/lib/nokogiri/xml/node_set.rb +145 -67
  146. data/lib/nokogiri/xml/notation.rb +13 -0
  147. data/lib/nokogiri/xml/parse_options.rb +145 -52
  148. data/lib/nokogiri/xml/pp/character_data.rb +9 -6
  149. data/lib/nokogiri/xml/pp/node.rb +47 -30
  150. data/lib/nokogiri/xml/pp.rb +4 -2
  151. data/lib/nokogiri/xml/processing_instruction.rb +4 -1
  152. data/lib/nokogiri/xml/reader.rb +68 -41
  153. data/lib/nokogiri/xml/relax_ng.rb +60 -17
  154. data/lib/nokogiri/xml/sax/document.rb +198 -111
  155. data/lib/nokogiri/xml/sax/parser.rb +144 -67
  156. data/lib/nokogiri/xml/sax/parser_context.rb +119 -6
  157. data/lib/nokogiri/xml/sax/push_parser.rb +9 -5
  158. data/lib/nokogiri/xml/sax.rb +54 -4
  159. data/lib/nokogiri/xml/schema.rb +116 -39
  160. data/lib/nokogiri/xml/searchable.rb +139 -95
  161. data/lib/nokogiri/xml/syntax_error.rb +29 -5
  162. data/lib/nokogiri/xml/text.rb +2 -0
  163. data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
  164. data/lib/nokogiri/xml/xpath.rb +15 -4
  165. data/lib/nokogiri/xml/xpath_context.rb +15 -4
  166. data/lib/nokogiri/xml.rb +45 -55
  167. data/lib/nokogiri/xslt/stylesheet.rb +32 -8
  168. data/lib/nokogiri/xslt.rb +103 -30
  169. data/lib/nokogiri.rb +59 -75
  170. data/lib/xsd/xmlparser/nokogiri.rb +32 -29
  171. data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
  172. data/patches/libxml2/0010-update-config.guess-and-config.sub-for-libxml2.patch +224 -0
  173. data/patches/libxml2/0011-rip-out-libxml2-s-libc_single_threaded-support.patch +30 -0
  174. data/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch +244 -0
  175. data/patches/libxslt/0001-update-config.guess-and-config.sub-for-libxslt.patch +224 -0
  176. data/ports/archives/libxml2-2.13.6.tar.xz +0 -0
  177. data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
  178. metadata +123 -295
  179. data/ext/nokogiri/html_document.c +0 -170
  180. data/ext/nokogiri/html_document.h +0 -10
  181. data/ext/nokogiri/html_element_description.c +0 -279
  182. data/ext/nokogiri/html_element_description.h +0 -10
  183. data/ext/nokogiri/html_entity_lookup.c +0 -32
  184. data/ext/nokogiri/html_entity_lookup.h +0 -8
  185. data/ext/nokogiri/html_sax_parser_context.c +0 -116
  186. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  187. data/ext/nokogiri/html_sax_push_parser.c +0 -87
  188. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  189. data/ext/nokogiri/xml_attr.h +0 -9
  190. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  191. data/ext/nokogiri/xml_cdata.h +0 -9
  192. data/ext/nokogiri/xml_comment.h +0 -9
  193. data/ext/nokogiri/xml_document.h +0 -23
  194. data/ext/nokogiri/xml_document_fragment.h +0 -10
  195. data/ext/nokogiri/xml_dtd.h +0 -10
  196. data/ext/nokogiri/xml_element_content.h +0 -10
  197. data/ext/nokogiri/xml_element_decl.h +0 -9
  198. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  199. data/ext/nokogiri/xml_entity_decl.h +0 -10
  200. data/ext/nokogiri/xml_entity_reference.h +0 -9
  201. data/ext/nokogiri/xml_io.c +0 -61
  202. data/ext/nokogiri/xml_io.h +0 -11
  203. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  204. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  205. data/ext/nokogiri/xml_namespace.h +0 -14
  206. data/ext/nokogiri/xml_node.h +0 -13
  207. data/ext/nokogiri/xml_node_set.h +0 -12
  208. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  209. data/ext/nokogiri/xml_reader.h +0 -10
  210. data/ext/nokogiri/xml_relax_ng.h +0 -9
  211. data/ext/nokogiri/xml_sax_parser.h +0 -39
  212. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  213. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  214. data/ext/nokogiri/xml_schema.h +0 -9
  215. data/ext/nokogiri/xml_syntax_error.h +0 -13
  216. data/ext/nokogiri/xml_text.h +0 -9
  217. data/ext/nokogiri/xml_xpath_context.h +0 -10
  218. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  219. data/lib/nokogiri/html/document.rb +0 -335
  220. data/lib/nokogiri/html/document_fragment.rb +0 -49
  221. data/lib/nokogiri/html/element_description_defaults.rb +0 -671
  222. data/lib/nokogiri/html/sax/parser.rb +0 -62
  223. data/lib/nokogiri/html/sax/parser_context.rb +0 -16
  224. data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
  225. data/patches/libxml2/0004-libxml2.la-is-in-top_builddir.patch +0 -25
  226. data/patches/libxml2/0005-Fix-infinite-loop-in-xmlStringLenDecodeEntities.patch +0 -32
  227. data/ports/archives/libxml2-2.9.10.tar.gz +0 -0
  228. data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
  229. /data/patches/libxml2/{0002-Remove-script-macro-support.patch → 0001-Remove-script-macro-support.patch} +0 -0
  230. /data/patches/libxml2/{0003-Update-entities-to-remove-handling-of-ssi.patch → 0002-Update-entities-to-remove-handling-of-ssi.patch} +0 -0
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  #--
2
3
  # DO NOT MODIFY!!!!
3
4
  # This file is automatically generated by rex 1.0.7
@@ -6,7 +7,8 @@
6
7
 
7
8
  module Nokogiri
8
9
  module CSS
9
- class Tokenizer # :nodoc:
10
+ # :nodoc: all
11
+ class Tokenizer
10
12
  require 'strscan'
11
13
 
12
14
  class ScanError < StandardError ; end
@@ -61,13 +63,13 @@ class Tokenizer # :nodoc:
61
63
  when (text = @ss.scan(/has\([\s]*/))
62
64
  action { [:HAS, text] }
63
65
 
64
- when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
66
+ when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*\([\s]*/))
65
67
  action { [:FUNCTION, text] }
66
68
 
67
- when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
69
+ when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*/))
68
70
  action { [:IDENT, text] }
69
71
 
70
- when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])+/))
72
+ when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))+/))
71
73
  action { [:HASH, text] }
72
74
 
73
75
  when (text = @ss.scan(/[\s]*~=[\s]*/))
@@ -130,7 +132,7 @@ class Tokenizer # :nodoc:
130
132
  when (text = @ss.scan(/[\s]+/))
131
133
  action { [:S, text] }
132
134
 
133
- when (text = @ss.scan(/"([^\n\r\f"]|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*(?<!\\)(?:\\{2})*"|'([^\n\r\f']|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*(?<!\\)(?:\\{2})*'/))
135
+ when (text = @ss.scan(/("([^\n\r\f"]|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*"|'([^\n\r\f']|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*')/))
134
136
  action { [:STRING, text] }
135
137
 
136
138
  when (text = @ss.scan(/./))
@@ -1,22 +1,24 @@
1
1
  module Nokogiri
2
2
  module CSS
3
- class Tokenizer # :nodoc:
3
+ # :nodoc: all
4
+ class Tokenizer
4
5
 
5
6
  macro
6
- nl \n|\r\n|\r|\f
7
+ nl (\n|\r\n|\r|\f)
7
8
  w [\s]*
8
9
  nonascii [^\0-\177]
9
10
  num -?([0-9]+|[0-9]*\.[0-9]+)
10
11
  unicode \\[0-9A-Fa-f]{1,6}(\r\n|[\s])?
11
12
 
12
- escape {unicode}|\\[^\n\r\f0-9A-Fa-f]
13
- nmchar [_A-Za-z0-9-]|{nonascii}|{escape}
14
- nmstart [_A-Za-z]|{nonascii}|{escape}
15
- ident [-@]?({nmstart})({nmchar})*
16
- name ({nmchar})+
13
+ escape ({unicode}|\\[^\n\r\f0-9A-Fa-f])
14
+ nmchar ([_A-Za-z0-9-]|{nonascii}|{escape})
15
+ nmstart ([_A-Za-z]|{nonascii}|{escape})
16
+ name {nmstart}{nmchar}*
17
+ ident -?{name}
18
+ charref {nmchar}+
17
19
  string1 "([^\n\r\f"]|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*"
18
20
  string2 '([^\n\r\f']|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*'
19
- string {string1}|{string2}
21
+ string ({string1}|{string2})
20
22
 
21
23
  rule
22
24
 
@@ -25,7 +27,7 @@ rule
25
27
  has\({w} { [:HAS, text] }
26
28
  {ident}\({w} { [:FUNCTION, text] }
27
29
  {ident} { [:IDENT, text] }
28
- \#{name} { [:HASH, text] }
30
+ \#{charref} { [:HASH, text] }
29
31
  {w}~={w} { [:INCLUDES, text] }
30
32
  {w}\|={w} { [:DASHMATCH, text] }
31
33
  {w}\^={w} { [:PREFIXMATCH, text] }
@@ -1,64 +1,164 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
1
4
  module Nokogiri
2
5
  module CSS
3
- class XPathVisitor # :nodoc:
4
- def visit_function node
6
+ # When translating CSS selectors to XPath queries with Nokogiri::CSS.xpath_for, the XPathVisitor
7
+ # class allows for changing some of the behaviors related to builtin xpath functions and quirks
8
+ # of HTML5.
9
+ class XPathVisitor
10
+ WILDCARD_NAMESPACES = Nokogiri.libxml2_patches.include?("0009-allow-wildcard-namespaces.patch") # :nodoc:
11
+
12
+ # Enum to direct XPathVisitor when to use Nokogiri builtin XPath functions.
13
+ module BuiltinsConfig
14
+ # Never use Nokogiri builtin functions, always generate vanilla XPath 1.0 queries. This is
15
+ # the default when calling Nokogiri::CSS.xpath_for directly.
16
+ NEVER = :never
17
+
18
+ # Always use Nokogiri builtin functions whenever possible. This is probably only useful for testing.
19
+ ALWAYS = :always
20
+
21
+ # Only use Nokogiri builtin functions when they will be faster than vanilla XPath. This is
22
+ # the behavior chosen when searching for CSS selectors on a Nokogiri document, fragment, or
23
+ # node.
24
+ OPTIMAL = :optimal
25
+
26
+ # :nodoc: array of values for validation
27
+ VALUES = [NEVER, ALWAYS, OPTIMAL]
28
+ end
29
+
30
+ # Enum to direct XPathVisitor when to tweak the XPath query to suit the nature of the document
31
+ # being searched. Note that searches for CSS selectors from a Nokogiri document, fragment, or
32
+ # node will choose the correct option automatically.
33
+ module DoctypeConfig
34
+ # The document being searched is an XML document. This is the default.
35
+ XML = :xml
36
+
37
+ # The document being searched is an HTML4 document.
38
+ HTML4 = :html4
39
+
40
+ # The document being searched is an HTML5 document.
41
+ HTML5 = :html5
42
+
43
+ # :nodoc: array of values for validation
44
+ VALUES = [XML, HTML4, HTML5]
45
+ end
46
+
47
+ # The visitor configuration set via the +builtins:+ keyword argument to XPathVisitor.new.
48
+ attr_reader :builtins
5
49
 
6
- msg = :"visit_function_#{node.value.first.gsub(/[(]/, '')}"
7
- return self.send(msg, node) if self.respond_to?(msg)
50
+ # The visitor configuration set via the +doctype:+ keyword argument to XPathVisitor.new.
51
+ attr_reader :doctype
52
+
53
+ # The visitor configuration set via the +prefix:+ keyword argument to XPathVisitor.new.
54
+ attr_reader :prefix
55
+
56
+ # The visitor configuration set via the +namespaces:+ keyword argument to XPathVisitor.new.
57
+ attr_reader :namespaces
58
+
59
+ # :call-seq:
60
+ # new() → XPathVisitor
61
+ # new(builtins:, doctype:) → XPathVisitor
62
+ #
63
+ # [Parameters]
64
+ # - +builtins:+ (BuiltinsConfig) Determine when to use Nokogiri's built-in xpath functions for performance improvements.
65
+ # - +doctype:+ (DoctypeConfig) Make document-type-specific accommodations for CSS queries.
66
+ #
67
+ # [Returns] XPathVisitor
68
+ #
69
+ def initialize(
70
+ builtins: BuiltinsConfig::NEVER,
71
+ doctype: DoctypeConfig::XML,
72
+ prefix: Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX,
73
+ namespaces: nil
74
+ )
75
+ unless BuiltinsConfig::VALUES.include?(builtins)
76
+ raise(ArgumentError, "Invalid values #{builtins.inspect} for builtins: keyword parameter")
77
+ end
78
+ unless DoctypeConfig::VALUES.include?(doctype)
79
+ raise(ArgumentError, "Invalid values #{doctype.inspect} for doctype: keyword parameter")
80
+ end
81
+
82
+ @builtins = builtins
83
+ @doctype = doctype
84
+ @prefix = prefix
85
+ @namespaces = namespaces
86
+ end
87
+
88
+ # :call-seq: config() → Hash
89
+ #
90
+ # [Returns]
91
+ # a Hash representing the configuration of the XPathVisitor, suitable for use as
92
+ # part of the CSS cache key.
93
+ def config
94
+ { builtins: @builtins, doctype: @doctype, prefix: @prefix, namespaces: @namespaces }
95
+ end
96
+
97
+ # :stopdoc:
98
+ def visit_function(node)
99
+ msg = :"visit_function_#{node.value.first.gsub(/[(]/, "")}"
100
+ return send(msg, node) if respond_to?(msg)
8
101
 
9
102
  case node.value.first
10
103
  when /^text\(/
11
- 'child::text()'
104
+ "child::text()"
12
105
  when /^self\(/
13
106
  "self::#{node.value[1]}"
14
107
  when /^eq\(/
15
- "position() = #{node.value[1]}"
108
+ "position()=#{node.value[1]}"
16
109
  when /^(nth|nth-of-type)\(/
17
- if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
110
+ if node.value[1].is_a?(Nokogiri::CSS::Node) && (node.value[1].type == :NTH)
18
111
  nth(node.value[1])
19
112
  else
20
- "position() = #{node.value[1]}"
113
+ "position()=#{node.value[1]}"
21
114
  end
22
115
  when /^nth-child\(/
23
- if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
24
- nth(node.value[1], :child => true)
116
+ if node.value[1].is_a?(Nokogiri::CSS::Node) && (node.value[1].type == :NTH)
117
+ nth(node.value[1], child: true)
25
118
  else
26
- "count(preceding-sibling::*) = #{node.value[1].to_i-1}"
119
+ "count(preceding-sibling::*)=#{node.value[1].to_i - 1}"
27
120
  end
28
121
  when /^nth-last-of-type\(/
29
- if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
30
- nth(node.value[1], :last => true)
122
+ if node.value[1].is_a?(Nokogiri::CSS::Node) && (node.value[1].type == :NTH)
123
+ nth(node.value[1], last: true)
31
124
  else
32
125
  index = node.value[1].to_i - 1
33
- index == 0 ? "position() = last()" : "position() = last() - #{index}"
126
+ index == 0 ? "position()=last()" : "position()=last()-#{index}"
34
127
  end
35
128
  when /^nth-last-child\(/
36
- if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
37
- nth(node.value[1], :last => true, :child => true)
129
+ if node.value[1].is_a?(Nokogiri::CSS::Node) && (node.value[1].type == :NTH)
130
+ nth(node.value[1], last: true, child: true)
38
131
  else
39
- "count(following-sibling::*) = #{node.value[1].to_i-1}"
132
+ "count(following-sibling::*)=#{node.value[1].to_i - 1}"
40
133
  end
41
134
  when /^(first|first-of-type)\(/
42
- "position() = 1"
135
+ "position()=1"
43
136
  when /^(last|last-of-type)\(/
44
- "position() = last()"
137
+ "position()=last()"
45
138
  when /^contains\(/
46
- "contains(., #{node.value[1]})"
139
+ "contains(.,#{node.value[1]})"
47
140
  when /^gt\(/
48
- "position() > #{node.value[1]}"
141
+ "position()>#{node.value[1]}"
49
142
  when /^only-child\(/
50
- "last() = 1"
143
+ "last()=1"
51
144
  when /^comment\(/
52
145
  "comment()"
53
146
  when /^has\(/
54
- ".//#{node.value[1].accept(self)}"
147
+ is_direct = node.value[1].value[0].nil? # e.g. "has(> a)", "has(~ a)", "has(+ a)"
148
+ ".#{"//" unless is_direct}#{node.value[1].accept(self)}"
55
149
  else
56
- args = ['.'] + node.value[1..-1]
57
- "#{node.value.first}#{args.join(', ')})"
150
+ validate_xpath_function_name(node.value.first)
151
+
152
+ # xpath function call, let's marshal those arguments
153
+ args = ["."]
154
+ args += node.value[1..-1].map do |n|
155
+ n.is_a?(Nokogiri::CSS::Node) ? n.accept(self) : n
156
+ end
157
+ "nokogiri:#{node.value.first}#{args.join(",")})"
58
158
  end
59
159
  end
60
160
 
61
- def visit_not node
161
+ def visit_not(node)
62
162
  child = node.value.first
63
163
  if :ELEMENT_NAME == child.type
64
164
  "not(self::#{child.accept(self)})"
@@ -67,143 +167,179 @@ module Nokogiri
67
167
  end
68
168
  end
69
169
 
70
- def visit_id node
170
+ def visit_id(node)
71
171
  node.value.first =~ /^#(.*)$/
72
- "@id = '#{$1}'"
172
+ "@id='#{Regexp.last_match(1)}'"
73
173
  end
74
174
 
75
- def visit_attribute_condition node
76
- attribute = if (node.value.first.type == :FUNCTION) or (node.value.first.value.first =~ /::/)
77
- ''
78
- else
79
- '@'
80
- end
81
- attribute += node.value.first.accept(self)
82
-
83
- # Support non-standard css
84
- attribute.gsub!(/^@@/, '@')
85
-
86
- return attribute unless node.value.length == 3
175
+ def visit_attribute_condition(node)
176
+ attribute = node.value.first.accept(self)
177
+ return attribute if node.value.length == 1
87
178
 
88
179
  value = node.value.last
89
- value = "'#{value}'" if value !~ /^['"]/
180
+ value = "'#{value}'" unless /^['"]/.match?(value)
90
181
 
91
- if (value[0]==value[-1]) && %q{"'}.include?(value[0])
182
+ # quoted values - see test_attribute_value_with_quotes in test/css/test_parser.rb
183
+ if (value[0] == value[-1]) && %q{"'}.include?(value[0])
92
184
  str_value = value[1..-2]
93
185
  if str_value.include?(value[0])
94
- value = 'concat("' + str_value.split('"', -1).join(%q{", '"', "}) + '", "")'
186
+ value = 'concat("' + str_value.split('"', -1).join(%q{",'"',"}) + '","")'
95
187
  end
96
188
  end
97
189
 
98
190
  case node.value[1]
99
191
  when :equal
100
- attribute + " = " + "#{value}"
192
+ attribute + "=" + value.to_s
101
193
  when :not_equal
102
- attribute + " != " + "#{value}"
194
+ attribute + "!=" + value.to_s
103
195
  when :substring_match
104
- "contains(#{attribute}, #{value})"
196
+ "contains(#{attribute},#{value})"
105
197
  when :prefix_match
106
- "starts-with(#{attribute}, #{value})"
198
+ "starts-with(#{attribute},#{value})"
107
199
  when :dash_match
108
- "#{attribute} = #{value} or starts-with(#{attribute}, concat(#{value}, '-'))"
200
+ "#{attribute}=#{value} or starts-with(#{attribute},concat(#{value},'-'))"
109
201
  when :includes
110
- "contains(concat(\" \", #{attribute}, \" \"),concat(\" \", #{value}, \" \"))"
202
+ value = value[1..-2] # strip quotes
203
+ css_class(attribute, value)
111
204
  when :suffix_match
112
- "substring(#{attribute}, string-length(#{attribute}) - " +
113
- "string-length(#{value}) + 1, string-length(#{value})) = #{value}"
205
+ "substring(#{attribute},string-length(#{attribute})-string-length(#{value})+1,string-length(#{value}))=#{value}"
114
206
  else
115
- attribute + " #{node.value[1]} " + "#{value}"
207
+ attribute + " #{node.value[1]} " + value.to_s
116
208
  end
117
209
  end
118
210
 
119
- def visit_pseudo_class node
120
- if node.value.first.is_a?(Nokogiri::CSS::Node) and node.value.first.type == :FUNCTION
211
+ def visit_pseudo_class(node)
212
+ if node.value.first.is_a?(Nokogiri::CSS::Node) && (node.value.first.type == :FUNCTION)
121
213
  node.value.first.accept(self)
122
214
  else
123
- msg = :"visit_pseudo_class_#{node.value.first.gsub(/[(]/, '')}"
124
- return self.send(msg, node) if self.respond_to?(msg)
215
+ msg = :"visit_pseudo_class_#{node.value.first.gsub(/[(]/, "")}"
216
+ return send(msg, node) if respond_to?(msg)
125
217
 
126
218
  case node.value.first
127
- when "first" then "position() = 1"
128
- when "first-child" then "count(preceding-sibling::*) = 0"
129
- when "last" then "position() = last()"
130
- when "last-child" then "count(following-sibling::*) = 0"
131
- when "first-of-type" then "position() = 1"
132
- when "last-of-type" then "position() = last()"
133
- when "only-child" then "count(preceding-sibling::*) = 0 and count(following-sibling::*) = 0"
134
- when "only-of-type" then "last() = 1"
219
+ when "first" then "position()=1"
220
+ when "first-child" then "count(preceding-sibling::*)=0"
221
+ when "last" then "position()=last()"
222
+ when "last-child" then "count(following-sibling::*)=0"
223
+ when "first-of-type" then "position()=1"
224
+ when "last-of-type" then "position()=last()"
225
+ when "only-child" then "count(preceding-sibling::*)=0 and count(following-sibling::*)=0"
226
+ when "only-of-type" then "last()=1"
135
227
  when "empty" then "not(node())"
136
228
  when "parent" then "node()"
137
229
  when "root" then "not(parent::*)"
138
230
  else
139
- node.value.first + "(.)"
231
+ validate_xpath_function_name(node.value.first)
232
+ "nokogiri:#{node.value.first}(.)"
140
233
  end
141
234
  end
142
235
  end
143
236
 
144
- def visit_class_condition node
145
- "contains(concat(' ', normalize-space(@class), ' '), ' #{node.value.first} ')"
237
+ def visit_class_condition(node)
238
+ css_class("@class", node.value.first)
146
239
  end
147
240
 
148
- def visit_combinator node
241
+ def visit_combinator(node)
149
242
  if is_of_type_pseudo_class?(node.value.last)
150
- "#{node.value.first.accept(self) if node.value.first}][#{node.value.last.accept(self)}"
243
+ "#{node.value.first&.accept(self)}][#{node.value.last.accept(self)}"
151
244
  else
152
- "#{node.value.first.accept(self) if node.value.first} and #{node.value.last.accept(self)}"
245
+ "#{node.value.first&.accept(self)} and #{node.value.last.accept(self)}"
153
246
  end
154
247
  end
155
248
 
156
249
  {
157
- 'direct_adjacent_selector' => "/following-sibling::*[1]/self::",
158
- 'following_selector' => "/following-sibling::",
159
- 'descendant_selector' => '//',
160
- 'child_selector' => '/',
161
- }.each do |k,v|
162
- class_eval %{
250
+ "direct_adjacent_selector" => "/following-sibling::*[1]/self::",
251
+ "following_selector" => "/following-sibling::",
252
+ "descendant_selector" => "//",
253
+ "child_selector" => "/",
254
+ }.each do |k, v|
255
+ class_eval <<~RUBY, __FILE__, __LINE__ + 1
163
256
  def visit_#{k} node
164
257
  "\#{node.value.first.accept(self) if node.value.first}#{v}\#{node.value.last.accept(self)}"
165
258
  end
166
- }
259
+ RUBY
167
260
  end
168
261
 
169
- def visit_conditional_selector node
170
- node.value.first.accept(self) + '[' +
171
- node.value.last.accept(self) + ']'
262
+ def visit_conditional_selector(node)
263
+ node.value.first.accept(self) + "[" +
264
+ node.value.last.accept(self) + "]"
265
+ end
266
+
267
+ def visit_element_name(node)
268
+ if @doctype == DoctypeConfig::HTML5 && html5_element_name_needs_namespace_handling(node)
269
+ # HTML5 has namespaces that should be ignored in CSS queries
270
+ # https://github.com/sparklemotion/nokogiri/issues/2376
271
+ if @builtins == BuiltinsConfig::ALWAYS || (@builtins == BuiltinsConfig::OPTIMAL && Nokogiri.uses_libxml?)
272
+ if WILDCARD_NAMESPACES
273
+ "*:#{node.value.first}"
274
+ else
275
+ "*[nokogiri-builtin:local-name-is('#{node.value.first}')]"
276
+ end
277
+ else
278
+ "*[local-name()='#{node.value.first}']"
279
+ end
280
+ elsif node.value.length == 2 # has a namespace prefix
281
+ if node.value.first.nil? # namespace prefix is empty
282
+ node.value.last
283
+ else
284
+ node.value.join(":")
285
+ end
286
+ elsif node.value.first != "*" && @namespaces&.key?("xmlns")
287
+ # apply the default namespace (if one is present) to a non-wildcard selector
288
+ "xmlns:#{node.value.first}"
289
+ else
290
+ node.value.first
291
+ end
172
292
  end
173
293
 
174
- def visit_element_name node
175
- node.value.first
294
+ def visit_attrib_name(node)
295
+ "@#{node.value.first}"
176
296
  end
177
297
 
178
- def accept node
298
+ def accept(node)
179
299
  node.accept(self)
180
300
  end
181
301
 
182
- private
183
- def nth node, options={}
184
- raise ArgumentError, "expected an+b node to contain 4 tokens, but is #{node.value.inspect}" unless node.value.size == 4
302
+ private
185
303
 
186
- a, b = read_a_and_positive_b node.value
304
+ def validate_xpath_function_name(name)
305
+ if name.start_with?("-")
306
+ raise Nokogiri::CSS::SyntaxError, "Invalid XPath function name '#{name}'"
307
+ end
308
+ end
309
+
310
+ def html5_element_name_needs_namespace_handling(node)
311
+ # if there is already a namespace (i.e., it is a prefixed QName), use it as normal
312
+ node.value.length == 1 &&
313
+ # if this is the wildcard selector "*", use it as normal
314
+ node.value.first != "*"
315
+ end
316
+
317
+ def nth(node, options = {})
318
+ unless node.value.size == 4
319
+ raise(ArgumentError, "expected an+b node to contain 4 tokens, but is #{node.value.inspect}")
320
+ end
321
+
322
+ a, b = read_a_and_positive_b(node.value)
187
323
  position = if options[:child]
188
- options[:last] ? "(count(following-sibling::*) + 1)" : "(count(preceding-sibling::*) + 1)"
324
+ options[:last] ? "(count(following-sibling::*)+1)" : "(count(preceding-sibling::*)+1)"
189
325
  else
190
326
  options[:last] ? "(last()-position()+1)" : "position()"
191
327
  end
192
328
 
193
329
  if b.zero?
194
- "(#{position} mod #{a}) = 0"
330
+ "(#{position} mod #{a})=0"
195
331
  else
196
332
  compare = a < 0 ? "<=" : ">="
197
333
  if a.abs == 1
198
- "#{position} #{compare} #{b}"
334
+ "#{position}#{compare}#{b}"
199
335
  else
200
- "(#{position} #{compare} #{b}) and (((#{position}-#{b}) mod #{a.abs}) = 0)"
336
+ "(#{position}#{compare}#{b}) and (((#{position}-#{b}) mod #{a.abs})=0)"
201
337
  end
202
338
  end
203
339
  end
204
340
 
205
- def read_a_and_positive_b values
206
- op = values[2]
341
+ def read_a_and_positive_b(values)
342
+ op = values[2].strip
207
343
  if op == "+"
208
344
  a = values[0].to_i
209
345
  b = values[3].to_i
@@ -216,15 +352,25 @@ module Nokogiri
216
352
  [a, b]
217
353
  end
218
354
 
219
- def is_of_type_pseudo_class? node
220
- if node.type==:PSEUDO_CLASS
221
- if node.value[0].is_a?(Nokogiri::CSS::Node) and node.value[0].type == :FUNCTION
355
+ def is_of_type_pseudo_class?(node) # rubocop:disable Naming/PredicateName
356
+ if node.type == :PSEUDO_CLASS
357
+ if node.value[0].is_a?(Nokogiri::CSS::Node) && (node.value[0].type == :FUNCTION)
222
358
  node.value[0].value[0]
223
359
  else
224
360
  node.value[0]
225
361
  end =~ /(nth|first|last|only)-of-type(\()?/
226
362
  end
227
363
  end
364
+
365
+ def css_class(hay, needle)
366
+ if @builtins == BuiltinsConfig::ALWAYS || (@builtins == BuiltinsConfig::OPTIMAL && Nokogiri.uses_libxml?)
367
+ # use the builtin implementation
368
+ "nokogiri-builtin:css-class(#{hay},'#{needle}')"
369
+ else
370
+ # use only ordinary xpath functions
371
+ "contains(concat(' ',normalize-space(#{hay}),' '),' #{needle} ')"
372
+ end
373
+ end
228
374
  end
229
375
  end
230
376
  end