nokogiri 1.10.9 → 1.18.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (230) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +38 -0
  3. data/LICENSE-DEPENDENCIES.md +1632 -1022
  4. data/LICENSE.md +1 -1
  5. data/README.md +190 -95
  6. data/bin/nokogiri +63 -50
  7. data/dependencies.yml +34 -66
  8. data/ext/nokogiri/depend +38 -358
  9. data/ext/nokogiri/extconf.rb +909 -422
  10. data/ext/nokogiri/gumbo.c +610 -0
  11. data/ext/nokogiri/html4_document.c +171 -0
  12. data/ext/nokogiri/html4_element_description.c +299 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser.c +40 -0
  15. data/ext/nokogiri/html4_sax_parser_context.c +98 -0
  16. data/ext/nokogiri/html4_sax_push_parser.c +96 -0
  17. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  18. data/ext/nokogiri/nokogiri.c +258 -105
  19. data/ext/nokogiri/nokogiri.h +207 -90
  20. data/ext/nokogiri/test_global_handlers.c +40 -0
  21. data/ext/nokogiri/xml_attr.c +18 -18
  22. data/ext/nokogiri/xml_attribute_decl.c +22 -22
  23. data/ext/nokogiri/xml_cdata.c +33 -33
  24. data/ext/nokogiri/xml_comment.c +19 -31
  25. data/ext/nokogiri/xml_document.c +499 -323
  26. data/ext/nokogiri/xml_document_fragment.c +17 -36
  27. data/ext/nokogiri/xml_dtd.c +65 -59
  28. data/ext/nokogiri/xml_element_content.c +63 -55
  29. data/ext/nokogiri/xml_element_decl.c +31 -31
  30. data/ext/nokogiri/xml_encoding_handler.c +54 -21
  31. data/ext/nokogiri/xml_entity_decl.c +37 -35
  32. data/ext/nokogiri/xml_entity_reference.c +17 -19
  33. data/ext/nokogiri/xml_namespace.c +131 -61
  34. data/ext/nokogiri/xml_node.c +1429 -723
  35. data/ext/nokogiri/xml_node_set.c +257 -225
  36. data/ext/nokogiri/xml_processing_instruction.c +18 -20
  37. data/ext/nokogiri/xml_reader.c +340 -231
  38. data/ext/nokogiri/xml_relax_ng.c +87 -99
  39. data/ext/nokogiri/xml_sax_parser.c +269 -176
  40. data/ext/nokogiri/xml_sax_parser_context.c +286 -152
  41. data/ext/nokogiri/xml_sax_push_parser.c +111 -64
  42. data/ext/nokogiri/xml_schema.c +132 -140
  43. data/ext/nokogiri/xml_syntax_error.c +52 -23
  44. data/ext/nokogiri/xml_text.c +37 -30
  45. data/ext/nokogiri/xml_xpath_context.c +373 -185
  46. data/ext/nokogiri/xslt_stylesheet.c +342 -191
  47. data/gumbo-parser/CHANGES.md +63 -0
  48. data/gumbo-parser/Makefile +129 -0
  49. data/gumbo-parser/THANKS +27 -0
  50. data/gumbo-parser/src/Makefile +34 -0
  51. data/gumbo-parser/src/README.md +41 -0
  52. data/gumbo-parser/src/ascii.c +75 -0
  53. data/gumbo-parser/src/ascii.h +115 -0
  54. data/gumbo-parser/src/attribute.c +42 -0
  55. data/gumbo-parser/src/attribute.h +17 -0
  56. data/gumbo-parser/src/char_ref.c +22225 -0
  57. data/gumbo-parser/src/char_ref.h +29 -0
  58. data/gumbo-parser/src/char_ref.rl +2154 -0
  59. data/gumbo-parser/src/error.c +658 -0
  60. data/gumbo-parser/src/error.h +152 -0
  61. data/gumbo-parser/src/foreign_attrs.c +103 -0
  62. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  63. data/gumbo-parser/src/insertion_mode.h +33 -0
  64. data/gumbo-parser/src/macros.h +91 -0
  65. data/gumbo-parser/src/nokogiri_gumbo.h +953 -0
  66. data/gumbo-parser/src/parser.c +4932 -0
  67. data/gumbo-parser/src/parser.h +41 -0
  68. data/gumbo-parser/src/replacement.h +33 -0
  69. data/gumbo-parser/src/string_buffer.c +103 -0
  70. data/gumbo-parser/src/string_buffer.h +68 -0
  71. data/gumbo-parser/src/string_piece.c +48 -0
  72. data/gumbo-parser/src/svg_attrs.c +174 -0
  73. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  74. data/gumbo-parser/src/svg_tags.c +137 -0
  75. data/gumbo-parser/src/svg_tags.gperf +55 -0
  76. data/gumbo-parser/src/tag.c +223 -0
  77. data/gumbo-parser/src/tag_lookup.c +382 -0
  78. data/gumbo-parser/src/tag_lookup.gperf +170 -0
  79. data/gumbo-parser/src/tag_lookup.h +13 -0
  80. data/gumbo-parser/src/token_buffer.c +79 -0
  81. data/gumbo-parser/src/token_buffer.h +71 -0
  82. data/gumbo-parser/src/token_type.h +17 -0
  83. data/gumbo-parser/src/tokenizer.c +3464 -0
  84. data/gumbo-parser/src/tokenizer.h +112 -0
  85. data/gumbo-parser/src/tokenizer_states.h +339 -0
  86. data/gumbo-parser/src/utf8.c +245 -0
  87. data/gumbo-parser/src/utf8.h +164 -0
  88. data/gumbo-parser/src/util.c +66 -0
  89. data/gumbo-parser/src/util.h +34 -0
  90. data/gumbo-parser/src/vector.c +111 -0
  91. data/gumbo-parser/src/vector.h +45 -0
  92. data/lib/nokogiri/class_resolver.rb +67 -0
  93. data/lib/nokogiri/css/node.rb +14 -8
  94. data/lib/nokogiri/css/parser.rb +399 -377
  95. data/lib/nokogiri/css/parser.y +250 -245
  96. data/lib/nokogiri/css/parser_extras.rb +16 -71
  97. data/lib/nokogiri/css/selector_cache.rb +38 -0
  98. data/lib/nokogiri/css/syntax_error.rb +3 -1
  99. data/lib/nokogiri/css/tokenizer.rb +7 -5
  100. data/lib/nokogiri/css/tokenizer.rex +11 -9
  101. data/lib/nokogiri/css/xpath_visitor.rb +242 -96
  102. data/lib/nokogiri/css.rb +122 -17
  103. data/lib/nokogiri/decorators/slop.rb +11 -11
  104. data/lib/nokogiri/encoding_handler.rb +57 -0
  105. data/lib/nokogiri/extension.rb +32 -0
  106. data/lib/nokogiri/gumbo.rb +15 -0
  107. data/lib/nokogiri/html.rb +38 -27
  108. data/lib/nokogiri/{html → html4}/builder.rb +4 -2
  109. data/lib/nokogiri/html4/document.rb +235 -0
  110. data/lib/nokogiri/html4/document_fragment.rb +166 -0
  111. data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
  112. data/lib/nokogiri/html4/element_description_defaults.rb +2040 -0
  113. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  114. data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
  115. data/lib/nokogiri/html4/sax/parser.rb +48 -0
  116. data/lib/nokogiri/html4/sax/parser_context.rb +15 -0
  117. data/lib/nokogiri/{html → html4}/sax/push_parser.rb +12 -11
  118. data/lib/nokogiri/html4.rb +42 -0
  119. data/lib/nokogiri/html5/builder.rb +40 -0
  120. data/lib/nokogiri/html5/document.rb +199 -0
  121. data/lib/nokogiri/html5/document_fragment.rb +200 -0
  122. data/lib/nokogiri/html5/node.rb +103 -0
  123. data/lib/nokogiri/html5.rb +368 -0
  124. data/lib/nokogiri/jruby/dependencies.rb +3 -0
  125. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  126. data/lib/nokogiri/syntax_error.rb +2 -0
  127. data/lib/nokogiri/version/constant.rb +6 -0
  128. data/lib/nokogiri/version/info.rb +224 -0
  129. data/lib/nokogiri/version.rb +3 -108
  130. data/lib/nokogiri/xml/attr.rb +55 -3
  131. data/lib/nokogiri/xml/attribute_decl.rb +6 -2
  132. data/lib/nokogiri/xml/builder.rb +83 -35
  133. data/lib/nokogiri/xml/cdata.rb +3 -1
  134. data/lib/nokogiri/xml/character_data.rb +2 -0
  135. data/lib/nokogiri/xml/document.rb +359 -130
  136. data/lib/nokogiri/xml/document_fragment.rb +170 -54
  137. data/lib/nokogiri/xml/dtd.rb +4 -2
  138. data/lib/nokogiri/xml/element_content.rb +12 -2
  139. data/lib/nokogiri/xml/element_decl.rb +6 -2
  140. data/lib/nokogiri/xml/entity_decl.rb +7 -3
  141. data/lib/nokogiri/xml/entity_reference.rb +2 -0
  142. data/lib/nokogiri/xml/namespace.rb +44 -0
  143. data/lib/nokogiri/xml/node/save_options.rb +23 -8
  144. data/lib/nokogiri/xml/node.rb +1168 -420
  145. data/lib/nokogiri/xml/node_set.rb +145 -67
  146. data/lib/nokogiri/xml/notation.rb +13 -0
  147. data/lib/nokogiri/xml/parse_options.rb +145 -52
  148. data/lib/nokogiri/xml/pp/character_data.rb +9 -6
  149. data/lib/nokogiri/xml/pp/node.rb +47 -30
  150. data/lib/nokogiri/xml/pp.rb +4 -2
  151. data/lib/nokogiri/xml/processing_instruction.rb +4 -1
  152. data/lib/nokogiri/xml/reader.rb +68 -41
  153. data/lib/nokogiri/xml/relax_ng.rb +60 -17
  154. data/lib/nokogiri/xml/sax/document.rb +198 -111
  155. data/lib/nokogiri/xml/sax/parser.rb +144 -67
  156. data/lib/nokogiri/xml/sax/parser_context.rb +119 -6
  157. data/lib/nokogiri/xml/sax/push_parser.rb +9 -5
  158. data/lib/nokogiri/xml/sax.rb +54 -4
  159. data/lib/nokogiri/xml/schema.rb +116 -39
  160. data/lib/nokogiri/xml/searchable.rb +139 -95
  161. data/lib/nokogiri/xml/syntax_error.rb +29 -5
  162. data/lib/nokogiri/xml/text.rb +2 -0
  163. data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
  164. data/lib/nokogiri/xml/xpath.rb +15 -4
  165. data/lib/nokogiri/xml/xpath_context.rb +15 -4
  166. data/lib/nokogiri/xml.rb +45 -55
  167. data/lib/nokogiri/xslt/stylesheet.rb +32 -8
  168. data/lib/nokogiri/xslt.rb +103 -30
  169. data/lib/nokogiri.rb +59 -75
  170. data/lib/xsd/xmlparser/nokogiri.rb +32 -29
  171. data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
  172. data/patches/libxml2/0010-update-config.guess-and-config.sub-for-libxml2.patch +224 -0
  173. data/patches/libxml2/0011-rip-out-libxml2-s-libc_single_threaded-support.patch +30 -0
  174. data/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch +244 -0
  175. data/patches/libxslt/0001-update-config.guess-and-config.sub-for-libxslt.patch +224 -0
  176. data/ports/archives/libxml2-2.13.6.tar.xz +0 -0
  177. data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
  178. metadata +123 -295
  179. data/ext/nokogiri/html_document.c +0 -170
  180. data/ext/nokogiri/html_document.h +0 -10
  181. data/ext/nokogiri/html_element_description.c +0 -279
  182. data/ext/nokogiri/html_element_description.h +0 -10
  183. data/ext/nokogiri/html_entity_lookup.c +0 -32
  184. data/ext/nokogiri/html_entity_lookup.h +0 -8
  185. data/ext/nokogiri/html_sax_parser_context.c +0 -116
  186. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  187. data/ext/nokogiri/html_sax_push_parser.c +0 -87
  188. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  189. data/ext/nokogiri/xml_attr.h +0 -9
  190. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  191. data/ext/nokogiri/xml_cdata.h +0 -9
  192. data/ext/nokogiri/xml_comment.h +0 -9
  193. data/ext/nokogiri/xml_document.h +0 -23
  194. data/ext/nokogiri/xml_document_fragment.h +0 -10
  195. data/ext/nokogiri/xml_dtd.h +0 -10
  196. data/ext/nokogiri/xml_element_content.h +0 -10
  197. data/ext/nokogiri/xml_element_decl.h +0 -9
  198. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  199. data/ext/nokogiri/xml_entity_decl.h +0 -10
  200. data/ext/nokogiri/xml_entity_reference.h +0 -9
  201. data/ext/nokogiri/xml_io.c +0 -61
  202. data/ext/nokogiri/xml_io.h +0 -11
  203. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  204. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  205. data/ext/nokogiri/xml_namespace.h +0 -14
  206. data/ext/nokogiri/xml_node.h +0 -13
  207. data/ext/nokogiri/xml_node_set.h +0 -12
  208. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  209. data/ext/nokogiri/xml_reader.h +0 -10
  210. data/ext/nokogiri/xml_relax_ng.h +0 -9
  211. data/ext/nokogiri/xml_sax_parser.h +0 -39
  212. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  213. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  214. data/ext/nokogiri/xml_schema.h +0 -9
  215. data/ext/nokogiri/xml_syntax_error.h +0 -13
  216. data/ext/nokogiri/xml_text.h +0 -9
  217. data/ext/nokogiri/xml_xpath_context.h +0 -10
  218. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  219. data/lib/nokogiri/html/document.rb +0 -335
  220. data/lib/nokogiri/html/document_fragment.rb +0 -49
  221. data/lib/nokogiri/html/element_description_defaults.rb +0 -671
  222. data/lib/nokogiri/html/sax/parser.rb +0 -62
  223. data/lib/nokogiri/html/sax/parser_context.rb +0 -16
  224. data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
  225. data/patches/libxml2/0004-libxml2.la-is-in-top_builddir.patch +0 -25
  226. data/patches/libxml2/0005-Fix-infinite-loop-in-xmlStringLenDecodeEntities.patch +0 -32
  227. data/ports/archives/libxml2-2.9.10.tar.gz +0 -0
  228. data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
  229. /data/patches/libxml2/{0002-Remove-script-macro-support.patch → 0001-Remove-script-macro-support.patch} +0 -0
  230. /data/patches/libxml2/{0003-Update-entities-to-remove-handling-of-ssi.patch → 0002-Update-entities-to-remove-handling-of-ssi.patch} +0 -0
@@ -1,169 +1,256 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Nokogiri
2
4
  module XML
3
- ###
4
- # SAX Parsers are event driven parsers. Nokogiri provides two different
5
- # event based parsers when dealing with XML. If you want to do SAX style
6
- # parsing using HTML, check out Nokogiri::HTML::SAX.
7
- #
8
- # The basic way a SAX style parser works is by creating a parser,
9
- # telling the parser about the events we're interested in, then giving
10
- # the parser some XML to process. The parser will notify you when
11
- # it encounters events you said you would like to know about.
12
- #
13
- # To register for events, you simply subclass Nokogiri::XML::SAX::Document,
14
- # and implement the methods for which you would like notification.
15
- #
16
- # For example, if I want to be notified when a document ends, and when an
17
- # element starts, I would write a class like this:
18
- #
19
- # class MyDocument < Nokogiri::XML::SAX::Document
20
- # def end_document
21
- # puts "the document has ended"
22
- # end
23
- #
24
- # def start_element name, attributes = []
25
- # puts "#{name} started"
26
- # end
27
- # end
28
- #
29
- # Then I would instantiate a SAX parser with this document, and feed the
30
- # parser some XML
31
- #
32
- # # Create a new parser
33
- # parser = Nokogiri::XML::SAX::Parser.new(MyDocument.new)
34
- #
35
- # # Feed the parser some XML
36
- # parser.parse(File.open(ARGV[0]))
37
- #
38
- # Now my document handler will be called when each node starts, and when
39
- # then document ends. To see what kinds of events are available, take
40
- # a look at Nokogiri::XML::SAX::Document.
41
- #
42
- # Two SAX parsers for XML are available, a parser that reads from a string
43
- # or IO object as it feels necessary, and a parser that lets you spoon
44
- # feed it XML. If you want to let Nokogiri deal with reading your XML,
45
- # use the Nokogiri::XML::SAX::Parser. If you want to have fine grain
46
- # control over the XML input, use the Nokogiri::XML::SAX::PushParser.
47
5
  module SAX
48
- ###
49
- # This class is used for registering types of events you are interested
50
- # in handling. All of the methods on this class are available as
51
- # possible events while parsing an XML document. To register for any
52
- # particular event, just subclass this class and implement the methods
53
- # you are interested in knowing about.
54
- #
55
- # To only be notified about start and end element events, write a class
56
- # like this:
57
- #
58
- # class MyDocument < Nokogiri::XML::SAX::Document
59
- # def start_element name, attrs = []
60
- # puts "#{name} started!"
61
- # end
6
+ # :markup: markdown
7
+ #
8
+ # The SAX::Document class is used for registering types of events you are interested in
9
+ # handling. All of the methods on this class are available as possible events while parsing an
10
+ # \XML document. To register for any particular event, subclass this class and implement the
11
+ # methods you are interested in knowing about.
62
12
  #
63
- # def end_element name
64
- # puts "#{name} ended"
13
+ # To only be notified about start and end element events, write a class like this:
14
+ #
15
+ # class MyHandler < Nokogiri::XML::SAX::Document
16
+ # def start_element name, attrs = []
17
+ # puts "#{name} started!"
18
+ # end
19
+ #
20
+ # def end_element name
21
+ # puts "#{name} ended"
22
+ # end
65
23
  # end
66
- # end
67
24
  #
68
- # You can use this event handler for any SAX style parser included with
69
- # Nokogiri. See Nokogiri::XML::SAX, and Nokogiri::HTML::SAX.
25
+ # You can use this event handler for any SAX-style parser included with Nokogiri.
26
+ #
27
+ # See also:
28
+ #
29
+ # - Nokogiri::XML::SAX
30
+ # - Nokogiri::HTML4::SAX
31
+ #
32
+ # ### Entity Handling
33
+ #
34
+ # ⚠ Entity handling is complicated in a SAX parser! Please read this section carefully if
35
+ # you're not getting the behavior you expect.
36
+ #
37
+ # Entities will be reported to the user via callbacks to #characters, to #reference, or
38
+ # possibly to both. The behavior is determined by a combination of _entity type_ and the value
39
+ # of ParserContext#replace_entities. (Recall that the default value of
40
+ # ParserContext#replace_entities is `false`.)
41
+ #
42
+ # ⚠ <b>It is UNSAFE to set ParserContext#replace_entities to `true`</b> when parsing untrusted
43
+ # documents.
44
+ #
45
+ # 💡 For more information on entity types, see [Wikipedia's page on
46
+ # DTDs](https://en.wikipedia.org/wiki/Document_type_definition#Entity_declarations).
47
+ #
48
+ # | Entity type | #characters | #reference |
49
+ # |--------------------------------------|------------------------------------|-------------------------------------|
50
+ # | Char ref (e.g., <tt>&#146;</tt>) | always | never |
51
+ # | Predefined (e.g., <tt>&amp;</tt>) | always | never |
52
+ # | Undeclared † | never | <tt>#replace_entities == false</tt> |
53
+ # | Internal | always | <tt>#replace_entities == false</tt> |
54
+ # | External † | <tt>#replace_entities == true</tt> | <tt>#replace_entities == false</tt> |
55
+ #
56
+ # &nbsp;
57
+ #
58
+ # † In the case where the replacement text for the entity is unknown (e.g., an undeclared entity
59
+ # or an external entity that could not be resolved because of network issues), then the
60
+ # replacement text will not be reported. If ParserContext#replace_entities is `true`, this
61
+ # means the #characters callback will not be invoked. If ParserContext#replace_entities is
62
+ # `false`, then the #reference callback will be invoked, but with `nil` for the `content`
63
+ # argument.
64
+ #
70
65
  class Document
71
66
  ###
72
- # Called when an XML declaration is parsed
73
- def xmldecl version, encoding, standalone
67
+ # Called when an \XML declaration is parsed.
68
+ #
69
+ # [Parameters]
70
+ # - +version+ (String) the version attribute
71
+ # - +encoding+ (String, nil) the encoding of the document if present, else +nil+
72
+ # - +standalone+ ("yes", "no", nil) the standalone attribute if present, else +nil+
73
+ def xmldecl(version, encoding, standalone)
74
74
  end
75
75
 
76
76
  ###
77
- # Called when document starts parsing
77
+ # Called when document starts parsing.
78
78
  def start_document
79
79
  end
80
80
 
81
81
  ###
82
- # Called when document ends parsing
82
+ # Called when document ends parsing.
83
83
  def end_document
84
84
  end
85
85
 
86
86
  ###
87
- # Called at the beginning of an element
88
- # * +name+ is the name of the tag
89
- # * +attrs+ are an assoc list of namespaces and attributes, e.g.:
87
+ # Called at the beginning of an element.
88
+ #
89
+ # [Parameters]
90
+ # - +name+ (String) the name of the element
91
+ # - +attrs+ (Array<Array<String>>) an assoc list of namespace declarations and attributes, e.g.:
90
92
  # [ ["xmlns:foo", "http://sample.net"], ["size", "large"] ]
91
- def start_element name, attrs = []
93
+ #
94
+ # 💡If you're dealing with XML and need to handle namespaces, use the
95
+ # #start_element_namespace method instead.
96
+ #
97
+ # Note that the element namespace and any attribute namespaces are not provided, and so any
98
+ # namespaced elements or attributes will be returned as strings including the prefix:
99
+ #
100
+ # parser.parse(<<~XML)
101
+ # <root xmlns:foo='http://foo.example.com/' xmlns='http://example.com/'>
102
+ # <foo:bar foo:quux="xxx">hello world</foo:bar>
103
+ # </root>
104
+ # XML
105
+ #
106
+ # assert_pattern do
107
+ # parser.document.start_elements => [
108
+ # ["root", [["xmlns:foo", "http://foo.example.com/"], ["xmlns", "http://example.com/"]]],
109
+ # ["foo:bar", [["foo:quux", "xxx"]]],
110
+ # ]
111
+ # end
112
+ #
113
+ def start_element(name, attrs = [])
92
114
  end
93
115
 
94
116
  ###
95
- # Called at the end of an element
96
- # +name+ is the tag name
97
- def end_element name
117
+ # Called at the end of an element.
118
+ #
119
+ # [Parameters]
120
+ # - +name+ (String) the name of the element being closed
121
+ #
122
+ def end_element(name)
98
123
  end
99
124
 
100
125
  ###
101
- # Called at the beginning of an element
102
- # +name+ is the element name
103
- # +attrs+ is a list of attributes
104
- # +prefix+ is the namespace prefix for the element
105
- # +uri+ is the associated namespace URI
106
- # +ns+ is a hash of namespace prefix:urls associated with the element
107
- def start_element_namespace name, attrs = [], prefix = nil, uri = nil, ns = []
108
- ###
126
+ # Called at the beginning of an element.
127
+ #
128
+ # [Parameters]
129
+ # - +name+ (String) is the name of the element
130
+ # - +attrs+ (Array<Attribute>) is an array of structs with the following properties:
131
+ # - +localname+ (String) the local name of the attribute
132
+ # - +value+ (String) the value of the attribute
133
+ # - +prefix+ (String, nil) the namespace prefix of the attribute
134
+ # - +uri+ (String, nil) the namespace URI of the attribute
135
+ # - +prefix+ (String, nil) is the namespace prefix for the element
136
+ # - +uri+ (String, nil) is the associated URI for the element's namespace
137
+ # - +ns+ (Array<Array<String, String>>) is an assoc list of namespace declarations on the element
138
+ #
139
+ # 💡If you're dealing with HTML or don't care about namespaces, try #start_element instead.
140
+ #
141
+ # [Example]
142
+ # it "start_elements_namespace is called with namespaced attributes" do
143
+ # parser.parse(<<~XML)
144
+ # <root xmlns:foo='http://foo.example.com/'>
145
+ # <foo:a foo:bar='hello' />
146
+ # </root>
147
+ # XML
148
+ #
149
+ # assert_pattern do
150
+ # parser.document.start_elements_namespace => [
151
+ # [
152
+ # "root",
153
+ # [],
154
+ # nil, nil,
155
+ # [["foo", "http://foo.example.com/"]], # namespace declarations
156
+ # ], [
157
+ # "a",
158
+ # [Nokogiri::XML::SAX::Parser::Attribute(localname: "bar", prefix: "foo", uri: "http://foo.example.com/", value: "hello")], # prefixed attribute
159
+ # "foo", "http://foo.example.com/", # prefix and uri for the "a" element
160
+ # [],
161
+ # ]
162
+ # ]
163
+ # end
164
+ # end
165
+ #
166
+ def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = []) # rubocop:disable Metrics/ParameterLists
109
167
  # Deal with SAX v1 interface
110
- name = [prefix, name].compact.join(':')
111
- attributes = ns.map { |ns_prefix,ns_uri|
112
- [['xmlns', ns_prefix].compact.join(':'), ns_uri]
113
- } + attrs.map { |attr|
114
- [[attr.prefix, attr.localname].compact.join(':'), attr.value]
115
- }
116
- start_element name, attributes
168
+ name = [prefix, name].compact.join(":")
169
+ attributes = ns.map do |ns_prefix, ns_uri|
170
+ [["xmlns", ns_prefix].compact.join(":"), ns_uri]
171
+ end + attrs.map do |attr|
172
+ [[attr.prefix, attr.localname].compact.join(":"), attr.value]
173
+ end
174
+ start_element(name, attributes)
117
175
  end
118
176
 
119
177
  ###
120
- # Called at the end of an element
121
- # +name+ is the element's name
122
- # +prefix+ is the namespace prefix associated with the element
123
- # +uri+ is the associated namespace URI
124
- def end_element_namespace name, prefix = nil, uri = nil
125
- ###
178
+ # Called at the end of an element.
179
+ #
180
+ # [Parameters]
181
+ # - +name+ (String) is the name of the element
182
+ # - +prefix+ (String, nil) is the namespace prefix for the element
183
+ # - +uri+ (String, nil) is the associated URI for the element's namespace
184
+ #
185
+ def end_element_namespace(name, prefix = nil, uri = nil)
126
186
  # Deal with SAX v1 interface
127
- end_element [prefix, name].compact.join(':')
187
+ end_element([prefix, name].compact.join(":"))
128
188
  end
129
189
 
130
190
  ###
131
- # Characters read between a tag. This method might be called multiple
132
- # times given one contiguous string of characters.
191
+ # Called when character data is parsed, and for parsed entities when
192
+ # ParserContext#replace_entities is +true+.
193
+ #
194
+ # [Parameters]
195
+ # - +string+ contains the character data or entity replacement text
196
+ #
197
+ # ⚠ Please see Document@Entity+Handling for important information about how entities are handled.
198
+ #
199
+ # ⚠ This method might be called multiple times for a contiguous string of characters.
200
+ #
201
+ def characters(string)
202
+ end
203
+
204
+ ###
205
+ # Called when a parsed entity is referenced and not replaced.
206
+ #
207
+ # [Parameters]
208
+ # - +name+ (String) is the name of the entity
209
+ # - +content+ (String, nil) is the replacement text for the entity, if known
210
+ #
211
+ # ⚠ Please see Document@Entity+Handling for important information about how entities are handled.
212
+ #
213
+ # ⚠ An internal entity may result in a call to both #characters and #reference.
214
+ #
215
+ # Since v1.17.0
133
216
  #
134
- # +string+ contains the character data
135
- def characters string
217
+ def reference(name, content)
136
218
  end
137
219
 
138
220
  ###
139
221
  # Called when comments are encountered
140
- # +string+ contains the comment data
141
- def comment string
222
+ # [Parameters]
223
+ # - +string+ contains the comment data
224
+ def comment(string)
142
225
  end
143
226
 
144
227
  ###
145
228
  # Called on document warnings
146
- # +string+ contains the warning
147
- def warning string
229
+ # [Parameters]
230
+ # - +string+ contains the warning
231
+ def warning(string)
148
232
  end
149
233
 
150
234
  ###
151
235
  # Called on document errors
152
- # +string+ contains the error
153
- def error string
236
+ # [Parameters]
237
+ # - +string+ contains the error
238
+ def error(string)
154
239
  end
155
240
 
156
241
  ###
157
242
  # Called when cdata blocks are found
158
- # +string+ contains the cdata content
159
- def cdata_block string
243
+ # [Parameters]
244
+ # - +string+ contains the cdata content
245
+ def cdata_block(string)
160
246
  end
161
247
 
162
248
  ###
163
249
  # Called when processing instructions are found
164
- # +name+ is the target of the instruction
165
- # +content+ is the value of the instruction
166
- def processing_instruction name, content
250
+ # [Parameters]
251
+ # - +name+ is the target of the instruction
252
+ # - +content+ is the value of the instruction
253
+ def processing_instruction(name, content)
167
254
  end
168
255
  end
169
256
  end
@@ -1,17 +1,18 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Nokogiri
2
4
  module XML
3
5
  module SAX
4
6
  ###
5
- # This parser is a SAX style parser that reads it's input as it
6
- # deems necessary. The parser takes a Nokogiri::XML::SAX::Document,
7
- # an optional encoding, then given an XML input, sends messages to
8
- # the Nokogiri::XML::SAX::Document.
7
+ # This parser is a SAX style parser that reads its input as it deems necessary. The parser
8
+ # takes a Nokogiri::XML::SAX::Document, an optional encoding, then given an XML input, sends
9
+ # messages to the Nokogiri::XML::SAX::Document.
9
10
  #
10
11
  # Here is an example of using this parser:
11
12
  #
12
13
  # # Create a subclass of Nokogiri::XML::SAX::Document and implement
13
14
  # # the events we care about:
14
- # class MyDoc < Nokogiri::XML::SAX::Document
15
+ # class MyHandler < Nokogiri::XML::SAX::Document
15
16
  # def start_element name, attrs = []
16
17
  # puts "starting: #{name}"
17
18
  # end
@@ -21,44 +22,54 @@ module Nokogiri
21
22
  # end
22
23
  # end
23
24
  #
24
- # # Create our parser
25
- # parser = Nokogiri::XML::SAX::Parser.new(MyDoc.new)
25
+ # parser = Nokogiri::XML::SAX::Parser.new(MyHandler.new)
26
+ #
27
+ # # Hand an IO object to the parser, which will read the XML from the IO.
28
+ # File.open(path_to_xml) do |f|
29
+ # parser.parse(f)
30
+ # end
31
+ #
32
+ # For more information about \SAX parsers, see Nokogiri::XML::SAX.
33
+ #
34
+ # Also see Nokogiri::XML::SAX::Document for the available events.
26
35
  #
27
- # # Send some XML to the parser
28
- # parser.parse(File.open(ARGV[0]))
36
+ # For \HTML documents, use the subclass Nokogiri::HTML4::SAX::Parser.
29
37
  #
30
- # For more information about SAX parsers, see Nokogiri::XML::SAX. Also
31
- # see Nokogiri::XML::SAX::Document for the available events.
32
38
  class Parser
39
+ # to dynamically resolve ParserContext in inherited methods
40
+ include Nokogiri::ClassResolver
41
+
42
+ # Structure used for marshalling attributes for some callbacks in XML::SAX::Document.
33
43
  class Attribute < Struct.new(:localname, :prefix, :uri, :value)
34
44
  end
35
45
 
36
- # Encodinds this parser supports
37
- ENCODINGS = {
38
- 'NONE' => 0, # No char encoding detected
39
- 'UTF-8' => 1, # UTF-8
40
- 'UTF16LE' => 2, # UTF-16 little endian
41
- 'UTF16BE' => 3, # UTF-16 big endian
42
- 'UCS4LE' => 4, # UCS-4 little endian
43
- 'UCS4BE' => 5, # UCS-4 big endian
44
- 'EBCDIC' => 6, # EBCDIC uh!
45
- 'UCS4-2143' => 7, # UCS-4 unusual ordering
46
- 'UCS4-3412' => 8, # UCS-4 unusual ordering
47
- 'UCS2' => 9, # UCS-2
48
- 'ISO-8859-1' => 10, # ISO-8859-1 ISO Latin 1
49
- 'ISO-8859-2' => 11, # ISO-8859-2 ISO Latin 2
50
- 'ISO-8859-3' => 12, # ISO-8859-3
51
- 'ISO-8859-4' => 13, # ISO-8859-4
52
- 'ISO-8859-5' => 14, # ISO-8859-5
53
- 'ISO-8859-6' => 15, # ISO-8859-6
54
- 'ISO-8859-7' => 16, # ISO-8859-7
55
- 'ISO-8859-8' => 17, # ISO-8859-8
56
- 'ISO-8859-9' => 18, # ISO-8859-9
57
- 'ISO-2022-JP' => 19, # ISO-2022-JP
58
- 'SHIFT-JIS' => 20, # Shift_JIS
59
- 'EUC-JP' => 21, # EUC-JP
60
- 'ASCII' => 22, # pure ASCII
46
+ ENCODINGS = { # :nodoc:
47
+ "NONE" => 0, # No char encoding detected
48
+ "UTF-8" => 1, # UTF-8
49
+ "UTF16LE" => 2, # UTF-16 little endian
50
+ "UTF16BE" => 3, # UTF-16 big endian
51
+ "UCS4LE" => 4, # UCS-4 little endian
52
+ "UCS4BE" => 5, # UCS-4 big endian
53
+ "EBCDIC" => 6, # EBCDIC uh!
54
+ "UCS4-2143" => 7, # UCS-4 unusual ordering
55
+ "UCS4-3412" => 8, # UCS-4 unusual ordering
56
+ "UCS2" => 9, # UCS-2
57
+ "ISO-8859-1" => 10, # ISO-8859-1 ISO Latin 1
58
+ "ISO-8859-2" => 11, # ISO-8859-2 ISO Latin 2
59
+ "ISO-8859-3" => 12, # ISO-8859-3
60
+ "ISO-8859-4" => 13, # ISO-8859-4
61
+ "ISO-8859-5" => 14, # ISO-8859-5
62
+ "ISO-8859-6" => 15, # ISO-8859-6
63
+ "ISO-8859-7" => 16, # ISO-8859-7
64
+ "ISO-8859-8" => 17, # ISO-8859-8
65
+ "ISO-8859-9" => 18, # ISO-8859-9
66
+ "ISO-2022-JP" => 19, # ISO-2022-JP
67
+ "SHIFT-JIS" => 20, # Shift_JIS
68
+ "EUC-JP" => 21, # EUC-JP
69
+ "ASCII" => 22, # pure ASCII
61
70
  }
71
+ REVERSE_ENCODINGS = ENCODINGS.invert # :nodoc:
72
+ deprecate_constant :ENCODINGS
62
73
 
63
74
  # The Nokogiri::XML::SAX::Document where events will be sent.
64
75
  attr_accessor :document
@@ -66,55 +77,121 @@ module Nokogiri
66
77
  # The encoding beings used for this document.
67
78
  attr_accessor :encoding
68
79
 
69
- # Create a new Parser with +doc+ and +encoding+
70
- def initialize doc = Nokogiri::XML::SAX::Document.new, encoding = 'UTF-8'
71
- @encoding = check_encoding(encoding)
80
+ ###
81
+ # :call-seq:
82
+ # new SAX::Parser
83
+ # new(handler) ⇒ SAX::Parser
84
+ # new(handler, encoding) ⇒ SAX::Parser
85
+ #
86
+ # Create a new Parser.
87
+ #
88
+ # [Parameters]
89
+ # - +handler+ (optional Nokogiri::XML::SAX::Document) The document that will receive
90
+ # events. Will create a new Nokogiri::XML::SAX::Document if not given, which is accessible
91
+ # through the #document attribute.
92
+ # - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
93
+ # parsing the input. (default +nil+ for auto-detection)
94
+ #
95
+ def initialize(doc = Nokogiri::XML::SAX::Document.new, encoding = nil)
96
+ @encoding = encoding
72
97
  @document = doc
73
98
  @warned = false
99
+
100
+ initialize_native unless Nokogiri.jruby?
74
101
  end
75
102
 
76
103
  ###
77
- # Parse given +thing+ which may be a string containing xml, or an
78
- # IO object.
79
- def parse thing, &block
80
- if thing.respond_to?(:read) && thing.respond_to?(:close)
81
- parse_io(thing, &block)
104
+ # :call-seq:
105
+ # parse(input) { |parser_context| ... }
106
+ #
107
+ # Parse the input, sending events to the SAX::Document at #document.
108
+ #
109
+ # [Parameters]
110
+ # - +input+ (String, IO) The input to parse.
111
+ #
112
+ # If +input+ quacks like a readable IO object, this method forwards to Parser.parse_io,
113
+ # otherwise it forwards to Parser.parse_memory.
114
+ #
115
+ # [Yields]
116
+ # If a block is given, the underlying ParserContext object will be yielded. This can be used
117
+ # to set options on the parser context before parsing begins.
118
+ #
119
+ def parse(input, &block)
120
+ if input.respond_to?(:read) && input.respond_to?(:close)
121
+ parse_io(input, &block)
82
122
  else
83
- parse_memory(thing, &block)
123
+ parse_memory(input, &block)
84
124
  end
85
125
  end
86
126
 
87
127
  ###
88
- # Parse given +io+
89
- def parse_io io, encoding = 'ASCII'
90
- @encoding = check_encoding(encoding)
91
- ctx = ParserContext.io(io, ENCODINGS[@encoding])
128
+ # :call-seq:
129
+ # parse_io(io) { |parser_context| ... }
130
+ # parse_io(io, encoding) { |parser_context| ... }
131
+ #
132
+ # Parse an input stream.
133
+ #
134
+ # [Parameters]
135
+ # - +io+ (IO) The readable IO object from which to read input
136
+ # - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
137
+ # parsing the input, or +nil+ for auto-detection. (default #encoding)
138
+ #
139
+ # [Yields]
140
+ # If a block is given, the underlying ParserContext object will be yielded. This can be used
141
+ # to set options on the parser context before parsing begins.
142
+ #
143
+ def parse_io(io, encoding = @encoding)
144
+ ctx = related_class("ParserContext").io(io, encoding)
92
145
  yield ctx if block_given?
93
- ctx.parse_with self
146
+ ctx.parse_with(self)
94
147
  end
95
148
 
96
149
  ###
97
- # Parse a file with +filename+
98
- def parse_file filename
99
- raise ArgumentError unless filename
100
- raise Errno::ENOENT unless File.exist?(filename)
101
- raise Errno::EISDIR if File.directory?(filename)
102
- ctx = ParserContext.file filename
150
+ # :call-seq:
151
+ # parse_memory(input) { |parser_context| ... }
152
+ # parse_memory(input, encoding) { |parser_context| ... }
153
+ #
154
+ # Parse an input string.
155
+ #
156
+ # [Parameters]
157
+ # - +input+ (String) The input string to be parsed.
158
+ # - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
159
+ # parsing the input, or +nil+ for auto-detection. (default #encoding)
160
+ #
161
+ # [Yields]
162
+ # If a block is given, the underlying ParserContext object will be yielded. This can be used
163
+ # to set options on the parser context before parsing begins.
164
+ #
165
+ def parse_memory(input, encoding = @encoding)
166
+ ctx = related_class("ParserContext").memory(input, encoding)
103
167
  yield ctx if block_given?
104
- ctx.parse_with self
168
+ ctx.parse_with(self)
105
169
  end
106
170
 
107
- def parse_memory data
108
- ctx = ParserContext.memory data
109
- yield ctx if block_given?
110
- ctx.parse_with self
111
- end
171
+ ###
172
+ # :call-seq:
173
+ # parse_file(filename) { |parser_context| ... }
174
+ # parse_file(filename, encoding) { |parser_context| ... }
175
+ #
176
+ # Parse a file.
177
+ #
178
+ # [Parameters]
179
+ # - +filename+ (String) The path to the file to be parsed.
180
+ # - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
181
+ # parsing the input, or +nil+ for auto-detection. (default #encoding)
182
+ #
183
+ # [Yields]
184
+ # If a block is given, the underlying ParserContext object will be yielded. This can be used
185
+ # to set options on the parser context before parsing begins.
186
+ #
187
+ def parse_file(filename, encoding = @encoding)
188
+ raise ArgumentError, "no filename provided" unless filename
189
+ raise Errno::ENOENT unless File.exist?(filename)
190
+ raise Errno::EISDIR if File.directory?(filename)
112
191
 
113
- private
114
- def check_encoding(encoding)
115
- encoding.upcase.tap do |enc|
116
- raise ArgumentError.new("'#{enc}' is not a valid encoding") unless ENCODINGS[enc]
117
- end
192
+ ctx = related_class("ParserContext").file(filename, encoding)
193
+ yield ctx if block_given?
194
+ ctx.parse_with(self)
118
195
  end
119
196
  end
120
197
  end