nokogiri 1.16.8-x86_64-linux → 1.17.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +11 -21
  3. data/README.md +4 -0
  4. data/dependencies.yml +6 -6
  5. data/ext/nokogiri/extconf.rb +191 -137
  6. data/ext/nokogiri/gumbo.c +69 -53
  7. data/ext/nokogiri/html4_document.c +10 -4
  8. data/ext/nokogiri/html4_element_description.c +18 -18
  9. data/ext/nokogiri/html4_sax_parser.c +40 -0
  10. data/ext/nokogiri/html4_sax_parser_context.c +48 -58
  11. data/ext/nokogiri/html4_sax_push_parser.c +25 -24
  12. data/ext/nokogiri/include/libexslt/exsltconfig.h +3 -3
  13. data/ext/nokogiri/include/libxml2/libxml/HTMLparser.h +12 -19
  14. data/ext/nokogiri/include/libxml2/libxml/c14n.h +1 -12
  15. data/ext/nokogiri/include/libxml2/libxml/debugXML.h +1 -1
  16. data/ext/nokogiri/include/libxml2/libxml/encoding.h +9 -0
  17. data/ext/nokogiri/include/libxml2/libxml/entities.h +12 -1
  18. data/ext/nokogiri/include/libxml2/libxml/hash.h +19 -0
  19. data/ext/nokogiri/include/libxml2/libxml/list.h +2 -2
  20. data/ext/nokogiri/include/libxml2/libxml/nanohttp.h +17 -0
  21. data/ext/nokogiri/include/libxml2/libxml/parser.h +60 -54
  22. data/ext/nokogiri/include/libxml2/libxml/parserInternals.h +9 -1
  23. data/ext/nokogiri/include/libxml2/libxml/pattern.h +6 -0
  24. data/ext/nokogiri/include/libxml2/libxml/tree.h +32 -12
  25. data/ext/nokogiri/include/libxml2/libxml/uri.h +11 -0
  26. data/ext/nokogiri/include/libxml2/libxml/valid.h +29 -2
  27. data/ext/nokogiri/include/libxml2/libxml/xinclude.h +7 -0
  28. data/ext/nokogiri/include/libxml2/libxml/xmlIO.h +21 -4
  29. data/ext/nokogiri/include/libxml2/libxml/xmlerror.h +14 -0
  30. data/ext/nokogiri/include/libxml2/libxml/xmlexports.h +111 -15
  31. data/ext/nokogiri/include/libxml2/libxml/xmlmemory.h +8 -45
  32. data/ext/nokogiri/include/libxml2/libxml/xmlreader.h +2 -0
  33. data/ext/nokogiri/include/libxml2/libxml/xmlsave.h +5 -0
  34. data/ext/nokogiri/include/libxml2/libxml/xmlunicode.h +165 -1
  35. data/ext/nokogiri/include/libxml2/libxml/xmlversion.h +7 -171
  36. data/ext/nokogiri/include/libxml2/libxml/xmlwriter.h +1 -0
  37. data/ext/nokogiri/include/libxml2/libxml/xpath.h +4 -0
  38. data/ext/nokogiri/include/libxslt/xsltInternals.h +3 -0
  39. data/ext/nokogiri/include/libxslt/xsltconfig.h +4 -37
  40. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  41. data/ext/nokogiri/nokogiri.c +9 -2
  42. data/ext/nokogiri/nokogiri.h +18 -33
  43. data/ext/nokogiri/xml_attr.c +1 -1
  44. data/ext/nokogiri/xml_cdata.c +2 -10
  45. data/ext/nokogiri/xml_comment.c +3 -8
  46. data/ext/nokogiri/xml_document.c +163 -156
  47. data/ext/nokogiri/xml_document_fragment.c +10 -25
  48. data/ext/nokogiri/xml_dtd.c +1 -1
  49. data/ext/nokogiri/xml_element_content.c +9 -9
  50. data/ext/nokogiri/xml_encoding_handler.c +4 -4
  51. data/ext/nokogiri/xml_namespace.c +6 -6
  52. data/ext/nokogiri/xml_node.c +130 -104
  53. data/ext/nokogiri/xml_node_set.c +46 -44
  54. data/ext/nokogiri/xml_reader.c +54 -58
  55. data/ext/nokogiri/xml_relax_ng.c +35 -56
  56. data/ext/nokogiri/xml_sax_parser.c +156 -88
  57. data/ext/nokogiri/xml_sax_parser_context.c +213 -131
  58. data/ext/nokogiri/xml_sax_push_parser.c +68 -49
  59. data/ext/nokogiri/xml_schema.c +50 -85
  60. data/ext/nokogiri/xml_syntax_error.c +19 -11
  61. data/ext/nokogiri/xml_text.c +2 -4
  62. data/ext/nokogiri/xml_xpath_context.c +2 -2
  63. data/ext/nokogiri/xslt_stylesheet.c +8 -8
  64. data/lib/nokogiri/3.0/nokogiri.so +0 -0
  65. data/lib/nokogiri/3.1/nokogiri.so +0 -0
  66. data/lib/nokogiri/3.2/nokogiri.so +0 -0
  67. data/lib/nokogiri/3.3/nokogiri.so +0 -0
  68. data/lib/nokogiri/class_resolver.rb +1 -1
  69. data/lib/nokogiri/css/node.rb +6 -2
  70. data/lib/nokogiri/css/parser.rb +6 -4
  71. data/lib/nokogiri/css/parser.y +2 -2
  72. data/lib/nokogiri/css/parser_extras.rb +6 -66
  73. data/lib/nokogiri/css/selector_cache.rb +38 -0
  74. data/lib/nokogiri/css/tokenizer.rb +4 -4
  75. data/lib/nokogiri/css/tokenizer.rex +9 -8
  76. data/lib/nokogiri/css/xpath_visitor.rb +42 -6
  77. data/lib/nokogiri/css.rb +86 -20
  78. data/lib/nokogiri/decorators/slop.rb +3 -5
  79. data/lib/nokogiri/encoding_handler.rb +2 -2
  80. data/lib/nokogiri/html4/document.rb +44 -23
  81. data/lib/nokogiri/html4/document_fragment.rb +124 -12
  82. data/lib/nokogiri/html4/encoding_reader.rb +1 -1
  83. data/lib/nokogiri/html4/sax/parser.rb +23 -38
  84. data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
  85. data/lib/nokogiri/html4.rb +9 -14
  86. data/lib/nokogiri/html5/builder.rb +40 -0
  87. data/lib/nokogiri/html5/document.rb +61 -30
  88. data/lib/nokogiri/html5/document_fragment.rb +130 -20
  89. data/lib/nokogiri/html5/node.rb +4 -4
  90. data/lib/nokogiri/html5.rb +114 -72
  91. data/lib/nokogiri/version/constant.rb +1 -1
  92. data/lib/nokogiri/xml/builder.rb +8 -1
  93. data/lib/nokogiri/xml/document.rb +70 -26
  94. data/lib/nokogiri/xml/document_fragment.rb +84 -13
  95. data/lib/nokogiri/xml/node.rb +82 -11
  96. data/lib/nokogiri/xml/node_set.rb +9 -7
  97. data/lib/nokogiri/xml/parse_options.rb +1 -1
  98. data/lib/nokogiri/xml/pp/node.rb +6 -1
  99. data/lib/nokogiri/xml/reader.rb +46 -13
  100. data/lib/nokogiri/xml/relax_ng.rb +57 -20
  101. data/lib/nokogiri/xml/sax/document.rb +174 -83
  102. data/lib/nokogiri/xml/sax/parser.rb +115 -41
  103. data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
  104. data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
  105. data/lib/nokogiri/xml/sax.rb +48 -0
  106. data/lib/nokogiri/xml/schema.rb +112 -45
  107. data/lib/nokogiri/xml/searchable.rb +6 -8
  108. data/lib/nokogiri/xml/syntax_error.rb +22 -0
  109. data/lib/nokogiri/xml.rb +13 -24
  110. data/lib/nokogiri/xslt.rb +3 -9
  111. data/lib/xsd/xmlparser/nokogiri.rb +3 -4
  112. metadata +8 -4
  113. data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
@@ -3,36 +3,73 @@
3
3
  module Nokogiri
4
4
  module XML
5
5
  class << self
6
- ###
7
- # Create a new Nokogiri::XML::RelaxNG document from +string_or_io+.
8
- # See Nokogiri::XML::RelaxNG for an example.
9
- def RelaxNG(string_or_io, options = ParseOptions::DEFAULT_SCHEMA)
10
- RelaxNG.new(string_or_io, options)
6
+ # :call-seq:
7
+ # RelaxNG(input) Nokogiri::XML::RelaxNG
8
+ # RelaxNG(input, options:) Nokogiri::XML::RelaxNG
9
+ #
10
+ # Convenience method for Nokogiri::XML::RelaxNG.new
11
+ def RelaxNG(...)
12
+ RelaxNG.new(...)
11
13
  end
12
14
  end
13
15
 
14
- ###
15
- # Nokogiri::XML::RelaxNG is used for validating XML against a
16
- # RelaxNG schema.
16
+ # Nokogiri::XML::RelaxNG is used for validating \XML against a RELAX NG schema definition.
17
17
  #
18
- # == Synopsis
18
+ # 🛡 <b>Do not use this class for untrusted schema documents.</b> RELAX NG input is always
19
+ # treated as *trusted*, meaning that the underlying parsing libraries <b>will access network
20
+ # resources</b>. This is counter to Nokogiri's "untrusted by default" security policy, but is an
21
+ # unfortunate limitation of the underlying libraries.
19
22
  #
20
- # Validate an XML document against a RelaxNG schema. Loop over the errors
21
- # that are returned and print them out:
23
+ # *Example:* Determine whether an \XML document is valid.
22
24
  #
23
- # schema = Nokogiri::XML::RelaxNG(File.open(ADDRESS_SCHEMA_FILE))
24
- # doc = Nokogiri::XML(File.open(ADDRESS_XML_FILE))
25
+ # schema = Nokogiri::XML::RelaxNG.new(File.read(RELAX_NG_FILE))
26
+ # doc = Nokogiri::XML::Document.parse(File.read(XML_FILE))
27
+ # schema.valid?(doc) # Boolean
25
28
  #
26
- # schema.validate(doc).each do |error|
27
- # puts error.message
28
- # end
29
+ # *Example:* Validate an \XML document against a \RelaxNG schema, and capture any errors that are found.
29
30
  #
30
- # The list of errors are Nokogiri::XML::SyntaxError objects.
31
+ # schema = Nokogiri::XML::RelaxNG.new(File.open(RELAX_NG_FILE))
32
+ # doc = Nokogiri::XML::Document.parse(File.open(XML_FILE))
33
+ # errors = schema.validate(doc) # Array<SyntaxError>
34
+ #
35
+ # *Example:* Validate an \XML document using a Document containing a RELAX NG schema definition.
36
+ #
37
+ # schema_doc = Nokogiri::XML::Document.parse(File.read(RELAX_NG_FILE))
38
+ # schema = Nokogiri::XML::RelaxNG.from_document(schema_doc)
39
+ # doc = Nokogiri::XML::Document.parse(File.open(XML_FILE))
40
+ # schema.valid?(doc) # Boolean
31
41
  #
32
- # NOTE: RelaxNG input is always treated as TRUSTED documents, meaning that they will cause the
33
- # underlying parsing libraries to access network resources. This is counter to Nokogiri's
34
- # "untrusted by default" security policy, but is a limitation of the underlying libraries.
35
42
  class RelaxNG < Nokogiri::XML::Schema
43
+ # :call-seq:
44
+ # new(input) → Nokogiri::XML::RelaxNG
45
+ # new(input, options:) → Nokogiri::XML::RelaxNG
46
+ #
47
+ # Parse a RELAX NG schema definition from a String or IO to create a new Nokogiri::XML::RelaxNG.
48
+ #
49
+ # [Parameters]
50
+ # - +input+ (String | IO) RELAX NG schema definition
51
+ # - +options:+ (Nokogiri::XML::ParseOptions)
52
+ # Defaults to Nokogiri::XML::ParseOptions::DEFAULT_SCHEMA ⚠ Unused
53
+ #
54
+ # [Returns] Nokogiri::XML::RelaxNG
55
+ #
56
+ # ⚠ +parse_options+ is currently unused by this method and is present only as a placeholder for
57
+ # future functionality.
58
+ #
59
+ # Also see convenience method Nokogiri::XML::RelaxNG()
60
+ def self.new(input, parse_options_ = ParseOptions::DEFAULT_SCHEMA, options: parse_options_)
61
+ from_document(Nokogiri::XML::Document.parse(input), options)
62
+ end
63
+
64
+ # :call-seq:
65
+ # read_memory(input) → Nokogiri::XML::RelaxNG
66
+ # read_memory(input, options:) → Nokogiri::XML::RelaxNG
67
+ #
68
+ # Convenience method for Nokogiri::XML::RelaxNG.new.
69
+ def self.read_memory(...)
70
+ # TODO deprecate this method
71
+ new(...)
72
+ end
36
73
  end
37
74
  end
38
75
  end
@@ -2,106 +2,168 @@
2
2
 
3
3
  module Nokogiri
4
4
  module XML
5
- ###
6
- # SAX Parsers are event driven parsers. Nokogiri provides two different event based parsers when
7
- # dealing with XML. If you want to do SAX style parsing using HTML, check out
8
- # Nokogiri::HTML4::SAX.
9
- #
10
- # The basic way a SAX style parser works is by creating a parser, telling the parser about the
11
- # events we're interested in, then giving the parser some XML to process. The parser will notify
12
- # you when it encounters events you said you would like to know about.
13
- #
14
- # To register for events, you simply subclass Nokogiri::XML::SAX::Document, and implement the
15
- # methods for which you would like notification.
16
- #
17
- # For example, if I want to be notified when a document ends, and when an element starts, I
18
- # would write a class like this:
19
- #
20
- # class MyDocument < Nokogiri::XML::SAX::Document
21
- # def end_document
22
- # puts "the document has ended"
23
- # end
24
- #
25
- # def start_element name, attributes = []
26
- # puts "#{name} started"
27
- # end
28
- # end
29
- #
30
- # Then I would instantiate a SAX parser with this document, and feed the parser some XML
31
- #
32
- # # Create a new parser
33
- # parser = Nokogiri::XML::SAX::Parser.new(MyDocument.new)
34
- #
35
- # # Feed the parser some XML
36
- # parser.parse(File.open(ARGV[0]))
37
- #
38
- # Now my document handler will be called when each node starts, and when then document ends. To
39
- # see what kinds of events are available, take a look at Nokogiri::XML::SAX::Document.
40
- #
41
- # Two SAX parsers for XML are available, a parser that reads from a string or IO object as it
42
- # feels necessary, and a parser that lets you spoon feed it XML. If you want to let Nokogiri
43
- # deal with reading your XML, use the Nokogiri::XML::SAX::Parser. If you want to have fine grain
44
- # control over the XML input, use the Nokogiri::XML::SAX::PushParser.
45
5
  module SAX
46
- ###
47
- # This class is used for registering types of events you are interested in handling. All of
48
- # the methods on this class are available as possible events while parsing an XML document. To
49
- # register for any particular event, just subclass this class and implement the methods you
50
- # are interested in knowing about.
6
+ # :markup: markdown
7
+ #
8
+ # The SAX::Document class is used for registering types of events you are interested in
9
+ # handling. All of the methods on this class are available as possible events while parsing an
10
+ # \XML document. To register for any particular event, subclass this class and implement the
11
+ # methods you are interested in knowing about.
51
12
  #
52
13
  # To only be notified about start and end element events, write a class like this:
53
14
  #
54
- # class MyDocument < Nokogiri::XML::SAX::Document
55
- # def start_element name, attrs = []
56
- # puts "#{name} started!"
57
- # end
15
+ # class MyHandler < Nokogiri::XML::SAX::Document
16
+ # def start_element name, attrs = []
17
+ # puts "#{name} started!"
18
+ # end
58
19
  #
59
- # def end_element name
60
- # puts "#{name} ended"
20
+ # def end_element name
21
+ # puts "#{name} ended"
22
+ # end
61
23
  # end
62
- # end
63
24
  #
64
- # You can use this event handler for any SAX style parser included with Nokogiri. See
65
- # Nokogiri::XML::SAX, and Nokogiri::HTML4::SAX.
25
+ # You can use this event handler for any SAX-style parser included with Nokogiri.
26
+ #
27
+ # See also:
28
+ #
29
+ # - Nokogiri::XML::SAX
30
+ # - Nokogiri::HTML4::SAX
31
+ #
32
+ # ### Entity Handling
33
+ #
34
+ # ⚠ Entity handling is complicated in a SAX parser! Please read this section carefully if
35
+ # you're not getting the behavior you expect.
36
+ #
37
+ # Entities will be reported to the user via callbacks to #characters, to #reference, or
38
+ # possibly to both. The behavior is determined by a combination of _entity type_ and the value
39
+ # of ParserContext#replace_entities. (Recall that the default value of
40
+ # ParserContext#replace_entities is `false`.)
41
+ #
42
+ # ⚠ <b>It is UNSAFE to set ParserContext#replace_entities to `true`</b> when parsing untrusted
43
+ # documents.
44
+ #
45
+ # 💡 For more information on entity types, see [Wikipedia's page on
46
+ # DTDs](https://en.wikipedia.org/wiki/Document_type_definition#Entity_declarations).
47
+ #
48
+ # | Entity type | #characters | #reference |
49
+ # |--------------------------------------|------------------------------------|-------------------------------------|
50
+ # | Char ref (e.g., <tt>&#146;</tt>) | always | never |
51
+ # | Predefined (e.g., <tt>&amp;</tt>) | always | never |
52
+ # | Undeclared † | never | <tt>#replace_entities == false</tt> |
53
+ # | Internal | always | <tt>#replace_entities == false</tt> |
54
+ # | External † | <tt>#replace_entities == true</tt> | <tt>#replace_entities == false</tt> |
55
+ #
56
+ # &nbsp;
57
+ #
58
+ # † In the case where the replacement text for the entity is unknown (e.g., an undeclared entity
59
+ # or an external entity that could not be resolved because of network issues), then the
60
+ # replacement text will not be reported. If ParserContext#replace_entities is `true`, this
61
+ # means the #characters callback will not be invoked. If ParserContext#replace_entities is
62
+ # `false`, then the #reference callback will be invoked, but with `nil` for the `content`
63
+ # argument.
64
+ #
66
65
  class Document
67
66
  ###
68
- # Called when an XML declaration is parsed
67
+ # Called when an \XML declaration is parsed.
68
+ #
69
+ # [Parameters]
70
+ # - +version+ (String) the version attribute
71
+ # - +encoding+ (String, nil) the encoding of the document if present, else +nil+
72
+ # - +standalone+ ("yes", "no", nil) the standalone attribute if present, else +nil+
69
73
  def xmldecl(version, encoding, standalone)
70
74
  end
71
75
 
72
76
  ###
73
- # Called when document starts parsing
77
+ # Called when document starts parsing.
74
78
  def start_document
75
79
  end
76
80
 
77
81
  ###
78
- # Called when document ends parsing
82
+ # Called when document ends parsing.
79
83
  def end_document
80
84
  end
81
85
 
82
86
  ###
83
- # Called at the beginning of an element
84
- # * +name+ is the name of the tag
85
- # * +attrs+ are an assoc list of namespaces and attributes, e.g.:
87
+ # Called at the beginning of an element.
88
+ #
89
+ # [Parameters]
90
+ # - +name+ (String) the name of the element
91
+ # - +attrs+ (Array<Array<String>>) an assoc list of namespace declarations and attributes, e.g.:
86
92
  # [ ["xmlns:foo", "http://sample.net"], ["size", "large"] ]
93
+ #
94
+ # 💡If you're dealing with XML and need to handle namespaces, use the
95
+ # #start_element_namespace method instead.
96
+ #
97
+ # Note that the element namespace and any attribute namespaces are not provided, and so any
98
+ # namespaced elements or attributes will be returned as strings including the prefix:
99
+ #
100
+ # parser.parse(<<~XML)
101
+ # <root xmlns:foo='http://foo.example.com/' xmlns='http://example.com/'>
102
+ # <foo:bar foo:quux="xxx">hello world</foo:bar>
103
+ # </root>
104
+ # XML
105
+ #
106
+ # assert_pattern do
107
+ # parser.document.start_elements => [
108
+ # ["root", [["xmlns:foo", "http://foo.example.com/"], ["xmlns", "http://example.com/"]]],
109
+ # ["foo:bar", [["foo:quux", "xxx"]]],
110
+ # ]
111
+ # end
112
+ #
87
113
  def start_element(name, attrs = [])
88
114
  end
89
115
 
90
116
  ###
91
- # Called at the end of an element
92
- # +name+ is the tag name
117
+ # Called at the end of an element.
118
+ #
119
+ # [Parameters]
120
+ # - +name+ (String) the name of the element being closed
121
+ #
93
122
  def end_element(name)
94
123
  end
95
124
 
96
125
  ###
97
- # Called at the beginning of an element
98
- # +name+ is the element name
99
- # +attrs+ is a list of attributes
100
- # +prefix+ is the namespace prefix for the element
101
- # +uri+ is the associated namespace URI
102
- # +ns+ is a hash of namespace prefix:urls associated with the element
126
+ # Called at the beginning of an element.
127
+ #
128
+ # [Parameters]
129
+ # - +name+ (String) is the name of the element
130
+ # - +attrs+ (Array<Attribute>) is an array of structs with the following properties:
131
+ # - +localname+ (String) the local name of the attribute
132
+ # - +value+ (String) the value of the attribute
133
+ # - +prefix+ (String, nil) the namespace prefix of the attribute
134
+ # - +uri+ (String, nil) the namespace URI of the attribute
135
+ # - +prefix+ (String, nil) is the namespace prefix for the element
136
+ # - +uri+ (String, nil) is the associated URI for the element's namespace
137
+ # - +ns+ (Array<Array<String, String>>) is an assoc list of namespace declarations on the element
138
+ #
139
+ # 💡If you're dealing with HTML or don't care about namespaces, try #start_element instead.
140
+ #
141
+ # [Example]
142
+ # it "start_elements_namespace is called with namespaced attributes" do
143
+ # parser.parse(<<~XML)
144
+ # <root xmlns:foo='http://foo.example.com/'>
145
+ # <foo:a foo:bar='hello' />
146
+ # </root>
147
+ # XML
148
+ #
149
+ # assert_pattern do
150
+ # parser.document.start_elements_namespace => [
151
+ # [
152
+ # "root",
153
+ # [],
154
+ # nil, nil,
155
+ # [["foo", "http://foo.example.com/"]], # namespace declarations
156
+ # ], [
157
+ # "a",
158
+ # [Nokogiri::XML::SAX::Parser::Attribute(localname: "bar", prefix: "foo", uri: "http://foo.example.com/", value: "hello")], # prefixed attribute
159
+ # "foo", "http://foo.example.com/", # prefix and uri for the "a" element
160
+ # [],
161
+ # ]
162
+ # ]
163
+ # end
164
+ # end
165
+ #
103
166
  def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = []) # rubocop:disable Metrics/ParameterLists
104
- ###
105
167
  # Deal with SAX v1 interface
106
168
  name = [prefix, name].compact.join(":")
107
169
  attributes = ns.map do |ns_prefix, ns_uri|
@@ -113,52 +175,81 @@ module Nokogiri
113
175
  end
114
176
 
115
177
  ###
116
- # Called at the end of an element
117
- # +name+ is the element's name
118
- # +prefix+ is the namespace prefix associated with the element
119
- # +uri+ is the associated namespace URI
178
+ # Called at the end of an element.
179
+ #
180
+ # [Parameters]
181
+ # - +name+ (String) is the name of the element
182
+ # - +prefix+ (String, nil) is the namespace prefix for the element
183
+ # - +uri+ (String, nil) is the associated URI for the element's namespace
184
+ #
120
185
  def end_element_namespace(name, prefix = nil, uri = nil)
121
- ###
122
186
  # Deal with SAX v1 interface
123
187
  end_element([prefix, name].compact.join(":"))
124
188
  end
125
189
 
126
190
  ###
127
- # Characters read between a tag. This method might be called multiple
128
- # times given one contiguous string of characters.
191
+ # Called when character data is parsed, and for parsed entities when
192
+ # ParserContext#replace_entities is +true+.
193
+ #
194
+ # [Parameters]
195
+ # - +string+ contains the character data or entity replacement text
196
+ #
197
+ # ⚠ Please see Document@Entity+Handling for important information about how entities are handled.
198
+ #
199
+ # ⚠ This method might be called multiple times for a contiguous string of characters.
129
200
  #
130
- # +string+ contains the character data
131
201
  def characters(string)
132
202
  end
133
203
 
204
+ ###
205
+ # Called when a parsed entity is referenced and not replaced.
206
+ #
207
+ # [Parameters]
208
+ # - +name+ (String) is the name of the entity
209
+ # - +content+ (String, nil) is the replacement text for the entity, if known
210
+ #
211
+ # ⚠ Please see Document@Entity+Handling for important information about how entities are handled.
212
+ #
213
+ # ⚠ An internal entity may result in a call to both #characters and #reference.
214
+ #
215
+ # Since v1.17.0
216
+ #
217
+ def reference(name, content)
218
+ end
219
+
134
220
  ###
135
221
  # Called when comments are encountered
136
- # +string+ contains the comment data
222
+ # [Parameters]
223
+ # - +string+ contains the comment data
137
224
  def comment(string)
138
225
  end
139
226
 
140
227
  ###
141
228
  # Called on document warnings
142
- # +string+ contains the warning
229
+ # [Parameters]
230
+ # - +string+ contains the warning
143
231
  def warning(string)
144
232
  end
145
233
 
146
234
  ###
147
235
  # Called on document errors
148
- # +string+ contains the error
236
+ # [Parameters]
237
+ # - +string+ contains the error
149
238
  def error(string)
150
239
  end
151
240
 
152
241
  ###
153
242
  # Called when cdata blocks are found
154
- # +string+ contains the cdata content
243
+ # [Parameters]
244
+ # - +string+ contains the cdata content
155
245
  def cdata_block(string)
156
246
  end
157
247
 
158
248
  ###
159
249
  # Called when processing instructions are found
160
- # +name+ is the target of the instruction
161
- # +content+ is the value of the instruction
250
+ # [Parameters]
251
+ # - +name+ is the target of the instruction
252
+ # - +content+ is the value of the instruction
162
253
  def processing_instruction(name, content)
163
254
  end
164
255
  end
@@ -4,16 +4,15 @@ module Nokogiri
4
4
  module XML
5
5
  module SAX
6
6
  ###
7
- # This parser is a SAX style parser that reads it's input as it
8
- # deems necessary. The parser takes a Nokogiri::XML::SAX::Document,
9
- # an optional encoding, then given an XML input, sends messages to
10
- # the Nokogiri::XML::SAX::Document.
7
+ # This parser is a SAX style parser that reads its input as it deems necessary. The parser
8
+ # takes a Nokogiri::XML::SAX::Document, an optional encoding, then given an XML input, sends
9
+ # messages to the Nokogiri::XML::SAX::Document.
11
10
  #
12
11
  # Here is an example of using this parser:
13
12
  #
14
13
  # # Create a subclass of Nokogiri::XML::SAX::Document and implement
15
14
  # # the events we care about:
16
- # class MyDoc < Nokogiri::XML::SAX::Document
15
+ # class MyHandler < Nokogiri::XML::SAX::Document
17
16
  # def start_element name, attrs = []
18
17
  # puts "starting: #{name}"
19
18
  # end
@@ -23,20 +22,28 @@ module Nokogiri
23
22
  # end
24
23
  # end
25
24
  #
26
- # # Create our parser
27
- # parser = Nokogiri::XML::SAX::Parser.new(MyDoc.new)
25
+ # parser = Nokogiri::XML::SAX::Parser.new(MyHandler.new)
28
26
  #
29
- # # Send some XML to the parser
30
- # parser.parse(File.open(ARGV[0]))
27
+ # # Hand an IO object to the parser, which will read the XML from the IO.
28
+ # File.open(path_to_xml) do |f|
29
+ # parser.parse(f)
30
+ # end
31
+ #
32
+ # For more information about \SAX parsers, see Nokogiri::XML::SAX.
33
+ #
34
+ # Also see Nokogiri::XML::SAX::Document for the available events.
35
+ #
36
+ # For \HTML documents, use the subclass Nokogiri::HTML4::SAX::Parser.
31
37
  #
32
- # For more information about SAX parsers, see Nokogiri::XML::SAX. Also
33
- # see Nokogiri::XML::SAX::Document for the available events.
34
38
  class Parser
39
+ # to dynamically resolve ParserContext in inherited methods
40
+ include Nokogiri::ClassResolver
41
+
42
+ # Structure used for marshalling attributes for some callbacks in XML::SAX::Document.
35
43
  class Attribute < Struct.new(:localname, :prefix, :uri, :value)
36
44
  end
37
45
 
38
- # Encodinds this parser supports
39
- ENCODINGS = {
46
+ ENCODINGS = { # :nodoc:
40
47
  "NONE" => 0, # No char encoding detected
41
48
  "UTF-8" => 1, # UTF-8
42
49
  "UTF16LE" => 2, # UTF-16 little endian
@@ -61,6 +68,8 @@ module Nokogiri
61
68
  "EUC-JP" => 21, # EUC-JP
62
69
  "ASCII" => 22, # pure ASCII
63
70
  }
71
+ REVERSE_ENCODINGS = ENCODINGS.invert # :nodoc:
72
+ deprecate_constant :ENCODINGS
64
73
 
65
74
  # The Nokogiri::XML::SAX::Document where events will be sent.
66
75
  attr_accessor :document
@@ -68,57 +77,122 @@ module Nokogiri
68
77
  # The encoding beings used for this document.
69
78
  attr_accessor :encoding
70
79
 
71
- # Create a new Parser with +doc+ and +encoding+
72
- def initialize(doc = Nokogiri::XML::SAX::Document.new, encoding = "UTF-8")
73
- @encoding = check_encoding(encoding)
80
+ ###
81
+ # :call-seq:
82
+ # new SAX::Parser
83
+ # new(handler) ⇒ SAX::Parser
84
+ # new(handler, encoding) ⇒ SAX::Parser
85
+ #
86
+ # Create a new Parser.
87
+ #
88
+ # [Parameters]
89
+ # - +handler+ (optional Nokogiri::XML::SAX::Document) The document that will receive
90
+ # events. Will create a new Nokogiri::XML::SAX::Document if not given, which is accessible
91
+ # through the #document attribute.
92
+ # - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
93
+ # parsing the input. (default +nil+ for auto-detection)
94
+ #
95
+ def initialize(doc = Nokogiri::XML::SAX::Document.new, encoding = nil)
96
+ @encoding = encoding
74
97
  @document = doc
75
98
  @warned = false
99
+
100
+ initialize_native unless Nokogiri.jruby?
76
101
  end
77
102
 
78
103
  ###
79
- # Parse given +thing+ which may be a string containing xml, or an
80
- # IO object.
81
- def parse(thing, &block)
82
- if thing.respond_to?(:read) && thing.respond_to?(:close)
83
- parse_io(thing, &block)
104
+ # :call-seq:
105
+ # parse(input) { |parser_context| ... }
106
+ #
107
+ # Parse the input, sending events to the SAX::Document at #document.
108
+ #
109
+ # [Parameters]
110
+ # - +input+ (String, IO) The input to parse.
111
+ #
112
+ # If +input+ quacks like a readable IO object, this method forwards to Parser.parse_io,
113
+ # otherwise it forwards to Parser.parse_memory.
114
+ #
115
+ # [Yields]
116
+ # If a block is given, the underlying ParserContext object will be yielded. This can be used
117
+ # to set options on the parser context before parsing begins.
118
+ #
119
+ def parse(input, &block)
120
+ if input.respond_to?(:read) && input.respond_to?(:close)
121
+ parse_io(input, &block)
84
122
  else
85
- parse_memory(thing, &block)
123
+ parse_memory(input, &block)
86
124
  end
87
125
  end
88
126
 
89
127
  ###
90
- # Parse given +io+
128
+ # :call-seq:
129
+ # parse_io(io) { |parser_context| ... }
130
+ # parse_io(io, encoding) { |parser_context| ... }
131
+ #
132
+ # Parse an input stream.
133
+ #
134
+ # [Parameters]
135
+ # - +io+ (IO) The readable IO object from which to read input
136
+ # - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
137
+ # parsing the input, or +nil+ for auto-detection. (default #encoding)
138
+ #
139
+ # [Yields]
140
+ # If a block is given, the underlying ParserContext object will be yielded. This can be used
141
+ # to set options on the parser context before parsing begins.
142
+ #
91
143
  def parse_io(io, encoding = @encoding)
92
- ctx = ParserContext.io(io, ENCODINGS[check_encoding(encoding)])
144
+ ctx = related_class("ParserContext").io(io, encoding)
93
145
  yield ctx if block_given?
94
146
  ctx.parse_with(self)
95
147
  end
96
148
 
97
149
  ###
98
- # Parse a file with +filename+
99
- def parse_file(filename)
100
- raise ArgumentError unless filename
101
- raise Errno::ENOENT unless File.exist?(filename)
102
- raise Errno::EISDIR if File.directory?(filename)
103
-
104
- ctx = ParserContext.file(filename)
150
+ # :call-seq:
151
+ # parse_memory(input) { |parser_context| ... }
152
+ # parse_memory(input, encoding) { |parser_context| ... }
153
+ #
154
+ # Parse an input string.
155
+ #
156
+ # [Parameters]
157
+ # - +input+ (String) The input string to be parsed.
158
+ # - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
159
+ # parsing the input, or +nil+ for auto-detection. (default #encoding)
160
+ #
161
+ # [Yields]
162
+ # If a block is given, the underlying ParserContext object will be yielded. This can be used
163
+ # to set options on the parser context before parsing begins.
164
+ #
165
+ def parse_memory(input, encoding = @encoding)
166
+ ctx = related_class("ParserContext").memory(input, encoding)
105
167
  yield ctx if block_given?
106
168
  ctx.parse_with(self)
107
169
  end
108
170
 
109
- def parse_memory(data)
110
- ctx = ParserContext.memory(data)
171
+ ###
172
+ # :call-seq:
173
+ # parse_file(filename) { |parser_context| ... }
174
+ # parse_file(filename, encoding) { |parser_context| ... }
175
+ #
176
+ # Parse a file.
177
+ #
178
+ # [Parameters]
179
+ # - +filename+ (String) The path to the file to be parsed.
180
+ # - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
181
+ # parsing the input, or +nil+ for auto-detection. (default #encoding)
182
+ #
183
+ # [Yields]
184
+ # If a block is given, the underlying ParserContext object will be yielded. This can be used
185
+ # to set options on the parser context before parsing begins.
186
+ #
187
+ def parse_file(filename, encoding = @encoding)
188
+ raise ArgumentError, "no filename provided" unless filename
189
+ raise Errno::ENOENT unless File.exist?(filename)
190
+ raise Errno::EISDIR if File.directory?(filename)
191
+
192
+ ctx = related_class("ParserContext").file(filename, encoding)
111
193
  yield ctx if block_given?
112
194
  ctx.parse_with(self)
113
195
  end
114
-
115
- private
116
-
117
- def check_encoding(encoding)
118
- encoding.upcase.tap do |enc|
119
- raise ArgumentError, "'#{enc}' is not a valid encoding" unless ENCODINGS[enc]
120
- end
121
- end
122
196
  end
123
197
  end
124
198
  end