nokogiri 1.16.8-x86_64-darwin → 1.17.1-x86_64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +11 -21
- data/README.md +4 -0
- data/dependencies.yml +6 -6
- data/ext/nokogiri/extconf.rb +191 -137
- data/ext/nokogiri/gumbo.c +69 -53
- data/ext/nokogiri/html4_document.c +10 -4
- data/ext/nokogiri/html4_element_description.c +18 -18
- data/ext/nokogiri/html4_sax_parser.c +40 -0
- data/ext/nokogiri/html4_sax_parser_context.c +48 -58
- data/ext/nokogiri/html4_sax_push_parser.c +25 -24
- data/ext/nokogiri/include/libexslt/exsltconfig.h +3 -3
- data/ext/nokogiri/include/libxml2/libxml/HTMLparser.h +12 -19
- data/ext/nokogiri/include/libxml2/libxml/c14n.h +1 -12
- data/ext/nokogiri/include/libxml2/libxml/debugXML.h +1 -1
- data/ext/nokogiri/include/libxml2/libxml/encoding.h +9 -0
- data/ext/nokogiri/include/libxml2/libxml/entities.h +12 -1
- data/ext/nokogiri/include/libxml2/libxml/hash.h +19 -0
- data/ext/nokogiri/include/libxml2/libxml/list.h +2 -2
- data/ext/nokogiri/include/libxml2/libxml/nanohttp.h +17 -0
- data/ext/nokogiri/include/libxml2/libxml/parser.h +60 -54
- data/ext/nokogiri/include/libxml2/libxml/parserInternals.h +9 -1
- data/ext/nokogiri/include/libxml2/libxml/pattern.h +6 -0
- data/ext/nokogiri/include/libxml2/libxml/tree.h +32 -12
- data/ext/nokogiri/include/libxml2/libxml/uri.h +11 -0
- data/ext/nokogiri/include/libxml2/libxml/valid.h +29 -2
- data/ext/nokogiri/include/libxml2/libxml/xinclude.h +7 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlIO.h +21 -4
- data/ext/nokogiri/include/libxml2/libxml/xmlerror.h +14 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlexports.h +111 -15
- data/ext/nokogiri/include/libxml2/libxml/xmlmemory.h +8 -45
- data/ext/nokogiri/include/libxml2/libxml/xmlreader.h +2 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlsave.h +5 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlunicode.h +165 -1
- data/ext/nokogiri/include/libxml2/libxml/xmlversion.h +7 -171
- data/ext/nokogiri/include/libxml2/libxml/xmlwriter.h +1 -0
- data/ext/nokogiri/include/libxml2/libxml/xpath.h +4 -0
- data/ext/nokogiri/include/libxslt/xsltInternals.h +3 -0
- data/ext/nokogiri/include/libxslt/xsltconfig.h +4 -37
- data/ext/nokogiri/libxml2_polyfill.c +114 -0
- data/ext/nokogiri/nokogiri.c +9 -2
- data/ext/nokogiri/nokogiri.h +18 -33
- data/ext/nokogiri/xml_attr.c +1 -1
- data/ext/nokogiri/xml_cdata.c +2 -10
- data/ext/nokogiri/xml_comment.c +3 -8
- data/ext/nokogiri/xml_document.c +163 -156
- data/ext/nokogiri/xml_document_fragment.c +10 -25
- data/ext/nokogiri/xml_dtd.c +1 -1
- data/ext/nokogiri/xml_element_content.c +9 -9
- data/ext/nokogiri/xml_encoding_handler.c +4 -4
- data/ext/nokogiri/xml_namespace.c +6 -6
- data/ext/nokogiri/xml_node.c +134 -103
- data/ext/nokogiri/xml_node_set.c +46 -44
- data/ext/nokogiri/xml_reader.c +54 -58
- data/ext/nokogiri/xml_relax_ng.c +35 -56
- data/ext/nokogiri/xml_sax_parser.c +156 -88
- data/ext/nokogiri/xml_sax_parser_context.c +213 -131
- data/ext/nokogiri/xml_sax_push_parser.c +68 -49
- data/ext/nokogiri/xml_schema.c +50 -85
- data/ext/nokogiri/xml_syntax_error.c +19 -11
- data/ext/nokogiri/xml_text.c +2 -4
- data/ext/nokogiri/xml_xpath_context.c +2 -2
- data/ext/nokogiri/xslt_stylesheet.c +8 -8
- data/lib/nokogiri/3.0/nokogiri.bundle +0 -0
- data/lib/nokogiri/3.1/nokogiri.bundle +0 -0
- data/lib/nokogiri/3.2/nokogiri.bundle +0 -0
- data/lib/nokogiri/3.3/nokogiri.bundle +0 -0
- data/lib/nokogiri/class_resolver.rb +1 -1
- data/lib/nokogiri/css/node.rb +6 -2
- data/lib/nokogiri/css/parser.rb +6 -4
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/parser_extras.rb +6 -66
- data/lib/nokogiri/css/selector_cache.rb +38 -0
- data/lib/nokogiri/css/tokenizer.rb +4 -4
- data/lib/nokogiri/css/tokenizer.rex +9 -8
- data/lib/nokogiri/css/xpath_visitor.rb +42 -6
- data/lib/nokogiri/css.rb +86 -20
- data/lib/nokogiri/decorators/slop.rb +3 -5
- data/lib/nokogiri/encoding_handler.rb +2 -2
- data/lib/nokogiri/html4/document.rb +44 -23
- data/lib/nokogiri/html4/document_fragment.rb +124 -12
- data/lib/nokogiri/html4/encoding_reader.rb +1 -1
- data/lib/nokogiri/html4/sax/parser.rb +23 -38
- data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
- data/lib/nokogiri/html4.rb +9 -14
- data/lib/nokogiri/html5/builder.rb +40 -0
- data/lib/nokogiri/html5/document.rb +61 -30
- data/lib/nokogiri/html5/document_fragment.rb +130 -20
- data/lib/nokogiri/html5/node.rb +4 -4
- data/lib/nokogiri/html5.rb +114 -72
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/xml/builder.rb +8 -1
- data/lib/nokogiri/xml/document.rb +70 -26
- data/lib/nokogiri/xml/document_fragment.rb +84 -13
- data/lib/nokogiri/xml/node.rb +82 -11
- data/lib/nokogiri/xml/node_set.rb +9 -7
- data/lib/nokogiri/xml/parse_options.rb +1 -1
- data/lib/nokogiri/xml/pp/node.rb +6 -1
- data/lib/nokogiri/xml/reader.rb +46 -13
- data/lib/nokogiri/xml/relax_ng.rb +57 -20
- data/lib/nokogiri/xml/sax/document.rb +174 -83
- data/lib/nokogiri/xml/sax/parser.rb +115 -41
- data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
- data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
- data/lib/nokogiri/xml/sax.rb +48 -0
- data/lib/nokogiri/xml/schema.rb +112 -45
- data/lib/nokogiri/xml/searchable.rb +6 -8
- data/lib/nokogiri/xml/syntax_error.rb +22 -0
- data/lib/nokogiri/xml.rb +13 -24
- data/lib/nokogiri/xslt.rb +3 -9
- data/lib/xsd/xmlparser/nokogiri.rb +3 -4
- metadata +8 -4
- data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
@@ -3,36 +3,73 @@
|
|
3
3
|
module Nokogiri
|
4
4
|
module XML
|
5
5
|
class << self
|
6
|
-
|
7
|
-
#
|
8
|
-
#
|
9
|
-
|
10
|
-
|
6
|
+
# :call-seq:
|
7
|
+
# RelaxNG(input) → Nokogiri::XML::RelaxNG
|
8
|
+
# RelaxNG(input, options:) → Nokogiri::XML::RelaxNG
|
9
|
+
#
|
10
|
+
# Convenience method for Nokogiri::XML::RelaxNG.new
|
11
|
+
def RelaxNG(...)
|
12
|
+
RelaxNG.new(...)
|
11
13
|
end
|
12
14
|
end
|
13
15
|
|
14
|
-
|
15
|
-
# Nokogiri::XML::RelaxNG is used for validating XML against a
|
16
|
-
# RelaxNG schema.
|
16
|
+
# Nokogiri::XML::RelaxNG is used for validating \XML against a RELAX NG schema definition.
|
17
17
|
#
|
18
|
-
#
|
18
|
+
# 🛡 <b>Do not use this class for untrusted schema documents.</b> RELAX NG input is always
|
19
|
+
# treated as *trusted*, meaning that the underlying parsing libraries <b>will access network
|
20
|
+
# resources</b>. This is counter to Nokogiri's "untrusted by default" security policy, but is an
|
21
|
+
# unfortunate limitation of the underlying libraries.
|
19
22
|
#
|
20
|
-
#
|
21
|
-
# that are returned and print them out:
|
23
|
+
# *Example:* Determine whether an \XML document is valid.
|
22
24
|
#
|
23
|
-
# schema
|
24
|
-
# doc
|
25
|
+
# schema = Nokogiri::XML::RelaxNG.new(File.read(RELAX_NG_FILE))
|
26
|
+
# doc = Nokogiri::XML::Document.parse(File.read(XML_FILE))
|
27
|
+
# schema.valid?(doc) # Boolean
|
25
28
|
#
|
26
|
-
#
|
27
|
-
# puts error.message
|
28
|
-
# end
|
29
|
+
# *Example:* Validate an \XML document against a \RelaxNG schema, and capture any errors that are found.
|
29
30
|
#
|
30
|
-
#
|
31
|
+
# schema = Nokogiri::XML::RelaxNG.new(File.open(RELAX_NG_FILE))
|
32
|
+
# doc = Nokogiri::XML::Document.parse(File.open(XML_FILE))
|
33
|
+
# errors = schema.validate(doc) # Array<SyntaxError>
|
34
|
+
#
|
35
|
+
# *Example:* Validate an \XML document using a Document containing a RELAX NG schema definition.
|
36
|
+
#
|
37
|
+
# schema_doc = Nokogiri::XML::Document.parse(File.read(RELAX_NG_FILE))
|
38
|
+
# schema = Nokogiri::XML::RelaxNG.from_document(schema_doc)
|
39
|
+
# doc = Nokogiri::XML::Document.parse(File.open(XML_FILE))
|
40
|
+
# schema.valid?(doc) # Boolean
|
31
41
|
#
|
32
|
-
# NOTE: RelaxNG input is always treated as TRUSTED documents, meaning that they will cause the
|
33
|
-
# underlying parsing libraries to access network resources. This is counter to Nokogiri's
|
34
|
-
# "untrusted by default" security policy, but is a limitation of the underlying libraries.
|
35
42
|
class RelaxNG < Nokogiri::XML::Schema
|
43
|
+
# :call-seq:
|
44
|
+
# new(input) → Nokogiri::XML::RelaxNG
|
45
|
+
# new(input, options:) → Nokogiri::XML::RelaxNG
|
46
|
+
#
|
47
|
+
# Parse a RELAX NG schema definition from a String or IO to create a new Nokogiri::XML::RelaxNG.
|
48
|
+
#
|
49
|
+
# [Parameters]
|
50
|
+
# - +input+ (String | IO) RELAX NG schema definition
|
51
|
+
# - +options:+ (Nokogiri::XML::ParseOptions)
|
52
|
+
# Defaults to Nokogiri::XML::ParseOptions::DEFAULT_SCHEMA ⚠ Unused
|
53
|
+
#
|
54
|
+
# [Returns] Nokogiri::XML::RelaxNG
|
55
|
+
#
|
56
|
+
# ⚠ +parse_options+ is currently unused by this method and is present only as a placeholder for
|
57
|
+
# future functionality.
|
58
|
+
#
|
59
|
+
# Also see convenience method Nokogiri::XML::RelaxNG()
|
60
|
+
def self.new(input, parse_options_ = ParseOptions::DEFAULT_SCHEMA, options: parse_options_)
|
61
|
+
from_document(Nokogiri::XML::Document.parse(input), options)
|
62
|
+
end
|
63
|
+
|
64
|
+
# :call-seq:
|
65
|
+
# read_memory(input) → Nokogiri::XML::RelaxNG
|
66
|
+
# read_memory(input, options:) → Nokogiri::XML::RelaxNG
|
67
|
+
#
|
68
|
+
# Convenience method for Nokogiri::XML::RelaxNG.new.
|
69
|
+
def self.read_memory(...)
|
70
|
+
# TODO deprecate this method
|
71
|
+
new(...)
|
72
|
+
end
|
36
73
|
end
|
37
74
|
end
|
38
75
|
end
|
@@ -2,106 +2,168 @@
|
|
2
2
|
|
3
3
|
module Nokogiri
|
4
4
|
module XML
|
5
|
-
###
|
6
|
-
# SAX Parsers are event driven parsers. Nokogiri provides two different event based parsers when
|
7
|
-
# dealing with XML. If you want to do SAX style parsing using HTML, check out
|
8
|
-
# Nokogiri::HTML4::SAX.
|
9
|
-
#
|
10
|
-
# The basic way a SAX style parser works is by creating a parser, telling the parser about the
|
11
|
-
# events we're interested in, then giving the parser some XML to process. The parser will notify
|
12
|
-
# you when it encounters events you said you would like to know about.
|
13
|
-
#
|
14
|
-
# To register for events, you simply subclass Nokogiri::XML::SAX::Document, and implement the
|
15
|
-
# methods for which you would like notification.
|
16
|
-
#
|
17
|
-
# For example, if I want to be notified when a document ends, and when an element starts, I
|
18
|
-
# would write a class like this:
|
19
|
-
#
|
20
|
-
# class MyDocument < Nokogiri::XML::SAX::Document
|
21
|
-
# def end_document
|
22
|
-
# puts "the document has ended"
|
23
|
-
# end
|
24
|
-
#
|
25
|
-
# def start_element name, attributes = []
|
26
|
-
# puts "#{name} started"
|
27
|
-
# end
|
28
|
-
# end
|
29
|
-
#
|
30
|
-
# Then I would instantiate a SAX parser with this document, and feed the parser some XML
|
31
|
-
#
|
32
|
-
# # Create a new parser
|
33
|
-
# parser = Nokogiri::XML::SAX::Parser.new(MyDocument.new)
|
34
|
-
#
|
35
|
-
# # Feed the parser some XML
|
36
|
-
# parser.parse(File.open(ARGV[0]))
|
37
|
-
#
|
38
|
-
# Now my document handler will be called when each node starts, and when then document ends. To
|
39
|
-
# see what kinds of events are available, take a look at Nokogiri::XML::SAX::Document.
|
40
|
-
#
|
41
|
-
# Two SAX parsers for XML are available, a parser that reads from a string or IO object as it
|
42
|
-
# feels necessary, and a parser that lets you spoon feed it XML. If you want to let Nokogiri
|
43
|
-
# deal with reading your XML, use the Nokogiri::XML::SAX::Parser. If you want to have fine grain
|
44
|
-
# control over the XML input, use the Nokogiri::XML::SAX::PushParser.
|
45
5
|
module SAX
|
46
|
-
|
47
|
-
#
|
48
|
-
#
|
49
|
-
#
|
50
|
-
#
|
6
|
+
# :markup: markdown
|
7
|
+
#
|
8
|
+
# The SAX::Document class is used for registering types of events you are interested in
|
9
|
+
# handling. All of the methods on this class are available as possible events while parsing an
|
10
|
+
# \XML document. To register for any particular event, subclass this class and implement the
|
11
|
+
# methods you are interested in knowing about.
|
51
12
|
#
|
52
13
|
# To only be notified about start and end element events, write a class like this:
|
53
14
|
#
|
54
|
-
#
|
55
|
-
#
|
56
|
-
#
|
57
|
-
#
|
15
|
+
# class MyHandler < Nokogiri::XML::SAX::Document
|
16
|
+
# def start_element name, attrs = []
|
17
|
+
# puts "#{name} started!"
|
18
|
+
# end
|
58
19
|
#
|
59
|
-
#
|
60
|
-
#
|
20
|
+
# def end_element name
|
21
|
+
# puts "#{name} ended"
|
22
|
+
# end
|
61
23
|
# end
|
62
|
-
# end
|
63
24
|
#
|
64
|
-
# You can use this event handler for any SAX
|
65
|
-
#
|
25
|
+
# You can use this event handler for any SAX-style parser included with Nokogiri.
|
26
|
+
#
|
27
|
+
# See also:
|
28
|
+
#
|
29
|
+
# - Nokogiri::XML::SAX
|
30
|
+
# - Nokogiri::HTML4::SAX
|
31
|
+
#
|
32
|
+
# ### Entity Handling
|
33
|
+
#
|
34
|
+
# ⚠ Entity handling is complicated in a SAX parser! Please read this section carefully if
|
35
|
+
# you're not getting the behavior you expect.
|
36
|
+
#
|
37
|
+
# Entities will be reported to the user via callbacks to #characters, to #reference, or
|
38
|
+
# possibly to both. The behavior is determined by a combination of _entity type_ and the value
|
39
|
+
# of ParserContext#replace_entities. (Recall that the default value of
|
40
|
+
# ParserContext#replace_entities is `false`.)
|
41
|
+
#
|
42
|
+
# ⚠ <b>It is UNSAFE to set ParserContext#replace_entities to `true`</b> when parsing untrusted
|
43
|
+
# documents.
|
44
|
+
#
|
45
|
+
# 💡 For more information on entity types, see [Wikipedia's page on
|
46
|
+
# DTDs](https://en.wikipedia.org/wiki/Document_type_definition#Entity_declarations).
|
47
|
+
#
|
48
|
+
# | Entity type | #characters | #reference |
|
49
|
+
# |--------------------------------------|------------------------------------|-------------------------------------|
|
50
|
+
# | Char ref (e.g., <tt>’</tt>) | always | never |
|
51
|
+
# | Predefined (e.g., <tt>&</tt>) | always | never |
|
52
|
+
# | Undeclared † | never | <tt>#replace_entities == false</tt> |
|
53
|
+
# | Internal | always | <tt>#replace_entities == false</tt> |
|
54
|
+
# | External † | <tt>#replace_entities == true</tt> | <tt>#replace_entities == false</tt> |
|
55
|
+
#
|
56
|
+
#
|
57
|
+
#
|
58
|
+
# † In the case where the replacement text for the entity is unknown (e.g., an undeclared entity
|
59
|
+
# or an external entity that could not be resolved because of network issues), then the
|
60
|
+
# replacement text will not be reported. If ParserContext#replace_entities is `true`, this
|
61
|
+
# means the #characters callback will not be invoked. If ParserContext#replace_entities is
|
62
|
+
# `false`, then the #reference callback will be invoked, but with `nil` for the `content`
|
63
|
+
# argument.
|
64
|
+
#
|
66
65
|
class Document
|
67
66
|
###
|
68
|
-
# Called when an XML declaration is parsed
|
67
|
+
# Called when an \XML declaration is parsed.
|
68
|
+
#
|
69
|
+
# [Parameters]
|
70
|
+
# - +version+ (String) the version attribute
|
71
|
+
# - +encoding+ (String, nil) the encoding of the document if present, else +nil+
|
72
|
+
# - +standalone+ ("yes", "no", nil) the standalone attribute if present, else +nil+
|
69
73
|
def xmldecl(version, encoding, standalone)
|
70
74
|
end
|
71
75
|
|
72
76
|
###
|
73
|
-
# Called when document starts parsing
|
77
|
+
# Called when document starts parsing.
|
74
78
|
def start_document
|
75
79
|
end
|
76
80
|
|
77
81
|
###
|
78
|
-
# Called when document ends parsing
|
82
|
+
# Called when document ends parsing.
|
79
83
|
def end_document
|
80
84
|
end
|
81
85
|
|
82
86
|
###
|
83
|
-
# Called at the beginning of an element
|
84
|
-
#
|
85
|
-
#
|
87
|
+
# Called at the beginning of an element.
|
88
|
+
#
|
89
|
+
# [Parameters]
|
90
|
+
# - +name+ (String) the name of the element
|
91
|
+
# - +attrs+ (Array<Array<String>>) an assoc list of namespace declarations and attributes, e.g.:
|
86
92
|
# [ ["xmlns:foo", "http://sample.net"], ["size", "large"] ]
|
93
|
+
#
|
94
|
+
# 💡If you're dealing with XML and need to handle namespaces, use the
|
95
|
+
# #start_element_namespace method instead.
|
96
|
+
#
|
97
|
+
# Note that the element namespace and any attribute namespaces are not provided, and so any
|
98
|
+
# namespaced elements or attributes will be returned as strings including the prefix:
|
99
|
+
#
|
100
|
+
# parser.parse(<<~XML)
|
101
|
+
# <root xmlns:foo='http://foo.example.com/' xmlns='http://example.com/'>
|
102
|
+
# <foo:bar foo:quux="xxx">hello world</foo:bar>
|
103
|
+
# </root>
|
104
|
+
# XML
|
105
|
+
#
|
106
|
+
# assert_pattern do
|
107
|
+
# parser.document.start_elements => [
|
108
|
+
# ["root", [["xmlns:foo", "http://foo.example.com/"], ["xmlns", "http://example.com/"]]],
|
109
|
+
# ["foo:bar", [["foo:quux", "xxx"]]],
|
110
|
+
# ]
|
111
|
+
# end
|
112
|
+
#
|
87
113
|
def start_element(name, attrs = [])
|
88
114
|
end
|
89
115
|
|
90
116
|
###
|
91
|
-
# Called at the end of an element
|
92
|
-
#
|
117
|
+
# Called at the end of an element.
|
118
|
+
#
|
119
|
+
# [Parameters]
|
120
|
+
# - +name+ (String) the name of the element being closed
|
121
|
+
#
|
93
122
|
def end_element(name)
|
94
123
|
end
|
95
124
|
|
96
125
|
###
|
97
|
-
# Called at the beginning of an element
|
98
|
-
#
|
99
|
-
#
|
100
|
-
# +
|
101
|
-
# +
|
102
|
-
# +
|
126
|
+
# Called at the beginning of an element.
|
127
|
+
#
|
128
|
+
# [Parameters]
|
129
|
+
# - +name+ (String) is the name of the element
|
130
|
+
# - +attrs+ (Array<Attribute>) is an array of structs with the following properties:
|
131
|
+
# - +localname+ (String) the local name of the attribute
|
132
|
+
# - +value+ (String) the value of the attribute
|
133
|
+
# - +prefix+ (String, nil) the namespace prefix of the attribute
|
134
|
+
# - +uri+ (String, nil) the namespace URI of the attribute
|
135
|
+
# - +prefix+ (String, nil) is the namespace prefix for the element
|
136
|
+
# - +uri+ (String, nil) is the associated URI for the element's namespace
|
137
|
+
# - +ns+ (Array<Array<String, String>>) is an assoc list of namespace declarations on the element
|
138
|
+
#
|
139
|
+
# 💡If you're dealing with HTML or don't care about namespaces, try #start_element instead.
|
140
|
+
#
|
141
|
+
# [Example]
|
142
|
+
# it "start_elements_namespace is called with namespaced attributes" do
|
143
|
+
# parser.parse(<<~XML)
|
144
|
+
# <root xmlns:foo='http://foo.example.com/'>
|
145
|
+
# <foo:a foo:bar='hello' />
|
146
|
+
# </root>
|
147
|
+
# XML
|
148
|
+
#
|
149
|
+
# assert_pattern do
|
150
|
+
# parser.document.start_elements_namespace => [
|
151
|
+
# [
|
152
|
+
# "root",
|
153
|
+
# [],
|
154
|
+
# nil, nil,
|
155
|
+
# [["foo", "http://foo.example.com/"]], # namespace declarations
|
156
|
+
# ], [
|
157
|
+
# "a",
|
158
|
+
# [Nokogiri::XML::SAX::Parser::Attribute(localname: "bar", prefix: "foo", uri: "http://foo.example.com/", value: "hello")], # prefixed attribute
|
159
|
+
# "foo", "http://foo.example.com/", # prefix and uri for the "a" element
|
160
|
+
# [],
|
161
|
+
# ]
|
162
|
+
# ]
|
163
|
+
# end
|
164
|
+
# end
|
165
|
+
#
|
103
166
|
def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = []) # rubocop:disable Metrics/ParameterLists
|
104
|
-
###
|
105
167
|
# Deal with SAX v1 interface
|
106
168
|
name = [prefix, name].compact.join(":")
|
107
169
|
attributes = ns.map do |ns_prefix, ns_uri|
|
@@ -113,52 +175,81 @@ module Nokogiri
|
|
113
175
|
end
|
114
176
|
|
115
177
|
###
|
116
|
-
# Called at the end of an element
|
117
|
-
#
|
118
|
-
#
|
119
|
-
# +
|
178
|
+
# Called at the end of an element.
|
179
|
+
#
|
180
|
+
# [Parameters]
|
181
|
+
# - +name+ (String) is the name of the element
|
182
|
+
# - +prefix+ (String, nil) is the namespace prefix for the element
|
183
|
+
# - +uri+ (String, nil) is the associated URI for the element's namespace
|
184
|
+
#
|
120
185
|
def end_element_namespace(name, prefix = nil, uri = nil)
|
121
|
-
###
|
122
186
|
# Deal with SAX v1 interface
|
123
187
|
end_element([prefix, name].compact.join(":"))
|
124
188
|
end
|
125
189
|
|
126
190
|
###
|
127
|
-
#
|
128
|
-
#
|
191
|
+
# Called when character data is parsed, and for parsed entities when
|
192
|
+
# ParserContext#replace_entities is +true+.
|
193
|
+
#
|
194
|
+
# [Parameters]
|
195
|
+
# - +string+ contains the character data or entity replacement text
|
196
|
+
#
|
197
|
+
# ⚠ Please see Document@Entity+Handling for important information about how entities are handled.
|
198
|
+
#
|
199
|
+
# ⚠ This method might be called multiple times for a contiguous string of characters.
|
129
200
|
#
|
130
|
-
# +string+ contains the character data
|
131
201
|
def characters(string)
|
132
202
|
end
|
133
203
|
|
204
|
+
###
|
205
|
+
# Called when a parsed entity is referenced and not replaced.
|
206
|
+
#
|
207
|
+
# [Parameters]
|
208
|
+
# - +name+ (String) is the name of the entity
|
209
|
+
# - +content+ (String, nil) is the replacement text for the entity, if known
|
210
|
+
#
|
211
|
+
# ⚠ Please see Document@Entity+Handling for important information about how entities are handled.
|
212
|
+
#
|
213
|
+
# ⚠ An internal entity may result in a call to both #characters and #reference.
|
214
|
+
#
|
215
|
+
# Since v1.17.0
|
216
|
+
#
|
217
|
+
def reference(name, content)
|
218
|
+
end
|
219
|
+
|
134
220
|
###
|
135
221
|
# Called when comments are encountered
|
136
|
-
#
|
222
|
+
# [Parameters]
|
223
|
+
# - +string+ contains the comment data
|
137
224
|
def comment(string)
|
138
225
|
end
|
139
226
|
|
140
227
|
###
|
141
228
|
# Called on document warnings
|
142
|
-
#
|
229
|
+
# [Parameters]
|
230
|
+
# - +string+ contains the warning
|
143
231
|
def warning(string)
|
144
232
|
end
|
145
233
|
|
146
234
|
###
|
147
235
|
# Called on document errors
|
148
|
-
#
|
236
|
+
# [Parameters]
|
237
|
+
# - +string+ contains the error
|
149
238
|
def error(string)
|
150
239
|
end
|
151
240
|
|
152
241
|
###
|
153
242
|
# Called when cdata blocks are found
|
154
|
-
#
|
243
|
+
# [Parameters]
|
244
|
+
# - +string+ contains the cdata content
|
155
245
|
def cdata_block(string)
|
156
246
|
end
|
157
247
|
|
158
248
|
###
|
159
249
|
# Called when processing instructions are found
|
160
|
-
#
|
161
|
-
# +
|
250
|
+
# [Parameters]
|
251
|
+
# - +name+ is the target of the instruction
|
252
|
+
# - +content+ is the value of the instruction
|
162
253
|
def processing_instruction(name, content)
|
163
254
|
end
|
164
255
|
end
|
@@ -4,16 +4,15 @@ module Nokogiri
|
|
4
4
|
module XML
|
5
5
|
module SAX
|
6
6
|
###
|
7
|
-
# This parser is a SAX style parser that reads
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# the Nokogiri::XML::SAX::Document.
|
7
|
+
# This parser is a SAX style parser that reads its input as it deems necessary. The parser
|
8
|
+
# takes a Nokogiri::XML::SAX::Document, an optional encoding, then given an XML input, sends
|
9
|
+
# messages to the Nokogiri::XML::SAX::Document.
|
11
10
|
#
|
12
11
|
# Here is an example of using this parser:
|
13
12
|
#
|
14
13
|
# # Create a subclass of Nokogiri::XML::SAX::Document and implement
|
15
14
|
# # the events we care about:
|
16
|
-
# class
|
15
|
+
# class MyHandler < Nokogiri::XML::SAX::Document
|
17
16
|
# def start_element name, attrs = []
|
18
17
|
# puts "starting: #{name}"
|
19
18
|
# end
|
@@ -23,20 +22,28 @@ module Nokogiri
|
|
23
22
|
# end
|
24
23
|
# end
|
25
24
|
#
|
26
|
-
#
|
27
|
-
# parser = Nokogiri::XML::SAX::Parser.new(MyDoc.new)
|
25
|
+
# parser = Nokogiri::XML::SAX::Parser.new(MyHandler.new)
|
28
26
|
#
|
29
|
-
# #
|
30
|
-
#
|
27
|
+
# # Hand an IO object to the parser, which will read the XML from the IO.
|
28
|
+
# File.open(path_to_xml) do |f|
|
29
|
+
# parser.parse(f)
|
30
|
+
# end
|
31
|
+
#
|
32
|
+
# For more information about \SAX parsers, see Nokogiri::XML::SAX.
|
33
|
+
#
|
34
|
+
# Also see Nokogiri::XML::SAX::Document for the available events.
|
35
|
+
#
|
36
|
+
# For \HTML documents, use the subclass Nokogiri::HTML4::SAX::Parser.
|
31
37
|
#
|
32
|
-
# For more information about SAX parsers, see Nokogiri::XML::SAX. Also
|
33
|
-
# see Nokogiri::XML::SAX::Document for the available events.
|
34
38
|
class Parser
|
39
|
+
# to dynamically resolve ParserContext in inherited methods
|
40
|
+
include Nokogiri::ClassResolver
|
41
|
+
|
42
|
+
# Structure used for marshalling attributes for some callbacks in XML::SAX::Document.
|
35
43
|
class Attribute < Struct.new(:localname, :prefix, :uri, :value)
|
36
44
|
end
|
37
45
|
|
38
|
-
|
39
|
-
ENCODINGS = {
|
46
|
+
ENCODINGS = { # :nodoc:
|
40
47
|
"NONE" => 0, # No char encoding detected
|
41
48
|
"UTF-8" => 1, # UTF-8
|
42
49
|
"UTF16LE" => 2, # UTF-16 little endian
|
@@ -61,6 +68,8 @@ module Nokogiri
|
|
61
68
|
"EUC-JP" => 21, # EUC-JP
|
62
69
|
"ASCII" => 22, # pure ASCII
|
63
70
|
}
|
71
|
+
REVERSE_ENCODINGS = ENCODINGS.invert # :nodoc:
|
72
|
+
deprecate_constant :ENCODINGS
|
64
73
|
|
65
74
|
# The Nokogiri::XML::SAX::Document where events will be sent.
|
66
75
|
attr_accessor :document
|
@@ -68,57 +77,122 @@ module Nokogiri
|
|
68
77
|
# The encoding beings used for this document.
|
69
78
|
attr_accessor :encoding
|
70
79
|
|
71
|
-
|
72
|
-
|
73
|
-
|
80
|
+
###
|
81
|
+
# :call-seq:
|
82
|
+
# new ⇒ SAX::Parser
|
83
|
+
# new(handler) ⇒ SAX::Parser
|
84
|
+
# new(handler, encoding) ⇒ SAX::Parser
|
85
|
+
#
|
86
|
+
# Create a new Parser.
|
87
|
+
#
|
88
|
+
# [Parameters]
|
89
|
+
# - +handler+ (optional Nokogiri::XML::SAX::Document) The document that will receive
|
90
|
+
# events. Will create a new Nokogiri::XML::SAX::Document if not given, which is accessible
|
91
|
+
# through the #document attribute.
|
92
|
+
# - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
|
93
|
+
# parsing the input. (default +nil+ for auto-detection)
|
94
|
+
#
|
95
|
+
def initialize(doc = Nokogiri::XML::SAX::Document.new, encoding = nil)
|
96
|
+
@encoding = encoding
|
74
97
|
@document = doc
|
75
98
|
@warned = false
|
99
|
+
|
100
|
+
initialize_native unless Nokogiri.jruby?
|
76
101
|
end
|
77
102
|
|
78
103
|
###
|
79
|
-
#
|
80
|
-
#
|
81
|
-
|
82
|
-
|
83
|
-
|
104
|
+
# :call-seq:
|
105
|
+
# parse(input) { |parser_context| ... }
|
106
|
+
#
|
107
|
+
# Parse the input, sending events to the SAX::Document at #document.
|
108
|
+
#
|
109
|
+
# [Parameters]
|
110
|
+
# - +input+ (String, IO) The input to parse.
|
111
|
+
#
|
112
|
+
# If +input+ quacks like a readable IO object, this method forwards to Parser.parse_io,
|
113
|
+
# otherwise it forwards to Parser.parse_memory.
|
114
|
+
#
|
115
|
+
# [Yields]
|
116
|
+
# If a block is given, the underlying ParserContext object will be yielded. This can be used
|
117
|
+
# to set options on the parser context before parsing begins.
|
118
|
+
#
|
119
|
+
def parse(input, &block)
|
120
|
+
if input.respond_to?(:read) && input.respond_to?(:close)
|
121
|
+
parse_io(input, &block)
|
84
122
|
else
|
85
|
-
parse_memory(
|
123
|
+
parse_memory(input, &block)
|
86
124
|
end
|
87
125
|
end
|
88
126
|
|
89
127
|
###
|
90
|
-
#
|
128
|
+
# :call-seq:
|
129
|
+
# parse_io(io) { |parser_context| ... }
|
130
|
+
# parse_io(io, encoding) { |parser_context| ... }
|
131
|
+
#
|
132
|
+
# Parse an input stream.
|
133
|
+
#
|
134
|
+
# [Parameters]
|
135
|
+
# - +io+ (IO) The readable IO object from which to read input
|
136
|
+
# - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
|
137
|
+
# parsing the input, or +nil+ for auto-detection. (default #encoding)
|
138
|
+
#
|
139
|
+
# [Yields]
|
140
|
+
# If a block is given, the underlying ParserContext object will be yielded. This can be used
|
141
|
+
# to set options on the parser context before parsing begins.
|
142
|
+
#
|
91
143
|
def parse_io(io, encoding = @encoding)
|
92
|
-
ctx = ParserContext.io(io,
|
144
|
+
ctx = related_class("ParserContext").io(io, encoding)
|
93
145
|
yield ctx if block_given?
|
94
146
|
ctx.parse_with(self)
|
95
147
|
end
|
96
148
|
|
97
149
|
###
|
98
|
-
#
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
150
|
+
# :call-seq:
|
151
|
+
# parse_memory(input) { |parser_context| ... }
|
152
|
+
# parse_memory(input, encoding) { |parser_context| ... }
|
153
|
+
#
|
154
|
+
# Parse an input string.
|
155
|
+
#
|
156
|
+
# [Parameters]
|
157
|
+
# - +input+ (String) The input string to be parsed.
|
158
|
+
# - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
|
159
|
+
# parsing the input, or +nil+ for auto-detection. (default #encoding)
|
160
|
+
#
|
161
|
+
# [Yields]
|
162
|
+
# If a block is given, the underlying ParserContext object will be yielded. This can be used
|
163
|
+
# to set options on the parser context before parsing begins.
|
164
|
+
#
|
165
|
+
def parse_memory(input, encoding = @encoding)
|
166
|
+
ctx = related_class("ParserContext").memory(input, encoding)
|
105
167
|
yield ctx if block_given?
|
106
168
|
ctx.parse_with(self)
|
107
169
|
end
|
108
170
|
|
109
|
-
|
110
|
-
|
171
|
+
###
|
172
|
+
# :call-seq:
|
173
|
+
# parse_file(filename) { |parser_context| ... }
|
174
|
+
# parse_file(filename, encoding) { |parser_context| ... }
|
175
|
+
#
|
176
|
+
# Parse a file.
|
177
|
+
#
|
178
|
+
# [Parameters]
|
179
|
+
# - +filename+ (String) The path to the file to be parsed.
|
180
|
+
# - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
|
181
|
+
# parsing the input, or +nil+ for auto-detection. (default #encoding)
|
182
|
+
#
|
183
|
+
# [Yields]
|
184
|
+
# If a block is given, the underlying ParserContext object will be yielded. This can be used
|
185
|
+
# to set options on the parser context before parsing begins.
|
186
|
+
#
|
187
|
+
def parse_file(filename, encoding = @encoding)
|
188
|
+
raise ArgumentError, "no filename provided" unless filename
|
189
|
+
raise Errno::ENOENT unless File.exist?(filename)
|
190
|
+
raise Errno::EISDIR if File.directory?(filename)
|
191
|
+
|
192
|
+
ctx = related_class("ParserContext").file(filename, encoding)
|
111
193
|
yield ctx if block_given?
|
112
194
|
ctx.parse_with(self)
|
113
195
|
end
|
114
|
-
|
115
|
-
private
|
116
|
-
|
117
|
-
def check_encoding(encoding)
|
118
|
-
encoding.upcase.tap do |enc|
|
119
|
-
raise ArgumentError, "'#{enc}' is not a valid encoding" unless ENCODINGS[enc]
|
120
|
-
end
|
121
|
-
end
|
122
196
|
end
|
123
197
|
end
|
124
198
|
end
|