nokogiri 1.16.8-x86_64-darwin → 1.17.0-x86_64-darwin
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +11 -21
- data/README.md +4 -0
- data/dependencies.yml +6 -6
- data/ext/nokogiri/extconf.rb +191 -137
- data/ext/nokogiri/gumbo.c +69 -53
- data/ext/nokogiri/html4_document.c +10 -4
- data/ext/nokogiri/html4_element_description.c +18 -18
- data/ext/nokogiri/html4_sax_parser.c +40 -0
- data/ext/nokogiri/html4_sax_parser_context.c +48 -58
- data/ext/nokogiri/html4_sax_push_parser.c +25 -24
- data/ext/nokogiri/include/libexslt/exsltconfig.h +3 -3
- data/ext/nokogiri/include/libxml2/libxml/HTMLparser.h +12 -19
- data/ext/nokogiri/include/libxml2/libxml/c14n.h +1 -12
- data/ext/nokogiri/include/libxml2/libxml/debugXML.h +1 -1
- data/ext/nokogiri/include/libxml2/libxml/encoding.h +9 -0
- data/ext/nokogiri/include/libxml2/libxml/entities.h +12 -1
- data/ext/nokogiri/include/libxml2/libxml/hash.h +19 -0
- data/ext/nokogiri/include/libxml2/libxml/list.h +2 -2
- data/ext/nokogiri/include/libxml2/libxml/nanohttp.h +17 -0
- data/ext/nokogiri/include/libxml2/libxml/parser.h +60 -54
- data/ext/nokogiri/include/libxml2/libxml/parserInternals.h +9 -1
- data/ext/nokogiri/include/libxml2/libxml/pattern.h +6 -0
- data/ext/nokogiri/include/libxml2/libxml/tree.h +32 -12
- data/ext/nokogiri/include/libxml2/libxml/uri.h +11 -0
- data/ext/nokogiri/include/libxml2/libxml/valid.h +29 -2
- data/ext/nokogiri/include/libxml2/libxml/xinclude.h +7 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlIO.h +21 -4
- data/ext/nokogiri/include/libxml2/libxml/xmlerror.h +14 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlexports.h +111 -15
- data/ext/nokogiri/include/libxml2/libxml/xmlmemory.h +8 -45
- data/ext/nokogiri/include/libxml2/libxml/xmlreader.h +2 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlsave.h +5 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlunicode.h +165 -1
- data/ext/nokogiri/include/libxml2/libxml/xmlversion.h +7 -171
- data/ext/nokogiri/include/libxml2/libxml/xmlwriter.h +1 -0
- data/ext/nokogiri/include/libxml2/libxml/xpath.h +4 -0
- data/ext/nokogiri/include/libxslt/xsltInternals.h +3 -0
- data/ext/nokogiri/include/libxslt/xsltconfig.h +4 -37
- data/ext/nokogiri/libxml2_polyfill.c +114 -0
- data/ext/nokogiri/nokogiri.c +9 -2
- data/ext/nokogiri/nokogiri.h +18 -33
- data/ext/nokogiri/xml_attr.c +1 -1
- data/ext/nokogiri/xml_cdata.c +2 -10
- data/ext/nokogiri/xml_comment.c +3 -8
- data/ext/nokogiri/xml_document.c +163 -156
- data/ext/nokogiri/xml_document_fragment.c +10 -25
- data/ext/nokogiri/xml_dtd.c +1 -1
- data/ext/nokogiri/xml_element_content.c +9 -9
- data/ext/nokogiri/xml_encoding_handler.c +4 -4
- data/ext/nokogiri/xml_namespace.c +6 -6
- data/ext/nokogiri/xml_node.c +130 -104
- data/ext/nokogiri/xml_node_set.c +46 -44
- data/ext/nokogiri/xml_reader.c +54 -58
- data/ext/nokogiri/xml_relax_ng.c +35 -56
- data/ext/nokogiri/xml_sax_parser.c +156 -88
- data/ext/nokogiri/xml_sax_parser_context.c +213 -131
- data/ext/nokogiri/xml_sax_push_parser.c +68 -49
- data/ext/nokogiri/xml_schema.c +50 -85
- data/ext/nokogiri/xml_syntax_error.c +19 -11
- data/ext/nokogiri/xml_text.c +2 -4
- data/ext/nokogiri/xml_xpath_context.c +2 -2
- data/ext/nokogiri/xslt_stylesheet.c +8 -8
- data/lib/nokogiri/3.0/nokogiri.bundle +0 -0
- data/lib/nokogiri/3.1/nokogiri.bundle +0 -0
- data/lib/nokogiri/3.2/nokogiri.bundle +0 -0
- data/lib/nokogiri/3.3/nokogiri.bundle +0 -0
- data/lib/nokogiri/class_resolver.rb +1 -1
- data/lib/nokogiri/css/node.rb +6 -2
- data/lib/nokogiri/css/parser.rb +6 -4
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/parser_extras.rb +6 -66
- data/lib/nokogiri/css/selector_cache.rb +38 -0
- data/lib/nokogiri/css/tokenizer.rb +4 -4
- data/lib/nokogiri/css/tokenizer.rex +9 -8
- data/lib/nokogiri/css/xpath_visitor.rb +42 -6
- data/lib/nokogiri/css.rb +86 -20
- data/lib/nokogiri/decorators/slop.rb +3 -5
- data/lib/nokogiri/encoding_handler.rb +2 -2
- data/lib/nokogiri/html4/document.rb +44 -23
- data/lib/nokogiri/html4/document_fragment.rb +124 -12
- data/lib/nokogiri/html4/encoding_reader.rb +1 -1
- data/lib/nokogiri/html4/sax/parser.rb +23 -38
- data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
- data/lib/nokogiri/html4.rb +9 -14
- data/lib/nokogiri/html5/builder.rb +40 -0
- data/lib/nokogiri/html5/document.rb +61 -30
- data/lib/nokogiri/html5/document_fragment.rb +130 -20
- data/lib/nokogiri/html5/node.rb +4 -4
- data/lib/nokogiri/html5.rb +114 -72
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/xml/builder.rb +8 -1
- data/lib/nokogiri/xml/document.rb +70 -26
- data/lib/nokogiri/xml/document_fragment.rb +84 -13
- data/lib/nokogiri/xml/node.rb +82 -11
- data/lib/nokogiri/xml/node_set.rb +9 -7
- data/lib/nokogiri/xml/parse_options.rb +1 -1
- data/lib/nokogiri/xml/pp/node.rb +6 -1
- data/lib/nokogiri/xml/reader.rb +46 -13
- data/lib/nokogiri/xml/relax_ng.rb +57 -20
- data/lib/nokogiri/xml/sax/document.rb +174 -83
- data/lib/nokogiri/xml/sax/parser.rb +115 -41
- data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
- data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
- data/lib/nokogiri/xml/sax.rb +48 -0
- data/lib/nokogiri/xml/schema.rb +112 -45
- data/lib/nokogiri/xml/searchable.rb +6 -8
- data/lib/nokogiri/xml/syntax_error.rb +22 -0
- data/lib/nokogiri/xml.rb +13 -24
- data/lib/nokogiri/xslt.rb +3 -9
- data/lib/xsd/xmlparser/nokogiri.rb +3 -4
- metadata +8 -4
- data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
@@ -3,36 +3,73 @@
|
|
3
3
|
module Nokogiri
|
4
4
|
module XML
|
5
5
|
class << self
|
6
|
-
|
7
|
-
#
|
8
|
-
#
|
9
|
-
|
10
|
-
|
6
|
+
# :call-seq:
|
7
|
+
# RelaxNG(input) → Nokogiri::XML::RelaxNG
|
8
|
+
# RelaxNG(input, options:) → Nokogiri::XML::RelaxNG
|
9
|
+
#
|
10
|
+
# Convenience method for Nokogiri::XML::RelaxNG.new
|
11
|
+
def RelaxNG(...)
|
12
|
+
RelaxNG.new(...)
|
11
13
|
end
|
12
14
|
end
|
13
15
|
|
14
|
-
|
15
|
-
# Nokogiri::XML::RelaxNG is used for validating XML against a
|
16
|
-
# RelaxNG schema.
|
16
|
+
# Nokogiri::XML::RelaxNG is used for validating \XML against a RELAX NG schema definition.
|
17
17
|
#
|
18
|
-
#
|
18
|
+
# 🛡 <b>Do not use this class for untrusted schema documents.</b> RELAX NG input is always
|
19
|
+
# treated as *trusted*, meaning that the underlying parsing libraries <b>will access network
|
20
|
+
# resources</b>. This is counter to Nokogiri's "untrusted by default" security policy, but is an
|
21
|
+
# unfortunate limitation of the underlying libraries.
|
19
22
|
#
|
20
|
-
#
|
21
|
-
# that are returned and print them out:
|
23
|
+
# *Example:* Determine whether an \XML document is valid.
|
22
24
|
#
|
23
|
-
# schema
|
24
|
-
# doc
|
25
|
+
# schema = Nokogiri::XML::RelaxNG.new(File.read(RELAX_NG_FILE))
|
26
|
+
# doc = Nokogiri::XML::Document.parse(File.read(XML_FILE))
|
27
|
+
# schema.valid?(doc) # Boolean
|
25
28
|
#
|
26
|
-
#
|
27
|
-
# puts error.message
|
28
|
-
# end
|
29
|
+
# *Example:* Validate an \XML document against a \RelaxNG schema, and capture any errors that are found.
|
29
30
|
#
|
30
|
-
#
|
31
|
+
# schema = Nokogiri::XML::RelaxNG.new(File.open(RELAX_NG_FILE))
|
32
|
+
# doc = Nokogiri::XML::Document.parse(File.open(XML_FILE))
|
33
|
+
# errors = schema.validate(doc) # Array<SyntaxError>
|
34
|
+
#
|
35
|
+
# *Example:* Validate an \XML document using a Document containing a RELAX NG schema definition.
|
36
|
+
#
|
37
|
+
# schema_doc = Nokogiri::XML::Document.parse(File.read(RELAX_NG_FILE))
|
38
|
+
# schema = Nokogiri::XML::RelaxNG.from_document(schema_doc)
|
39
|
+
# doc = Nokogiri::XML::Document.parse(File.open(XML_FILE))
|
40
|
+
# schema.valid?(doc) # Boolean
|
31
41
|
#
|
32
|
-
# NOTE: RelaxNG input is always treated as TRUSTED documents, meaning that they will cause the
|
33
|
-
# underlying parsing libraries to access network resources. This is counter to Nokogiri's
|
34
|
-
# "untrusted by default" security policy, but is a limitation of the underlying libraries.
|
35
42
|
class RelaxNG < Nokogiri::XML::Schema
|
43
|
+
# :call-seq:
|
44
|
+
# new(input) → Nokogiri::XML::RelaxNG
|
45
|
+
# new(input, options:) → Nokogiri::XML::RelaxNG
|
46
|
+
#
|
47
|
+
# Parse a RELAX NG schema definition from a String or IO to create a new Nokogiri::XML::RelaxNG.
|
48
|
+
#
|
49
|
+
# [Parameters]
|
50
|
+
# - +input+ (String | IO) RELAX NG schema definition
|
51
|
+
# - +options:+ (Nokogiri::XML::ParseOptions)
|
52
|
+
# Defaults to Nokogiri::XML::ParseOptions::DEFAULT_SCHEMA ⚠ Unused
|
53
|
+
#
|
54
|
+
# [Returns] Nokogiri::XML::RelaxNG
|
55
|
+
#
|
56
|
+
# ⚠ +parse_options+ is currently unused by this method and is present only as a placeholder for
|
57
|
+
# future functionality.
|
58
|
+
#
|
59
|
+
# Also see convenience method Nokogiri::XML::RelaxNG()
|
60
|
+
def self.new(input, parse_options_ = ParseOptions::DEFAULT_SCHEMA, options: parse_options_)
|
61
|
+
from_document(Nokogiri::XML::Document.parse(input), options)
|
62
|
+
end
|
63
|
+
|
64
|
+
# :call-seq:
|
65
|
+
# read_memory(input) → Nokogiri::XML::RelaxNG
|
66
|
+
# read_memory(input, options:) → Nokogiri::XML::RelaxNG
|
67
|
+
#
|
68
|
+
# Convenience method for Nokogiri::XML::RelaxNG.new.
|
69
|
+
def self.read_memory(...)
|
70
|
+
# TODO deprecate this method
|
71
|
+
new(...)
|
72
|
+
end
|
36
73
|
end
|
37
74
|
end
|
38
75
|
end
|
@@ -2,106 +2,168 @@
|
|
2
2
|
|
3
3
|
module Nokogiri
|
4
4
|
module XML
|
5
|
-
###
|
6
|
-
# SAX Parsers are event driven parsers. Nokogiri provides two different event based parsers when
|
7
|
-
# dealing with XML. If you want to do SAX style parsing using HTML, check out
|
8
|
-
# Nokogiri::HTML4::SAX.
|
9
|
-
#
|
10
|
-
# The basic way a SAX style parser works is by creating a parser, telling the parser about the
|
11
|
-
# events we're interested in, then giving the parser some XML to process. The parser will notify
|
12
|
-
# you when it encounters events you said you would like to know about.
|
13
|
-
#
|
14
|
-
# To register for events, you simply subclass Nokogiri::XML::SAX::Document, and implement the
|
15
|
-
# methods for which you would like notification.
|
16
|
-
#
|
17
|
-
# For example, if I want to be notified when a document ends, and when an element starts, I
|
18
|
-
# would write a class like this:
|
19
|
-
#
|
20
|
-
# class MyDocument < Nokogiri::XML::SAX::Document
|
21
|
-
# def end_document
|
22
|
-
# puts "the document has ended"
|
23
|
-
# end
|
24
|
-
#
|
25
|
-
# def start_element name, attributes = []
|
26
|
-
# puts "#{name} started"
|
27
|
-
# end
|
28
|
-
# end
|
29
|
-
#
|
30
|
-
# Then I would instantiate a SAX parser with this document, and feed the parser some XML
|
31
|
-
#
|
32
|
-
# # Create a new parser
|
33
|
-
# parser = Nokogiri::XML::SAX::Parser.new(MyDocument.new)
|
34
|
-
#
|
35
|
-
# # Feed the parser some XML
|
36
|
-
# parser.parse(File.open(ARGV[0]))
|
37
|
-
#
|
38
|
-
# Now my document handler will be called when each node starts, and when then document ends. To
|
39
|
-
# see what kinds of events are available, take a look at Nokogiri::XML::SAX::Document.
|
40
|
-
#
|
41
|
-
# Two SAX parsers for XML are available, a parser that reads from a string or IO object as it
|
42
|
-
# feels necessary, and a parser that lets you spoon feed it XML. If you want to let Nokogiri
|
43
|
-
# deal with reading your XML, use the Nokogiri::XML::SAX::Parser. If you want to have fine grain
|
44
|
-
# control over the XML input, use the Nokogiri::XML::SAX::PushParser.
|
45
5
|
module SAX
|
46
|
-
|
47
|
-
#
|
48
|
-
#
|
49
|
-
#
|
50
|
-
#
|
6
|
+
# :markup: markdown
|
7
|
+
#
|
8
|
+
# The SAX::Document class is used for registering types of events you are interested in
|
9
|
+
# handling. All of the methods on this class are available as possible events while parsing an
|
10
|
+
# \XML document. To register for any particular event, subclass this class and implement the
|
11
|
+
# methods you are interested in knowing about.
|
51
12
|
#
|
52
13
|
# To only be notified about start and end element events, write a class like this:
|
53
14
|
#
|
54
|
-
#
|
55
|
-
#
|
56
|
-
#
|
57
|
-
#
|
15
|
+
# class MyHandler < Nokogiri::XML::SAX::Document
|
16
|
+
# def start_element name, attrs = []
|
17
|
+
# puts "#{name} started!"
|
18
|
+
# end
|
58
19
|
#
|
59
|
-
#
|
60
|
-
#
|
20
|
+
# def end_element name
|
21
|
+
# puts "#{name} ended"
|
22
|
+
# end
|
61
23
|
# end
|
62
|
-
# end
|
63
24
|
#
|
64
|
-
# You can use this event handler for any SAX
|
65
|
-
#
|
25
|
+
# You can use this event handler for any SAX-style parser included with Nokogiri.
|
26
|
+
#
|
27
|
+
# See also:
|
28
|
+
#
|
29
|
+
# - Nokogiri::XML::SAX
|
30
|
+
# - Nokogiri::HTML4::SAX
|
31
|
+
#
|
32
|
+
# ### Entity Handling
|
33
|
+
#
|
34
|
+
# ⚠ Entity handling is complicated in a SAX parser! Please read this section carefully if
|
35
|
+
# you're not getting the behavior you expect.
|
36
|
+
#
|
37
|
+
# Entities will be reported to the user via callbacks to #characters, to #reference, or
|
38
|
+
# possibly to both. The behavior is determined by a combination of _entity type_ and the value
|
39
|
+
# of ParserContext#replace_entities. (Recall that the default value of
|
40
|
+
# ParserContext#replace_entities is `false`.)
|
41
|
+
#
|
42
|
+
# ⚠ <b>It is UNSAFE to set ParserContext#replace_entities to `true`</b> when parsing untrusted
|
43
|
+
# documents.
|
44
|
+
#
|
45
|
+
# 💡 For more information on entity types, see [Wikipedia's page on
|
46
|
+
# DTDs](https://en.wikipedia.org/wiki/Document_type_definition#Entity_declarations).
|
47
|
+
#
|
48
|
+
# | Entity type | #characters | #reference |
|
49
|
+
# |--------------------------------------|------------------------------------|-------------------------------------|
|
50
|
+
# | Char ref (e.g., <tt>’</tt>) | always | never |
|
51
|
+
# | Predefined (e.g., <tt>&</tt>) | always | never |
|
52
|
+
# | Undeclared † | never | <tt>#replace_entities == false</tt> |
|
53
|
+
# | Internal | always | <tt>#replace_entities == false</tt> |
|
54
|
+
# | External † | <tt>#replace_entities == true</tt> | <tt>#replace_entities == false</tt> |
|
55
|
+
#
|
56
|
+
#
|
57
|
+
#
|
58
|
+
# † In the case where the replacement text for the entity is unknown (e.g., an undeclared entity
|
59
|
+
# or an external entity that could not be resolved because of network issues), then the
|
60
|
+
# replacement text will not be reported. If ParserContext#replace_entities is `true`, this
|
61
|
+
# means the #characters callback will not be invoked. If ParserContext#replace_entities is
|
62
|
+
# `false`, then the #reference callback will be invoked, but with `nil` for the `content`
|
63
|
+
# argument.
|
64
|
+
#
|
66
65
|
class Document
|
67
66
|
###
|
68
|
-
# Called when an XML declaration is parsed
|
67
|
+
# Called when an \XML declaration is parsed.
|
68
|
+
#
|
69
|
+
# [Parameters]
|
70
|
+
# - +version+ (String) the version attribute
|
71
|
+
# - +encoding+ (String, nil) the encoding of the document if present, else +nil+
|
72
|
+
# - +standalone+ ("yes", "no", nil) the standalone attribute if present, else +nil+
|
69
73
|
def xmldecl(version, encoding, standalone)
|
70
74
|
end
|
71
75
|
|
72
76
|
###
|
73
|
-
# Called when document starts parsing
|
77
|
+
# Called when document starts parsing.
|
74
78
|
def start_document
|
75
79
|
end
|
76
80
|
|
77
81
|
###
|
78
|
-
# Called when document ends parsing
|
82
|
+
# Called when document ends parsing.
|
79
83
|
def end_document
|
80
84
|
end
|
81
85
|
|
82
86
|
###
|
83
|
-
# Called at the beginning of an element
|
84
|
-
#
|
85
|
-
#
|
87
|
+
# Called at the beginning of an element.
|
88
|
+
#
|
89
|
+
# [Parameters]
|
90
|
+
# - +name+ (String) the name of the element
|
91
|
+
# - +attrs+ (Array<Array<String>>) an assoc list of namespace declarations and attributes, e.g.:
|
86
92
|
# [ ["xmlns:foo", "http://sample.net"], ["size", "large"] ]
|
93
|
+
#
|
94
|
+
# 💡If you're dealing with XML and need to handle namespaces, use the
|
95
|
+
# #start_element_namespace method instead.
|
96
|
+
#
|
97
|
+
# Note that the element namespace and any attribute namespaces are not provided, and so any
|
98
|
+
# namespaced elements or attributes will be returned as strings including the prefix:
|
99
|
+
#
|
100
|
+
# parser.parse(<<~XML)
|
101
|
+
# <root xmlns:foo='http://foo.example.com/' xmlns='http://example.com/'>
|
102
|
+
# <foo:bar foo:quux="xxx">hello world</foo:bar>
|
103
|
+
# </root>
|
104
|
+
# XML
|
105
|
+
#
|
106
|
+
# assert_pattern do
|
107
|
+
# parser.document.start_elements => [
|
108
|
+
# ["root", [["xmlns:foo", "http://foo.example.com/"], ["xmlns", "http://example.com/"]]],
|
109
|
+
# ["foo:bar", [["foo:quux", "xxx"]]],
|
110
|
+
# ]
|
111
|
+
# end
|
112
|
+
#
|
87
113
|
def start_element(name, attrs = [])
|
88
114
|
end
|
89
115
|
|
90
116
|
###
|
91
|
-
# Called at the end of an element
|
92
|
-
#
|
117
|
+
# Called at the end of an element.
|
118
|
+
#
|
119
|
+
# [Parameters]
|
120
|
+
# - +name+ (String) the name of the element being closed
|
121
|
+
#
|
93
122
|
def end_element(name)
|
94
123
|
end
|
95
124
|
|
96
125
|
###
|
97
|
-
# Called at the beginning of an element
|
98
|
-
#
|
99
|
-
#
|
100
|
-
# +
|
101
|
-
# +
|
102
|
-
# +
|
126
|
+
# Called at the beginning of an element.
|
127
|
+
#
|
128
|
+
# [Parameters]
|
129
|
+
# - +name+ (String) is the name of the element
|
130
|
+
# - +attrs+ (Array<Attribute>) is an array of structs with the following properties:
|
131
|
+
# - +localname+ (String) the local name of the attribute
|
132
|
+
# - +value+ (String) the value of the attribute
|
133
|
+
# - +prefix+ (String, nil) the namespace prefix of the attribute
|
134
|
+
# - +uri+ (String, nil) the namespace URI of the attribute
|
135
|
+
# - +prefix+ (String, nil) is the namespace prefix for the element
|
136
|
+
# - +uri+ (String, nil) is the associated URI for the element's namespace
|
137
|
+
# - +ns+ (Array<Array<String, String>>) is an assoc list of namespace declarations on the element
|
138
|
+
#
|
139
|
+
# 💡If you're dealing with HTML or don't care about namespaces, try #start_element instead.
|
140
|
+
#
|
141
|
+
# [Example]
|
142
|
+
# it "start_elements_namespace is called with namespaced attributes" do
|
143
|
+
# parser.parse(<<~XML)
|
144
|
+
# <root xmlns:foo='http://foo.example.com/'>
|
145
|
+
# <foo:a foo:bar='hello' />
|
146
|
+
# </root>
|
147
|
+
# XML
|
148
|
+
#
|
149
|
+
# assert_pattern do
|
150
|
+
# parser.document.start_elements_namespace => [
|
151
|
+
# [
|
152
|
+
# "root",
|
153
|
+
# [],
|
154
|
+
# nil, nil,
|
155
|
+
# [["foo", "http://foo.example.com/"]], # namespace declarations
|
156
|
+
# ], [
|
157
|
+
# "a",
|
158
|
+
# [Nokogiri::XML::SAX::Parser::Attribute(localname: "bar", prefix: "foo", uri: "http://foo.example.com/", value: "hello")], # prefixed attribute
|
159
|
+
# "foo", "http://foo.example.com/", # prefix and uri for the "a" element
|
160
|
+
# [],
|
161
|
+
# ]
|
162
|
+
# ]
|
163
|
+
# end
|
164
|
+
# end
|
165
|
+
#
|
103
166
|
def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = []) # rubocop:disable Metrics/ParameterLists
|
104
|
-
###
|
105
167
|
# Deal with SAX v1 interface
|
106
168
|
name = [prefix, name].compact.join(":")
|
107
169
|
attributes = ns.map do |ns_prefix, ns_uri|
|
@@ -113,52 +175,81 @@ module Nokogiri
|
|
113
175
|
end
|
114
176
|
|
115
177
|
###
|
116
|
-
# Called at the end of an element
|
117
|
-
#
|
118
|
-
#
|
119
|
-
# +
|
178
|
+
# Called at the end of an element.
|
179
|
+
#
|
180
|
+
# [Parameters]
|
181
|
+
# - +name+ (String) is the name of the element
|
182
|
+
# - +prefix+ (String, nil) is the namespace prefix for the element
|
183
|
+
# - +uri+ (String, nil) is the associated URI for the element's namespace
|
184
|
+
#
|
120
185
|
def end_element_namespace(name, prefix = nil, uri = nil)
|
121
|
-
###
|
122
186
|
# Deal with SAX v1 interface
|
123
187
|
end_element([prefix, name].compact.join(":"))
|
124
188
|
end
|
125
189
|
|
126
190
|
###
|
127
|
-
#
|
128
|
-
#
|
191
|
+
# Called when character data is parsed, and for parsed entities when
|
192
|
+
# ParserContext#replace_entities is +true+.
|
193
|
+
#
|
194
|
+
# [Parameters]
|
195
|
+
# - +string+ contains the character data or entity replacement text
|
196
|
+
#
|
197
|
+
# ⚠ Please see Document@Entity+Handling for important information about how entities are handled.
|
198
|
+
#
|
199
|
+
# ⚠ This method might be called multiple times for a contiguous string of characters.
|
129
200
|
#
|
130
|
-
# +string+ contains the character data
|
131
201
|
def characters(string)
|
132
202
|
end
|
133
203
|
|
204
|
+
###
|
205
|
+
# Called when a parsed entity is referenced and not replaced.
|
206
|
+
#
|
207
|
+
# [Parameters]
|
208
|
+
# - +name+ (String) is the name of the entity
|
209
|
+
# - +content+ (String, nil) is the replacement text for the entity, if known
|
210
|
+
#
|
211
|
+
# ⚠ Please see Document@Entity+Handling for important information about how entities are handled.
|
212
|
+
#
|
213
|
+
# ⚠ An internal entity may result in a call to both #characters and #reference.
|
214
|
+
#
|
215
|
+
# Since v1.17.0
|
216
|
+
#
|
217
|
+
def reference(name, content)
|
218
|
+
end
|
219
|
+
|
134
220
|
###
|
135
221
|
# Called when comments are encountered
|
136
|
-
#
|
222
|
+
# [Parameters]
|
223
|
+
# - +string+ contains the comment data
|
137
224
|
def comment(string)
|
138
225
|
end
|
139
226
|
|
140
227
|
###
|
141
228
|
# Called on document warnings
|
142
|
-
#
|
229
|
+
# [Parameters]
|
230
|
+
# - +string+ contains the warning
|
143
231
|
def warning(string)
|
144
232
|
end
|
145
233
|
|
146
234
|
###
|
147
235
|
# Called on document errors
|
148
|
-
#
|
236
|
+
# [Parameters]
|
237
|
+
# - +string+ contains the error
|
149
238
|
def error(string)
|
150
239
|
end
|
151
240
|
|
152
241
|
###
|
153
242
|
# Called when cdata blocks are found
|
154
|
-
#
|
243
|
+
# [Parameters]
|
244
|
+
# - +string+ contains the cdata content
|
155
245
|
def cdata_block(string)
|
156
246
|
end
|
157
247
|
|
158
248
|
###
|
159
249
|
# Called when processing instructions are found
|
160
|
-
#
|
161
|
-
# +
|
250
|
+
# [Parameters]
|
251
|
+
# - +name+ is the target of the instruction
|
252
|
+
# - +content+ is the value of the instruction
|
162
253
|
def processing_instruction(name, content)
|
163
254
|
end
|
164
255
|
end
|
@@ -4,16 +4,15 @@ module Nokogiri
|
|
4
4
|
module XML
|
5
5
|
module SAX
|
6
6
|
###
|
7
|
-
# This parser is a SAX style parser that reads
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# the Nokogiri::XML::SAX::Document.
|
7
|
+
# This parser is a SAX style parser that reads its input as it deems necessary. The parser
|
8
|
+
# takes a Nokogiri::XML::SAX::Document, an optional encoding, then given an XML input, sends
|
9
|
+
# messages to the Nokogiri::XML::SAX::Document.
|
11
10
|
#
|
12
11
|
# Here is an example of using this parser:
|
13
12
|
#
|
14
13
|
# # Create a subclass of Nokogiri::XML::SAX::Document and implement
|
15
14
|
# # the events we care about:
|
16
|
-
# class
|
15
|
+
# class MyHandler < Nokogiri::XML::SAX::Document
|
17
16
|
# def start_element name, attrs = []
|
18
17
|
# puts "starting: #{name}"
|
19
18
|
# end
|
@@ -23,20 +22,28 @@ module Nokogiri
|
|
23
22
|
# end
|
24
23
|
# end
|
25
24
|
#
|
26
|
-
#
|
27
|
-
# parser = Nokogiri::XML::SAX::Parser.new(MyDoc.new)
|
25
|
+
# parser = Nokogiri::XML::SAX::Parser.new(MyHandler.new)
|
28
26
|
#
|
29
|
-
# #
|
30
|
-
#
|
27
|
+
# # Hand an IO object to the parser, which will read the XML from the IO.
|
28
|
+
# File.open(path_to_xml) do |f|
|
29
|
+
# parser.parse(f)
|
30
|
+
# end
|
31
|
+
#
|
32
|
+
# For more information about \SAX parsers, see Nokogiri::XML::SAX.
|
33
|
+
#
|
34
|
+
# Also see Nokogiri::XML::SAX::Document for the available events.
|
35
|
+
#
|
36
|
+
# For \HTML documents, use the subclass Nokogiri::HTML4::SAX::Parser.
|
31
37
|
#
|
32
|
-
# For more information about SAX parsers, see Nokogiri::XML::SAX. Also
|
33
|
-
# see Nokogiri::XML::SAX::Document for the available events.
|
34
38
|
class Parser
|
39
|
+
# to dynamically resolve ParserContext in inherited methods
|
40
|
+
include Nokogiri::ClassResolver
|
41
|
+
|
42
|
+
# Structure used for marshalling attributes for some callbacks in XML::SAX::Document.
|
35
43
|
class Attribute < Struct.new(:localname, :prefix, :uri, :value)
|
36
44
|
end
|
37
45
|
|
38
|
-
|
39
|
-
ENCODINGS = {
|
46
|
+
ENCODINGS = { # :nodoc:
|
40
47
|
"NONE" => 0, # No char encoding detected
|
41
48
|
"UTF-8" => 1, # UTF-8
|
42
49
|
"UTF16LE" => 2, # UTF-16 little endian
|
@@ -61,6 +68,8 @@ module Nokogiri
|
|
61
68
|
"EUC-JP" => 21, # EUC-JP
|
62
69
|
"ASCII" => 22, # pure ASCII
|
63
70
|
}
|
71
|
+
REVERSE_ENCODINGS = ENCODINGS.invert # :nodoc:
|
72
|
+
deprecate_constant :ENCODINGS
|
64
73
|
|
65
74
|
# The Nokogiri::XML::SAX::Document where events will be sent.
|
66
75
|
attr_accessor :document
|
@@ -68,57 +77,122 @@ module Nokogiri
|
|
68
77
|
# The encoding beings used for this document.
|
69
78
|
attr_accessor :encoding
|
70
79
|
|
71
|
-
|
72
|
-
|
73
|
-
|
80
|
+
###
|
81
|
+
# :call-seq:
|
82
|
+
# new ⇒ SAX::Parser
|
83
|
+
# new(handler) ⇒ SAX::Parser
|
84
|
+
# new(handler, encoding) ⇒ SAX::Parser
|
85
|
+
#
|
86
|
+
# Create a new Parser.
|
87
|
+
#
|
88
|
+
# [Parameters]
|
89
|
+
# - +handler+ (optional Nokogiri::XML::SAX::Document) The document that will receive
|
90
|
+
# events. Will create a new Nokogiri::XML::SAX::Document if not given, which is accessible
|
91
|
+
# through the #document attribute.
|
92
|
+
# - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
|
93
|
+
# parsing the input. (default +nil+ for auto-detection)
|
94
|
+
#
|
95
|
+
def initialize(doc = Nokogiri::XML::SAX::Document.new, encoding = nil)
|
96
|
+
@encoding = encoding
|
74
97
|
@document = doc
|
75
98
|
@warned = false
|
99
|
+
|
100
|
+
initialize_native unless Nokogiri.jruby?
|
76
101
|
end
|
77
102
|
|
78
103
|
###
|
79
|
-
#
|
80
|
-
#
|
81
|
-
|
82
|
-
|
83
|
-
|
104
|
+
# :call-seq:
|
105
|
+
# parse(input) { |parser_context| ... }
|
106
|
+
#
|
107
|
+
# Parse the input, sending events to the SAX::Document at #document.
|
108
|
+
#
|
109
|
+
# [Parameters]
|
110
|
+
# - +input+ (String, IO) The input to parse.
|
111
|
+
#
|
112
|
+
# If +input+ quacks like a readable IO object, this method forwards to Parser.parse_io,
|
113
|
+
# otherwise it forwards to Parser.parse_memory.
|
114
|
+
#
|
115
|
+
# [Yields]
|
116
|
+
# If a block is given, the underlying ParserContext object will be yielded. This can be used
|
117
|
+
# to set options on the parser context before parsing begins.
|
118
|
+
#
|
119
|
+
def parse(input, &block)
|
120
|
+
if input.respond_to?(:read) && input.respond_to?(:close)
|
121
|
+
parse_io(input, &block)
|
84
122
|
else
|
85
|
-
parse_memory(
|
123
|
+
parse_memory(input, &block)
|
86
124
|
end
|
87
125
|
end
|
88
126
|
|
89
127
|
###
|
90
|
-
#
|
128
|
+
# :call-seq:
|
129
|
+
# parse_io(io) { |parser_context| ... }
|
130
|
+
# parse_io(io, encoding) { |parser_context| ... }
|
131
|
+
#
|
132
|
+
# Parse an input stream.
|
133
|
+
#
|
134
|
+
# [Parameters]
|
135
|
+
# - +io+ (IO) The readable IO object from which to read input
|
136
|
+
# - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
|
137
|
+
# parsing the input, or +nil+ for auto-detection. (default #encoding)
|
138
|
+
#
|
139
|
+
# [Yields]
|
140
|
+
# If a block is given, the underlying ParserContext object will be yielded. This can be used
|
141
|
+
# to set options on the parser context before parsing begins.
|
142
|
+
#
|
91
143
|
def parse_io(io, encoding = @encoding)
|
92
|
-
ctx = ParserContext.io(io,
|
144
|
+
ctx = related_class("ParserContext").io(io, encoding)
|
93
145
|
yield ctx if block_given?
|
94
146
|
ctx.parse_with(self)
|
95
147
|
end
|
96
148
|
|
97
149
|
###
|
98
|
-
#
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
150
|
+
# :call-seq:
|
151
|
+
# parse_memory(input) { |parser_context| ... }
|
152
|
+
# parse_memory(input, encoding) { |parser_context| ... }
|
153
|
+
#
|
154
|
+
# Parse an input string.
|
155
|
+
#
|
156
|
+
# [Parameters]
|
157
|
+
# - +input+ (String) The input string to be parsed.
|
158
|
+
# - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
|
159
|
+
# parsing the input, or +nil+ for auto-detection. (default #encoding)
|
160
|
+
#
|
161
|
+
# [Yields]
|
162
|
+
# If a block is given, the underlying ParserContext object will be yielded. This can be used
|
163
|
+
# to set options on the parser context before parsing begins.
|
164
|
+
#
|
165
|
+
def parse_memory(input, encoding = @encoding)
|
166
|
+
ctx = related_class("ParserContext").memory(input, encoding)
|
105
167
|
yield ctx if block_given?
|
106
168
|
ctx.parse_with(self)
|
107
169
|
end
|
108
170
|
|
109
|
-
|
110
|
-
|
171
|
+
###
|
172
|
+
# :call-seq:
|
173
|
+
# parse_file(filename) { |parser_context| ... }
|
174
|
+
# parse_file(filename, encoding) { |parser_context| ... }
|
175
|
+
#
|
176
|
+
# Parse a file.
|
177
|
+
#
|
178
|
+
# [Parameters]
|
179
|
+
# - +filename+ (String) The path to the file to be parsed.
|
180
|
+
# - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
|
181
|
+
# parsing the input, or +nil+ for auto-detection. (default #encoding)
|
182
|
+
#
|
183
|
+
# [Yields]
|
184
|
+
# If a block is given, the underlying ParserContext object will be yielded. This can be used
|
185
|
+
# to set options on the parser context before parsing begins.
|
186
|
+
#
|
187
|
+
def parse_file(filename, encoding = @encoding)
|
188
|
+
raise ArgumentError, "no filename provided" unless filename
|
189
|
+
raise Errno::ENOENT unless File.exist?(filename)
|
190
|
+
raise Errno::EISDIR if File.directory?(filename)
|
191
|
+
|
192
|
+
ctx = related_class("ParserContext").file(filename, encoding)
|
111
193
|
yield ctx if block_given?
|
112
194
|
ctx.parse_with(self)
|
113
195
|
end
|
114
|
-
|
115
|
-
private
|
116
|
-
|
117
|
-
def check_encoding(encoding)
|
118
|
-
encoding.upcase.tap do |enc|
|
119
|
-
raise ArgumentError, "'#{enc}' is not a valid encoding" unless ENCODINGS[enc]
|
120
|
-
end
|
121
|
-
end
|
122
196
|
end
|
123
197
|
end
|
124
198
|
end
|