nokogiri 1.10.9 → 1.18.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +38 -0
- data/LICENSE-DEPENDENCIES.md +1632 -1022
- data/LICENSE.md +1 -1
- data/README.md +190 -95
- data/bin/nokogiri +63 -50
- data/dependencies.yml +34 -66
- data/ext/nokogiri/depend +38 -358
- data/ext/nokogiri/extconf.rb +909 -422
- data/ext/nokogiri/gumbo.c +610 -0
- data/ext/nokogiri/html4_document.c +171 -0
- data/ext/nokogiri/html4_element_description.c +299 -0
- data/ext/nokogiri/html4_entity_lookup.c +37 -0
- data/ext/nokogiri/html4_sax_parser.c +40 -0
- data/ext/nokogiri/html4_sax_parser_context.c +98 -0
- data/ext/nokogiri/html4_sax_push_parser.c +96 -0
- data/ext/nokogiri/libxml2_polyfill.c +114 -0
- data/ext/nokogiri/nokogiri.c +258 -105
- data/ext/nokogiri/nokogiri.h +207 -90
- data/ext/nokogiri/test_global_handlers.c +40 -0
- data/ext/nokogiri/xml_attr.c +18 -18
- data/ext/nokogiri/xml_attribute_decl.c +22 -22
- data/ext/nokogiri/xml_cdata.c +33 -33
- data/ext/nokogiri/xml_comment.c +19 -31
- data/ext/nokogiri/xml_document.c +499 -323
- data/ext/nokogiri/xml_document_fragment.c +17 -36
- data/ext/nokogiri/xml_dtd.c +65 -59
- data/ext/nokogiri/xml_element_content.c +63 -55
- data/ext/nokogiri/xml_element_decl.c +31 -31
- data/ext/nokogiri/xml_encoding_handler.c +54 -21
- data/ext/nokogiri/xml_entity_decl.c +37 -35
- data/ext/nokogiri/xml_entity_reference.c +17 -19
- data/ext/nokogiri/xml_namespace.c +131 -61
- data/ext/nokogiri/xml_node.c +1429 -723
- data/ext/nokogiri/xml_node_set.c +257 -225
- data/ext/nokogiri/xml_processing_instruction.c +18 -20
- data/ext/nokogiri/xml_reader.c +340 -231
- data/ext/nokogiri/xml_relax_ng.c +87 -99
- data/ext/nokogiri/xml_sax_parser.c +269 -176
- data/ext/nokogiri/xml_sax_parser_context.c +286 -152
- data/ext/nokogiri/xml_sax_push_parser.c +111 -64
- data/ext/nokogiri/xml_schema.c +132 -140
- data/ext/nokogiri/xml_syntax_error.c +52 -23
- data/ext/nokogiri/xml_text.c +37 -30
- data/ext/nokogiri/xml_xpath_context.c +373 -185
- data/ext/nokogiri/xslt_stylesheet.c +342 -191
- data/gumbo-parser/CHANGES.md +63 -0
- data/gumbo-parser/Makefile +129 -0
- data/gumbo-parser/THANKS +27 -0
- data/gumbo-parser/src/Makefile +34 -0
- data/gumbo-parser/src/README.md +41 -0
- data/gumbo-parser/src/ascii.c +75 -0
- data/gumbo-parser/src/ascii.h +115 -0
- data/gumbo-parser/src/attribute.c +42 -0
- data/gumbo-parser/src/attribute.h +17 -0
- data/gumbo-parser/src/char_ref.c +22225 -0
- data/gumbo-parser/src/char_ref.h +29 -0
- data/gumbo-parser/src/char_ref.rl +2154 -0
- data/gumbo-parser/src/error.c +658 -0
- data/gumbo-parser/src/error.h +152 -0
- data/gumbo-parser/src/foreign_attrs.c +103 -0
- data/gumbo-parser/src/foreign_attrs.gperf +27 -0
- data/gumbo-parser/src/insertion_mode.h +33 -0
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/nokogiri_gumbo.h +953 -0
- data/gumbo-parser/src/parser.c +4932 -0
- data/gumbo-parser/src/parser.h +41 -0
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +103 -0
- data/gumbo-parser/src/string_buffer.h +68 -0
- data/gumbo-parser/src/string_piece.c +48 -0
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_attrs.gperf +77 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/svg_tags.gperf +55 -0
- data/gumbo-parser/src/tag.c +223 -0
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.gperf +170 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/token_type.h +17 -0
- data/gumbo-parser/src/tokenizer.c +3464 -0
- data/gumbo-parser/src/tokenizer.h +112 -0
- data/gumbo-parser/src/tokenizer_states.h +339 -0
- data/gumbo-parser/src/utf8.c +245 -0
- data/gumbo-parser/src/utf8.h +164 -0
- data/gumbo-parser/src/util.c +66 -0
- data/gumbo-parser/src/util.h +34 -0
- data/gumbo-parser/src/vector.c +111 -0
- data/gumbo-parser/src/vector.h +45 -0
- data/lib/nokogiri/class_resolver.rb +67 -0
- data/lib/nokogiri/css/node.rb +14 -8
- data/lib/nokogiri/css/parser.rb +399 -377
- data/lib/nokogiri/css/parser.y +250 -245
- data/lib/nokogiri/css/parser_extras.rb +16 -71
- data/lib/nokogiri/css/selector_cache.rb +38 -0
- data/lib/nokogiri/css/syntax_error.rb +3 -1
- data/lib/nokogiri/css/tokenizer.rb +7 -5
- data/lib/nokogiri/css/tokenizer.rex +11 -9
- data/lib/nokogiri/css/xpath_visitor.rb +242 -96
- data/lib/nokogiri/css.rb +122 -17
- data/lib/nokogiri/decorators/slop.rb +11 -11
- data/lib/nokogiri/encoding_handler.rb +57 -0
- data/lib/nokogiri/extension.rb +32 -0
- data/lib/nokogiri/gumbo.rb +15 -0
- data/lib/nokogiri/html.rb +38 -27
- data/lib/nokogiri/{html → html4}/builder.rb +4 -2
- data/lib/nokogiri/html4/document.rb +235 -0
- data/lib/nokogiri/html4/document_fragment.rb +166 -0
- data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
- data/lib/nokogiri/html4/element_description_defaults.rb +2040 -0
- data/lib/nokogiri/html4/encoding_reader.rb +121 -0
- data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
- data/lib/nokogiri/html4/sax/parser.rb +48 -0
- data/lib/nokogiri/html4/sax/parser_context.rb +15 -0
- data/lib/nokogiri/{html → html4}/sax/push_parser.rb +12 -11
- data/lib/nokogiri/html4.rb +42 -0
- data/lib/nokogiri/html5/builder.rb +40 -0
- data/lib/nokogiri/html5/document.rb +199 -0
- data/lib/nokogiri/html5/document_fragment.rb +200 -0
- data/lib/nokogiri/html5/node.rb +103 -0
- data/lib/nokogiri/html5.rb +368 -0
- data/lib/nokogiri/jruby/dependencies.rb +3 -0
- data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
- data/lib/nokogiri/syntax_error.rb +2 -0
- data/lib/nokogiri/version/constant.rb +6 -0
- data/lib/nokogiri/version/info.rb +224 -0
- data/lib/nokogiri/version.rb +3 -108
- data/lib/nokogiri/xml/attr.rb +55 -3
- data/lib/nokogiri/xml/attribute_decl.rb +6 -2
- data/lib/nokogiri/xml/builder.rb +83 -35
- data/lib/nokogiri/xml/cdata.rb +3 -1
- data/lib/nokogiri/xml/character_data.rb +2 -0
- data/lib/nokogiri/xml/document.rb +359 -130
- data/lib/nokogiri/xml/document_fragment.rb +170 -54
- data/lib/nokogiri/xml/dtd.rb +4 -2
- data/lib/nokogiri/xml/element_content.rb +12 -2
- data/lib/nokogiri/xml/element_decl.rb +6 -2
- data/lib/nokogiri/xml/entity_decl.rb +7 -3
- data/lib/nokogiri/xml/entity_reference.rb +2 -0
- data/lib/nokogiri/xml/namespace.rb +44 -0
- data/lib/nokogiri/xml/node/save_options.rb +23 -8
- data/lib/nokogiri/xml/node.rb +1168 -420
- data/lib/nokogiri/xml/node_set.rb +145 -67
- data/lib/nokogiri/xml/notation.rb +13 -0
- data/lib/nokogiri/xml/parse_options.rb +145 -52
- data/lib/nokogiri/xml/pp/character_data.rb +9 -6
- data/lib/nokogiri/xml/pp/node.rb +47 -30
- data/lib/nokogiri/xml/pp.rb +4 -2
- data/lib/nokogiri/xml/processing_instruction.rb +4 -1
- data/lib/nokogiri/xml/reader.rb +68 -41
- data/lib/nokogiri/xml/relax_ng.rb +60 -17
- data/lib/nokogiri/xml/sax/document.rb +198 -111
- data/lib/nokogiri/xml/sax/parser.rb +144 -67
- data/lib/nokogiri/xml/sax/parser_context.rb +119 -6
- data/lib/nokogiri/xml/sax/push_parser.rb +9 -5
- data/lib/nokogiri/xml/sax.rb +54 -4
- data/lib/nokogiri/xml/schema.rb +116 -39
- data/lib/nokogiri/xml/searchable.rb +139 -95
- data/lib/nokogiri/xml/syntax_error.rb +29 -5
- data/lib/nokogiri/xml/text.rb +2 -0
- data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
- data/lib/nokogiri/xml/xpath.rb +15 -4
- data/lib/nokogiri/xml/xpath_context.rb +15 -4
- data/lib/nokogiri/xml.rb +45 -55
- data/lib/nokogiri/xslt/stylesheet.rb +32 -8
- data/lib/nokogiri/xslt.rb +103 -30
- data/lib/nokogiri.rb +59 -75
- data/lib/xsd/xmlparser/nokogiri.rb +32 -29
- data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
- data/patches/libxml2/0010-update-config.guess-and-config.sub-for-libxml2.patch +224 -0
- data/patches/libxml2/0011-rip-out-libxml2-s-libc_single_threaded-support.patch +30 -0
- data/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch +244 -0
- data/patches/libxslt/0001-update-config.guess-and-config.sub-for-libxslt.patch +224 -0
- data/ports/archives/libxml2-2.13.6.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
- metadata +123 -295
- data/ext/nokogiri/html_document.c +0 -170
- data/ext/nokogiri/html_document.h +0 -10
- data/ext/nokogiri/html_element_description.c +0 -279
- data/ext/nokogiri/html_element_description.h +0 -10
- data/ext/nokogiri/html_entity_lookup.c +0 -32
- data/ext/nokogiri/html_entity_lookup.h +0 -8
- data/ext/nokogiri/html_sax_parser_context.c +0 -116
- data/ext/nokogiri/html_sax_parser_context.h +0 -11
- data/ext/nokogiri/html_sax_push_parser.c +0 -87
- data/ext/nokogiri/html_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_attr.h +0 -9
- data/ext/nokogiri/xml_attribute_decl.h +0 -9
- data/ext/nokogiri/xml_cdata.h +0 -9
- data/ext/nokogiri/xml_comment.h +0 -9
- data/ext/nokogiri/xml_document.h +0 -23
- data/ext/nokogiri/xml_document_fragment.h +0 -10
- data/ext/nokogiri/xml_dtd.h +0 -10
- data/ext/nokogiri/xml_element_content.h +0 -10
- data/ext/nokogiri/xml_element_decl.h +0 -9
- data/ext/nokogiri/xml_encoding_handler.h +0 -8
- data/ext/nokogiri/xml_entity_decl.h +0 -10
- data/ext/nokogiri/xml_entity_reference.h +0 -9
- data/ext/nokogiri/xml_io.c +0 -61
- data/ext/nokogiri/xml_io.h +0 -11
- data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
- data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
- data/ext/nokogiri/xml_namespace.h +0 -14
- data/ext/nokogiri/xml_node.h +0 -13
- data/ext/nokogiri/xml_node_set.h +0 -12
- data/ext/nokogiri/xml_processing_instruction.h +0 -9
- data/ext/nokogiri/xml_reader.h +0 -10
- data/ext/nokogiri/xml_relax_ng.h +0 -9
- data/ext/nokogiri/xml_sax_parser.h +0 -39
- data/ext/nokogiri/xml_sax_parser_context.h +0 -10
- data/ext/nokogiri/xml_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_schema.h +0 -9
- data/ext/nokogiri/xml_syntax_error.h +0 -13
- data/ext/nokogiri/xml_text.h +0 -9
- data/ext/nokogiri/xml_xpath_context.h +0 -10
- data/ext/nokogiri/xslt_stylesheet.h +0 -14
- data/lib/nokogiri/html/document.rb +0 -335
- data/lib/nokogiri/html/document_fragment.rb +0 -49
- data/lib/nokogiri/html/element_description_defaults.rb +0 -671
- data/lib/nokogiri/html/sax/parser.rb +0 -62
- data/lib/nokogiri/html/sax/parser_context.rb +0 -16
- data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
- data/patches/libxml2/0004-libxml2.la-is-in-top_builddir.patch +0 -25
- data/patches/libxml2/0005-Fix-infinite-loop-in-xmlStringLenDecodeEntities.patch +0 -32
- data/ports/archives/libxml2-2.9.10.tar.gz +0 -0
- data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
- /data/patches/libxml2/{0002-Remove-script-macro-support.patch → 0001-Remove-script-macro-support.patch} +0 -0
- /data/patches/libxml2/{0003-Update-entities-to-remove-handling-of-ssi.patch → 0002-Update-entities-to-remove-handling-of-ssi.patch} +0 -0
@@ -1,169 +1,256 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Nokogiri
|
2
4
|
module XML
|
3
|
-
###
|
4
|
-
# SAX Parsers are event driven parsers. Nokogiri provides two different
|
5
|
-
# event based parsers when dealing with XML. If you want to do SAX style
|
6
|
-
# parsing using HTML, check out Nokogiri::HTML::SAX.
|
7
|
-
#
|
8
|
-
# The basic way a SAX style parser works is by creating a parser,
|
9
|
-
# telling the parser about the events we're interested in, then giving
|
10
|
-
# the parser some XML to process. The parser will notify you when
|
11
|
-
# it encounters events you said you would like to know about.
|
12
|
-
#
|
13
|
-
# To register for events, you simply subclass Nokogiri::XML::SAX::Document,
|
14
|
-
# and implement the methods for which you would like notification.
|
15
|
-
#
|
16
|
-
# For example, if I want to be notified when a document ends, and when an
|
17
|
-
# element starts, I would write a class like this:
|
18
|
-
#
|
19
|
-
# class MyDocument < Nokogiri::XML::SAX::Document
|
20
|
-
# def end_document
|
21
|
-
# puts "the document has ended"
|
22
|
-
# end
|
23
|
-
#
|
24
|
-
# def start_element name, attributes = []
|
25
|
-
# puts "#{name} started"
|
26
|
-
# end
|
27
|
-
# end
|
28
|
-
#
|
29
|
-
# Then I would instantiate a SAX parser with this document, and feed the
|
30
|
-
# parser some XML
|
31
|
-
#
|
32
|
-
# # Create a new parser
|
33
|
-
# parser = Nokogiri::XML::SAX::Parser.new(MyDocument.new)
|
34
|
-
#
|
35
|
-
# # Feed the parser some XML
|
36
|
-
# parser.parse(File.open(ARGV[0]))
|
37
|
-
#
|
38
|
-
# Now my document handler will be called when each node starts, and when
|
39
|
-
# then document ends. To see what kinds of events are available, take
|
40
|
-
# a look at Nokogiri::XML::SAX::Document.
|
41
|
-
#
|
42
|
-
# Two SAX parsers for XML are available, a parser that reads from a string
|
43
|
-
# or IO object as it feels necessary, and a parser that lets you spoon
|
44
|
-
# feed it XML. If you want to let Nokogiri deal with reading your XML,
|
45
|
-
# use the Nokogiri::XML::SAX::Parser. If you want to have fine grain
|
46
|
-
# control over the XML input, use the Nokogiri::XML::SAX::PushParser.
|
47
5
|
module SAX
|
48
|
-
|
49
|
-
#
|
50
|
-
#
|
51
|
-
# possible events while parsing an
|
52
|
-
# particular event,
|
53
|
-
# you are interested in knowing about.
|
54
|
-
#
|
55
|
-
# To only be notified about start and end element events, write a class
|
56
|
-
# like this:
|
57
|
-
#
|
58
|
-
# class MyDocument < Nokogiri::XML::SAX::Document
|
59
|
-
# def start_element name, attrs = []
|
60
|
-
# puts "#{name} started!"
|
61
|
-
# end
|
6
|
+
# :markup: markdown
|
7
|
+
#
|
8
|
+
# The SAX::Document class is used for registering types of events you are interested in
|
9
|
+
# handling. All of the methods on this class are available as possible events while parsing an
|
10
|
+
# \XML document. To register for any particular event, subclass this class and implement the
|
11
|
+
# methods you are interested in knowing about.
|
62
12
|
#
|
63
|
-
#
|
64
|
-
#
|
13
|
+
# To only be notified about start and end element events, write a class like this:
|
14
|
+
#
|
15
|
+
# class MyHandler < Nokogiri::XML::SAX::Document
|
16
|
+
# def start_element name, attrs = []
|
17
|
+
# puts "#{name} started!"
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# def end_element name
|
21
|
+
# puts "#{name} ended"
|
22
|
+
# end
|
65
23
|
# end
|
66
|
-
# end
|
67
24
|
#
|
68
|
-
# You can use this event handler for any SAX
|
69
|
-
#
|
25
|
+
# You can use this event handler for any SAX-style parser included with Nokogiri.
|
26
|
+
#
|
27
|
+
# See also:
|
28
|
+
#
|
29
|
+
# - Nokogiri::XML::SAX
|
30
|
+
# - Nokogiri::HTML4::SAX
|
31
|
+
#
|
32
|
+
# ### Entity Handling
|
33
|
+
#
|
34
|
+
# ⚠ Entity handling is complicated in a SAX parser! Please read this section carefully if
|
35
|
+
# you're not getting the behavior you expect.
|
36
|
+
#
|
37
|
+
# Entities will be reported to the user via callbacks to #characters, to #reference, or
|
38
|
+
# possibly to both. The behavior is determined by a combination of _entity type_ and the value
|
39
|
+
# of ParserContext#replace_entities. (Recall that the default value of
|
40
|
+
# ParserContext#replace_entities is `false`.)
|
41
|
+
#
|
42
|
+
# ⚠ <b>It is UNSAFE to set ParserContext#replace_entities to `true`</b> when parsing untrusted
|
43
|
+
# documents.
|
44
|
+
#
|
45
|
+
# 💡 For more information on entity types, see [Wikipedia's page on
|
46
|
+
# DTDs](https://en.wikipedia.org/wiki/Document_type_definition#Entity_declarations).
|
47
|
+
#
|
48
|
+
# | Entity type | #characters | #reference |
|
49
|
+
# |--------------------------------------|------------------------------------|-------------------------------------|
|
50
|
+
# | Char ref (e.g., <tt>’</tt>) | always | never |
|
51
|
+
# | Predefined (e.g., <tt>&</tt>) | always | never |
|
52
|
+
# | Undeclared † | never | <tt>#replace_entities == false</tt> |
|
53
|
+
# | Internal | always | <tt>#replace_entities == false</tt> |
|
54
|
+
# | External † | <tt>#replace_entities == true</tt> | <tt>#replace_entities == false</tt> |
|
55
|
+
#
|
56
|
+
#
|
57
|
+
#
|
58
|
+
# † In the case where the replacement text for the entity is unknown (e.g., an undeclared entity
|
59
|
+
# or an external entity that could not be resolved because of network issues), then the
|
60
|
+
# replacement text will not be reported. If ParserContext#replace_entities is `true`, this
|
61
|
+
# means the #characters callback will not be invoked. If ParserContext#replace_entities is
|
62
|
+
# `false`, then the #reference callback will be invoked, but with `nil` for the `content`
|
63
|
+
# argument.
|
64
|
+
#
|
70
65
|
class Document
|
71
66
|
###
|
72
|
-
# Called when an XML declaration is parsed
|
73
|
-
|
67
|
+
# Called when an \XML declaration is parsed.
|
68
|
+
#
|
69
|
+
# [Parameters]
|
70
|
+
# - +version+ (String) the version attribute
|
71
|
+
# - +encoding+ (String, nil) the encoding of the document if present, else +nil+
|
72
|
+
# - +standalone+ ("yes", "no", nil) the standalone attribute if present, else +nil+
|
73
|
+
def xmldecl(version, encoding, standalone)
|
74
74
|
end
|
75
75
|
|
76
76
|
###
|
77
|
-
# Called when document starts parsing
|
77
|
+
# Called when document starts parsing.
|
78
78
|
def start_document
|
79
79
|
end
|
80
80
|
|
81
81
|
###
|
82
|
-
# Called when document ends parsing
|
82
|
+
# Called when document ends parsing.
|
83
83
|
def end_document
|
84
84
|
end
|
85
85
|
|
86
86
|
###
|
87
|
-
# Called at the beginning of an element
|
88
|
-
#
|
89
|
-
#
|
87
|
+
# Called at the beginning of an element.
|
88
|
+
#
|
89
|
+
# [Parameters]
|
90
|
+
# - +name+ (String) the name of the element
|
91
|
+
# - +attrs+ (Array<Array<String>>) an assoc list of namespace declarations and attributes, e.g.:
|
90
92
|
# [ ["xmlns:foo", "http://sample.net"], ["size", "large"] ]
|
91
|
-
|
93
|
+
#
|
94
|
+
# 💡If you're dealing with XML and need to handle namespaces, use the
|
95
|
+
# #start_element_namespace method instead.
|
96
|
+
#
|
97
|
+
# Note that the element namespace and any attribute namespaces are not provided, and so any
|
98
|
+
# namespaced elements or attributes will be returned as strings including the prefix:
|
99
|
+
#
|
100
|
+
# parser.parse(<<~XML)
|
101
|
+
# <root xmlns:foo='http://foo.example.com/' xmlns='http://example.com/'>
|
102
|
+
# <foo:bar foo:quux="xxx">hello world</foo:bar>
|
103
|
+
# </root>
|
104
|
+
# XML
|
105
|
+
#
|
106
|
+
# assert_pattern do
|
107
|
+
# parser.document.start_elements => [
|
108
|
+
# ["root", [["xmlns:foo", "http://foo.example.com/"], ["xmlns", "http://example.com/"]]],
|
109
|
+
# ["foo:bar", [["foo:quux", "xxx"]]],
|
110
|
+
# ]
|
111
|
+
# end
|
112
|
+
#
|
113
|
+
def start_element(name, attrs = [])
|
92
114
|
end
|
93
115
|
|
94
116
|
###
|
95
|
-
# Called at the end of an element
|
96
|
-
#
|
97
|
-
|
117
|
+
# Called at the end of an element.
|
118
|
+
#
|
119
|
+
# [Parameters]
|
120
|
+
# - +name+ (String) the name of the element being closed
|
121
|
+
#
|
122
|
+
def end_element(name)
|
98
123
|
end
|
99
124
|
|
100
125
|
###
|
101
|
-
# Called at the beginning of an element
|
102
|
-
#
|
103
|
-
#
|
104
|
-
# +
|
105
|
-
# +
|
106
|
-
# +
|
107
|
-
|
108
|
-
|
126
|
+
# Called at the beginning of an element.
|
127
|
+
#
|
128
|
+
# [Parameters]
|
129
|
+
# - +name+ (String) is the name of the element
|
130
|
+
# - +attrs+ (Array<Attribute>) is an array of structs with the following properties:
|
131
|
+
# - +localname+ (String) the local name of the attribute
|
132
|
+
# - +value+ (String) the value of the attribute
|
133
|
+
# - +prefix+ (String, nil) the namespace prefix of the attribute
|
134
|
+
# - +uri+ (String, nil) the namespace URI of the attribute
|
135
|
+
# - +prefix+ (String, nil) is the namespace prefix for the element
|
136
|
+
# - +uri+ (String, nil) is the associated URI for the element's namespace
|
137
|
+
# - +ns+ (Array<Array<String, String>>) is an assoc list of namespace declarations on the element
|
138
|
+
#
|
139
|
+
# 💡If you're dealing with HTML or don't care about namespaces, try #start_element instead.
|
140
|
+
#
|
141
|
+
# [Example]
|
142
|
+
# it "start_elements_namespace is called with namespaced attributes" do
|
143
|
+
# parser.parse(<<~XML)
|
144
|
+
# <root xmlns:foo='http://foo.example.com/'>
|
145
|
+
# <foo:a foo:bar='hello' />
|
146
|
+
# </root>
|
147
|
+
# XML
|
148
|
+
#
|
149
|
+
# assert_pattern do
|
150
|
+
# parser.document.start_elements_namespace => [
|
151
|
+
# [
|
152
|
+
# "root",
|
153
|
+
# [],
|
154
|
+
# nil, nil,
|
155
|
+
# [["foo", "http://foo.example.com/"]], # namespace declarations
|
156
|
+
# ], [
|
157
|
+
# "a",
|
158
|
+
# [Nokogiri::XML::SAX::Parser::Attribute(localname: "bar", prefix: "foo", uri: "http://foo.example.com/", value: "hello")], # prefixed attribute
|
159
|
+
# "foo", "http://foo.example.com/", # prefix and uri for the "a" element
|
160
|
+
# [],
|
161
|
+
# ]
|
162
|
+
# ]
|
163
|
+
# end
|
164
|
+
# end
|
165
|
+
#
|
166
|
+
def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = []) # rubocop:disable Metrics/ParameterLists
|
109
167
|
# Deal with SAX v1 interface
|
110
|
-
name = [prefix, name].compact.join(
|
111
|
-
attributes = ns.map
|
112
|
-
[[
|
113
|
-
|
114
|
-
[[attr.prefix, attr.localname].compact.join(
|
115
|
-
|
116
|
-
start_element
|
168
|
+
name = [prefix, name].compact.join(":")
|
169
|
+
attributes = ns.map do |ns_prefix, ns_uri|
|
170
|
+
[["xmlns", ns_prefix].compact.join(":"), ns_uri]
|
171
|
+
end + attrs.map do |attr|
|
172
|
+
[[attr.prefix, attr.localname].compact.join(":"), attr.value]
|
173
|
+
end
|
174
|
+
start_element(name, attributes)
|
117
175
|
end
|
118
176
|
|
119
177
|
###
|
120
|
-
# Called at the end of an element
|
121
|
-
#
|
122
|
-
#
|
123
|
-
# +
|
124
|
-
|
125
|
-
|
178
|
+
# Called at the end of an element.
|
179
|
+
#
|
180
|
+
# [Parameters]
|
181
|
+
# - +name+ (String) is the name of the element
|
182
|
+
# - +prefix+ (String, nil) is the namespace prefix for the element
|
183
|
+
# - +uri+ (String, nil) is the associated URI for the element's namespace
|
184
|
+
#
|
185
|
+
def end_element_namespace(name, prefix = nil, uri = nil)
|
126
186
|
# Deal with SAX v1 interface
|
127
|
-
end_element
|
187
|
+
end_element([prefix, name].compact.join(":"))
|
128
188
|
end
|
129
189
|
|
130
190
|
###
|
131
|
-
#
|
132
|
-
#
|
191
|
+
# Called when character data is parsed, and for parsed entities when
|
192
|
+
# ParserContext#replace_entities is +true+.
|
193
|
+
#
|
194
|
+
# [Parameters]
|
195
|
+
# - +string+ contains the character data or entity replacement text
|
196
|
+
#
|
197
|
+
# ⚠ Please see Document@Entity+Handling for important information about how entities are handled.
|
198
|
+
#
|
199
|
+
# ⚠ This method might be called multiple times for a contiguous string of characters.
|
200
|
+
#
|
201
|
+
def characters(string)
|
202
|
+
end
|
203
|
+
|
204
|
+
###
|
205
|
+
# Called when a parsed entity is referenced and not replaced.
|
206
|
+
#
|
207
|
+
# [Parameters]
|
208
|
+
# - +name+ (String) is the name of the entity
|
209
|
+
# - +content+ (String, nil) is the replacement text for the entity, if known
|
210
|
+
#
|
211
|
+
# ⚠ Please see Document@Entity+Handling for important information about how entities are handled.
|
212
|
+
#
|
213
|
+
# ⚠ An internal entity may result in a call to both #characters and #reference.
|
214
|
+
#
|
215
|
+
# Since v1.17.0
|
133
216
|
#
|
134
|
-
|
135
|
-
def characters string
|
217
|
+
def reference(name, content)
|
136
218
|
end
|
137
219
|
|
138
220
|
###
|
139
221
|
# Called when comments are encountered
|
140
|
-
#
|
141
|
-
|
222
|
+
# [Parameters]
|
223
|
+
# - +string+ contains the comment data
|
224
|
+
def comment(string)
|
142
225
|
end
|
143
226
|
|
144
227
|
###
|
145
228
|
# Called on document warnings
|
146
|
-
#
|
147
|
-
|
229
|
+
# [Parameters]
|
230
|
+
# - +string+ contains the warning
|
231
|
+
def warning(string)
|
148
232
|
end
|
149
233
|
|
150
234
|
###
|
151
235
|
# Called on document errors
|
152
|
-
#
|
153
|
-
|
236
|
+
# [Parameters]
|
237
|
+
# - +string+ contains the error
|
238
|
+
def error(string)
|
154
239
|
end
|
155
240
|
|
156
241
|
###
|
157
242
|
# Called when cdata blocks are found
|
158
|
-
#
|
159
|
-
|
243
|
+
# [Parameters]
|
244
|
+
# - +string+ contains the cdata content
|
245
|
+
def cdata_block(string)
|
160
246
|
end
|
161
247
|
|
162
248
|
###
|
163
249
|
# Called when processing instructions are found
|
164
|
-
#
|
165
|
-
# +
|
166
|
-
|
250
|
+
# [Parameters]
|
251
|
+
# - +name+ is the target of the instruction
|
252
|
+
# - +content+ is the value of the instruction
|
253
|
+
def processing_instruction(name, content)
|
167
254
|
end
|
168
255
|
end
|
169
256
|
end
|
@@ -1,17 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Nokogiri
|
2
4
|
module XML
|
3
5
|
module SAX
|
4
6
|
###
|
5
|
-
# This parser is a SAX style parser that reads
|
6
|
-
#
|
7
|
-
#
|
8
|
-
# the Nokogiri::XML::SAX::Document.
|
7
|
+
# This parser is a SAX style parser that reads its input as it deems necessary. The parser
|
8
|
+
# takes a Nokogiri::XML::SAX::Document, an optional encoding, then given an XML input, sends
|
9
|
+
# messages to the Nokogiri::XML::SAX::Document.
|
9
10
|
#
|
10
11
|
# Here is an example of using this parser:
|
11
12
|
#
|
12
13
|
# # Create a subclass of Nokogiri::XML::SAX::Document and implement
|
13
14
|
# # the events we care about:
|
14
|
-
# class
|
15
|
+
# class MyHandler < Nokogiri::XML::SAX::Document
|
15
16
|
# def start_element name, attrs = []
|
16
17
|
# puts "starting: #{name}"
|
17
18
|
# end
|
@@ -21,44 +22,54 @@ module Nokogiri
|
|
21
22
|
# end
|
22
23
|
# end
|
23
24
|
#
|
24
|
-
#
|
25
|
-
#
|
25
|
+
# parser = Nokogiri::XML::SAX::Parser.new(MyHandler.new)
|
26
|
+
#
|
27
|
+
# # Hand an IO object to the parser, which will read the XML from the IO.
|
28
|
+
# File.open(path_to_xml) do |f|
|
29
|
+
# parser.parse(f)
|
30
|
+
# end
|
31
|
+
#
|
32
|
+
# For more information about \SAX parsers, see Nokogiri::XML::SAX.
|
33
|
+
#
|
34
|
+
# Also see Nokogiri::XML::SAX::Document for the available events.
|
26
35
|
#
|
27
|
-
#
|
28
|
-
# parser.parse(File.open(ARGV[0]))
|
36
|
+
# For \HTML documents, use the subclass Nokogiri::HTML4::SAX::Parser.
|
29
37
|
#
|
30
|
-
# For more information about SAX parsers, see Nokogiri::XML::SAX. Also
|
31
|
-
# see Nokogiri::XML::SAX::Document for the available events.
|
32
38
|
class Parser
|
39
|
+
# to dynamically resolve ParserContext in inherited methods
|
40
|
+
include Nokogiri::ClassResolver
|
41
|
+
|
42
|
+
# Structure used for marshalling attributes for some callbacks in XML::SAX::Document.
|
33
43
|
class Attribute < Struct.new(:localname, :prefix, :uri, :value)
|
34
44
|
end
|
35
45
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
'ASCII' => 22, # pure ASCII
|
46
|
+
ENCODINGS = { # :nodoc:
|
47
|
+
"NONE" => 0, # No char encoding detected
|
48
|
+
"UTF-8" => 1, # UTF-8
|
49
|
+
"UTF16LE" => 2, # UTF-16 little endian
|
50
|
+
"UTF16BE" => 3, # UTF-16 big endian
|
51
|
+
"UCS4LE" => 4, # UCS-4 little endian
|
52
|
+
"UCS4BE" => 5, # UCS-4 big endian
|
53
|
+
"EBCDIC" => 6, # EBCDIC uh!
|
54
|
+
"UCS4-2143" => 7, # UCS-4 unusual ordering
|
55
|
+
"UCS4-3412" => 8, # UCS-4 unusual ordering
|
56
|
+
"UCS2" => 9, # UCS-2
|
57
|
+
"ISO-8859-1" => 10, # ISO-8859-1 ISO Latin 1
|
58
|
+
"ISO-8859-2" => 11, # ISO-8859-2 ISO Latin 2
|
59
|
+
"ISO-8859-3" => 12, # ISO-8859-3
|
60
|
+
"ISO-8859-4" => 13, # ISO-8859-4
|
61
|
+
"ISO-8859-5" => 14, # ISO-8859-5
|
62
|
+
"ISO-8859-6" => 15, # ISO-8859-6
|
63
|
+
"ISO-8859-7" => 16, # ISO-8859-7
|
64
|
+
"ISO-8859-8" => 17, # ISO-8859-8
|
65
|
+
"ISO-8859-9" => 18, # ISO-8859-9
|
66
|
+
"ISO-2022-JP" => 19, # ISO-2022-JP
|
67
|
+
"SHIFT-JIS" => 20, # Shift_JIS
|
68
|
+
"EUC-JP" => 21, # EUC-JP
|
69
|
+
"ASCII" => 22, # pure ASCII
|
61
70
|
}
|
71
|
+
REVERSE_ENCODINGS = ENCODINGS.invert # :nodoc:
|
72
|
+
deprecate_constant :ENCODINGS
|
62
73
|
|
63
74
|
# The Nokogiri::XML::SAX::Document where events will be sent.
|
64
75
|
attr_accessor :document
|
@@ -66,55 +77,121 @@ module Nokogiri
|
|
66
77
|
# The encoding beings used for this document.
|
67
78
|
attr_accessor :encoding
|
68
79
|
|
69
|
-
|
70
|
-
|
71
|
-
|
80
|
+
###
|
81
|
+
# :call-seq:
|
82
|
+
# new ⇒ SAX::Parser
|
83
|
+
# new(handler) ⇒ SAX::Parser
|
84
|
+
# new(handler, encoding) ⇒ SAX::Parser
|
85
|
+
#
|
86
|
+
# Create a new Parser.
|
87
|
+
#
|
88
|
+
# [Parameters]
|
89
|
+
# - +handler+ (optional Nokogiri::XML::SAX::Document) The document that will receive
|
90
|
+
# events. Will create a new Nokogiri::XML::SAX::Document if not given, which is accessible
|
91
|
+
# through the #document attribute.
|
92
|
+
# - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
|
93
|
+
# parsing the input. (default +nil+ for auto-detection)
|
94
|
+
#
|
95
|
+
def initialize(doc = Nokogiri::XML::SAX::Document.new, encoding = nil)
|
96
|
+
@encoding = encoding
|
72
97
|
@document = doc
|
73
98
|
@warned = false
|
99
|
+
|
100
|
+
initialize_native unless Nokogiri.jruby?
|
74
101
|
end
|
75
102
|
|
76
103
|
###
|
77
|
-
#
|
78
|
-
#
|
79
|
-
|
80
|
-
|
81
|
-
|
104
|
+
# :call-seq:
|
105
|
+
# parse(input) { |parser_context| ... }
|
106
|
+
#
|
107
|
+
# Parse the input, sending events to the SAX::Document at #document.
|
108
|
+
#
|
109
|
+
# [Parameters]
|
110
|
+
# - +input+ (String, IO) The input to parse.
|
111
|
+
#
|
112
|
+
# If +input+ quacks like a readable IO object, this method forwards to Parser.parse_io,
|
113
|
+
# otherwise it forwards to Parser.parse_memory.
|
114
|
+
#
|
115
|
+
# [Yields]
|
116
|
+
# If a block is given, the underlying ParserContext object will be yielded. This can be used
|
117
|
+
# to set options on the parser context before parsing begins.
|
118
|
+
#
|
119
|
+
def parse(input, &block)
|
120
|
+
if input.respond_to?(:read) && input.respond_to?(:close)
|
121
|
+
parse_io(input, &block)
|
82
122
|
else
|
83
|
-
parse_memory(
|
123
|
+
parse_memory(input, &block)
|
84
124
|
end
|
85
125
|
end
|
86
126
|
|
87
127
|
###
|
88
|
-
#
|
89
|
-
|
90
|
-
|
91
|
-
|
128
|
+
# :call-seq:
|
129
|
+
# parse_io(io) { |parser_context| ... }
|
130
|
+
# parse_io(io, encoding) { |parser_context| ... }
|
131
|
+
#
|
132
|
+
# Parse an input stream.
|
133
|
+
#
|
134
|
+
# [Parameters]
|
135
|
+
# - +io+ (IO) The readable IO object from which to read input
|
136
|
+
# - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
|
137
|
+
# parsing the input, or +nil+ for auto-detection. (default #encoding)
|
138
|
+
#
|
139
|
+
# [Yields]
|
140
|
+
# If a block is given, the underlying ParserContext object will be yielded. This can be used
|
141
|
+
# to set options on the parser context before parsing begins.
|
142
|
+
#
|
143
|
+
def parse_io(io, encoding = @encoding)
|
144
|
+
ctx = related_class("ParserContext").io(io, encoding)
|
92
145
|
yield ctx if block_given?
|
93
|
-
ctx.parse_with
|
146
|
+
ctx.parse_with(self)
|
94
147
|
end
|
95
148
|
|
96
149
|
###
|
97
|
-
#
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
150
|
+
# :call-seq:
|
151
|
+
# parse_memory(input) { |parser_context| ... }
|
152
|
+
# parse_memory(input, encoding) { |parser_context| ... }
|
153
|
+
#
|
154
|
+
# Parse an input string.
|
155
|
+
#
|
156
|
+
# [Parameters]
|
157
|
+
# - +input+ (String) The input string to be parsed.
|
158
|
+
# - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
|
159
|
+
# parsing the input, or +nil+ for auto-detection. (default #encoding)
|
160
|
+
#
|
161
|
+
# [Yields]
|
162
|
+
# If a block is given, the underlying ParserContext object will be yielded. This can be used
|
163
|
+
# to set options on the parser context before parsing begins.
|
164
|
+
#
|
165
|
+
def parse_memory(input, encoding = @encoding)
|
166
|
+
ctx = related_class("ParserContext").memory(input, encoding)
|
103
167
|
yield ctx if block_given?
|
104
|
-
ctx.parse_with
|
168
|
+
ctx.parse_with(self)
|
105
169
|
end
|
106
170
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
171
|
+
###
|
172
|
+
# :call-seq:
|
173
|
+
# parse_file(filename) { |parser_context| ... }
|
174
|
+
# parse_file(filename, encoding) { |parser_context| ... }
|
175
|
+
#
|
176
|
+
# Parse a file.
|
177
|
+
#
|
178
|
+
# [Parameters]
|
179
|
+
# - +filename+ (String) The path to the file to be parsed.
|
180
|
+
# - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
|
181
|
+
# parsing the input, or +nil+ for auto-detection. (default #encoding)
|
182
|
+
#
|
183
|
+
# [Yields]
|
184
|
+
# If a block is given, the underlying ParserContext object will be yielded. This can be used
|
185
|
+
# to set options on the parser context before parsing begins.
|
186
|
+
#
|
187
|
+
def parse_file(filename, encoding = @encoding)
|
188
|
+
raise ArgumentError, "no filename provided" unless filename
|
189
|
+
raise Errno::ENOENT unless File.exist?(filename)
|
190
|
+
raise Errno::EISDIR if File.directory?(filename)
|
112
191
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
raise ArgumentError.new("'#{enc}' is not a valid encoding") unless ENCODINGS[enc]
|
117
|
-
end
|
192
|
+
ctx = related_class("ParserContext").file(filename, encoding)
|
193
|
+
yield ctx if block_given?
|
194
|
+
ctx.parse_with(self)
|
118
195
|
end
|
119
196
|
end
|
120
197
|
end
|