nokogiri 1.10.7 → 1.16.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +42 -0
- data/LICENSE-DEPENDENCIES.md +1632 -1022
- data/LICENSE.md +1 -1
- data/README.md +188 -96
- data/bin/nokogiri +63 -50
- data/dependencies.yml +34 -66
- data/ext/nokogiri/depend +38 -358
- data/ext/nokogiri/extconf.rb +862 -421
- data/ext/nokogiri/gumbo.c +594 -0
- data/ext/nokogiri/html4_document.c +165 -0
- data/ext/nokogiri/html4_element_description.c +299 -0
- data/ext/nokogiri/html4_entity_lookup.c +37 -0
- data/ext/nokogiri/html4_sax_parser_context.c +108 -0
- data/ext/nokogiri/html4_sax_push_parser.c +95 -0
- data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
- data/ext/nokogiri/nokogiri.c +251 -105
- data/ext/nokogiri/nokogiri.h +222 -90
- data/ext/nokogiri/test_global_handlers.c +40 -0
- data/ext/nokogiri/xml_attr.c +17 -17
- data/ext/nokogiri/xml_attribute_decl.c +22 -22
- data/ext/nokogiri/xml_cdata.c +39 -31
- data/ext/nokogiri/xml_comment.c +20 -27
- data/ext/nokogiri/xml_document.c +408 -243
- data/ext/nokogiri/xml_document_fragment.c +13 -17
- data/ext/nokogiri/xml_dtd.c +64 -58
- data/ext/nokogiri/xml_element_content.c +63 -55
- data/ext/nokogiri/xml_element_decl.c +31 -31
- data/ext/nokogiri/xml_encoding_handler.c +54 -21
- data/ext/nokogiri/xml_entity_decl.c +37 -35
- data/ext/nokogiri/xml_entity_reference.c +17 -19
- data/ext/nokogiri/xml_namespace.c +131 -61
- data/ext/nokogiri/xml_node.c +1343 -674
- data/ext/nokogiri/xml_node_set.c +246 -216
- data/ext/nokogiri/xml_processing_instruction.c +18 -20
- data/ext/nokogiri/xml_reader.c +305 -213
- data/ext/nokogiri/xml_relax_ng.c +87 -78
- data/ext/nokogiri/xml_sax_parser.c +149 -124
- data/ext/nokogiri/xml_sax_parser_context.c +149 -103
- data/ext/nokogiri/xml_sax_push_parser.c +65 -37
- data/ext/nokogiri/xml_schema.c +138 -82
- data/ext/nokogiri/xml_syntax_error.c +42 -21
- data/ext/nokogiri/xml_text.c +35 -26
- data/ext/nokogiri/xml_xpath_context.c +363 -178
- data/ext/nokogiri/xslt_stylesheet.c +335 -189
- data/gumbo-parser/CHANGES.md +63 -0
- data/gumbo-parser/Makefile +126 -0
- data/gumbo-parser/THANKS +27 -0
- data/gumbo-parser/src/Makefile +34 -0
- data/gumbo-parser/src/README.md +41 -0
- data/gumbo-parser/src/ascii.c +75 -0
- data/gumbo-parser/src/ascii.h +115 -0
- data/gumbo-parser/src/attribute.c +42 -0
- data/gumbo-parser/src/attribute.h +17 -0
- data/gumbo-parser/src/char_ref.c +22225 -0
- data/gumbo-parser/src/char_ref.h +29 -0
- data/gumbo-parser/src/char_ref.rl +2154 -0
- data/gumbo-parser/src/error.c +630 -0
- data/gumbo-parser/src/error.h +148 -0
- data/gumbo-parser/src/foreign_attrs.c +103 -0
- data/gumbo-parser/src/foreign_attrs.gperf +27 -0
- data/gumbo-parser/src/insertion_mode.h +33 -0
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/nokogiri_gumbo.h +944 -0
- data/gumbo-parser/src/parser.c +4891 -0
- data/gumbo-parser/src/parser.h +41 -0
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +103 -0
- data/gumbo-parser/src/string_buffer.h +68 -0
- data/gumbo-parser/src/string_piece.c +48 -0
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_attrs.gperf +77 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/svg_tags.gperf +55 -0
- data/gumbo-parser/src/tag.c +223 -0
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.gperf +170 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/token_type.h +17 -0
- data/gumbo-parser/src/tokenizer.c +3464 -0
- data/gumbo-parser/src/tokenizer.h +112 -0
- data/gumbo-parser/src/tokenizer_states.h +339 -0
- data/gumbo-parser/src/utf8.c +245 -0
- data/gumbo-parser/src/utf8.h +164 -0
- data/gumbo-parser/src/util.c +66 -0
- data/gumbo-parser/src/util.h +34 -0
- data/gumbo-parser/src/vector.c +111 -0
- data/gumbo-parser/src/vector.h +45 -0
- data/lib/nokogiri/class_resolver.rb +67 -0
- data/lib/nokogiri/css/node.rb +10 -8
- data/lib/nokogiri/css/parser.rb +397 -377
- data/lib/nokogiri/css/parser.y +250 -245
- data/lib/nokogiri/css/parser_extras.rb +54 -49
- data/lib/nokogiri/css/syntax_error.rb +3 -1
- data/lib/nokogiri/css/tokenizer.rb +5 -3
- data/lib/nokogiri/css/tokenizer.rex +3 -2
- data/lib/nokogiri/css/xpath_visitor.rb +205 -96
- data/lib/nokogiri/css.rb +56 -17
- data/lib/nokogiri/decorators/slop.rb +9 -7
- data/lib/nokogiri/encoding_handler.rb +57 -0
- data/lib/nokogiri/extension.rb +32 -0
- data/lib/nokogiri/gumbo.rb +15 -0
- data/lib/nokogiri/html.rb +38 -27
- data/lib/nokogiri/{html → html4}/builder.rb +4 -2
- data/lib/nokogiri/html4/document.rb +214 -0
- data/lib/nokogiri/html4/document_fragment.rb +54 -0
- data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
- data/lib/nokogiri/html4/element_description_defaults.rb +2040 -0
- data/lib/nokogiri/html4/encoding_reader.rb +121 -0
- data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
- data/lib/nokogiri/{html → html4}/sax/parser.rb +17 -16
- data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
- data/lib/nokogiri/{html → html4}/sax/push_parser.rb +12 -11
- data/lib/nokogiri/html4.rb +47 -0
- data/lib/nokogiri/html5/document.rb +168 -0
- data/lib/nokogiri/html5/document_fragment.rb +90 -0
- data/lib/nokogiri/html5/node.rb +103 -0
- data/lib/nokogiri/html5.rb +326 -0
- data/lib/nokogiri/jruby/dependencies.rb +3 -0
- data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
- data/lib/nokogiri/syntax_error.rb +2 -0
- data/lib/nokogiri/version/constant.rb +6 -0
- data/lib/nokogiri/version/info.rb +224 -0
- data/lib/nokogiri/version.rb +3 -108
- data/lib/nokogiri/xml/attr.rb +55 -3
- data/lib/nokogiri/xml/attribute_decl.rb +6 -2
- data/lib/nokogiri/xml/builder.rb +75 -34
- data/lib/nokogiri/xml/cdata.rb +3 -1
- data/lib/nokogiri/xml/character_data.rb +2 -0
- data/lib/nokogiri/xml/document.rb +312 -127
- data/lib/nokogiri/xml/document_fragment.rb +93 -48
- data/lib/nokogiri/xml/dtd.rb +4 -2
- data/lib/nokogiri/xml/element_content.rb +12 -2
- data/lib/nokogiri/xml/element_decl.rb +6 -2
- data/lib/nokogiri/xml/entity_decl.rb +7 -3
- data/lib/nokogiri/xml/entity_reference.rb +2 -0
- data/lib/nokogiri/xml/namespace.rb +44 -0
- data/lib/nokogiri/xml/node/save_options.rb +23 -8
- data/lib/nokogiri/xml/node.rb +1096 -419
- data/lib/nokogiri/xml/node_set.rb +137 -61
- data/lib/nokogiri/xml/notation.rb +13 -0
- data/lib/nokogiri/xml/parse_options.rb +145 -52
- data/lib/nokogiri/xml/pp/character_data.rb +9 -6
- data/lib/nokogiri/xml/pp/node.rb +42 -30
- data/lib/nokogiri/xml/pp.rb +4 -2
- data/lib/nokogiri/xml/processing_instruction.rb +4 -1
- data/lib/nokogiri/xml/reader.rb +21 -28
- data/lib/nokogiri/xml/relax_ng.rb +8 -2
- data/lib/nokogiri/xml/sax/document.rb +45 -49
- data/lib/nokogiri/xml/sax/parser.rb +39 -36
- data/lib/nokogiri/xml/sax/parser_context.rb +8 -3
- data/lib/nokogiri/xml/sax/push_parser.rb +6 -5
- data/lib/nokogiri/xml/sax.rb +6 -4
- data/lib/nokogiri/xml/schema.rb +19 -9
- data/lib/nokogiri/xml/searchable.rb +120 -72
- data/lib/nokogiri/xml/syntax_error.rb +7 -5
- data/lib/nokogiri/xml/text.rb +2 -0
- data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
- data/lib/nokogiri/xml/xpath.rb +15 -4
- data/lib/nokogiri/xml/xpath_context.rb +3 -3
- data/lib/nokogiri/xml.rb +39 -38
- data/lib/nokogiri/xslt/stylesheet.rb +3 -1
- data/lib/nokogiri/xslt.rb +101 -22
- data/lib/nokogiri.rb +59 -75
- data/lib/xsd/xmlparser/nokogiri.rb +29 -25
- data/patches/libxml2/{0004-libxml2.la-is-in-top_builddir.patch → 0003-libxml2.la-is-in-top_builddir.patch} +1 -1
- data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
- data/patches/libxml2/0010-update-config.guess-and-config.sub-for-libxml2.patch +224 -0
- data/patches/libxml2/0011-rip-out-libxml2-s-libc_single_threaded-support.patch +30 -0
- data/patches/libxslt/0001-update-config.guess-and-config.sub-for-libxslt.patch +224 -0
- data/ports/archives/libxml2-2.12.3.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
- metadata +121 -291
- data/ext/nokogiri/html_document.c +0 -170
- data/ext/nokogiri/html_document.h +0 -10
- data/ext/nokogiri/html_element_description.c +0 -279
- data/ext/nokogiri/html_element_description.h +0 -10
- data/ext/nokogiri/html_entity_lookup.c +0 -32
- data/ext/nokogiri/html_entity_lookup.h +0 -8
- data/ext/nokogiri/html_sax_parser_context.c +0 -116
- data/ext/nokogiri/html_sax_parser_context.h +0 -11
- data/ext/nokogiri/html_sax_push_parser.c +0 -87
- data/ext/nokogiri/html_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_attr.h +0 -9
- data/ext/nokogiri/xml_attribute_decl.h +0 -9
- data/ext/nokogiri/xml_cdata.h +0 -9
- data/ext/nokogiri/xml_comment.h +0 -9
- data/ext/nokogiri/xml_document.h +0 -23
- data/ext/nokogiri/xml_document_fragment.h +0 -10
- data/ext/nokogiri/xml_dtd.h +0 -10
- data/ext/nokogiri/xml_element_content.h +0 -10
- data/ext/nokogiri/xml_element_decl.h +0 -9
- data/ext/nokogiri/xml_encoding_handler.h +0 -8
- data/ext/nokogiri/xml_entity_decl.h +0 -10
- data/ext/nokogiri/xml_entity_reference.h +0 -9
- data/ext/nokogiri/xml_io.c +0 -61
- data/ext/nokogiri/xml_io.h +0 -11
- data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
- data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
- data/ext/nokogiri/xml_namespace.h +0 -14
- data/ext/nokogiri/xml_node.h +0 -13
- data/ext/nokogiri/xml_node_set.h +0 -12
- data/ext/nokogiri/xml_processing_instruction.h +0 -9
- data/ext/nokogiri/xml_reader.h +0 -10
- data/ext/nokogiri/xml_relax_ng.h +0 -9
- data/ext/nokogiri/xml_sax_parser.h +0 -39
- data/ext/nokogiri/xml_sax_parser_context.h +0 -10
- data/ext/nokogiri/xml_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_schema.h +0 -9
- data/ext/nokogiri/xml_syntax_error.h +0 -13
- data/ext/nokogiri/xml_text.h +0 -9
- data/ext/nokogiri/xml_xpath_context.h +0 -10
- data/ext/nokogiri/xslt_stylesheet.h +0 -14
- data/lib/nokogiri/html/document.rb +0 -335
- data/lib/nokogiri/html/document_fragment.rb +0 -49
- data/lib/nokogiri/html/element_description_defaults.rb +0 -671
- data/lib/nokogiri/html/sax/parser_context.rb +0 -16
- data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
- data/ports/archives/libxml2-2.9.10.tar.gz +0 -0
- data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
- /data/patches/libxml2/{0002-Remove-script-macro-support.patch → 0001-Remove-script-macro-support.patch} +0 -0
- /data/patches/libxml2/{0003-Update-entities-to-remove-handling-of-ssi.patch → 0002-Update-entities-to-remove-handling-of-ssi.patch} +0 -0
@@ -1,16 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Nokogiri
|
2
4
|
module XML
|
5
|
+
# :nodoc: all
|
3
6
|
module PP
|
4
7
|
module CharacterData
|
5
|
-
def pretty_print
|
6
|
-
nice_name = self.class.name.split(
|
7
|
-
pp.group(2, "#(#{nice_name} ",
|
8
|
-
pp.pp
|
8
|
+
def pretty_print(pp)
|
9
|
+
nice_name = self.class.name.split("::").last
|
10
|
+
pp.group(2, "#(#{nice_name} ", ")") do
|
11
|
+
pp.pp(text)
|
9
12
|
end
|
10
13
|
end
|
11
14
|
|
12
|
-
def inspect
|
13
|
-
"#<#{self.class.name}:#{
|
15
|
+
def inspect
|
16
|
+
"#<#{self.class.name}:#{format("0x%x", object_id)} #{text.inspect}>"
|
14
17
|
end
|
15
18
|
end
|
16
19
|
end
|
data/lib/nokogiri/xml/pp/node.rb
CHANGED
@@ -1,53 +1,65 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Nokogiri
|
2
4
|
module XML
|
5
|
+
# :nodoc: all
|
3
6
|
module PP
|
4
7
|
module Node
|
5
|
-
|
6
|
-
attributes = inspect_attributes.reject { |x|
|
7
|
-
begin
|
8
|
-
attribute = send x
|
9
|
-
!attribute || (attribute.respond_to?(:empty?) && attribute.empty?)
|
10
|
-
rescue NoMethodError
|
11
|
-
true
|
12
|
-
end
|
13
|
-
}.map { |attribute|
|
14
|
-
"#{attribute.to_s.sub(/_\w+/, 's')}=#{send(attribute).inspect}"
|
15
|
-
}.join ' '
|
16
|
-
"#<#{self.class.name}:#{sprintf("0x%x", object_id)} #{attributes}>"
|
17
|
-
end
|
8
|
+
COLLECTIONS = [:attribute_nodes, :children]
|
18
9
|
|
19
|
-
def
|
20
|
-
|
21
|
-
|
10
|
+
def inspect
|
11
|
+
attributes = inspect_attributes.reject do |x|
|
12
|
+
attribute = send(x)
|
13
|
+
!attribute || (attribute.respond_to?(:empty?) && attribute.empty?)
|
14
|
+
rescue NoMethodError
|
15
|
+
true
|
16
|
+
end
|
17
|
+
attributes = if inspect_attributes.length == 1
|
18
|
+
send(attributes.first).inspect
|
19
|
+
else
|
20
|
+
attributes.map do |attribute|
|
21
|
+
"#{attribute}=#{send(attribute).inspect}"
|
22
|
+
end.join(" ")
|
23
|
+
end
|
24
|
+
"#<#{self.class.name}:#{format("0x%x", object_id)} #{attributes}>"
|
25
|
+
end
|
22
26
|
|
27
|
+
def pretty_print(pp)
|
28
|
+
nice_name = self.class.name.split("::").last
|
29
|
+
pp.group(2, "#(#{nice_name}:#{format("0x%x", object_id)} {", "})") do
|
23
30
|
pp.breakable
|
24
|
-
|
31
|
+
|
32
|
+
attrs = inspect_attributes.filter_map do |t|
|
25
33
|
[t, send(t)] if respond_to?(t)
|
26
|
-
|
34
|
+
end.find_all do |x|
|
27
35
|
if x.last
|
28
|
-
if
|
36
|
+
if COLLECTIONS.include?(x.first)
|
29
37
|
!x.last.empty?
|
30
38
|
else
|
31
39
|
true
|
32
40
|
end
|
33
41
|
end
|
34
|
-
|
42
|
+
end
|
35
43
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
44
|
+
if inspect_attributes.length == 1
|
45
|
+
pp.pp(attrs.first.last)
|
46
|
+
else
|
47
|
+
pp.seplist(attrs) do |v|
|
48
|
+
if COLLECTIONS.include?(v.first)
|
49
|
+
pp.group(2, "#{v.first} = [", "]") do
|
50
|
+
pp.breakable
|
51
|
+
pp.seplist(v.last) do |item|
|
52
|
+
pp.pp(item)
|
53
|
+
end
|
42
54
|
end
|
55
|
+
else
|
56
|
+
pp.text("#{v.first} = ")
|
57
|
+
pp.pp(v.last)
|
43
58
|
end
|
44
|
-
else
|
45
|
-
pp.text "#{v.first} = "
|
46
|
-
pp.pp v.last
|
47
59
|
end
|
48
60
|
end
|
49
|
-
pp.breakable
|
50
61
|
|
62
|
+
pp.breakable
|
51
63
|
end
|
52
64
|
end
|
53
65
|
end
|
data/lib/nokogiri/xml/pp.rb
CHANGED
data/lib/nokogiri/xml/reader.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Nokogiri
|
2
4
|
module XML
|
3
5
|
###
|
@@ -7,18 +9,18 @@ module Nokogiri
|
|
7
9
|
#
|
8
10
|
# Here is an example of usage:
|
9
11
|
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
12
|
+
# reader = Nokogiri::XML::Reader(<<-eoxml)
|
13
|
+
# <x xmlns:tenderlove='http://tenderlovemaking.com/'>
|
14
|
+
# <tenderlove:foo awesome='true'>snuggles!</tenderlove:foo>
|
15
|
+
# </x>
|
16
|
+
# eoxml
|
15
17
|
#
|
16
|
-
#
|
18
|
+
# reader.each do |node|
|
17
19
|
#
|
18
|
-
#
|
19
|
-
#
|
20
|
+
# # node is an instance of Nokogiri::XML::Reader
|
21
|
+
# puts node.name
|
20
22
|
#
|
21
|
-
#
|
23
|
+
# end
|
22
24
|
#
|
23
25
|
# Note that Nokogiri::XML::Reader#each can only be called once!! Once
|
24
26
|
# the cursor moves through the entire document, you must parse the
|
@@ -69,41 +71,32 @@ module Nokogiri
|
|
69
71
|
# A list of errors encountered while parsing
|
70
72
|
attr_accessor :errors
|
71
73
|
|
72
|
-
# The encoding for the document
|
73
|
-
attr_reader :encoding
|
74
|
-
|
75
74
|
# The XML source
|
76
75
|
attr_reader :source
|
77
76
|
|
78
|
-
|
77
|
+
alias_method :self_closing?, :empty_element?
|
79
78
|
|
80
|
-
def initialize
|
79
|
+
def initialize(source, url = nil, encoding = nil) # :nodoc:
|
81
80
|
@source = source
|
82
81
|
@errors = []
|
83
82
|
@encoding = encoding
|
84
83
|
end
|
85
84
|
private :initialize
|
86
85
|
|
87
|
-
|
88
|
-
#
|
86
|
+
# Get the attributes and namespaces of the current node as a Hash.
|
87
|
+
#
|
88
|
+
# This is the union of Reader#attribute_hash and Reader#namespaces
|
89
|
+
#
|
90
|
+
# [Returns]
|
91
|
+
# (Hash<String, String>) Attribute names and values, and namespace prefixes and hrefs.
|
89
92
|
def attributes
|
90
|
-
|
91
|
-
[node.name, node.to_s]
|
92
|
-
}].merge(namespaces || {})
|
93
|
-
end
|
94
|
-
|
95
|
-
###
|
96
|
-
# Get a list of attributes for the current node
|
97
|
-
def attribute_nodes
|
98
|
-
nodes = attr_nodes
|
99
|
-
nodes.each { |v| v.instance_variable_set(:@_r, self) }
|
100
|
-
nodes
|
93
|
+
attribute_hash.merge(namespaces)
|
101
94
|
end
|
102
95
|
|
103
96
|
###
|
104
97
|
# Move the cursor through the document yielding the cursor to the block
|
105
98
|
def each
|
106
|
-
while cursor =
|
99
|
+
while (cursor = read)
|
107
100
|
yield cursor
|
108
101
|
end
|
109
102
|
end
|
@@ -1,11 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Nokogiri
|
2
4
|
module XML
|
3
5
|
class << self
|
4
6
|
###
|
5
7
|
# Create a new Nokogiri::XML::RelaxNG document from +string_or_io+.
|
6
8
|
# See Nokogiri::XML::RelaxNG for an example.
|
7
|
-
def RelaxNG
|
8
|
-
RelaxNG.new(string_or_io)
|
9
|
+
def RelaxNG(string_or_io, options = ParseOptions::DEFAULT_SCHEMA)
|
10
|
+
RelaxNG.new(string_or_io, options)
|
9
11
|
end
|
10
12
|
end
|
11
13
|
|
@@ -26,6 +28,10 @@ module Nokogiri
|
|
26
28
|
# end
|
27
29
|
#
|
28
30
|
# The list of errors are Nokogiri::XML::SyntaxError objects.
|
31
|
+
#
|
32
|
+
# NOTE: RelaxNG input is always treated as TRUSTED documents, meaning that they will cause the
|
33
|
+
# underlying parsing libraries to access network resources. This is counter to Nokogiri's
|
34
|
+
# "untrusted by default" security policy, but is a limitation of the underlying libraries.
|
29
35
|
class RelaxNG < Nokogiri::XML::Schema
|
30
36
|
end
|
31
37
|
end
|
@@ -1,20 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Nokogiri
|
2
4
|
module XML
|
3
5
|
###
|
4
|
-
# SAX Parsers are event driven parsers.
|
5
|
-
#
|
6
|
-
#
|
6
|
+
# SAX Parsers are event driven parsers. Nokogiri provides two different event based parsers when
|
7
|
+
# dealing with XML. If you want to do SAX style parsing using HTML, check out
|
8
|
+
# Nokogiri::HTML4::SAX.
|
7
9
|
#
|
8
|
-
# The basic way a SAX style parser works is by creating a parser,
|
9
|
-
#
|
10
|
-
#
|
11
|
-
# it encounters events you said you would like to know about.
|
10
|
+
# The basic way a SAX style parser works is by creating a parser, telling the parser about the
|
11
|
+
# events we're interested in, then giving the parser some XML to process. The parser will notify
|
12
|
+
# you when it encounters events you said you would like to know about.
|
12
13
|
#
|
13
|
-
# To register for events, you simply subclass Nokogiri::XML::SAX::Document,
|
14
|
-
#
|
14
|
+
# To register for events, you simply subclass Nokogiri::XML::SAX::Document, and implement the
|
15
|
+
# methods for which you would like notification.
|
15
16
|
#
|
16
|
-
# For example, if I want to be notified when a document ends, and when an
|
17
|
-
#
|
17
|
+
# For example, if I want to be notified when a document ends, and when an element starts, I
|
18
|
+
# would write a class like this:
|
18
19
|
#
|
19
20
|
# class MyDocument < Nokogiri::XML::SAX::Document
|
20
21
|
# def end_document
|
@@ -26,8 +27,7 @@ module Nokogiri
|
|
26
27
|
# end
|
27
28
|
# end
|
28
29
|
#
|
29
|
-
# Then I would instantiate a SAX parser with this document, and feed the
|
30
|
-
# parser some XML
|
30
|
+
# Then I would instantiate a SAX parser with this document, and feed the parser some XML
|
31
31
|
#
|
32
32
|
# # Create a new parser
|
33
33
|
# parser = Nokogiri::XML::SAX::Parser.new(MyDocument.new)
|
@@ -35,25 +35,21 @@ module Nokogiri
|
|
35
35
|
# # Feed the parser some XML
|
36
36
|
# parser.parse(File.open(ARGV[0]))
|
37
37
|
#
|
38
|
-
# Now my document handler will be called when each node starts, and when
|
39
|
-
#
|
40
|
-
# a look at Nokogiri::XML::SAX::Document.
|
38
|
+
# Now my document handler will be called when each node starts, and when then document ends. To
|
39
|
+
# see what kinds of events are available, take a look at Nokogiri::XML::SAX::Document.
|
41
40
|
#
|
42
|
-
# Two SAX parsers for XML are available, a parser that reads from a string
|
43
|
-
#
|
44
|
-
#
|
45
|
-
# use the Nokogiri::XML::SAX::Parser. If you want to have fine grain
|
41
|
+
# Two SAX parsers for XML are available, a parser that reads from a string or IO object as it
|
42
|
+
# feels necessary, and a parser that lets you spoon feed it XML. If you want to let Nokogiri
|
43
|
+
# deal with reading your XML, use the Nokogiri::XML::SAX::Parser. If you want to have fine grain
|
46
44
|
# control over the XML input, use the Nokogiri::XML::SAX::PushParser.
|
47
45
|
module SAX
|
48
46
|
###
|
49
|
-
# This class is used for registering types of events you are interested
|
50
|
-
#
|
51
|
-
#
|
52
|
-
#
|
53
|
-
# you are interested in knowing about.
|
47
|
+
# This class is used for registering types of events you are interested in handling. All of
|
48
|
+
# the methods on this class are available as possible events while parsing an XML document. To
|
49
|
+
# register for any particular event, just subclass this class and implement the methods you
|
50
|
+
# are interested in knowing about.
|
54
51
|
#
|
55
|
-
# To only be notified about start and end element events, write a class
|
56
|
-
# like this:
|
52
|
+
# To only be notified about start and end element events, write a class like this:
|
57
53
|
#
|
58
54
|
# class MyDocument < Nokogiri::XML::SAX::Document
|
59
55
|
# def start_element name, attrs = []
|
@@ -65,12 +61,12 @@ module Nokogiri
|
|
65
61
|
# end
|
66
62
|
# end
|
67
63
|
#
|
68
|
-
# You can use this event handler for any SAX style parser included with
|
69
|
-
# Nokogiri
|
64
|
+
# You can use this event handler for any SAX style parser included with Nokogiri. See
|
65
|
+
# Nokogiri::XML::SAX, and Nokogiri::HTML4::SAX.
|
70
66
|
class Document
|
71
67
|
###
|
72
68
|
# Called when an XML declaration is parsed
|
73
|
-
def xmldecl
|
69
|
+
def xmldecl(version, encoding, standalone)
|
74
70
|
end
|
75
71
|
|
76
72
|
###
|
@@ -88,13 +84,13 @@ module Nokogiri
|
|
88
84
|
# * +name+ is the name of the tag
|
89
85
|
# * +attrs+ are an assoc list of namespaces and attributes, e.g.:
|
90
86
|
# [ ["xmlns:foo", "http://sample.net"], ["size", "large"] ]
|
91
|
-
def start_element
|
87
|
+
def start_element(name, attrs = [])
|
92
88
|
end
|
93
89
|
|
94
90
|
###
|
95
91
|
# Called at the end of an element
|
96
92
|
# +name+ is the tag name
|
97
|
-
def end_element
|
93
|
+
def end_element(name)
|
98
94
|
end
|
99
95
|
|
100
96
|
###
|
@@ -104,16 +100,16 @@ module Nokogiri
|
|
104
100
|
# +prefix+ is the namespace prefix for the element
|
105
101
|
# +uri+ is the associated namespace URI
|
106
102
|
# +ns+ is a hash of namespace prefix:urls associated with the element
|
107
|
-
def start_element_namespace
|
103
|
+
def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = []) # rubocop:disable Metrics/ParameterLists
|
108
104
|
###
|
109
105
|
# Deal with SAX v1 interface
|
110
|
-
name = [prefix, name].compact.join(
|
111
|
-
attributes = ns.map
|
112
|
-
[[
|
113
|
-
|
114
|
-
[[attr.prefix, attr.localname].compact.join(
|
115
|
-
|
116
|
-
start_element
|
106
|
+
name = [prefix, name].compact.join(":")
|
107
|
+
attributes = ns.map do |ns_prefix, ns_uri|
|
108
|
+
[["xmlns", ns_prefix].compact.join(":"), ns_uri]
|
109
|
+
end + attrs.map do |attr|
|
110
|
+
[[attr.prefix, attr.localname].compact.join(":"), attr.value]
|
111
|
+
end
|
112
|
+
start_element(name, attributes)
|
117
113
|
end
|
118
114
|
|
119
115
|
###
|
@@ -121,49 +117,49 @@ module Nokogiri
|
|
121
117
|
# +name+ is the element's name
|
122
118
|
# +prefix+ is the namespace prefix associated with the element
|
123
119
|
# +uri+ is the associated namespace URI
|
124
|
-
def end_element_namespace
|
120
|
+
def end_element_namespace(name, prefix = nil, uri = nil)
|
125
121
|
###
|
126
122
|
# Deal with SAX v1 interface
|
127
|
-
end_element
|
123
|
+
end_element([prefix, name].compact.join(":"))
|
128
124
|
end
|
129
125
|
|
130
126
|
###
|
131
|
-
# Characters read between a tag.
|
127
|
+
# Characters read between a tag. This method might be called multiple
|
132
128
|
# times given one contiguous string of characters.
|
133
129
|
#
|
134
130
|
# +string+ contains the character data
|
135
|
-
def characters
|
131
|
+
def characters(string)
|
136
132
|
end
|
137
133
|
|
138
134
|
###
|
139
135
|
# Called when comments are encountered
|
140
136
|
# +string+ contains the comment data
|
141
|
-
def comment
|
137
|
+
def comment(string)
|
142
138
|
end
|
143
139
|
|
144
140
|
###
|
145
141
|
# Called on document warnings
|
146
142
|
# +string+ contains the warning
|
147
|
-
def warning
|
143
|
+
def warning(string)
|
148
144
|
end
|
149
145
|
|
150
146
|
###
|
151
147
|
# Called on document errors
|
152
148
|
# +string+ contains the error
|
153
|
-
def error
|
149
|
+
def error(string)
|
154
150
|
end
|
155
151
|
|
156
152
|
###
|
157
153
|
# Called when cdata blocks are found
|
158
154
|
# +string+ contains the cdata content
|
159
|
-
def cdata_block
|
155
|
+
def cdata_block(string)
|
160
156
|
end
|
161
157
|
|
162
158
|
###
|
163
159
|
# Called when processing instructions are found
|
164
160
|
# +name+ is the target of the instruction
|
165
161
|
# +content+ is the value of the instruction
|
166
|
-
def processing_instruction
|
162
|
+
def processing_instruction(name, content)
|
167
163
|
end
|
168
164
|
end
|
169
165
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Nokogiri
|
2
4
|
module XML
|
3
5
|
module SAX
|
@@ -35,29 +37,29 @@ module Nokogiri
|
|
35
37
|
|
36
38
|
# Encodinds this parser supports
|
37
39
|
ENCODINGS = {
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
40
|
+
"NONE" => 0, # No char encoding detected
|
41
|
+
"UTF-8" => 1, # UTF-8
|
42
|
+
"UTF16LE" => 2, # UTF-16 little endian
|
43
|
+
"UTF16BE" => 3, # UTF-16 big endian
|
44
|
+
"UCS4LE" => 4, # UCS-4 little endian
|
45
|
+
"UCS4BE" => 5, # UCS-4 big endian
|
46
|
+
"EBCDIC" => 6, # EBCDIC uh!
|
47
|
+
"UCS4-2143" => 7, # UCS-4 unusual ordering
|
48
|
+
"UCS4-3412" => 8, # UCS-4 unusual ordering
|
49
|
+
"UCS2" => 9, # UCS-2
|
50
|
+
"ISO-8859-1" => 10, # ISO-8859-1 ISO Latin 1
|
51
|
+
"ISO-8859-2" => 11, # ISO-8859-2 ISO Latin 2
|
52
|
+
"ISO-8859-3" => 12, # ISO-8859-3
|
53
|
+
"ISO-8859-4" => 13, # ISO-8859-4
|
54
|
+
"ISO-8859-5" => 14, # ISO-8859-5
|
55
|
+
"ISO-8859-6" => 15, # ISO-8859-6
|
56
|
+
"ISO-8859-7" => 16, # ISO-8859-7
|
57
|
+
"ISO-8859-8" => 17, # ISO-8859-8
|
58
|
+
"ISO-8859-9" => 18, # ISO-8859-9
|
59
|
+
"ISO-2022-JP" => 19, # ISO-2022-JP
|
60
|
+
"SHIFT-JIS" => 20, # Shift_JIS
|
61
|
+
"EUC-JP" => 21, # EUC-JP
|
62
|
+
"ASCII" => 22, # pure ASCII
|
61
63
|
}
|
62
64
|
|
63
65
|
# The Nokogiri::XML::SAX::Document where events will be sent.
|
@@ -67,7 +69,7 @@ module Nokogiri
|
|
67
69
|
attr_accessor :encoding
|
68
70
|
|
69
71
|
# Create a new Parser with +doc+ and +encoding+
|
70
|
-
def initialize
|
72
|
+
def initialize(doc = Nokogiri::XML::SAX::Document.new, encoding = "UTF-8")
|
71
73
|
@encoding = check_encoding(encoding)
|
72
74
|
@document = doc
|
73
75
|
@warned = false
|
@@ -76,7 +78,7 @@ module Nokogiri
|
|
76
78
|
###
|
77
79
|
# Parse given +thing+ which may be a string containing xml, or an
|
78
80
|
# IO object.
|
79
|
-
def parse
|
81
|
+
def parse(thing, &block)
|
80
82
|
if thing.respond_to?(:read) && thing.respond_to?(:close)
|
81
83
|
parse_io(thing, &block)
|
82
84
|
else
|
@@ -86,34 +88,35 @@ module Nokogiri
|
|
86
88
|
|
87
89
|
###
|
88
90
|
# Parse given +io+
|
89
|
-
def parse_io
|
90
|
-
|
91
|
-
ctx = ParserContext.io(io, ENCODINGS[@encoding])
|
91
|
+
def parse_io(io, encoding = @encoding)
|
92
|
+
ctx = ParserContext.io(io, ENCODINGS[check_encoding(encoding)])
|
92
93
|
yield ctx if block_given?
|
93
|
-
ctx.parse_with
|
94
|
+
ctx.parse_with(self)
|
94
95
|
end
|
95
96
|
|
96
97
|
###
|
97
98
|
# Parse a file with +filename+
|
98
|
-
def parse_file
|
99
|
+
def parse_file(filename)
|
99
100
|
raise ArgumentError unless filename
|
100
101
|
raise Errno::ENOENT unless File.exist?(filename)
|
101
102
|
raise Errno::EISDIR if File.directory?(filename)
|
102
|
-
|
103
|
+
|
104
|
+
ctx = ParserContext.file(filename)
|
103
105
|
yield ctx if block_given?
|
104
|
-
ctx.parse_with
|
106
|
+
ctx.parse_with(self)
|
105
107
|
end
|
106
108
|
|
107
|
-
def parse_memory
|
108
|
-
ctx = ParserContext.memory
|
109
|
+
def parse_memory(data)
|
110
|
+
ctx = ParserContext.memory(data)
|
109
111
|
yield ctx if block_given?
|
110
|
-
ctx.parse_with
|
112
|
+
ctx.parse_with(self)
|
111
113
|
end
|
112
114
|
|
113
115
|
private
|
116
|
+
|
114
117
|
def check_encoding(encoding)
|
115
118
|
encoding.upcase.tap do |enc|
|
116
|
-
raise ArgumentError
|
119
|
+
raise ArgumentError, "'#{enc}' is not a valid encoding" unless ENCODINGS[enc]
|
117
120
|
end
|
118
121
|
end
|
119
122
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Nokogiri
|
2
4
|
module XML
|
3
5
|
module SAX
|
@@ -6,9 +8,12 @@ module Nokogiri
|
|
6
8
|
# by the user. Instead, you should be looking at
|
7
9
|
# Nokogiri::XML::SAX::Parser
|
8
10
|
class ParserContext
|
9
|
-
def self.new
|
10
|
-
[:read, :close].all? { |x| thing.respond_to?(x) }
|
11
|
-
io(thing, Parser::ENCODINGS[encoding])
|
11
|
+
def self.new(thing, encoding = "UTF-8")
|
12
|
+
if [:read, :close].all? { |x| thing.respond_to?(x) }
|
13
|
+
io(thing, Parser::ENCODINGS[encoding])
|
14
|
+
else
|
15
|
+
memory(thing)
|
16
|
+
end
|
12
17
|
end
|
13
18
|
end
|
14
19
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Nokogiri
|
2
4
|
module XML
|
3
5
|
module SAX
|
@@ -23,7 +25,6 @@ module Nokogiri
|
|
23
25
|
# parser << "/div>"
|
24
26
|
# parser.finish
|
25
27
|
class PushParser
|
26
|
-
|
27
28
|
# The Nokogiri::XML::SAX::Document on which the PushParser will be
|
28
29
|
# operating
|
29
30
|
attr_accessor :document
|
@@ -31,7 +32,7 @@ module Nokogiri
|
|
31
32
|
###
|
32
33
|
# Create a new PushParser with +doc+ as the SAX Document, providing
|
33
34
|
# an optional +file_name+ and +encoding+
|
34
|
-
def initialize(doc = XML::SAX::Document.new, file_name = nil, encoding =
|
35
|
+
def initialize(doc = XML::SAX::Document.new, file_name = nil, encoding = "UTF-8")
|
35
36
|
@document = doc
|
36
37
|
@encoding = encoding
|
37
38
|
@sax_parser = XML::SAX::Parser.new(doc)
|
@@ -43,16 +44,16 @@ module Nokogiri
|
|
43
44
|
###
|
44
45
|
# Write a +chunk+ of XML to the PushParser. Any callback methods
|
45
46
|
# that can be called will be called immediately.
|
46
|
-
def write
|
47
|
+
def write(chunk, last_chunk = false)
|
47
48
|
native_write(chunk, last_chunk)
|
48
49
|
end
|
49
|
-
|
50
|
+
alias_method :<<, :write
|
50
51
|
|
51
52
|
###
|
52
53
|
# Finish the parsing. This method is only necessary for
|
53
54
|
# Nokogiri::XML::SAX::Document#end_document to be called.
|
54
55
|
def finish
|
55
|
-
write
|
56
|
+
write("", true)
|
56
57
|
end
|
57
58
|
end
|
58
59
|
end
|
data/lib/nokogiri/xml/sax.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "sax/document"
|
4
|
+
require_relative "sax/parser_context"
|
5
|
+
require_relative "sax/parser"
|
6
|
+
require_relative "sax/push_parser"
|