multi_xml 0.7.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.mutant.yml +16 -0
- data/.rubocop.yml +14 -5
- data/CHANGELOG.md +8 -0
- data/Gemfile +12 -9
- data/README.md +1 -1
- data/Rakefile +35 -7
- data/Steepfile +22 -0
- data/lib/multi_xml/constants.rb +134 -0
- data/lib/multi_xml/errors.rb +93 -0
- data/lib/multi_xml/file_like.rb +62 -0
- data/lib/multi_xml/helpers.rb +228 -0
- data/lib/multi_xml/parsers/dom_parser.rb +97 -0
- data/lib/multi_xml/parsers/libxml.rb +35 -18
- data/lib/multi_xml/parsers/libxml_sax.rb +103 -0
- data/lib/multi_xml/parsers/nokogiri.rb +39 -22
- data/lib/multi_xml/parsers/nokogiri_sax.rb +102 -0
- data/lib/multi_xml/parsers/oga.rb +48 -51
- data/lib/multi_xml/parsers/ox.rb +99 -57
- data/lib/multi_xml/parsers/rexml.rb +84 -78
- data/lib/multi_xml/parsers/sax_handler.rb +117 -0
- data/lib/multi_xml/version.rb +5 -1
- data/lib/multi_xml.rb +173 -269
- data/sig/multi_xml.rbs +227 -0
- metadata +21 -5
- data/lib/multi_xml/parsers/libxml2_parser.rb +0 -70
|
@@ -1,71 +1,68 @@
|
|
|
1
|
-
require "oga"
|
|
2
|
-
|
|
1
|
+
require "oga"
|
|
2
|
+
require_relative "dom_parser"
|
|
3
3
|
|
|
4
4
|
module MultiXml
|
|
5
5
|
module Parsers
|
|
6
|
-
|
|
7
|
-
|
|
6
|
+
# XML parser using the Oga library
|
|
7
|
+
#
|
|
8
|
+
# @api private
|
|
9
|
+
module Oga
|
|
10
|
+
include DomParser
|
|
8
11
|
extend self
|
|
9
12
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
+
# Get the parse error class for this parser
|
|
14
|
+
#
|
|
15
|
+
# @api private
|
|
16
|
+
# @return [Class] LL::ParserError
|
|
17
|
+
def parse_error = LL::ParserError
|
|
13
18
|
|
|
19
|
+
# Parse XML from an IO object
|
|
20
|
+
#
|
|
21
|
+
# @api private
|
|
22
|
+
# @param io [IO] IO-like object containing XML
|
|
23
|
+
# @return [Hash] Parsed XML as a hash
|
|
24
|
+
# @raise [LL::ParserError] if XML is malformed
|
|
14
25
|
def parse(io)
|
|
15
|
-
|
|
16
|
-
node_to_hash(
|
|
26
|
+
doc = ::Oga.parse_xml(io)
|
|
27
|
+
node_to_hash(doc.children.first)
|
|
17
28
|
end
|
|
18
29
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
# Handle child elements
|
|
35
|
-
each_child(node) do |c|
|
|
36
|
-
if c.is_a?(::Oga::XML::Element)
|
|
37
|
-
node_to_hash(c, node_hash)
|
|
38
|
-
elsif c.is_a?(::Oga::XML::Text) || c.is_a?(::Oga::XML::Cdata)
|
|
39
|
-
node_hash[MultiXml::CONTENT_ROOT] += c.text
|
|
30
|
+
# Collect child nodes into a hash (Oga-specific implementation)
|
|
31
|
+
#
|
|
32
|
+
# Oga uses different node types than Nokogiri/LibXML.
|
|
33
|
+
#
|
|
34
|
+
# @api private
|
|
35
|
+
# @param node [Oga::XML::Element] Parent node
|
|
36
|
+
# @param node_hash [Hash] Hash to populate
|
|
37
|
+
# @return [void]
|
|
38
|
+
def collect_children(node, node_hash)
|
|
39
|
+
each_child(node) do |child|
|
|
40
|
+
case child
|
|
41
|
+
when ::Oga::XML::Element then node_to_hash(child, node_hash)
|
|
42
|
+
when ::Oga::XML::Text, ::Oga::XML::Cdata then node_hash[TEXT_CONTENT_KEY] << child.text
|
|
40
43
|
end
|
|
41
44
|
end
|
|
42
|
-
|
|
43
|
-
# Remove content node if it is empty
|
|
44
|
-
node_hash.delete(MultiXml::CONTENT_ROOT) if node_hash[MultiXml::CONTENT_ROOT].strip.empty?
|
|
45
|
-
|
|
46
|
-
# Handle attributes
|
|
47
|
-
each_attr(node) do |a|
|
|
48
|
-
key = node_name(a)
|
|
49
|
-
v = node_hash[key]
|
|
50
|
-
node_hash[key] = ((v) ? [a.value, v] : a.value)
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
hash
|
|
54
45
|
end
|
|
55
46
|
|
|
56
47
|
private
|
|
57
48
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
49
|
+
# Iterate over child nodes
|
|
50
|
+
#
|
|
51
|
+
# @param node [Oga::XML::Element] Parent node
|
|
52
|
+
# @return [void]
|
|
53
|
+
def each_child(node, &) = node.children.each(&)
|
|
61
54
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
55
|
+
# Iterate over attribute nodes
|
|
56
|
+
#
|
|
57
|
+
# @param node [Oga::XML::Element] Element node
|
|
58
|
+
# @return [void]
|
|
59
|
+
def each_attr(node, &) = node.attributes.each(&)
|
|
65
60
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
61
|
+
# Get the name of a node or attribute
|
|
62
|
+
#
|
|
63
|
+
# @param node [Oga::XML::Node] Node to get name from
|
|
64
|
+
# @return [String] Node name
|
|
65
|
+
def node_name(node) = node.name
|
|
69
66
|
end
|
|
70
67
|
end
|
|
71
68
|
end
|
data/lib/multi_xml/parsers/ox.rb
CHANGED
|
@@ -1,89 +1,131 @@
|
|
|
1
|
-
require "ox"
|
|
2
|
-
|
|
3
|
-
# Each MultiXml parser is expected to parse an XML document into a Hash. The
|
|
4
|
-
# conversion rules are:
|
|
5
|
-
#
|
|
6
|
-
# - Each document starts out as an empty Hash.
|
|
7
|
-
#
|
|
8
|
-
# - Reading an element created an entry in the parent Hash that has a key of
|
|
9
|
-
# the element name and a value of a Hash with attributes as key value
|
|
10
|
-
# pairs. Children are added as described by this rule.
|
|
11
|
-
#
|
|
12
|
-
# - Text and CDATE is stored in the parent element Hash with a key of
|
|
13
|
-
# MultiXml::CONTENT_ROOT and a value of the text itself.
|
|
14
|
-
#
|
|
15
|
-
# - If a key already exists in the Hash then the value associated with the key
|
|
16
|
-
# is converted to an Array with the old and new value in it.
|
|
17
|
-
#
|
|
18
|
-
# - Other elements such as the xml prolog, doctype, and comments are ignored.
|
|
19
|
-
#
|
|
1
|
+
require "ox"
|
|
20
2
|
|
|
21
3
|
module MultiXml
|
|
22
4
|
module Parsers
|
|
23
|
-
|
|
5
|
+
# XML parser using the Ox library (fastest pure-Ruby parser)
|
|
6
|
+
#
|
|
7
|
+
# @api private
|
|
8
|
+
module Ox
|
|
24
9
|
module_function
|
|
25
10
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
11
|
+
# Get the parse error class for this parser
|
|
12
|
+
#
|
|
13
|
+
# @api private
|
|
14
|
+
# @return [Class] Ox::ParseError
|
|
15
|
+
def parse_error = ::Ox::ParseError
|
|
29
16
|
|
|
17
|
+
# Parse XML from an IO object
|
|
18
|
+
#
|
|
19
|
+
# @api private
|
|
20
|
+
# @param io [IO] IO-like object containing XML
|
|
21
|
+
# @return [Hash] Parsed XML as a hash
|
|
30
22
|
def parse(io)
|
|
31
23
|
handler = Handler.new
|
|
32
24
|
::Ox.sax_parse(handler, io, convert_special: true, skip: :skip_return)
|
|
33
|
-
handler.
|
|
25
|
+
handler.result
|
|
34
26
|
end
|
|
35
27
|
|
|
28
|
+
# SAX event handler that builds a hash tree while parsing
|
|
29
|
+
#
|
|
30
|
+
# @api private
|
|
36
31
|
class Handler
|
|
37
|
-
|
|
38
|
-
|
|
32
|
+
# Create a new SAX handler
|
|
33
|
+
#
|
|
34
|
+
# @return [Handler] new handler instance
|
|
39
35
|
def initialize
|
|
40
36
|
@stack = []
|
|
41
37
|
end
|
|
42
38
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
39
|
+
# Get the parsed result
|
|
40
|
+
#
|
|
41
|
+
# @return [Hash, nil] the root hash or nil if empty
|
|
42
|
+
def result = @stack.first
|
|
46
43
|
|
|
47
|
-
|
|
48
|
-
|
|
44
|
+
# Handle start of an element
|
|
45
|
+
#
|
|
46
|
+
# @param name [Symbol] Element name
|
|
47
|
+
# @return [void]
|
|
48
|
+
def start_element(name)
|
|
49
|
+
@stack << {} if @stack.empty?
|
|
50
|
+
child = {}
|
|
51
|
+
add_value(name.to_s, child)
|
|
52
|
+
@stack << child
|
|
49
53
|
end
|
|
50
54
|
|
|
51
|
-
|
|
52
|
-
|
|
55
|
+
# Handle end of an element
|
|
56
|
+
#
|
|
57
|
+
# @param _name [Symbol] Element name (unused)
|
|
58
|
+
# @return [void]
|
|
59
|
+
def end_element(_name)
|
|
60
|
+
strip_whitespace_content if current.key?(TEXT_CONTENT_KEY)
|
|
61
|
+
@stack.pop
|
|
53
62
|
end
|
|
54
63
|
|
|
55
|
-
|
|
56
|
-
|
|
64
|
+
# Handle an attribute
|
|
65
|
+
#
|
|
66
|
+
# @param name [Symbol] Attribute name
|
|
67
|
+
# @param value [String] Attribute value
|
|
68
|
+
# @return [void]
|
|
69
|
+
def attr(name, value)
|
|
70
|
+
add_value(name.to_s, value) unless @stack.empty?
|
|
57
71
|
end
|
|
58
72
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
73
|
+
# Handle text content
|
|
74
|
+
#
|
|
75
|
+
# @param value [String] Text content
|
|
76
|
+
# @return [void]
|
|
77
|
+
def text(value) = add_value(TEXT_CONTENT_KEY, value)
|
|
78
|
+
|
|
79
|
+
# Handle CDATA content
|
|
80
|
+
#
|
|
81
|
+
# @param value [String] CDATA content
|
|
82
|
+
# @return [void]
|
|
83
|
+
def cdata(value) = add_value(TEXT_CONTENT_KEY, value)
|
|
84
|
+
|
|
85
|
+
# Handle parse errors
|
|
86
|
+
#
|
|
87
|
+
# @param message [String] Error message
|
|
88
|
+
# @param line [Integer] Line number
|
|
89
|
+
# @param column [Integer] Column number
|
|
90
|
+
# @return [void]
|
|
91
|
+
# @raise [Ox::ParseError] always
|
|
92
|
+
def error(message, line, column)
|
|
93
|
+
raise ::Ox::ParseError, "#{message} at #{line}:#{column}"
|
|
64
94
|
end
|
|
65
95
|
|
|
66
|
-
|
|
67
|
-
|
|
96
|
+
private
|
|
97
|
+
|
|
98
|
+
# Get the current element hash
|
|
99
|
+
#
|
|
100
|
+
# @return [Hash] current hash being built
|
|
101
|
+
def current = @stack.last
|
|
102
|
+
|
|
103
|
+
# Add a value to the current hash, merging with existing if needed
|
|
104
|
+
#
|
|
105
|
+
# @param key [String] Key to add
|
|
106
|
+
# @param value [Object] Value to add
|
|
107
|
+
# @return [void]
|
|
108
|
+
def add_value(key, value)
|
|
109
|
+
existing = current[key]
|
|
110
|
+
current[key] = existing ? merge_values(existing, value) : value
|
|
68
111
|
end
|
|
69
112
|
|
|
70
|
-
|
|
71
|
-
|
|
113
|
+
# Merge a value with an existing value, creating array if needed
|
|
114
|
+
#
|
|
115
|
+
# @param existing [Object] Existing value
|
|
116
|
+
# @param value [Object] Value to append
|
|
117
|
+
# @return [Array] array with both values
|
|
118
|
+
def merge_values(existing, value)
|
|
119
|
+
existing.is_a?(Array) ? existing << value : [existing, value]
|
|
72
120
|
end
|
|
73
121
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
else
|
|
82
|
-
h[key] = [v, value]
|
|
83
|
-
end
|
|
84
|
-
else
|
|
85
|
-
h[key] = value
|
|
86
|
-
end
|
|
122
|
+
# Remove empty or whitespace-only text content
|
|
123
|
+
#
|
|
124
|
+
# @return [void]
|
|
125
|
+
def strip_whitespace_content
|
|
126
|
+
content = current[TEXT_CONTENT_KEY]
|
|
127
|
+
should_remove = content.empty? || (current.size > 1 && content.strip.empty?)
|
|
128
|
+
current.delete(TEXT_CONTENT_KEY) if should_remove
|
|
87
129
|
end
|
|
88
130
|
end
|
|
89
131
|
end
|
|
@@ -1,111 +1,117 @@
|
|
|
1
|
-
require "rexml/document"
|
|
1
|
+
require "rexml/document"
|
|
2
2
|
|
|
3
3
|
module MultiXml
|
|
4
4
|
module Parsers
|
|
5
|
-
|
|
5
|
+
# XML parser using Ruby's built-in REXML library
|
|
6
|
+
#
|
|
7
|
+
# @api private
|
|
8
|
+
module Rexml
|
|
6
9
|
extend self
|
|
7
10
|
|
|
8
|
-
|
|
9
|
-
::REXML::ParseException
|
|
10
|
-
end
|
|
11
|
-
|
|
12
|
-
# Parse an XML Document IO into a simple hash using REXML
|
|
11
|
+
# Get the parse error class for this parser
|
|
13
12
|
#
|
|
14
|
-
#
|
|
15
|
-
#
|
|
16
|
-
def
|
|
17
|
-
doc = REXML::Document.new(xml)
|
|
18
|
-
raise(REXML::ParseException, "The document #{doc.to_s.inspect} does not have a valid root") unless doc.root
|
|
13
|
+
# @api private
|
|
14
|
+
# @return [Class] REXML::ParseException
|
|
15
|
+
def parse_error = ::REXML::ParseException
|
|
19
16
|
|
|
20
|
-
|
|
17
|
+
# Parse XML from an IO object
|
|
18
|
+
#
|
|
19
|
+
# @api private
|
|
20
|
+
# @param io [IO] IO-like object containing XML
|
|
21
|
+
# @return [Hash] Parsed XML as a hash
|
|
22
|
+
# @raise [REXML::ParseException] if XML is malformed
|
|
23
|
+
def parse(io)
|
|
24
|
+
doc = REXML::Document.new(io)
|
|
25
|
+
element_to_hash({}, doc.root)
|
|
21
26
|
end
|
|
22
27
|
|
|
23
28
|
private
|
|
24
29
|
|
|
25
|
-
# Convert an
|
|
30
|
+
# Convert an element to hash format
|
|
26
31
|
#
|
|
27
|
-
#
|
|
28
|
-
#
|
|
29
|
-
# element::
|
|
30
|
-
#
|
|
31
|
-
def
|
|
32
|
-
|
|
32
|
+
# @api private
|
|
33
|
+
# @param hash [Hash] Accumulator hash
|
|
34
|
+
# @param element [REXML::Element] Element to convert
|
|
35
|
+
# @return [Hash] Updated hash
|
|
36
|
+
def element_to_hash(hash, element)
|
|
37
|
+
add_to_hash(hash, element.name, collapse_element(element))
|
|
33
38
|
end
|
|
34
39
|
|
|
35
|
-
#
|
|
40
|
+
# Collapse an element into a hash with attributes and content
|
|
36
41
|
#
|
|
37
|
-
#
|
|
38
|
-
#
|
|
39
|
-
|
|
40
|
-
|
|
42
|
+
# @api private
|
|
43
|
+
# @param element [REXML::Element] Element to collapse
|
|
44
|
+
# @return [Hash] Hash representation
|
|
45
|
+
def collapse_element(element)
|
|
46
|
+
node_hash = collect_attributes(element)
|
|
41
47
|
|
|
42
48
|
if element.has_elements?
|
|
43
|
-
element
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
merge_texts!(hash, element)
|
|
49
|
+
collect_child_elements(element, node_hash)
|
|
50
|
+
add_text_content(node_hash, element) unless whitespace_only?(element)
|
|
51
|
+
elsif node_hash.empty? || !whitespace_only?(element)
|
|
52
|
+
add_text_content(node_hash, element)
|
|
48
53
|
end
|
|
54
|
+
|
|
55
|
+
node_hash
|
|
49
56
|
end
|
|
50
57
|
|
|
51
|
-
#
|
|
58
|
+
# Collect all attributes from an element into a hash
|
|
52
59
|
#
|
|
53
|
-
#
|
|
54
|
-
#
|
|
55
|
-
#
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
if element.has_text?
|
|
59
|
-
# must use value to prevent double-escaping
|
|
60
|
-
texts = element.texts.map(&:value).join
|
|
61
|
-
merge!(hash, MultiXml::CONTENT_ROOT, texts)
|
|
62
|
-
else
|
|
63
|
-
hash
|
|
64
|
-
end
|
|
60
|
+
# @api private
|
|
61
|
+
# @param element [REXML::Element] Element with attributes
|
|
62
|
+
# @return [Hash] Hash of attribute name-value pairs
|
|
63
|
+
def collect_attributes(element)
|
|
64
|
+
element.attributes.each_with_object({}) { |(name, value), hash| hash[name] = value }
|
|
65
65
|
end
|
|
66
66
|
|
|
67
|
-
#
|
|
68
|
-
# already exists and the existing value associated with key is not
|
|
69
|
-
# an Array, it will be wrapped in an Array. Then the new value is
|
|
70
|
-
# appended to that Array.
|
|
67
|
+
# Collect all child elements into a hash
|
|
71
68
|
#
|
|
72
|
-
#
|
|
73
|
-
#
|
|
74
|
-
#
|
|
75
|
-
#
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
hash
|
|
69
|
+
# @api private
|
|
70
|
+
# @param element [REXML::Element] Parent element
|
|
71
|
+
# @param node_hash [Hash] Hash to populate
|
|
72
|
+
# @return [void]
|
|
73
|
+
def collect_child_elements(element, node_hash)
|
|
74
|
+
element.each_element { |child| element_to_hash(node_hash, child) }
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Add text content from an element to a hash
|
|
78
|
+
#
|
|
79
|
+
# @api private
|
|
80
|
+
# @param hash [Hash] Target hash
|
|
81
|
+
# @param element [REXML::Element] Element with text
|
|
82
|
+
# @return [Hash] Updated hash
|
|
83
|
+
def add_text_content(hash, element)
|
|
84
|
+
return hash unless element.has_text?
|
|
85
|
+
|
|
86
|
+
text = element.texts.map(&:value).join
|
|
87
|
+
add_to_hash(hash, TEXT_CONTENT_KEY, text)
|
|
91
88
|
end
|
|
92
89
|
|
|
93
|
-
#
|
|
94
|
-
# Returns an empty Hash if node has no attributes.
|
|
90
|
+
# Add a value to a hash, handling duplicates as arrays
|
|
95
91
|
#
|
|
96
|
-
#
|
|
97
|
-
#
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
92
|
+
# @api private
|
|
93
|
+
# @param hash [Hash] Target hash
|
|
94
|
+
# @param key [String] Key to add
|
|
95
|
+
# @param value [Object] Value to add
|
|
96
|
+
# @return [Hash] Updated hash
|
|
97
|
+
def add_to_hash(hash, key, value)
|
|
98
|
+
existing = hash[key]
|
|
99
|
+
hash[key] = if existing
|
|
100
|
+
existing.is_a?(Array) ? existing << value : [existing, value]
|
|
101
|
+
elsif value.is_a?(Array)
|
|
102
|
+
[value]
|
|
103
|
+
else
|
|
104
|
+
value
|
|
105
|
+
end
|
|
106
|
+
hash
|
|
102
107
|
end
|
|
103
108
|
|
|
104
|
-
#
|
|
109
|
+
# Check if element contains only whitespace text
|
|
105
110
|
#
|
|
106
|
-
#
|
|
107
|
-
#
|
|
108
|
-
|
|
111
|
+
# @api private
|
|
112
|
+
# @param element [REXML::Element] Element to check
|
|
113
|
+
# @return [Boolean] true if whitespace only
|
|
114
|
+
def whitespace_only?(element)
|
|
109
115
|
element.texts.join.strip.empty?
|
|
110
116
|
end
|
|
111
117
|
end
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
require "cgi/escape"
|
|
2
|
+
|
|
3
|
+
module MultiXml
|
|
4
|
+
module Parsers
|
|
5
|
+
# Shared SAX handler logic for building hash trees from XML events
|
|
6
|
+
#
|
|
7
|
+
# This module provides the core stack-based parsing logic used by both
|
|
8
|
+
# NokogiriSax and LibxmlSax parsers. Including classes must implement
|
|
9
|
+
# the callback methods that their respective SAX libraries expect.
|
|
10
|
+
#
|
|
11
|
+
# @api private
|
|
12
|
+
module SaxHandler
|
|
13
|
+
# Initialize the handler state
|
|
14
|
+
#
|
|
15
|
+
# @api private
|
|
16
|
+
# @return [void]
|
|
17
|
+
def initialize_handler
|
|
18
|
+
@result = {}
|
|
19
|
+
@stack = [@result]
|
|
20
|
+
@pending_attrs = []
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Get the parsed result
|
|
24
|
+
#
|
|
25
|
+
# @api private
|
|
26
|
+
# @return [Hash] the parsed hash
|
|
27
|
+
attr_reader :result
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
|
|
31
|
+
# Get the current element hash
|
|
32
|
+
#
|
|
33
|
+
# @api private
|
|
34
|
+
# @return [Hash] current hash being built
|
|
35
|
+
def current = @stack.last
|
|
36
|
+
|
|
37
|
+
# Handle start of an element by pushing onto the stack
|
|
38
|
+
#
|
|
39
|
+
# @api private
|
|
40
|
+
# @param name [String] Element name
|
|
41
|
+
# @param attrs [Hash, Array] Element attributes
|
|
42
|
+
# @return [void]
|
|
43
|
+
def handle_start_element(name, attrs)
|
|
44
|
+
child = {TEXT_CONTENT_KEY => +""}
|
|
45
|
+
add_child_to_current(name, child)
|
|
46
|
+
@stack << child
|
|
47
|
+
@pending_attrs << normalize_attrs(attrs)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Handle end of an element by applying attributes and popping the stack
|
|
51
|
+
#
|
|
52
|
+
# @api private
|
|
53
|
+
# @return [void]
|
|
54
|
+
def handle_end_element
|
|
55
|
+
apply_attributes(@pending_attrs.pop)
|
|
56
|
+
strip_whitespace_content
|
|
57
|
+
@stack.pop
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Append text to the current element's content
|
|
61
|
+
#
|
|
62
|
+
# @api private
|
|
63
|
+
# @param text [String] Text to append
|
|
64
|
+
# @return [void]
|
|
65
|
+
def append_text(text)
|
|
66
|
+
current[TEXT_CONTENT_KEY] << text
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Add a child hash to the current element
|
|
70
|
+
#
|
|
71
|
+
# @api private
|
|
72
|
+
# @param name [String] Child element name
|
|
73
|
+
# @param child [Hash] Child hash to add
|
|
74
|
+
# @return [void]
|
|
75
|
+
def add_child_to_current(name, child)
|
|
76
|
+
existing = current[name]
|
|
77
|
+
current[name] = case existing
|
|
78
|
+
when Array then existing << child
|
|
79
|
+
when Hash then [existing, child]
|
|
80
|
+
else child
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Normalize attributes to a hash
|
|
85
|
+
#
|
|
86
|
+
# @api private
|
|
87
|
+
# @param attrs [Hash, Array] Attributes as hash or array of pairs
|
|
88
|
+
# @return [Hash] Normalized attributes hash
|
|
89
|
+
def normalize_attrs(attrs)
|
|
90
|
+
attrs.is_a?(Hash) ? attrs : attrs.to_h
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Apply pending attributes to the current element
|
|
94
|
+
#
|
|
95
|
+
# @api private
|
|
96
|
+
# @param attrs [Hash] Attributes to apply
|
|
97
|
+
# @return [void]
|
|
98
|
+
def apply_attributes(attrs)
|
|
99
|
+
attrs.each do |name, value|
|
|
100
|
+
unescaped = CGI.unescapeHTML(value)
|
|
101
|
+
existing = current[name]
|
|
102
|
+
current[name] = existing ? [unescaped, existing] : unescaped
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Remove empty or whitespace-only text content
|
|
107
|
+
#
|
|
108
|
+
# @api private
|
|
109
|
+
# @return [void]
|
|
110
|
+
def strip_whitespace_content
|
|
111
|
+
content = current[TEXT_CONTENT_KEY]
|
|
112
|
+
should_remove = content.empty? || (current.size > 1 && content.strip.empty?)
|
|
113
|
+
current.delete(TEXT_CONTENT_KEY) if should_remove
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|