xml_node_stream 1.0.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +30 -0
- data/README.md +139 -0
- data/VERSION +1 -0
- data/lib/xml_node_stream/http_stream.rb +179 -0
- data/lib/xml_node_stream/node.rb +98 -47
- data/lib/xml_node_stream/parser/base.rb +49 -12
- data/lib/xml_node_stream/parser/libxml_parser.rb +36 -9
- data/lib/xml_node_stream/parser/nokogiri_parser.rb +42 -12
- data/lib/xml_node_stream/parser/rexml_parser.rb +35 -8
- data/lib/xml_node_stream/parser.rb +54 -29
- data/lib/xml_node_stream/selector.rb +144 -34
- data/lib/xml_node_stream.rb +18 -5
- data/xml_node_stream.gemspec +39 -0
- metadata +46 -88
- data/README.rdoc +0 -61
- data/Rakefile +0 -44
- data/spec/node_spec.rb +0 -140
- data/spec/parser_spec.rb +0 -148
- data/spec/selector_spec.rb +0 -73
- data/spec/spec_helper.rb +0 -2
- data/spec/test.xml +0 -57
- data/spec/xml_node_stream_spec.rb +0 -11
- /data/{MIT_LICENSE → MIT-LICENSE} +0 -0
|
@@ -1,6 +1,8 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
begin
|
|
2
|
-
require
|
|
3
|
-
|
|
4
|
+
require "libxml"
|
|
5
|
+
|
|
4
6
|
module XmlNodeStream
|
|
5
7
|
class Parser
|
|
6
8
|
# Wrapper for the LibXML SAX parser.
|
|
@@ -8,26 +10,51 @@ begin
|
|
|
8
10
|
include LibXML::XML::SaxParser::Callbacks
|
|
9
11
|
include Base
|
|
10
12
|
|
|
11
|
-
|
|
13
|
+
# Parse the input stream using LibXML.
|
|
14
|
+
#
|
|
15
|
+
# @param io [IO] the input stream to parse
|
|
16
|
+
# @return [void]
|
|
17
|
+
def parse_stream(io)
|
|
12
18
|
context = LibXML::XML::Parser::Context.io(io)
|
|
13
19
|
parser = LibXML::XML::SaxParser.new(context)
|
|
14
20
|
parser.callbacks = self
|
|
15
21
|
parser.parse
|
|
16
22
|
end
|
|
17
|
-
|
|
18
|
-
|
|
23
|
+
|
|
24
|
+
# Handle LibXML start element callback.
|
|
25
|
+
#
|
|
26
|
+
# @param name [String] the element name
|
|
27
|
+
# @param attributes [Hash] the element attributes
|
|
28
|
+
# @return [void]
|
|
29
|
+
# @api private
|
|
30
|
+
def on_start_element(name, attributes)
|
|
19
31
|
do_start_element(name, attributes)
|
|
20
32
|
end
|
|
21
33
|
|
|
22
|
-
|
|
34
|
+
# Handle LibXML end element callback.
|
|
35
|
+
#
|
|
36
|
+
# @param name [String] the element name
|
|
37
|
+
# @return [void]
|
|
38
|
+
# @api private
|
|
39
|
+
def on_end_element(name)
|
|
23
40
|
do_end_element(name)
|
|
24
41
|
end
|
|
25
42
|
|
|
26
|
-
|
|
43
|
+
# Handle LibXML character data callback.
|
|
44
|
+
#
|
|
45
|
+
# @param characters [String] the character data
|
|
46
|
+
# @return [void]
|
|
47
|
+
# @api private
|
|
48
|
+
def on_characters(characters)
|
|
27
49
|
do_characters(characters)
|
|
28
50
|
end
|
|
29
51
|
|
|
30
|
-
|
|
52
|
+
# Handle LibXML CDATA block callback.
|
|
53
|
+
#
|
|
54
|
+
# @param characters [String] the CDATA content
|
|
55
|
+
# @return [void]
|
|
56
|
+
# @api private
|
|
57
|
+
def on_cdata_block(characters)
|
|
31
58
|
do_cdata_block(characters)
|
|
32
59
|
end
|
|
33
60
|
end
|
|
@@ -41,4 +68,4 @@ rescue LoadError
|
|
|
41
68
|
end
|
|
42
69
|
end
|
|
43
70
|
end
|
|
44
|
-
end
|
|
71
|
+
end
|
|
@@ -1,24 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
begin
|
|
2
|
-
require
|
|
3
|
-
|
|
4
|
+
require "nokogiri"
|
|
5
|
+
|
|
4
6
|
module XmlNodeStream
|
|
5
7
|
class Parser
|
|
6
8
|
# Wrapper for the Nokogiri SAX parser.
|
|
7
9
|
class NokogiriParser
|
|
8
10
|
include Base
|
|
9
11
|
|
|
10
|
-
|
|
12
|
+
# Parse the input stream using Nokogiri.
|
|
13
|
+
#
|
|
14
|
+
# @param io [IO] the input stream to parse
|
|
15
|
+
# @return [void]
|
|
16
|
+
def parse_stream(io)
|
|
11
17
|
listener = Listener.new(self)
|
|
12
18
|
parser = Nokogiri::XML::SAX::Parser.new(listener)
|
|
13
19
|
parser.parse(io)
|
|
14
20
|
end
|
|
15
|
-
|
|
21
|
+
|
|
16
22
|
class Listener < Nokogiri::XML::SAX::Document
|
|
17
|
-
|
|
23
|
+
# Initialize the Nokogiri listener.
|
|
24
|
+
#
|
|
25
|
+
# @param parser [NokogiriParser] the parser instance
|
|
26
|
+
def initialize(parser)
|
|
18
27
|
@parser = parser
|
|
19
28
|
end
|
|
20
|
-
|
|
21
|
-
|
|
29
|
+
|
|
30
|
+
# Handle Nokogiri start element callback.
|
|
31
|
+
#
|
|
32
|
+
# @param name [String] the element name
|
|
33
|
+
# @param attributes [Array] the element attributes
|
|
34
|
+
# @return [void]
|
|
35
|
+
# @api private
|
|
36
|
+
def start_element(name, attributes = [])
|
|
22
37
|
attributes_hash = {}
|
|
23
38
|
if attributes.first.is_a?(Array)
|
|
24
39
|
# Newer style where attributes are passed as an array of arrays
|
|
@@ -27,20 +42,35 @@ begin
|
|
|
27
42
|
end
|
|
28
43
|
else
|
|
29
44
|
# Old style where attributes are passed as a flat array
|
|
30
|
-
(attributes.size / 2).times{|i| attributes_hash[attributes[i * 2]] = attributes[(i * 2) + 1]}
|
|
45
|
+
(attributes.size / 2).times { |i| attributes_hash[attributes[i * 2]] = attributes[(i * 2) + 1] }
|
|
31
46
|
end
|
|
32
47
|
@parser.do_start_element(name, attributes_hash)
|
|
33
48
|
end
|
|
34
49
|
|
|
35
|
-
|
|
50
|
+
# Handle Nokogiri end element callback.
|
|
51
|
+
#
|
|
52
|
+
# @param name [String] the element name
|
|
53
|
+
# @return [void]
|
|
54
|
+
# @api private
|
|
55
|
+
def end_element(name)
|
|
36
56
|
@parser.do_end_element(name)
|
|
37
57
|
end
|
|
38
58
|
|
|
39
|
-
|
|
59
|
+
# Handle Nokogiri character data callback.
|
|
60
|
+
#
|
|
61
|
+
# @param characters [String] the character data
|
|
62
|
+
# @return [void]
|
|
63
|
+
# @api private
|
|
64
|
+
def characters(characters)
|
|
40
65
|
@parser.do_characters(characters)
|
|
41
66
|
end
|
|
42
67
|
|
|
43
|
-
|
|
68
|
+
# Handle Nokogiri CDATA block callback.
|
|
69
|
+
#
|
|
70
|
+
# @param characters [String] the CDATA content
|
|
71
|
+
# @return [void]
|
|
72
|
+
# @api private
|
|
73
|
+
def cdata_block(characters)
|
|
44
74
|
@parser.do_cdata_block(characters)
|
|
45
75
|
end
|
|
46
76
|
end
|
|
@@ -55,4 +85,4 @@ rescue LoadError
|
|
|
55
85
|
end
|
|
56
86
|
end
|
|
57
87
|
end
|
|
58
|
-
end
|
|
88
|
+
end
|
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
begin
|
|
2
|
-
require
|
|
3
|
-
require
|
|
4
|
-
|
|
4
|
+
require "rexml/document"
|
|
5
|
+
require "rexml/streamlistener"
|
|
6
|
+
|
|
5
7
|
module XmlNodeStream
|
|
6
8
|
class Parser
|
|
7
9
|
# Wrapper for the REXML SAX parser.
|
|
@@ -9,24 +11,49 @@ begin
|
|
|
9
11
|
include REXML::StreamListener
|
|
10
12
|
include Base
|
|
11
13
|
|
|
12
|
-
|
|
14
|
+
# Parse the input stream using REXML.
|
|
15
|
+
#
|
|
16
|
+
# @param io [IO] the input stream to parse
|
|
17
|
+
# @return [void]
|
|
18
|
+
def parse_stream(io)
|
|
13
19
|
parser = REXML::Parsers::StreamParser.new(io, self)
|
|
14
20
|
parser.parse
|
|
15
21
|
end
|
|
16
22
|
|
|
17
|
-
|
|
23
|
+
# Handle REXML tag start callback.
|
|
24
|
+
#
|
|
25
|
+
# @param name [String] the element name
|
|
26
|
+
# @param attributes [Hash] the element attributes
|
|
27
|
+
# @return [void]
|
|
28
|
+
# @api private
|
|
29
|
+
def tag_start(name, attributes)
|
|
18
30
|
do_start_element(name, attributes)
|
|
19
31
|
end
|
|
20
32
|
|
|
21
|
-
|
|
33
|
+
# Handle REXML tag end callback.
|
|
34
|
+
#
|
|
35
|
+
# @param name [String] the element name
|
|
36
|
+
# @return [void]
|
|
37
|
+
# @api private
|
|
38
|
+
def tag_end(name)
|
|
22
39
|
do_end_element(name)
|
|
23
40
|
end
|
|
24
41
|
|
|
25
|
-
|
|
42
|
+
# Handle REXML text callback.
|
|
43
|
+
#
|
|
44
|
+
# @param content [String] the text content
|
|
45
|
+
# @return [void]
|
|
46
|
+
# @api private
|
|
47
|
+
def text(content)
|
|
26
48
|
do_characters(content)
|
|
27
49
|
end
|
|
28
50
|
|
|
29
|
-
|
|
51
|
+
# Handle REXML CDATA callback.
|
|
52
|
+
#
|
|
53
|
+
# @param content [String] the CDATA content
|
|
54
|
+
# @return [void]
|
|
55
|
+
# @api private
|
|
56
|
+
def cdata(content)
|
|
30
57
|
do_cdata_block(content)
|
|
31
58
|
end
|
|
32
59
|
end
|
|
@@ -1,70 +1,95 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
require
|
|
4
|
-
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "net/http"
|
|
4
|
+
require_relative "parser/base"
|
|
5
|
+
require_relative "http_stream"
|
|
5
6
|
|
|
6
7
|
module XmlNodeStream
|
|
7
8
|
# The abstract parser class that wraps the actual parser implementation.
|
|
8
9
|
class Parser
|
|
9
|
-
|
|
10
10
|
SUPPORTED_PARSERS = [:nokogiri, :libxml, :rexml]
|
|
11
|
-
|
|
11
|
+
|
|
12
|
+
@parser = :rexml
|
|
13
|
+
|
|
12
14
|
class << self
|
|
13
15
|
# Set the parser implementation. The parser argument should be one of :nokogiri, :libxml, or :rexml. If this method
|
|
14
16
|
# is not called, it will default to :rexml which is the slowest choice possible. If you set the parser to one of the
|
|
15
17
|
# other values, though, you'll need to make sure you have the nokogiri gem or libxml-ruby gem installed.
|
|
16
|
-
|
|
17
|
-
|
|
18
|
+
#
|
|
19
|
+
# @param parser [Symbol, String] the parser name (:nokogiri, :libxml, or :rexml)
|
|
20
|
+
# @return [Symbol] the parser name
|
|
21
|
+
# @raise [ArgumentError] if parser is not one of the supported parsers
|
|
22
|
+
def parser_name=(parser)
|
|
23
|
+
parser_sym = parser&.to_sym
|
|
18
24
|
raise ArgumentError.new("must be one of #{SUPPORTED_PARSERS.inspect}") unless SUPPORTED_PARSERS.include?(parser_sym)
|
|
25
|
+
|
|
19
26
|
@parser_name = parser_sym
|
|
20
27
|
end
|
|
21
|
-
|
|
28
|
+
|
|
22
29
|
# Get the name of the current parser.
|
|
30
|
+
#
|
|
31
|
+
# @return [Symbol] the current parser name
|
|
23
32
|
def parser_name
|
|
24
33
|
@parser_name ||= :rexml
|
|
25
34
|
end
|
|
26
|
-
|
|
35
|
+
|
|
27
36
|
# Parse the document specified in io. This can be either a Stream, URI, Pathname, or String. If it is a String,
|
|
28
37
|
# it can either be a XML document, file system path, or URI. The parser will figure it out. If a block is given,
|
|
29
38
|
# it will be yielded to with each node as it is parsed.
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
39
|
+
#
|
|
40
|
+
# @param io [IO, String, URI, Pathname] the input source to parse
|
|
41
|
+
# @yield [Node] each node as it is parsed
|
|
42
|
+
# @return [Node] the root node of the parsed document
|
|
43
|
+
def parse(io, &block)
|
|
44
|
+
close_stream = true
|
|
45
|
+
io = URI.parse(io) if io.is_a?(String) && io.match?(%r{\Ahttp(s)?://})
|
|
46
|
+
|
|
47
|
+
if io.is_a?(String) && io.match?(/<[^>]+>/m)
|
|
48
|
+
io = StringIO.new(io)
|
|
49
|
+
elsif io.is_a?(String)
|
|
50
|
+
unless File.exist?(io)
|
|
51
|
+
raise ArgumentError.new("File not found: #{io}")
|
|
37
52
|
end
|
|
38
|
-
|
|
53
|
+
io = File.open(io, "r:UTF-8")
|
|
39
54
|
elsif io.is_a?(Pathname)
|
|
40
|
-
|
|
41
|
-
|
|
55
|
+
unless io.exist?
|
|
56
|
+
raise ArgumentError.new("File not found: #{io}")
|
|
57
|
+
end
|
|
58
|
+
io = io.open("r:UTF-8")
|
|
42
59
|
elsif io.is_a?(URI)
|
|
43
|
-
io = io
|
|
44
|
-
|
|
60
|
+
io = HttpStream.new(io)
|
|
61
|
+
else
|
|
62
|
+
close_stream = false
|
|
45
63
|
end
|
|
46
64
|
|
|
47
65
|
begin
|
|
48
66
|
parser = parser_class(parser_name).new(&block)
|
|
49
67
|
parser.parse_stream(io)
|
|
50
|
-
|
|
68
|
+
parser.root
|
|
51
69
|
ensure
|
|
52
|
-
|
|
70
|
+
if close_stream
|
|
71
|
+
begin
|
|
72
|
+
io.close
|
|
73
|
+
rescue
|
|
74
|
+
# Ignore errors during close to ensure cleanup completes
|
|
75
|
+
nil
|
|
76
|
+
end
|
|
77
|
+
end
|
|
53
78
|
end
|
|
54
79
|
end
|
|
55
|
-
|
|
80
|
+
|
|
56
81
|
protected
|
|
57
|
-
|
|
58
|
-
def parser_class
|
|
82
|
+
|
|
83
|
+
def parser_class(class_symbol)
|
|
59
84
|
@loaded_parsers ||= {}
|
|
60
85
|
klass = @loaded_parsers[class_symbol]
|
|
61
86
|
unless klass
|
|
62
|
-
require File.expand_path(File.join(File.dirname(__FILE__),
|
|
87
|
+
require File.expand_path(File.join(File.dirname(__FILE__), "parser", "#{class_symbol}_parser"))
|
|
63
88
|
class_name = "#{class_symbol.to_s.capitalize}Parser"
|
|
64
89
|
klass = const_get(class_name)
|
|
65
90
|
@loaded_parsers[class_symbol] = klass
|
|
66
91
|
end
|
|
67
|
-
|
|
92
|
+
klass
|
|
68
93
|
end
|
|
69
94
|
end
|
|
70
95
|
end
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "set"
|
|
4
|
+
|
|
1
5
|
module XmlNodeStream
|
|
2
6
|
# Partial implementation of XPath selectors. Only abbreviated paths and the text() function are supported. The rest of XPath
|
|
3
7
|
# is unecessary in the context of a Ruby application since XPath is also a programming language. If you really need an XPath
|
|
@@ -13,59 +17,165 @@ module XmlNodeStream
|
|
|
13
17
|
# * /library/books/book - find all book elements with the full path /library/books/book
|
|
14
18
|
# * author/text() - get the text values of all author child elements
|
|
15
19
|
class Selector
|
|
20
|
+
XPATH_SEGMENT_REGEX = /\A(\.\.?|\*|[a-zA-Z_][\w-]*|text\(\))(\|((\.\.?|\*|[a-zA-Z_][\w-]*|text\(\))))*\z/
|
|
21
|
+
|
|
16
22
|
# Create a selector. Path should be an abbreviated XPath string.
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
or_paths.each do |matcher_path|
|
|
25
|
-
part_matchers << Matcher.new(matcher_path)
|
|
26
|
-
end
|
|
27
|
-
end
|
|
23
|
+
#
|
|
24
|
+
# @param path [String] the XPath selector string
|
|
25
|
+
# @raise [ArgumentError] if the path is invalid
|
|
26
|
+
def initialize(path)
|
|
27
|
+
raise ArgumentError, "XPath pattern cannot be empty" if path.nil? || path.empty?
|
|
28
|
+
|
|
29
|
+
@parts = tokenize_path(path)
|
|
28
30
|
end
|
|
29
|
-
|
|
31
|
+
|
|
30
32
|
# Apply the selector to the current node. Note, if your path started with a /, it will be applied
|
|
31
33
|
# to the root node.
|
|
32
|
-
|
|
34
|
+
#
|
|
35
|
+
# @param node [Node] the node to apply the selector to
|
|
36
|
+
# @return [Array<Node>] the matching nodes
|
|
37
|
+
def find(node)
|
|
33
38
|
matched = [node]
|
|
34
39
|
@parts.each do |part_matchers|
|
|
35
40
|
context = matched
|
|
41
|
+
context_set = context.to_set
|
|
36
42
|
matched = []
|
|
43
|
+
|
|
37
44
|
part_matchers.each do |matcher|
|
|
38
|
-
matched.concat(matcher.select(context))
|
|
45
|
+
matched.concat(matcher.select(context, context_set))
|
|
39
46
|
end
|
|
47
|
+
|
|
40
48
|
break if matched.empty?
|
|
41
49
|
end
|
|
42
|
-
|
|
50
|
+
matched
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
private
|
|
54
|
+
|
|
55
|
+
# Tokenize the XPath into parts using a simple lexer approach
|
|
56
|
+
#
|
|
57
|
+
# @param path [String] the XPath string to tokenize
|
|
58
|
+
# @return [Array<Array<Matcher>>] array of matcher arrays
|
|
59
|
+
# @raise [ArgumentError] if the path is malformed
|
|
60
|
+
def tokenize_path(path)
|
|
61
|
+
# Check for invalid patterns upfront
|
|
62
|
+
raise ArgumentError, "Invalid XPath pattern: #{path} (triple slash not allowed)" if path.include?("///")
|
|
63
|
+
|
|
64
|
+
parts = []
|
|
65
|
+
i = 0
|
|
66
|
+
path_length = path.length
|
|
67
|
+
|
|
68
|
+
while i < path_length
|
|
69
|
+
# Skip leading slash for absolute paths
|
|
70
|
+
if i == 0 && path[i] == "/"
|
|
71
|
+
parts << [Matcher.new("")]
|
|
72
|
+
i += 1
|
|
73
|
+
next
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Look for // (descendant operator)
|
|
77
|
+
if i < path_length - 1 && path[i] == "/" && path[i + 1] == "/"
|
|
78
|
+
i += 2
|
|
79
|
+
# Check if there's a name after //
|
|
80
|
+
name_match = path[i..].match(/\A([a-zA-Z_][\w-]*)/)
|
|
81
|
+
if name_match
|
|
82
|
+
parts << [Matcher.new("%#{name_match[1]}")]
|
|
83
|
+
i += name_match[1].length
|
|
84
|
+
elsif i >= path_length
|
|
85
|
+
# // at end of path is invalid
|
|
86
|
+
raise ArgumentError, "Invalid XPath pattern: #{path} (// cannot be at end)"
|
|
87
|
+
else
|
|
88
|
+
parts << [Matcher.new("%")]
|
|
89
|
+
end
|
|
90
|
+
next
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Regular path segment
|
|
94
|
+
if path[i] == "/"
|
|
95
|
+
i += 1
|
|
96
|
+
next
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Extract the segment (until next / or end)
|
|
100
|
+
segment_end = i
|
|
101
|
+
in_parens = false
|
|
102
|
+
while segment_end < path_length
|
|
103
|
+
char = path[segment_end]
|
|
104
|
+
if char == "("
|
|
105
|
+
in_parens = true
|
|
106
|
+
elsif char == ")"
|
|
107
|
+
in_parens = false
|
|
108
|
+
elsif char == "/" && !in_parens
|
|
109
|
+
break
|
|
110
|
+
elsif char == "[" || char == "@"
|
|
111
|
+
raise ArgumentError, "Invalid XPath pattern: #{path} (predicates and attributes not supported)"
|
|
112
|
+
end
|
|
113
|
+
segment_end += 1
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
segment = path[i...segment_end]
|
|
117
|
+
raise ArgumentError, "Invalid XPath pattern: #{path} (empty segment)" if segment.empty? && i > 0
|
|
118
|
+
|
|
119
|
+
i = segment_end
|
|
120
|
+
|
|
121
|
+
# Validate segment format
|
|
122
|
+
unless segment.match?(XPATH_SEGMENT_REGEX)
|
|
123
|
+
raise ArgumentError, "Invalid XPath pattern: #{path} (invalid segment: #{segment})"
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Handle | (OR operator) within segment
|
|
127
|
+
or_paths = segment.split("|")
|
|
128
|
+
part_matchers = or_paths.map { |matcher_path| Matcher.new(matcher_path) }
|
|
129
|
+
parts << part_matchers
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
parts
|
|
43
133
|
end
|
|
44
|
-
|
|
134
|
+
|
|
45
135
|
# Match a partial path to a node.
|
|
46
136
|
class Matcher
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
when
|
|
54
|
-
|
|
55
|
-
when
|
|
56
|
-
|
|
57
|
-
when
|
|
58
|
-
|
|
59
|
-
when
|
|
60
|
-
|
|
137
|
+
# Create a new Matcher.
|
|
138
|
+
#
|
|
139
|
+
# @param path [String] the path pattern to match
|
|
140
|
+
def initialize(path)
|
|
141
|
+
@path = path
|
|
142
|
+
@extractor = case path
|
|
143
|
+
when "text()"
|
|
144
|
+
lambda { |node, context_set| node.value unless node.value.nil? || node.value.empty? }
|
|
145
|
+
when "%"
|
|
146
|
+
lambda { |node, context_set| node.descendants }
|
|
147
|
+
when "*"
|
|
148
|
+
lambda { |node, context_set| node.children }
|
|
149
|
+
when "."
|
|
150
|
+
lambda { |node, context_set| node }
|
|
151
|
+
when ".."
|
|
152
|
+
lambda { |node, context_set| node.parent || [] }
|
|
153
|
+
when ""
|
|
154
|
+
lambda { |node, context_set|
|
|
155
|
+
root = Node.new(nil)
|
|
156
|
+
root.children << node.root
|
|
157
|
+
root
|
|
158
|
+
}
|
|
159
|
+
when /^%(.+)$/ # descendants with name filter: %name
|
|
160
|
+
name = $1
|
|
161
|
+
lambda { |node, context_set| node.descendants.select { |d| d.name == name } }
|
|
61
162
|
else
|
|
62
|
-
|
|
163
|
+
lambda { |node, context_set|
|
|
164
|
+
# Only return children matching the name
|
|
165
|
+
# Don't include children that are already in the context
|
|
166
|
+
node.children.select { |child| child.name == @path && !context_set&.include?(child) }
|
|
167
|
+
}
|
|
63
168
|
end
|
|
64
169
|
end
|
|
65
|
-
|
|
170
|
+
|
|
66
171
|
# Select all nodes that match a partial path.
|
|
67
|
-
|
|
68
|
-
|
|
172
|
+
#
|
|
173
|
+
# @param context_nodes [Array<Node>] the nodes to select from
|
|
174
|
+
# @param context_set [Set<Node>, nil] optional set version of context_nodes for performance
|
|
175
|
+
# @return [Array<Node>] the matching nodes
|
|
176
|
+
def select(context_nodes, context_set = nil)
|
|
177
|
+
context_set ||= context_nodes.to_set
|
|
178
|
+
context_nodes.collect { |node| @extractor.call(node, context_set) if node.is_a?(Node) }.flatten.compact.uniq
|
|
69
179
|
end
|
|
70
180
|
end
|
|
71
181
|
end
|
data/lib/xml_node_stream.rb
CHANGED
|
@@ -1,10 +1,23 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
require
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "pathname"
|
|
4
|
+
require "uri"
|
|
5
|
+
|
|
6
|
+
require_relative "xml_node_stream/node"
|
|
7
|
+
require_relative "xml_node_stream/parser"
|
|
8
|
+
require_relative "xml_node_stream/selector"
|
|
4
9
|
|
|
5
10
|
module XmlNodeStream
|
|
11
|
+
VERSION = File.read(File.expand_path("../VERSION", __dir__)).strip
|
|
12
|
+
|
|
6
13
|
# Helper method to parse XML. See Parser#parse for details.
|
|
7
|
-
|
|
8
|
-
|
|
14
|
+
#
|
|
15
|
+
# @param io [IO, String, URI, Pathname] the input source to parse
|
|
16
|
+
# @yield [Node] each node as it is parsed
|
|
17
|
+
# @return [Node] the root node of the parsed document
|
|
18
|
+
class << self
|
|
19
|
+
def parse(io, &block)
|
|
20
|
+
Parser.parse(io, &block)
|
|
21
|
+
end
|
|
9
22
|
end
|
|
10
23
|
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
Gem::Specification.new do |spec|
|
|
2
|
+
spec.name = "xml_node_stream"
|
|
3
|
+
spec.version = File.read(File.expand_path("../VERSION", __FILE__)).strip
|
|
4
|
+
spec.authors = ["Brian Durand"]
|
|
5
|
+
spec.email = ["bbdurand@gmail.com"]
|
|
6
|
+
|
|
7
|
+
spec.summary = "Memory-efficient XML parser that reduces memory allocation when parsing large XML documents while maintaining a simple, easy-to-use interface."
|
|
8
|
+
|
|
9
|
+
spec.homepage = "https://github.com/bdurand/xml_node_stream"
|
|
10
|
+
spec.license = "MIT"
|
|
11
|
+
|
|
12
|
+
spec.metadata = {
|
|
13
|
+
"homepage_uri" => spec.homepage,
|
|
14
|
+
"source_code_uri" => spec.homepage,
|
|
15
|
+
"changelog_uri" => "#{spec.homepage}/blob/main/CHANGELOG.md"
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
# Specify which files should be added to the gem when it is released.
|
|
19
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
|
20
|
+
ignore_files = %w[
|
|
21
|
+
.
|
|
22
|
+
AGENTS.md
|
|
23
|
+
Appraisals
|
|
24
|
+
Gemfile
|
|
25
|
+
Gemfile.lock
|
|
26
|
+
Rakefile
|
|
27
|
+
bin/
|
|
28
|
+
gemfiles/
|
|
29
|
+
spec/
|
|
30
|
+
benchmark/
|
|
31
|
+
]
|
|
32
|
+
spec.files = Dir.chdir(File.expand_path("..", __FILE__)) do
|
|
33
|
+
`git ls-files -z`.split("\x0").reject { |f| ignore_files.any? { |path| f.start_with?(path) } }
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
spec.require_paths = ["lib"]
|
|
37
|
+
|
|
38
|
+
spec.add_development_dependency "bundler"
|
|
39
|
+
end
|