xml_node_stream 1.0.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,8 @@
1
+ # frozen_string_literal: true
2
+
1
3
  begin
2
- require 'libxml'
3
-
4
+ require "libxml"
5
+
4
6
  module XmlNodeStream
5
7
  class Parser
6
8
  # Wrapper for the LibXML SAX parser.
@@ -8,26 +10,51 @@ begin
8
10
  include LibXML::XML::SaxParser::Callbacks
9
11
  include Base
10
12
 
11
- def parse_stream (io)
13
+ # Parse the input stream using LibXML.
14
+ #
15
+ # @param io [IO] the input stream to parse
16
+ # @return [void]
17
+ def parse_stream(io)
12
18
  context = LibXML::XML::Parser::Context.io(io)
13
19
  parser = LibXML::XML::SaxParser.new(context)
14
20
  parser.callbacks = self
15
21
  parser.parse
16
22
  end
17
-
18
- def on_start_element (name, attributes)
23
+
24
+ # Handle LibXML start element callback.
25
+ #
26
+ # @param name [String] the element name
27
+ # @param attributes [Hash] the element attributes
28
+ # @return [void]
29
+ # @api private
30
+ def on_start_element(name, attributes)
19
31
  do_start_element(name, attributes)
20
32
  end
21
33
 
22
- def on_end_element (name)
34
+ # Handle LibXML end element callback.
35
+ #
36
+ # @param name [String] the element name
37
+ # @return [void]
38
+ # @api private
39
+ def on_end_element(name)
23
40
  do_end_element(name)
24
41
  end
25
42
 
26
- def on_characters (characters)
43
+ # Handle LibXML character data callback.
44
+ #
45
+ # @param characters [String] the character data
46
+ # @return [void]
47
+ # @api private
48
+ def on_characters(characters)
27
49
  do_characters(characters)
28
50
  end
29
51
 
30
- def on_cdata_block (characters)
52
+ # Handle LibXML CDATA block callback.
53
+ #
54
+ # @param characters [String] the CDATA content
55
+ # @return [void]
56
+ # @api private
57
+ def on_cdata_block(characters)
31
58
  do_cdata_block(characters)
32
59
  end
33
60
  end
@@ -41,4 +68,4 @@ rescue LoadError
41
68
  end
42
69
  end
43
70
  end
44
- end
71
+ end
@@ -1,24 +1,39 @@
1
+ # frozen_string_literal: true
2
+
1
3
  begin
2
- require 'nokogiri'
3
-
4
+ require "nokogiri"
5
+
4
6
  module XmlNodeStream
5
7
  class Parser
6
8
  # Wrapper for the Nokogiri SAX parser.
7
9
  class NokogiriParser
8
10
  include Base
9
11
 
10
- def parse_stream (io)
12
+ # Parse the input stream using Nokogiri.
13
+ #
14
+ # @param io [IO] the input stream to parse
15
+ # @return [void]
16
+ def parse_stream(io)
11
17
  listener = Listener.new(self)
12
18
  parser = Nokogiri::XML::SAX::Parser.new(listener)
13
19
  parser.parse(io)
14
20
  end
15
-
21
+
16
22
  class Listener < Nokogiri::XML::SAX::Document
17
- def initialize (parser)
23
+ # Initialize the Nokogiri listener.
24
+ #
25
+ # @param parser [NokogiriParser] the parser instance
26
+ def initialize(parser)
18
27
  @parser = parser
19
28
  end
20
-
21
- def start_element (name, attributes = [])
29
+
30
+ # Handle Nokogiri start element callback.
31
+ #
32
+ # @param name [String] the element name
33
+ # @param attributes [Array] the element attributes
34
+ # @return [void]
35
+ # @api private
36
+ def start_element(name, attributes = [])
22
37
  attributes_hash = {}
23
38
  if attributes.first.is_a?(Array)
24
39
  # Newer style where attributes are passed as an array of arrays
@@ -27,20 +42,35 @@ begin
27
42
  end
28
43
  else
29
44
  # Old style where attributes are passed as a flat array
30
- (attributes.size / 2).times{|i| attributes_hash[attributes[i * 2]] = attributes[(i * 2) + 1]}
45
+ (attributes.size / 2).times { |i| attributes_hash[attributes[i * 2]] = attributes[(i * 2) + 1] }
31
46
  end
32
47
  @parser.do_start_element(name, attributes_hash)
33
48
  end
34
49
 
35
- def end_element (name)
50
+ # Handle Nokogiri end element callback.
51
+ #
52
+ # @param name [String] the element name
53
+ # @return [void]
54
+ # @api private
55
+ def end_element(name)
36
56
  @parser.do_end_element(name)
37
57
  end
38
58
 
39
- def characters (characters)
59
+ # Handle Nokogiri character data callback.
60
+ #
61
+ # @param characters [String] the character data
62
+ # @return [void]
63
+ # @api private
64
+ def characters(characters)
40
65
  @parser.do_characters(characters)
41
66
  end
42
67
 
43
- def cdata_block (characters)
68
+ # Handle Nokogiri CDATA block callback.
69
+ #
70
+ # @param characters [String] the CDATA content
71
+ # @return [void]
72
+ # @api private
73
+ def cdata_block(characters)
44
74
  @parser.do_cdata_block(characters)
45
75
  end
46
76
  end
@@ -55,4 +85,4 @@ rescue LoadError
55
85
  end
56
86
  end
57
87
  end
58
- end
88
+ end
@@ -1,7 +1,9 @@
1
+ # frozen_string_literal: true
2
+
1
3
  begin
2
- require 'rexml/document'
3
- require 'rexml/streamlistener'
4
-
4
+ require "rexml/document"
5
+ require "rexml/streamlistener"
6
+
5
7
  module XmlNodeStream
6
8
  class Parser
7
9
  # Wrapper for the REXML SAX parser.
@@ -9,24 +11,49 @@ begin
9
11
  include REXML::StreamListener
10
12
  include Base
11
13
 
12
- def parse_stream (io)
14
+ # Parse the input stream using REXML.
15
+ #
16
+ # @param io [IO] the input stream to parse
17
+ # @return [void]
18
+ def parse_stream(io)
13
19
  parser = REXML::Parsers::StreamParser.new(io, self)
14
20
  parser.parse
15
21
  end
16
22
 
17
- def tag_start (name, attributes)
23
+ # Handle REXML tag start callback.
24
+ #
25
+ # @param name [String] the element name
26
+ # @param attributes [Hash] the element attributes
27
+ # @return [void]
28
+ # @api private
29
+ def tag_start(name, attributes)
18
30
  do_start_element(name, attributes)
19
31
  end
20
32
 
21
- def tag_end (name)
33
+ # Handle REXML tag end callback.
34
+ #
35
+ # @param name [String] the element name
36
+ # @return [void]
37
+ # @api private
38
+ def tag_end(name)
22
39
  do_end_element(name)
23
40
  end
24
41
 
25
- def text (content)
42
+ # Handle REXML text callback.
43
+ #
44
+ # @param content [String] the text content
45
+ # @return [void]
46
+ # @api private
47
+ def text(content)
26
48
  do_characters(content)
27
49
  end
28
50
 
29
- def cdata (content)
51
+ # Handle REXML CDATA callback.
52
+ #
53
+ # @param content [String] the CDATA content
54
+ # @return [void]
55
+ # @api private
56
+ def cdata(content)
30
57
  do_cdata_block(content)
31
58
  end
32
59
  end
@@ -1,70 +1,95 @@
1
- require 'open-uri'
2
- require 'rubygems'
3
- require 'pathname'
4
- require File.expand_path(File.join(File.dirname(__FILE__), 'parser', 'base'))
1
+ # frozen_string_literal: true
2
+
3
+ require "net/http"
4
+ require_relative "parser/base"
5
+ require_relative "http_stream"
5
6
 
6
7
  module XmlNodeStream
7
8
  # The abstract parser class that wraps the actual parser implementation.
8
9
  class Parser
9
-
10
10
  SUPPORTED_PARSERS = [:nokogiri, :libxml, :rexml]
11
-
11
+
12
+ @parser = :rexml
13
+
12
14
  class << self
13
15
  # Set the parser implementation. The parser argument should be one of :nokogiri, :libxml, or :rexml. If this method
14
16
  # is not called, it will default to :rexml which is the slowest choice possible. If you set the parser to one of the
15
17
  # other values, though, you'll need to make sure you have the nokogiri gem or libxml-ruby gem installed.
16
- def parser_name= (parser)
17
- parser_sym = parser.to_sym
18
+ #
19
+ # @param parser [Symbol, String] the parser name (:nokogiri, :libxml, or :rexml)
20
+ # @return [Symbol] the parser name
21
+ # @raise [ArgumentError] if parser is not one of the supported parsers
22
+ def parser_name=(parser)
23
+ parser_sym = parser&.to_sym
18
24
  raise ArgumentError.new("must be one of #{SUPPORTED_PARSERS.inspect}") unless SUPPORTED_PARSERS.include?(parser_sym)
25
+
19
26
  @parser_name = parser_sym
20
27
  end
21
-
28
+
22
29
  # Get the name of the current parser.
30
+ #
31
+ # @return [Symbol] the current parser name
23
32
  def parser_name
24
33
  @parser_name ||= :rexml
25
34
  end
26
-
35
+
27
36
  # Parse the document specified in io. This can be either a Stream, URI, Pathname, or String. If it is a String,
28
37
  # it can either be a XML document, file system path, or URI. The parser will figure it out. If a block is given,
29
38
  # it will be yielded to with each node as it is parsed.
30
- def parse (io, &block)
31
- close_stream = false
32
- if io.is_a?(String)
33
- if io.include?('<') and io.include?('>')
34
- io = StringIO.new(io)
35
- else
36
- io = open(io)
39
+ #
40
+ # @param io [IO, String, URI, Pathname] the input source to parse
41
+ # @yield [Node] each node as it is parsed
42
+ # @return [Node] the root node of the parsed document
43
+ def parse(io, &block)
44
+ close_stream = true
45
+ io = URI.parse(io) if io.is_a?(String) && io.match?(%r{\Ahttp(s)?://})
46
+
47
+ if io.is_a?(String) && io.match?(/<[^>]+>/m)
48
+ io = StringIO.new(io)
49
+ elsif io.is_a?(String)
50
+ unless File.exist?(io)
51
+ raise ArgumentError.new("File not found: #{io}")
37
52
  end
38
- close_stream = true
53
+ io = File.open(io, "r:UTF-8")
39
54
  elsif io.is_a?(Pathname)
40
- io = io.open
41
- close_stream = true
55
+ unless io.exist?
56
+ raise ArgumentError.new("File not found: #{io}")
57
+ end
58
+ io = io.open("r:UTF-8")
42
59
  elsif io.is_a?(URI)
43
- io = io.open
44
- close_stream = true
60
+ io = HttpStream.new(io)
61
+ else
62
+ close_stream = false
45
63
  end
46
64
 
47
65
  begin
48
66
  parser = parser_class(parser_name).new(&block)
49
67
  parser.parse_stream(io)
50
- return parser.root
68
+ parser.root
51
69
  ensure
52
- io.close if close_stream
70
+ if close_stream
71
+ begin
72
+ io.close
73
+ rescue
74
+ # Ignore errors during close to ensure cleanup completes
75
+ nil
76
+ end
77
+ end
53
78
  end
54
79
  end
55
-
80
+
56
81
  protected
57
-
58
- def parser_class (class_symbol)
82
+
83
+ def parser_class(class_symbol)
59
84
  @loaded_parsers ||= {}
60
85
  klass = @loaded_parsers[class_symbol]
61
86
  unless klass
62
- require File.expand_path(File.join(File.dirname(__FILE__), 'parser', "#{class_symbol}_parser"))
87
+ require File.expand_path(File.join(File.dirname(__FILE__), "parser", "#{class_symbol}_parser"))
63
88
  class_name = "#{class_symbol.to_s.capitalize}Parser"
64
89
  klass = const_get(class_name)
65
90
  @loaded_parsers[class_symbol] = klass
66
91
  end
67
- return klass
92
+ klass
68
93
  end
69
94
  end
70
95
  end
@@ -1,3 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "set"
4
+
1
5
  module XmlNodeStream
2
6
  # Partial implementation of XPath selectors. Only abbreviated paths and the text() function are supported. The rest of XPath
3
7
  # is unecessary in the context of a Ruby application since XPath is also a programming language. If you really need an XPath
@@ -13,59 +17,165 @@ module XmlNodeStream
13
17
  # * /library/books/book - find all book elements with the full path /library/books/book
14
18
  # * author/text() - get the text values of all author child elements
15
19
  class Selector
20
+ XPATH_SEGMENT_REGEX = /\A(\.\.?|\*|[a-zA-Z_][\w-]*|text\(\))(\|((\.\.?|\*|[a-zA-Z_][\w-]*|text\(\))))*\z/
21
+
16
22
  # Create a selector. Path should be an abbreviated XPath string.
17
- def initialize (path)
18
- @parts = []
19
- path.gsub('//', '/%/').split('/').each do |part_path|
20
- part_matchers = []
21
- @parts << part_matchers
22
- or_paths = part_path.split('|')
23
- or_paths << "" if or_paths.empty?
24
- or_paths.each do |matcher_path|
25
- part_matchers << Matcher.new(matcher_path)
26
- end
27
- end
23
+ #
24
+ # @param path [String] the XPath selector string
25
+ # @raise [ArgumentError] if the path is invalid
26
+ def initialize(path)
27
+ raise ArgumentError, "XPath pattern cannot be empty" if path.nil? || path.empty?
28
+
29
+ @parts = tokenize_path(path)
28
30
  end
29
-
31
+
30
32
  # Apply the selector to the current node. Note, if your path started with a /, it will be applied
31
33
  # to the root node.
32
- def find (node)
34
+ #
35
+ # @param node [Node] the node to apply the selector to
36
+ # @return [Array<Node>] the matching nodes
37
+ def find(node)
33
38
  matched = [node]
34
39
  @parts.each do |part_matchers|
35
40
  context = matched
41
+ context_set = context.to_set
36
42
  matched = []
43
+
37
44
  part_matchers.each do |matcher|
38
- matched.concat(matcher.select(context))
45
+ matched.concat(matcher.select(context, context_set))
39
46
  end
47
+
40
48
  break if matched.empty?
41
49
  end
42
- return matched
50
+ matched
51
+ end
52
+
53
+ private
54
+
55
+ # Tokenize the XPath into parts using a simple lexer approach
56
+ #
57
+ # @param path [String] the XPath string to tokenize
58
+ # @return [Array<Array<Matcher>>] array of matcher arrays
59
+ # @raise [ArgumentError] if the path is malformed
60
+ def tokenize_path(path)
61
+ # Check for invalid patterns upfront
62
+ raise ArgumentError, "Invalid XPath pattern: #{path} (triple slash not allowed)" if path.include?("///")
63
+
64
+ parts = []
65
+ i = 0
66
+ path_length = path.length
67
+
68
+ while i < path_length
69
+ # Skip leading slash for absolute paths
70
+ if i == 0 && path[i] == "/"
71
+ parts << [Matcher.new("")]
72
+ i += 1
73
+ next
74
+ end
75
+
76
+ # Look for // (descendant operator)
77
+ if i < path_length - 1 && path[i] == "/" && path[i + 1] == "/"
78
+ i += 2
79
+ # Check if there's a name after //
80
+ name_match = path[i..].match(/\A([a-zA-Z_][\w-]*)/)
81
+ if name_match
82
+ parts << [Matcher.new("%#{name_match[1]}")]
83
+ i += name_match[1].length
84
+ elsif i >= path_length
85
+ # // at end of path is invalid
86
+ raise ArgumentError, "Invalid XPath pattern: #{path} (// cannot be at end)"
87
+ else
88
+ parts << [Matcher.new("%")]
89
+ end
90
+ next
91
+ end
92
+
93
+ # Regular path segment
94
+ if path[i] == "/"
95
+ i += 1
96
+ next
97
+ end
98
+
99
+ # Extract the segment (until next / or end)
100
+ segment_end = i
101
+ in_parens = false
102
+ while segment_end < path_length
103
+ char = path[segment_end]
104
+ if char == "("
105
+ in_parens = true
106
+ elsif char == ")"
107
+ in_parens = false
108
+ elsif char == "/" && !in_parens
109
+ break
110
+ elsif char == "[" || char == "@"
111
+ raise ArgumentError, "Invalid XPath pattern: #{path} (predicates and attributes not supported)"
112
+ end
113
+ segment_end += 1
114
+ end
115
+
116
+ segment = path[i...segment_end]
117
+ raise ArgumentError, "Invalid XPath pattern: #{path} (empty segment)" if segment.empty? && i > 0
118
+
119
+ i = segment_end
120
+
121
+ # Validate segment format
122
+ unless segment.match?(XPATH_SEGMENT_REGEX)
123
+ raise ArgumentError, "Invalid XPath pattern: #{path} (invalid segment: #{segment})"
124
+ end
125
+
126
+ # Handle | (OR operator) within segment
127
+ or_paths = segment.split("|")
128
+ part_matchers = or_paths.map { |matcher_path| Matcher.new(matcher_path) }
129
+ parts << part_matchers
130
+ end
131
+
132
+ parts
43
133
  end
44
-
134
+
45
135
  # Match a partial path to a node.
46
136
  class Matcher
47
- def initialize (path)
48
- case path
49
- when 'text()'
50
- @extractor = lambda{|node| node.value}
51
- when '%'
52
- @extractor = lambda{|node| node.descendants}
53
- when '*'
54
- @extractor = lambda{|node| node.children}
55
- when '.'
56
- @extractor = lambda{|node| node}
57
- when '..'
58
- @extractor = lambda{|node| node.parent ? node.parent : []}
59
- when ''
60
- @extractor = lambda{|node| root = Node.new(nil); root.children << node.root; root}
137
+ # Create a new Matcher.
138
+ #
139
+ # @param path [String] the path pattern to match
140
+ def initialize(path)
141
+ @path = path
142
+ @extractor = case path
143
+ when "text()"
144
+ lambda { |node, context_set| node.value unless node.value.nil? || node.value.empty? }
145
+ when "%"
146
+ lambda { |node, context_set| node.descendants }
147
+ when "*"
148
+ lambda { |node, context_set| node.children }
149
+ when "."
150
+ lambda { |node, context_set| node }
151
+ when ".."
152
+ lambda { |node, context_set| node.parent || [] }
153
+ when ""
154
+ lambda { |node, context_set|
155
+ root = Node.new(nil)
156
+ root.children << node.root
157
+ root
158
+ }
159
+ when /^%(.+)$/ # descendants with name filter: %name
160
+ name = $1
161
+ lambda { |node, context_set| node.descendants.select { |d| d.name == name } }
61
162
  else
62
- @extractor = lambda{|node| node.children.select{|child| child.name == path}}
163
+ lambda { |node, context_set|
164
+ # Only return children matching the name
165
+ # Don't include children that are already in the context
166
+ node.children.select { |child| child.name == @path && !context_set&.include?(child) }
167
+ }
63
168
  end
64
169
  end
65
-
170
+
66
171
  # Select all nodes that match a partial path.
67
- def select (context_nodes)
68
- context_nodes.collect{|node| @extractor.call(node) if node.is_a?(Node)}.flatten
172
+ #
173
+ # @param context_nodes [Array<Node>] the nodes to select from
174
+ # @param context_set [Set<Node>, nil] optional set version of context_nodes for performance
175
+ # @return [Array<Node>] the matching nodes
176
+ def select(context_nodes, context_set = nil)
177
+ context_set ||= context_nodes.to_set
178
+ context_nodes.collect { |node| @extractor.call(node, context_set) if node.is_a?(Node) }.flatten.compact.uniq
69
179
  end
70
180
  end
71
181
  end
@@ -1,10 +1,23 @@
1
- require File.expand_path(File.join(File.dirname(__FILE__), 'xml_node_stream', 'node'))
2
- require File.expand_path(File.join(File.dirname(__FILE__), 'xml_node_stream', 'parser'))
3
- require File.expand_path(File.join(File.dirname(__FILE__), 'xml_node_stream', 'selector'))
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+ require "uri"
5
+
6
+ require_relative "xml_node_stream/node"
7
+ require_relative "xml_node_stream/parser"
8
+ require_relative "xml_node_stream/selector"
4
9
 
5
10
  module XmlNodeStream
11
+ VERSION = File.read(File.expand_path("../VERSION", __dir__)).strip
12
+
6
13
  # Helper method to parse XML. See Parser#parse for details.
7
- def self.parse (io, &block)
8
- Parser.parse(io, &block)
14
+ #
15
+ # @param io [IO, String, URI, Pathname] the input source to parse
16
+ # @yield [Node] each node as it is parsed
17
+ # @return [Node] the root node of the parsed document
18
+ class << self
19
+ def parse(io, &block)
20
+ Parser.parse(io, &block)
21
+ end
9
22
  end
10
23
  end
@@ -0,0 +1,39 @@
1
+ Gem::Specification.new do |spec|
2
+ spec.name = "xml_node_stream"
3
+ spec.version = File.read(File.expand_path("../VERSION", __FILE__)).strip
4
+ spec.authors = ["Brian Durand"]
5
+ spec.email = ["bbdurand@gmail.com"]
6
+
7
+ spec.summary = "Memory-efficient XML parser that reduces memory allocation when parsing large XML documents while maintaining a simple, easy-to-use interface."
8
+
9
+ spec.homepage = "https://github.com/bdurand/xml_node_stream"
10
+ spec.license = "MIT"
11
+
12
+ spec.metadata = {
13
+ "homepage_uri" => spec.homepage,
14
+ "source_code_uri" => spec.homepage,
15
+ "changelog_uri" => "#{spec.homepage}/blob/main/CHANGELOG.md"
16
+ }
17
+
18
+ # Specify which files should be added to the gem when it is released.
19
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
20
+ ignore_files = %w[
21
+ .
22
+ AGENTS.md
23
+ Appraisals
24
+ Gemfile
25
+ Gemfile.lock
26
+ Rakefile
27
+ bin/
28
+ gemfiles/
29
+ spec/
30
+ benchmark/
31
+ ]
32
+ spec.files = Dir.chdir(File.expand_path("..", __FILE__)) do
33
+ `git ls-files -z`.split("\x0").reject { |f| ignore_files.any? { |path| f.start_with?(path) } }
34
+ end
35
+
36
+ spec.require_paths = ["lib"]
37
+
38
+ spec.add_development_dependency "bundler"
39
+ end