RubyGems - xml_node_stream - Versions diffs - 1.0.2 → 2.0.0 - Mend

xml_node_stream 1.0.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +30 -0
data/README.md +139 -0
data/VERSION +1 -0
data/lib/xml_node_stream/http_stream.rb +179 -0
data/lib/xml_node_stream/node.rb +98 -47
data/lib/xml_node_stream/parser/base.rb +49 -12
data/lib/xml_node_stream/parser/libxml_parser.rb +36 -9
data/lib/xml_node_stream/parser/nokogiri_parser.rb +42 -12
data/lib/xml_node_stream/parser/rexml_parser.rb +35 -8
data/lib/xml_node_stream/parser.rb +54 -29
data/lib/xml_node_stream/selector.rb +144 -34
data/lib/xml_node_stream.rb +18 -5
data/xml_node_stream.gemspec +39 -0
metadata +46 -88
data/README.rdoc +0 -61
data/Rakefile +0 -44
data/spec/node_spec.rb +0 -140
data/spec/parser_spec.rb +0 -148
data/spec/selector_spec.rb +0 -73
data/spec/spec_helper.rb +0 -2
data/spec/test.xml +0 -57
data/spec/xml_node_stream_spec.rb +0 -11
/data/{MIT_LICENSE → MIT-LICENSE} +0 -0

data/lib/xml_node_stream/parser/libxml_parser.rb CHANGED Viewed

@@ -1,6 +1,8 @@
+# frozen_string_literal: true
 begin
-  require 'libxml'
+  require "libxml"
   module XmlNodeStream
     class Parser
       # Wrapper for the LibXML SAX parser.
@@ -8,26 +10,51 @@ begin
         include LibXML::XML::SaxParser::Callbacks
         include Base
-        def parse_stream (io)
+        # Parse the input stream using LibXML.
+        #
+        # @param io [IO] the input stream to parse
+        # @return [void]
+        def parse_stream(io)
           context = LibXML::XML::Parser::Context.io(io)
           parser = LibXML::XML::SaxParser.new(context)
           parser.callbacks = self
           parser.parse
         end
-        def on_start_element (name, attributes)
+        # Handle LibXML start element callback.
+        #
+        # @param name [String] the element name
+        # @param attributes [Hash] the element attributes
+        # @return [void]
+        # @api private
+        def on_start_element(name, attributes)
           do_start_element(name, attributes)
         end
-        def on_end_element (name)
+        # Handle LibXML end element callback.
+        #
+        # @param name [String] the element name
+        # @return [void]
+        # @api private
+        def on_end_element(name)
           do_end_element(name)
         end
-        def on_characters (characters)
+        # Handle LibXML character data callback.
+        #
+        # @param characters [String] the character data
+        # @return [void]
+        # @api private
+        def on_characters(characters)
           do_characters(characters)
         end
-        def on_cdata_block (characters)
+        # Handle LibXML CDATA block callback.
+        #
+        # @param characters [String] the CDATA content
+        # @return [void]
+        # @api private
+        def on_cdata_block(characters)
           do_cdata_block(characters)
         end
       end
@@ -41,4 +68,4 @@ rescue LoadError
       end
     end
   end
-end
+end

data/lib/xml_node_stream/parser/nokogiri_parser.rb CHANGED Viewed

@@ -1,24 +1,39 @@
+# frozen_string_literal: true
 begin
-  require 'nokogiri'
+  require "nokogiri"
   module XmlNodeStream
     class Parser
       # Wrapper for the Nokogiri SAX parser.
       class NokogiriParser
         include Base
-        def parse_stream (io)
+        # Parse the input stream using Nokogiri.
+        #
+        # @param io [IO] the input stream to parse
+        # @return [void]
+        def parse_stream(io)
           listener = Listener.new(self)
           parser = Nokogiri::XML::SAX::Parser.new(listener)
           parser.parse(io)
         end
         class Listener < Nokogiri::XML::SAX::Document
-          def initialize (parser)
+          # Initialize the Nokogiri listener.
+          #
+          # @param parser [NokogiriParser] the parser instance
+          def initialize(parser)
             @parser = parser
           end
-          def start_element (name, attributes = [])
+          # Handle Nokogiri start element callback.
+          #
+          # @param name [String] the element name
+          # @param attributes [Array] the element attributes
+          # @return [void]
+          # @api private
+          def start_element(name, attributes = [])
             attributes_hash = {}
             if attributes.first.is_a?(Array)
               # Newer style where attributes are passed as an array of arrays
@@ -27,20 +42,35 @@ begin
               end
             else
               # Old style where attributes are passed as a flat array
-              (attributes.size / 2).times{|i| attributes_hash[attributes[i * 2]] = attributes[(i * 2) + 1]}
+              (attributes.size / 2).times { |i| attributes_hash[attributes[i * 2]] = attributes[(i * 2) + 1] }
             end
             @parser.do_start_element(name, attributes_hash)
           end
-          def end_element (name)
+          # Handle Nokogiri end element callback.
+          #
+          # @param name [String] the element name
+          # @return [void]
+          # @api private
+          def end_element(name)
             @parser.do_end_element(name)
           end
-          def characters (characters)
+          # Handle Nokogiri character data callback.
+          #
+          # @param characters [String] the character data
+          # @return [void]
+          # @api private
+          def characters(characters)
             @parser.do_characters(characters)
           end
-          def cdata_block (characters)
+          # Handle Nokogiri CDATA block callback.
+          #
+          # @param characters [String] the CDATA content
+          # @return [void]
+          # @api private
+          def cdata_block(characters)
             @parser.do_cdata_block(characters)
           end
         end
@@ -55,4 +85,4 @@ rescue LoadError
       end
     end
   end
-end
+end

data/lib/xml_node_stream/parser/rexml_parser.rb CHANGED Viewed

@@ -1,7 +1,9 @@
+# frozen_string_literal: true
 begin
-  require 'rexml/document'
-  require 'rexml/streamlistener'
+  require "rexml/document"
+  require "rexml/streamlistener"
   module XmlNodeStream
     class Parser
       # Wrapper for the REXML SAX parser.
@@ -9,24 +11,49 @@ begin
         include REXML::StreamListener
         include Base
-        def parse_stream (io)
+        # Parse the input stream using REXML.
+        #
+        # @param io [IO] the input stream to parse
+        # @return [void]
+        def parse_stream(io)
           parser = REXML::Parsers::StreamParser.new(io, self)
           parser.parse
         end
-        def tag_start (name, attributes)
+        # Handle REXML tag start callback.
+        #
+        # @param name [String] the element name
+        # @param attributes [Hash] the element attributes
+        # @return [void]
+        # @api private
+        def tag_start(name, attributes)
           do_start_element(name, attributes)
         end
-        def tag_end (name)
+        # Handle REXML tag end callback.
+        #
+        # @param name [String] the element name
+        # @return [void]
+        # @api private
+        def tag_end(name)
           do_end_element(name)
         end
-        def text (content)
+        # Handle REXML text callback.
+        #
+        # @param content [String] the text content
+        # @return [void]
+        # @api private
+        def text(content)
           do_characters(content)
         end
-        def cdata (content)
+        # Handle REXML CDATA callback.
+        #
+        # @param content [String] the CDATA content
+        # @return [void]
+        # @api private
+        def cdata(content)
           do_cdata_block(content)
         end
       end

data/lib/xml_node_stream/parser.rb CHANGED Viewed

@@ -1,70 +1,95 @@
-require 'open-uri'
-require 'rubygems'
-require 'pathname'
-require File.expand_path(File.join(File.dirname(__FILE__), 'parser', 'base'))
+# frozen_string_literal: true
+require "net/http"
+require_relative "parser/base"
+require_relative "http_stream"
 module XmlNodeStream
   # The abstract parser class that wraps the actual parser implementation.
   class Parser
     SUPPORTED_PARSERS = [:nokogiri, :libxml, :rexml]
+    @parser = :rexml
     class << self
       # Set the parser implementation. The parser argument should be one of :nokogiri, :libxml, or :rexml. If this method
       # is not called, it will default to :rexml which is the slowest choice possible. If you set the parser to one of the
       # other values, though, you'll need to make sure you have the nokogiri gem or libxml-ruby gem installed.
-      def parser_name= (parser)
-        parser_sym = parser.to_sym
+      #
+      # @param parser [Symbol, String] the parser name (:nokogiri, :libxml, or :rexml)
+      # @return [Symbol] the parser name
+      # @raise [ArgumentError] if parser is not one of the supported parsers
+      def parser_name=(parser)
+        parser_sym = parser&.to_sym
         raise ArgumentError.new("must be one of #{SUPPORTED_PARSERS.inspect}") unless SUPPORTED_PARSERS.include?(parser_sym)
         @parser_name = parser_sym
       end
       # Get the name of the current parser.
+      #
+      # @return [Symbol] the current parser name
       def parser_name
         @parser_name ||= :rexml
       end
       # Parse the document specified in io. This can be either a Stream, URI, Pathname, or String. If it is a String,
       # it can either be a XML document, file system path, or URI. The parser will figure it out. If a block is given,
       # it will be yielded to with each node as it is parsed.
-      def parse (io, &block)
-        close_stream = false
-        if io.is_a?(String)
-          if io.include?('<') and io.include?('>')
-            io = StringIO.new(io)
-          else
-            io = open(io)
+      #
+      # @param io [IO, String, URI, Pathname] the input source to parse
+      # @yield [Node] each node as it is parsed
+      # @return [Node] the root node of the parsed document
+      def parse(io, &block)
+        close_stream = true
+        io = URI.parse(io) if io.is_a?(String) && io.match?(%r{\Ahttp(s)?://})
+        if io.is_a?(String) && io.match?(/<[^>]+>/m)
+          io = StringIO.new(io)
+        elsif io.is_a?(String)
+          unless File.exist?(io)
+            raise ArgumentError.new("File not found: #{io}")
           end
-          close_stream = true
+          io = File.open(io, "r:UTF-8")
         elsif io.is_a?(Pathname)
-          io = io.open
-          close_stream = true
+          unless io.exist?
+            raise ArgumentError.new("File not found: #{io}")
+          end
+          io = io.open("r:UTF-8")
         elsif io.is_a?(URI)
-          io = io.open
-          close_stream = true
+          io = HttpStream.new(io)
+        else
+          close_stream = false
         end
         begin
           parser = parser_class(parser_name).new(&block)
           parser.parse_stream(io)
-          return parser.root
+          parser.root
         ensure
-          io.close if close_stream
+          if close_stream
+            begin
+              io.close
+            rescue
+              # Ignore errors during close to ensure cleanup completes
+              nil
+            end
+          end
         end
       end
       protected
-      def parser_class (class_symbol)
+      def parser_class(class_symbol)
         @loaded_parsers ||= {}
         klass = @loaded_parsers[class_symbol]
         unless klass
-          require File.expand_path(File.join(File.dirname(__FILE__), 'parser', "#{class_symbol}_parser"))
+          require File.expand_path(File.join(File.dirname(__FILE__), "parser", "#{class_symbol}_parser"))
           class_name = "#{class_symbol.to_s.capitalize}Parser"
           klass = const_get(class_name)
           @loaded_parsers[class_symbol] = klass
         end
-        return klass
+        klass
       end
     end
   end

data/lib/xml_node_stream/selector.rb CHANGED Viewed

@@ -1,3 +1,7 @@
+# frozen_string_literal: true
+require "set"
 module XmlNodeStream
   # Partial implementation of XPath selectors. Only abbreviated paths and the text() function are supported. The rest of XPath
   # is unecessary in the context of a Ruby application since XPath is also a programming language. If you really need an XPath
@@ -13,59 +17,165 @@ module XmlNodeStream
   # * /library/books/book - find all book elements with the full path /library/books/book
   # * author/text() - get the text values of all author child elements
   class Selector
+    XPATH_SEGMENT_REGEX = /\A(\.\.?|\*|[a-zA-Z_][\w-]*|text\(\))(\|((\.\.?|\*|[a-zA-Z_][\w-]*|text\(\))))*\z/
     # Create a selector. Path should be an abbreviated XPath string.
-    def initialize (path)
-      @parts = []
-      path.gsub('//', '/%/').split('/').each do |part_path|
-        part_matchers = []
-        @parts << part_matchers
-        or_paths = part_path.split('|')
-        or_paths << "" if or_paths.empty?
-        or_paths.each do |matcher_path|
-          part_matchers << Matcher.new(matcher_path)
-        end
-      end
+    #
+    # @param path [String] the XPath selector string
+    # @raise [ArgumentError] if the path is invalid
+    def initialize(path)
+      raise ArgumentError, "XPath pattern cannot be empty" if path.nil? || path.empty?
+      @parts = tokenize_path(path)
     end
     # Apply the selector to the current node. Note, if your path started with a /, it will be applied
     # to the root node.
-    def find (node)
+    #
+    # @param node [Node] the node to apply the selector to
+    # @return [Array<Node>] the matching nodes
+    def find(node)
       matched = [node]
       @parts.each do |part_matchers|
         context = matched
+        context_set = context.to_set
         matched = []
         part_matchers.each do |matcher|
-          matched.concat(matcher.select(context))
+          matched.concat(matcher.select(context, context_set))
         end
         break if matched.empty?
       end
-      return matched
+      matched
+    end
+    private
+    # Tokenize the XPath into parts using a simple lexer approach
+    #
+    # @param path [String] the XPath string to tokenize
+    # @return [Array<Array<Matcher>>] array of matcher arrays
+    # @raise [ArgumentError] if the path is malformed
+    def tokenize_path(path)
+      # Check for invalid patterns upfront
+      raise ArgumentError, "Invalid XPath pattern: #{path} (triple slash not allowed)" if path.include?("///")
+      parts = []
+      i = 0
+      path_length = path.length
+      while i < path_length
+        # Skip leading slash for absolute paths
+        if i == 0 && path[i] == "/"
+          parts << [Matcher.new("")]
+          i += 1
+          next
+        end
+        # Look for // (descendant operator)
+        if i < path_length - 1 && path[i] == "/" && path[i + 1] == "/"
+          i += 2
+          # Check if there's a name after //
+          name_match = path[i..].match(/\A([a-zA-Z_][\w-]*)/)
+          if name_match
+            parts << [Matcher.new("%#{name_match[1]}")]
+            i += name_match[1].length
+          elsif i >= path_length
+            # // at end of path is invalid
+            raise ArgumentError, "Invalid XPath pattern: #{path} (// cannot be at end)"
+          else
+            parts << [Matcher.new("%")]
+          end
+          next
+        end
+        # Regular path segment
+        if path[i] == "/"
+          i += 1
+          next
+        end
+        # Extract the segment (until next / or end)
+        segment_end = i
+        in_parens = false
+        while segment_end < path_length
+          char = path[segment_end]
+          if char == "("
+            in_parens = true
+          elsif char == ")"
+            in_parens = false
+          elsif char == "/" && !in_parens
+            break
+          elsif char == "[" || char == "@"
+            raise ArgumentError, "Invalid XPath pattern: #{path} (predicates and attributes not supported)"
+          end
+          segment_end += 1
+        end
+        segment = path[i...segment_end]
+        raise ArgumentError, "Invalid XPath pattern: #{path} (empty segment)" if segment.empty? && i > 0
+        i = segment_end
+        # Validate segment format
+        unless segment.match?(XPATH_SEGMENT_REGEX)
+          raise ArgumentError, "Invalid XPath pattern: #{path} (invalid segment: #{segment})"
+        end
+        # Handle | (OR operator) within segment
+        or_paths = segment.split("|")
+        part_matchers = or_paths.map { |matcher_path| Matcher.new(matcher_path) }
+        parts << part_matchers
+      end
+      parts
     end
     # Match a partial path to a node.
     class Matcher
-      def initialize (path)
-        case path
-        when 'text()'
-          @extractor = lambda{|node| node.value}
-        when '%'
-          @extractor = lambda{|node| node.descendants}
-        when '*'
-          @extractor = lambda{|node| node.children}
-        when '.'
-          @extractor = lambda{|node| node}
-        when '..'
-          @extractor = lambda{|node| node.parent ? node.parent : []}
-        when ''
-          @extractor = lambda{|node| root = Node.new(nil); root.children << node.root; root}
+      # Create a new Matcher.
+      #
+      # @param path [String] the path pattern to match
+      def initialize(path)
+        @path = path
+        @extractor = case path
+        when "text()"
+          lambda { |node, context_set| node.value unless node.value.nil? || node.value.empty? }
+        when "%"
+          lambda { |node, context_set| node.descendants }
+        when "*"
+          lambda { |node, context_set| node.children }
+        when "."
+          lambda { |node, context_set| node }
+        when ".."
+          lambda { |node, context_set| node.parent || [] }
+        when ""
+          lambda { |node, context_set|
+            root = Node.new(nil)
+            root.children << node.root
+            root
+          }
+        when /^%(.+)$/  # descendants with name filter: %name
+          name = $1
+          lambda { |node, context_set| node.descendants.select { |d| d.name == name } }
         else
-          @extractor = lambda{|node| node.children.select{|child| child.name == path}}
+          lambda { |node, context_set|
+            # Only return children matching the name
+            # Don't include children that are already in the context
+            node.children.select { |child| child.name == @path && !context_set&.include?(child) }
+          }
         end
       end
       # Select all nodes that match a partial path.
-      def select (context_nodes)
-        context_nodes.collect{|node| @extractor.call(node) if node.is_a?(Node)}.flatten
+      #
+      # @param context_nodes [Array<Node>] the nodes to select from
+      # @param context_set [Set<Node>, nil] optional set version of context_nodes for performance
+      # @return [Array<Node>] the matching nodes
+      def select(context_nodes, context_set = nil)
+        context_set ||= context_nodes.to_set
+        context_nodes.collect { |node| @extractor.call(node, context_set) if node.is_a?(Node) }.flatten.compact.uniq
       end
     end
   end

data/lib/xml_node_stream.rb CHANGED Viewed

@@ -1,10 +1,23 @@
-require File.expand_path(File.join(File.dirname(__FILE__), 'xml_node_stream', 'node'))
-require File.expand_path(File.join(File.dirname(__FILE__), 'xml_node_stream', 'parser'))
-require File.expand_path(File.join(File.dirname(__FILE__), 'xml_node_stream', 'selector'))
+# frozen_string_literal: true
+require "pathname"
+require "uri"
+require_relative "xml_node_stream/node"
+require_relative "xml_node_stream/parser"
+require_relative "xml_node_stream/selector"
 module XmlNodeStream
+  VERSION = File.read(File.expand_path("../VERSION", __dir__)).strip
   # Helper method to parse XML. See Parser#parse for details.
-  def self.parse (io, &block)
-    Parser.parse(io, &block)
+  #
+  # @param io [IO, String, URI, Pathname] the input source to parse
+  # @yield [Node] each node as it is parsed
+  # @return [Node] the root node of the parsed document
+  class << self
+    def parse(io, &block)
+      Parser.parse(io, &block)
+    end
   end
 end

data/xml_node_stream.gemspec ADDED Viewed

@@ -0,0 +1,39 @@
+Gem::Specification.new do |spec|
+  spec.name = "xml_node_stream"
+  spec.version = File.read(File.expand_path("../VERSION", __FILE__)).strip
+  spec.authors = ["Brian Durand"]
+  spec.email = ["bbdurand@gmail.com"]
+  spec.summary = "Memory-efficient XML parser that reduces memory allocation when parsing large XML documents while maintaining a simple, easy-to-use interface."
+  spec.homepage = "https://github.com/bdurand/xml_node_stream"
+  spec.license = "MIT"
+  spec.metadata = {
+    "homepage_uri" => spec.homepage,
+    "source_code_uri" => spec.homepage,
+    "changelog_uri" => "#{spec.homepage}/blob/main/CHANGELOG.md"
+  }
+  # Specify which files should be added to the gem when it is released.
+  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
+  ignore_files = %w[
+    .
+    AGENTS.md
+    Appraisals
+    Gemfile
+    Gemfile.lock
+    Rakefile
+    bin/
+    gemfiles/
+    spec/
+    benchmark/
+  ]
+  spec.files = Dir.chdir(File.expand_path("..", __FILE__)) do
+    `git ls-files -z`.split("\x0").reject { |f| ignore_files.any? { |path| f.start_with?(path) } }
+  end
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler"
+end