RubyGems - canon - Versions diffs - 0.2.2 → 0.2.4 - Mend

canon 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

checksums.yaml +4 -4
data/.rubocop_todo.yml +31 -149
data/README.adoc +9 -0
data/docs/advanced/semantic-diff-report.adoc +31 -0
data/docs/features/configuration-profiles.adoc +4 -2
data/docs/features/match-options/html-policies.adoc +2 -0
data/docs/features/match-options/index.adoc +40 -0
data/docs/guides/choosing-configuration.adoc +12 -1
data/docs/reference/cli-options.adoc +3 -0
data/docs/reference/options-across-interfaces.adoc +7 -1
data/docs/understanding/formats/html.adoc +9 -2
data/lib/canon/cli.rb +4 -0
data/lib/canon/commands/diff_command.rb +1 -0
data/lib/canon/comparison/comparison_result.rb +79 -0
data/lib/canon/comparison/html_comparator.rb +92 -11
data/lib/canon/comparison/markup_comparator.rb +19 -0
data/lib/canon/comparison/match_options/base_resolver.rb +1 -0
data/lib/canon/comparison/match_options/xml_resolver.rb +8 -0
data/lib/canon/comparison/match_options.rb +23 -2
data/lib/canon/comparison/whitespace_sensitivity.rb +96 -0
data/lib/canon/comparison/xml_comparator/child_comparison.rb +6 -0
data/lib/canon/comparison/xml_comparator/node_parser.rb +45 -7
data/lib/canon/comparison/xml_comparator.rb +80 -4
data/lib/canon/comparison/xml_node_comparison.rb +29 -3
data/lib/canon/comparison.rb +84 -22
data/lib/canon/config/env_schema.rb +2 -1
data/lib/canon/config/profiles/metanorma.yml +3 -0
data/lib/canon/config.rb +51 -5
data/lib/canon/diff/diff_classifier.rb +18 -2
data/lib/canon/diff/diff_line_builder.rb +9 -8
data/lib/canon/diff_formatter/by_line/base_formatter.rb +39 -4
data/lib/canon/diff_formatter/by_line/html_formatter.rb +5 -2
data/lib/canon/diff_formatter/by_line_formatter.rb +84 -0
data/lib/canon/diff_formatter/by_object_formatter.rb +53 -0
data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +65 -17
data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +17 -0
data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +29 -0
data/lib/canon/diff_formatter/pretty_diff_formatter.rb +109 -0
data/lib/canon/diff_formatter.rb +57 -173
data/lib/canon/html/data_model.rb +10 -4
data/lib/canon/tree_diff/adapters/html_adapter.rb +55 -2
data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
data/lib/canon/version.rb +1 -1
data/lib/canon/xml/c14n.rb +59 -5
data/lib/canon/xml/element_matcher.rb +3 -0
data/lib/canon/xml/node.rb +8 -1
data/lib/canon/xml/nodes/comment_node.rb +4 -0
data/lib/canon/xml/nodes/element_node.rb +4 -0
data/lib/canon/xml/nodes/text_node.rb +4 -0
data/lib/canon/xml/sax_builder.rb +11 -2
data/lib/canon/xml/xpath_engine.rb +238 -0
metadata +6 -2

data/lib/canon/tree_diff/adapters/html_adapter.rb CHANGED Viewed

@@ -200,6 +200,19 @@ module Canon
           whitespace_sensitive_tags.include?(element.name.downcase)
         end
+        # Check if a text value is formatting-only whitespace
+        #
+        # Formatting whitespace contains newlines (indentation between
+        # block elements) and is safe to strip. Pure spaces/tabs without
+        # newlines may be semantically significant between inline elements
+        # and are preserved.
+        #
+        # @param text [String] Text value to check
+        # @return [Boolean] True if formatting-only whitespace
+        def formatting_whitespace?(text)
+          text.match?(/\A[\s\p{Zs}]*\z/) && text.include?("\n")
+        end
         # Build Nokogiri element from TreeNode
         #
         # @param tree_node [Core::TreeNode] Tree node
@@ -270,15 +283,51 @@ module Canon
             source_node: element_node, # Preserve reference to Canon node
           )
+          # Skip whitespace-only text children UNLESS this element is
+          # whitespace-sensitive (pre, code, textarea, script, style).
+          # Layout whitespace between block-level children is not
+          # semantically meaningful and preserving it causes the
+          # position-based tree matcher to misalign siblings, producing
+          # spurious NORMATIVE diffs around self-closing tags. This
+          # mirrors XMLAdapter's behavior and the DOM-diff path's
+          # remove_whitespace_only_text_nodes filter.
+          #
+          # HTML distinguishes between formatting whitespace (newlines +
+          # indentation between block elements) and inline whitespace
+          # (spaces between inline elements like <span>). Only formatting
+          # whitespace is stripped — inline spaces are semantically
+          # significant because they render as visible gaps.
+          skip_ws_text = !whitespace_sensitive?(element_node)
           # Process children recursively
           element_node.children.each do |child|
+            next if skip_ws_text && whitespace_only_text?(child)
             child_tree = to_tree(child)
-            tree_node.add_child(child_tree) if child_tree
+            next if child_tree.nil?
+            if child_tree.label == "text" && !whitespace_sensitive?(element_node) && formatting_whitespace?(child_tree.value)
+              next
+            end
+            tree_node.add_child(child_tree)
           end
           tree_node
         end
+        # Check if a Canon::Xml::Nodes node is a whitespace-only text node
+        #
+        # @param node [Canon::Xml::Nodes::Node] Node to check
+        # @return [Boolean] true if node is a TextNode containing only whitespace
+        def whitespace_only_text?(node)
+          return false unless node.is_a?(Canon::Xml::Nodes::TextNode)
+          # Uses \p{Zs} for Unicode space separators (em/en/thin spaces)
+          # plus ASCII whitespace -- same regex as XMLAdapter.
+          node.value.to_s.match?(/\A[\s\p{Zs}]*\z/)
+        end
         # Convert Canon::Xml::Nodes::TextNode to TreeNode
         #
         # @param text_node [Canon::Xml::Nodes::TextNode] Text node
@@ -287,7 +336,11 @@ module Canon
           # Extract text value
           text_value = text_node.value.to_s
-          # Return nil for empty text (don't strip for HTML)
+          # Return nil for truly empty text. Whitespace-only text nodes are
+          # filtered at the parent ElementNode level in
+          # to_tree_from_canon_element so that whitespace-sensitive
+          # containers (pre, code, textarea, script, style) retain their
+          # whitespace content.
           return nil if text_value.empty?
           Core::TreeNode.new(

data/lib/canon/tree_diff/tree_diff_integrator.rb CHANGED Viewed

@@ -188,7 +188,7 @@ module Canon
       # @return [Integer, nil] Max node count
       def get_max_node_count
         # Get from options if provided, otherwise use default
-        @options[:max_node_count] || 30_000
+        @options[:max_node_count] || 100_000
       end
     end
   end

data/lib/canon/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Canon
-  VERSION = "0.2.2"
+  VERSION = "0.2.4"
 end

data/lib/canon/xml/c14n.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require_relative "data_model"
 require_relative "processor"
+require_relative "xpath_engine"
 module Canon
   module Xml
@@ -21,15 +22,68 @@ module Canon
         processor.process(root_node)
       end
-      # Canonicalize a document subset (for future implementation)
+      # Canonicalize a document subset selected by XPath expression.
+      #
+      # Implements W3C C14N 1.1 subset canonicalization:
+      # 1. Evaluates XPath against the document tree
+      # 2. Marks matched nodes as the node-set
+      # 3. Renders canonical form for only the selected nodes,
+      #    with namespace and attribute inheritance from excluded ancestors
+      #
       # @param xml [String] XML document as string
       # @param xpath [String] XPath expression for subset selection
       # @param with_comments [Boolean] Include comments in canonical form
       # @return [String] Canonical form in UTF-8
-      def self.canonicalize_subset(xml, _xpath, with_comments: false)
-        # TODO: Implement XPath-based subset selection
-        # For now, just canonicalize the whole document
-        canonicalize(xml, with_comments: with_comments)
+      def self.canonicalize_subset(xml, xpath, with_comments: false)
+        root_node = DataModel.from_xml(xml)
+        # Mark all nodes as NOT in the node-set initially
+        mark_all_nodes(root_node, false)
+        # Evaluate XPath and mark matched nodes
+        matched = XPathEngine.evaluate(root_node, xpath)
+        # If XPath matches root or is empty, fall back to full canonicalization
+        if matched.empty?
+          mark_all_nodes(root_node, true)
+        else
+          # Mark matched nodes and their ancestors/descendants
+          mark_subset(root_node, matched)
+        end
+        # Process to canonical form
+        processor = Processor.new(with_comments: with_comments)
+        processor.process(root_node)
+      end
+      class << self
+        private
+        # Recursively set in_node_set on all nodes
+        def mark_all_nodes(node, value)
+          node.in_node_set = value
+          node.children.each { |child| mark_all_nodes(child, value) }
+        end
+        # Mark matched nodes and all required supporting nodes.
+        #
+        # Per W3C C14N 1.1, only nodes in the node-set are rendered.
+        # Ancestors not in the node-set become "omitted ancestors" —
+        # the Processor handles namespace/attribute inheritance from them.
+        def mark_subset(root_node, matched)
+          # Mark matched nodes and their descendants
+          matched.each do |node|
+            mark_node_and_descendants(node)
+          end
+          # Root node is always in the set so processing starts
+          root_node.in_node_set = true
+        end
+        def mark_node_and_descendants(node)
+          node.in_node_set = true
+          node.children.each { |child| mark_node_and_descendants(child) }
+        end
       end
     end
   end

data/lib/canon/xml/element_matcher.rb CHANGED Viewed

@@ -134,6 +134,9 @@ module Canon
       # Match children recursively
       def match_children(children1, children2, path)
+        # FAST PATH: Same array object means all children match
+        return if children1.equal?(children2)
         # Filter to only element nodes
         elems1 = children1.select { |n| n.node_type == :element }
         elems2 = children2.select { |n| n.node_type == :element }

data/lib/canon/xml/node.rb CHANGED Viewed

@@ -17,13 +17,20 @@ module Canon
       end
       def in_node_set?
-        @in_node_set ||= true
+        instance_variable_defined?(:@in_node_set) ? @in_node_set : true
       end
       def in_node_set=(value)
         @in_node_set = value
       end
+      # Return the text content of this node and all descendants.
+      # ElementNode concatenates children's text_content; other nodes
+      # (TextNode, CommentNode, etc.) return their value.
+      def text_content
+        children.map(&:text_content).join
+      end
       protected
       attr_writer :parent

data/lib/canon/xml/nodes/comment_node.rb CHANGED Viewed

@@ -21,6 +21,10 @@ module Canon
         def node_type
           :comment
         end
+        def text_content
+          @value
+        end
       end
     end
   end

data/lib/canon/xml/nodes/element_node.rb CHANGED Viewed

@@ -54,6 +54,10 @@ module Canon
         def node_info
           "name: #{name} namespace_uri: #{namespace_uri} prefix: #{prefix}"
         end
+        def to_s
+          "<#{qname}>"
+        end
       end
     end
   end

data/lib/canon/xml/nodes/text_node.rb CHANGED Viewed

@@ -29,6 +29,10 @@ module Canon
         def node_type
           :text
         end
+        def text_content
+          @value
+        end
       end
     end
   end

data/lib/canon/xml/sax_builder.rb CHANGED Viewed

@@ -182,8 +182,17 @@ strip_doctype: false)
         # Skip whitespace-only text nodes unless:
         # 1. preserve_whitespace is true, OR
-        # 2. The content contains CR (from &#xD; entities) which must be preserved for C14N
-        if !@preserve_whitespace && decoded_string.strip.empty? && parent.node_type == :element && !decoded_string.include?("\r")
+        # 2. The content contains CR (from &#xD; entities) which must be preserved for C14N, OR
+        # 3. The content contains non-ASCII whitespace (NBSP U+00A0, ideographic
+        #    space U+3000, etc.) — those are semantically meaningful content,
+        #    not pretty-print indentation, and must survive parsing so the
+        #    comparator can detect Unicode whitespace-type differences.
+        #
+        # Strip only when the node is pure ASCII whitespace (space, tab, CR, LF).
+        # This lets pretty-printed fixtures work (indent nodes stripped) while
+        # preserving NBSP-only text nodes.
+        if !@preserve_whitespace && decoded_string.gsub(/[ \t\r\n]/,
+                                                        "").empty? && parent.node_type == :element && !decoded_string.include?("\r")
           # Only skip if parent is an element (not root)
           return
         end

data/lib/canon/xml/xpath_engine.rb ADDED Viewed

@@ -0,0 +1,238 @@
+# frozen_string_literal: true
+module Canon
+  module Xml
+    # XPath evaluation engine for C14N subset selection.
+    #
+    # Supports a focused subset of XPath 1.0 sufficient for W3C C14N
+    # subset canonicalization:
+    #
+    # - Absolute paths: /root/child, /root/child[1]
+    # - Descendant-or-self: //element, //ns:element
+    # - Predicates: [1] (position), [@attr], [@attr='value']
+    # - Wildcards: *
+    # - Union: expr1 | expr2
+    #
+    # Not supported (not needed for C14N subset):
+    # - Axes other than child and descendant-or-self
+    # - Functions (last(), position(), etc.)
+    # - Variables
+    #
+    class XPathEngine
+      # Evaluate an XPath expression against a data model tree.
+      #
+      # @param root [Nodes::RootNode] Root of the data model tree
+      # @param xpath [String] XPath expression
+      # @return [Array<Node>] Matched nodes in document order
+      def self.evaluate(root, xpath)
+        new(root).evaluate(xpath)
+      end
+      def initialize(root)
+        @root = root
+      end
+      # Evaluate an XPath expression and return matched nodes.
+      #
+      # @param xpath [String] XPath expression
+      # @return [Array<Node>] Matched nodes in document order
+      def evaluate(xpath)
+        # Handle union operator (|)
+        if xpath.include?("|")
+          xpath.split("|").flat_map { |expr| evaluate(expr.strip) }.uniq
+        else
+          evaluate_path(xpath.strip)
+        end
+      end
+      private
+      def evaluate_path(xpath)
+        if xpath.start_with?("//")
+          # Descendant-or-self: anywhere in the tree
+          evaluate_descendant(xpath[2..])
+        elsif xpath.start_with?("/")
+          # Absolute path
+          evaluate_absolute(xpath[1..])
+        else
+          # Relative path — treat as descendant
+          evaluate_descendant(xpath)
+        end
+      end
+      def evaluate_absolute(path)
+        return [] if path.empty?
+        steps = parse_steps(path)
+        return [] if steps.empty?
+        # Start from root's children
+        current_nodes = @root.children
+        apply_steps(current_nodes, steps)
+      end
+      def evaluate_descendant(path)
+        steps = parse_steps(path)
+        return [] if steps.empty?
+        # Collect all descendant element nodes
+        all_elements = []
+        collect_elements(@root, all_elements)
+        # For each element, try to match the full path starting there
+        result = []
+        all_elements.each do |element|
+          first_step = steps.first
+          next unless step_matches?(element, first_step)
+          if steps.length == 1
+            result << element
+          else
+            remaining = steps[1..]
+            matched = apply_steps(element.children, remaining)
+            result.concat(matched)
+          end
+        end
+        result.uniq
+      end
+      def collect_elements(node, result)
+        node.children.each do |child|
+          next unless child.is_a?(Nodes::ElementNode)
+          result << child
+          collect_elements(child, result)
+        end
+      end
+      def apply_steps(nodes, steps)
+        return nodes if steps.empty?
+        step = steps.first
+        remaining = steps[1..]
+        matched = nodes.select { |n| step_matches?(n, step) }
+        if remaining.empty?
+          matched
+        else
+          matched.flat_map do |node|
+            apply_steps(node.children, remaining)
+          end
+        end
+      end
+      def step_matches?(node, step)
+        return false unless node.is_a?(Nodes::ElementNode)
+        name_matches?(node, step[:name]) &&
+          predicates_match?(node, step[:predicates])
+      end
+      def name_matches?(node, name)
+        return true if name == "*"
+        # Handle prefixed names (ns:element)
+        if name.include?(":")
+          prefix, local = name.split(":", 2)
+          node.prefix == prefix && node.name == local
+        else
+          node.name == name
+        end
+      end
+      def predicates_match?(node, predicates)
+        return true if predicates.empty?
+        predicates.all? { |pred| predicate_matches?(node, pred) }
+      end
+      def predicate_matches?(node, pred)
+        case pred[:type]
+        when :position
+          # [1] — position among siblings with same name
+          position_predicate?(node, pred[:value])
+        when :attribute_exists
+          # [@attr]
+          node.attribute_nodes.any? { |a| a.local_name == pred[:name] }
+        when :attribute_value
+          # [@attr='value']
+          node.attribute_nodes.any? do |a|
+            a.local_name == pred[:name] && a.value == pred[:value]
+          end
+        else
+          false
+        end
+      end
+      def position_predicate?(node, position)
+        siblings = node.parent&.children&.select do |n|
+          n.is_a?(Nodes::ElementNode) && n.name == node.name
+        end || []
+        idx = siblings.index(node)
+        idx && (idx + 1) == position
+      end
+      # Parse a path string into an array of steps.
+      #
+      # @param path [String] XPath path (without leading /)
+      # @return [Array<Hash>] Array of { name:, predicates: }
+      def parse_steps(path)
+        steps = []
+        scanner = StringScanner.new(path)
+        until scanner.eos?
+          scanner.skip(/\s+/)
+          break if scanner.eos?
+          # Skip /
+          scanner.scan(%r{/})
+          name = scan_name(scanner)
+          break if name.nil?
+          predicates = scan_predicates(scanner)
+          steps << { name: name, predicates: predicates }
+        end
+        steps
+      end
+      def scan_name(scanner)
+        scanner.scan(%r{[a-zA-Z_][\w:.-]*|\*})
+      end
+      def scan_predicates(scanner) # rubocop:disable Metrics/AbcSize
+        predicates = []
+        while scanner.scan(/\[/) # rubocop:disable Style/RedundantRegexpArgument
+          scanner.skip(/\s*/)
+          pred = scan_predicate(scanner)
+          scanner.skip(/\s*/)
+          scanner.scan(/\]/) # rubocop:disable Style/RedundantRegexpArgument
+          predicates << pred if pred
+        end
+        predicates
+      end
+      def scan_predicate(scanner)
+        if scanner.scan(/(\d+)/)
+          { type: :position, value: scanner[1].to_i }
+        elsif scanner.scan(/@/)
+          name = scanner.scan(/[a-zA-Z_][\w.-]*/)
+          if scanner.scan(/=/) # rubocop:disable Style/RedundantRegexpArgument
+            # Remove surrounding quotes if present
+            scanner.scan(/['"]/)
+            value = scanner.scan(/[^'"\]]+/)
+            scanner.scan(/['"]/)
+            { type: :attribute_value, name: name, value: value }
+          else
+            { type: :attribute_exists, name: name }
+          end
+        end
+      end
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: canon
 version: !ruby/object:Gem::Version
-  version: 0.2.2
+  version: 0.2.4
 platform: ruby
 authors:
 - Ribose Inc.
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-04-18 00:00:00.000000000 Z
+date: 2026-04-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: diff-lcs
@@ -296,10 +296,12 @@ files:
 - lib/canon/diff_formatter/by_line/simple_formatter.rb
 - lib/canon/diff_formatter/by_line/xml_formatter.rb
 - lib/canon/diff_formatter/by_line/yaml_formatter.rb
+- lib/canon/diff_formatter/by_line_formatter.rb
 - lib/canon/diff_formatter/by_object/base_formatter.rb
 - lib/canon/diff_formatter/by_object/json_formatter.rb
 - lib/canon/diff_formatter/by_object/xml_formatter.rb
 - lib/canon/diff_formatter/by_object/yaml_formatter.rb
+- lib/canon/diff_formatter/by_object_formatter.rb
 - lib/canon/diff_formatter/character_map.yml
 - lib/canon/diff_formatter/debug_output.rb
 - lib/canon/diff_formatter/diff_detail_formatter.rb
@@ -309,6 +311,7 @@ files:
 - lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb
 - lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb
 - lib/canon/diff_formatter/legend.rb
+- lib/canon/diff_formatter/pretty_diff_formatter.rb
 - lib/canon/diff_formatter/theme.rb
 - lib/canon/errors.rb
 - lib/canon/formatters/html4_formatter.rb
@@ -376,6 +379,7 @@ files:
 - lib/canon/xml/sax_builder.rb
 - lib/canon/xml/whitespace_normalizer.rb
 - lib/canon/xml/xml_base_handler.rb
+- lib/canon/xml/xpath_engine.rb
 - lib/tasks/benchmark_runner.rb
 - lib/tasks/performance.rake
 - lib/tasks/performance_comparator.rb