RubyGems - canon - Versions diffs - 0.2.3 → 0.2.5 - Mend

canon 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

checksums.yaml +4 -4
data/.rubocop_todo.yml +31 -149
data/README.adoc +9 -0
data/docs/advanced/semantic-diff-report.adoc +96 -0
data/docs/features/configuration-profiles.adoc +4 -2
data/docs/features/diff-formatting/index.adoc +3 -0
data/docs/features/diff-formatting/whitespace-adjacency.adoc +140 -0
data/docs/features/match-options/html-policies.adoc +2 -0
data/docs/features/match-options/index.adoc +40 -0
data/docs/guides/choosing-configuration.adoc +12 -1
data/docs/reference/cli-options.adoc +3 -0
data/docs/reference/environment-variables.adoc +3 -1
data/docs/reference/options-across-interfaces.adoc +7 -1
data/docs/understanding/formats/html.adoc +9 -2
data/lib/canon/cli.rb +4 -0
data/lib/canon/commands/diff_command.rb +1 -0
data/lib/canon/comparison/comparison_result.rb +95 -2
data/lib/canon/comparison/html_comparator.rb +96 -11
data/lib/canon/comparison/markup_comparator.rb +68 -71
data/lib/canon/comparison/match_options/base_resolver.rb +1 -0
data/lib/canon/comparison/match_options/xml_resolver.rb +8 -0
data/lib/canon/comparison/match_options.rb +23 -2
data/lib/canon/comparison/node_inspector.rb +103 -0
data/lib/canon/comparison/whitespace_sensitivity.rb +96 -0
data/lib/canon/comparison/xml_comparator/child_comparison.rb +133 -55
data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +24 -23
data/lib/canon/comparison/xml_comparator/node_parser.rb +45 -7
data/lib/canon/comparison/xml_comparator.rb +174 -7
data/lib/canon/comparison/xml_node_comparison.rb +48 -66
data/lib/canon/comparison.rb +143 -22
data/lib/canon/config/env_schema.rb +2 -1
data/lib/canon/config/profiles/metanorma.yml +3 -0
data/lib/canon/config.rb +51 -5
data/lib/canon/diff/diff_classifier.rb +55 -41
data/lib/canon/diff/diff_line_builder.rb +9 -8
data/lib/canon/diff/xml_serialization_formatter.rb +27 -42
data/lib/canon/diff_formatter/by_line/base_formatter.rb +39 -4
data/lib/canon/diff_formatter/by_line/html_formatter.rb +5 -2
data/lib/canon/diff_formatter/by_line_formatter.rb +84 -0
data/lib/canon/diff_formatter/by_object_formatter.rb +53 -0
data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +184 -26
data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +92 -4
data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +29 -0
data/lib/canon/diff_formatter/pretty_diff_formatter.rb +109 -0
data/lib/canon/diff_formatter.rb +128 -175
data/lib/canon/html/data_model.rb +10 -4
data/lib/canon/pretty_printer/html.rb +76 -14
data/lib/canon/pretty_printer/html_void_elements.rb +20 -0
data/lib/canon/pretty_printer/xml_normalized.rb +10 -3
data/lib/canon/tree_diff/adapters/html_adapter.rb +55 -2
data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
data/lib/canon/version.rb +1 -1
data/lib/canon/xml/c14n.rb +59 -5
data/lib/canon/xml/data_model.rb +13 -1
data/lib/canon/xml/element_matcher.rb +3 -0
data/lib/canon/xml/node.rb +23 -1
data/lib/canon/xml/nodes/comment_node.rb +4 -0
data/lib/canon/xml/nodes/element_node.rb +4 -0
data/lib/canon/xml/nodes/text_node.rb +4 -0
data/lib/canon/xml/sax_builder.rb +29 -2
data/lib/canon/xml/xpath_engine.rb +238 -0
metadata +9 -2

data/lib/canon/comparison/markup_comparator.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require_relative "../comparison" # Load base module with constants
+require_relative "node_inspector"
 require_relative "../diff/diff_node"
 require_relative "../diff/path_builder"
@@ -87,23 +88,20 @@ module Canon
           return nil if node.nil?
           # Canon::Xml::Node types
-          if node.is_a?(Canon::Xml::Nodes::RootNode)
+          case node
+          when Canon::Xml::Nodes::RootNode
             # Serialize all children of root
             node.children.map { |child| serialize_node(child) }.join
-          elsif node.is_a?(Canon::Xml::Nodes::ElementNode)
+          when Canon::Xml::Nodes::ElementNode
             serialize_element_node(node)
-          elsif node.is_a?(Canon::Xml::Nodes::TextNode)
+          when Canon::Xml::Nodes::TextNode
             # Use original text (with entity references) if available,
             # otherwise fall back to value (decoded text)
             node.original || node.value
-          elsif node.is_a?(Canon::Xml::Nodes::CommentNode)
+          when Canon::Xml::Nodes::CommentNode
             "<!--#{node.value}-->"
-          elsif node.is_a?(Canon::Xml::Nodes::ProcessingInstructionNode)
+          when Canon::Xml::Nodes::ProcessingInstructionNode
             "<?#{node.target} #{node.data}?>"
-          elsif node.respond_to?(:to_xml)
-            node.to_xml
-          elsif node.respond_to?(:to_html)
-            node.to_html
           else
             node.to_s
           end
@@ -121,8 +119,8 @@ module Canon
             node.attribute_nodes.to_h do |attr|
               [attr.name, attr.value]
             end
-          # Nokogiri nodes
-          elsif node.respond_to?(:attributes)
+          # Nokogiri elements
+          elsif node.is_a?(Nokogiri::XML::Element)
             node.attributes.to_h do |_, attr|
               [attr.name, attr.value]
             end
@@ -182,6 +180,25 @@ module Canon
           return false unless text_node?(node) && node.parent
           return false unless MatchOptions.normalize_text(node_text(node)).empty?
+          # NBSP (U+00A0) is never insignificant whitespace —
+          # it always renders as a visible non-breaking space.
+          # For HTML: always preserve NBSP nodes.
+          # For XML with whitespace_type: :strict: preserve NBSP nodes so
+          # different Unicode whitespace types remain distinguishable.
+          format = opts[:format] || match_opts[:format]
+          whitespace_type = match_opts[:whitespace_type] || :strict
+          if (%i[html html4
+                 html5].include?(format) || whitespace_type == :strict) && WhitespaceSensitivity.contains_nbsp?(node_text(node))
+            return false
+          end
+          if %i[html html4
+                html5].include?(format) && WhitespaceSensitivity.inline_whitespace_significant?(node)
+            # Whitespace between inline element siblings is semantically
+            # significant (renders as a visible gap) and must not be stripped.
+            return false
+          end
           return true unless WhitespaceSensitivity.whitespace_preserved?(
             node.parent, match_opts
           )
@@ -208,8 +225,8 @@ module Canon
         def same_node_type?(node1, node2)
           return false if node1.class != node2.class
-          # For Nokogiri/Canon::Xml nodes, check node type
-          if node1.respond_to?(:node_type) && node2.respond_to?(:node_type)
+          case node1
+          when Canon::Xml::Node, Nokogiri::XML::Node
             node1.node_type == node2.node_type
           else
             true
@@ -226,20 +243,7 @@ module Canon
         # @param node [Object] Node to check
         # @return [Boolean] true if node is a comment
         def comment_node?(node)
-          return true if node.respond_to?(:comment?) && node.comment?
-          return true if node.respond_to?(:node_type) && node.node_type == :comment
-          # HTML comments are parsed as TEXT nodes by Nokogiri
-          # Check if this is a text node with HTML comment content
-          if text_node?(node)
-            text = node_text(node)
-            # Strip whitespace and backslashes for comparison
-            # Nokogiri escapes HTML comments as "<\\!-- comment -->" in full documents
-            text_stripped = text.to_s.strip.gsub("\\", "")
-            return true if text_stripped.start_with?("<!--") && text_stripped.end_with?("-->")
-          end
-          false
+          NodeInspector.comment_node?(node)
         end
         # Check if a node is a text node
@@ -247,9 +251,7 @@ module Canon
         # @param node [Object] Node to check
         # @return [Boolean] true if node is a text node
         def text_node?(node)
-          (node.respond_to?(:text?) && node.text? &&
-            !node.respond_to?(:element?)) ||
-            (node.respond_to?(:node_type) && node.node_type == :text)
+          NodeInspector.text_node?(node)
         end
         # Get text content from a node
@@ -257,15 +259,7 @@ module Canon
         # @param node [Object] Node to get text from
         # @return [String] Text content
         def node_text(node)
-          # Canon::Xml::Node TextNode uses .value
-          if node.respond_to?(:value)
-            node.value.to_s
-          # Nokogiri nodes use .content
-          elsif node.respond_to?(:content)
-            node.content.to_s
-          else
-            node.to_s
-          end
+          NodeInspector.text_content(node)
         end
         # Check if difference between two texts is only whitespace
@@ -309,7 +303,7 @@ module Canon
           if diff1 == Canon::Comparison::MISSING_NODE && diff2 == Canon::Comparison::MISSING_NODE
             "element structure mismatch (children differ)"
           else
-            "#{diff1} vs #{diff2}"
+            Canon::Comparison.code_pair_label(diff1, diff2)
           end
         end
@@ -352,26 +346,18 @@ module Canon
         def extract_text_content_from_node(node)
           return nil if node.nil?
-          # For Canon::Xml::Nodes::TextNode
-          return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
-          # For XML/HTML nodes with text_content method
-          return node.text_content if node.respond_to?(:text_content)
-          # For nodes with text method
-          return node.text if node.respond_to?(:text)
-          # For nodes with content method (Moxml::Text)
-          return node.content if node.respond_to?(:content)
-          # For nodes with value method (other types)
-          return node.value if node.respond_to?(:value)
-          # For simple text nodes or strings
-          return node.to_s if node.is_a?(String)
-          # For other node types, try to_s
-          node.to_s
+          case node
+          when Canon::Xml::Nodes::TextNode
+            node.value
+          when Canon::Xml::Node
+            node.text_content
+          when Nokogiri::XML::Node
+            node.content.to_s
+          when String
+            node
+          else
+            node.to_s
+          end
         rescue StandardError
           nil
         end
@@ -425,26 +411,37 @@ module Canon
         # Determine the appropriate dimension for a node type
         #
+        # Used by ChildComparison to tag per-child orphan diffs with a
+        # dimension that matches what the node *is*, so the formatter
+        # renders correctly.  An element orphan tagged :text_content
+        # would otherwise route through PR #126's one-sided text
+        # formatter and render as +text ""+ instead of as the actual
+        # element (see lutaml/canon#125 follow-up).
+        #
         # @param node [Object] The node to check
         # @return [Symbol] The dimension symbol
         def determine_node_dimension(node)
-          # Canon::Xml::Node types
-          if node.respond_to?(:node_type) && node.node_type.is_a?(Symbol)
+          case node
+          when Canon::Xml::Node
             case node.node_type
+            when :element then :element_structure
             when :comment then :comments
             when :text, :cdata then :text_content
             when :processing_instruction then :processing_instructions
             else :text_content
             end
-          # Moxml/Nokogiri types
-          elsif node.respond_to?(:comment?) && node.comment?
-            :comments
-          elsif node.respond_to?(:text?) && node.text?
-            :text_content
-          elsif node.respond_to?(:cdata?) && node.cdata?
-            :text_content
-          elsif node.respond_to?(:processing_instruction?) && node.processing_instruction?
-            :processing_instructions
+          when Nokogiri::XML::Node
+            if node.comment?
+              :comments
+            elsif node.text? || node.cdata?
+              :text_content
+            elsif node.processing_instruction?
+              :processing_instructions
+            elsif node.element?
+              :element_structure
+            else
+              :text_content
+            end
           else
             :text_content
           end

data/lib/canon/comparison/match_options/base_resolver.rb CHANGED Viewed

@@ -146,6 +146,7 @@ module Canon
               respect_xml_space
               pretty_printed_expected
               pretty_printed_received
+              whitespace_type
             ]
             match_options.each do |dimension, behavior|

data/lib/canon/comparison/match_options/xml_resolver.rb CHANGED Viewed

@@ -24,6 +24,7 @@ module Canon
             attribute_values: :strict,
             element_position: :ignore,
             comments: :ignore,
+            whitespace_type: :strict,
           },
           xml: {
             preprocessing: :none,
@@ -34,6 +35,7 @@ module Canon
             attribute_values: :strict,
             element_position: :strict,
             comments: :strict,
+            whitespace_type: :strict,
           },
         }.freeze
@@ -51,6 +53,7 @@ module Canon
             attribute_values: :strict,
             element_position: :strict,
             comments: :strict,
+            whitespace_type: :strict,
           },
           # Rendered: Match rendered output (HTML default)
@@ -64,6 +67,7 @@ module Canon
             attribute_values: :strict,
             element_position: :strict,
             comments: :ignore,
+            whitespace_type: :strict,
           },
           # HTML4: Match HTML4 rendered output
@@ -77,6 +81,7 @@ module Canon
             attribute_values: :normalize,
             element_position: :ignore,
             comments: :ignore,
+            whitespace_type: :strict,
           },
           # HTML5: Match HTML5 rendered output (same as rendered)
@@ -89,6 +94,7 @@ module Canon
             attribute_values: :strict,
             element_position: :ignore,
             comments: :ignore,
+            whitespace_type: :strict,
           },
           # Spec-friendly: Formatting doesn't matter
@@ -102,6 +108,7 @@ module Canon
             attribute_values: :normalize,
             element_position: :ignore,
             comments: :ignore,
+            whitespace_type: :strict,
           },
           # Content-only: Only content matters
@@ -114,6 +121,7 @@ module Canon
             attribute_values: :normalize,
             element_position: :ignore,
             comments: :ignore,
+            whitespace_type: :strict,
           },
         }.freeze

data/lib/canon/comparison/match_options.rb CHANGED Viewed

@@ -69,13 +69,18 @@ module Canon
         # @param text1 [String] First text
         # @param text2 [String] Second text
         # @param behavior [Symbol] Match behavior (:strict, :normalize, :ignore)
+        # @param whitespace_type [Symbol] Whitespace type handling (:strict, :normalize)
         # @return [Boolean] true if texts match according to behavior
-        def match_text?(text1, text2, behavior)
+        def match_text?(text1, text2, behavior, whitespace_type: :strict)
           case behavior
           when :strict
             text1 == text2
           when :normalize
-            normalize_text(text1) == normalize_text(text2)
+            if whitespace_type == :normalize
+              normalize_text(text1) == normalize_text(text2)
+            else
+              normalize_text_preserving_type(text1) == normalize_text_preserving_type(text2)
+            end
           when :ignore
             true
           else
@@ -101,6 +106,22 @@ module Canon
             .strip # Remove leading/trailing whitespace
         end
+        # Normalize text preserving Unicode whitespace type distinctions.
+        #
+        # Only ASCII whitespace (space, tab, newline, etc.) is collapsed.
+        # Unicode whitespace (NBSP, ideographic space, etc.) is preserved,
+        # so different whitespace types remain distinguishable.
+        #
+        # @param text [String] Text to normalize
+        # @return [String] Normalized text with preserved whitespace types
+        def normalize_text_preserving_type(text)
+          return "" if text.nil?
+          text.to_s
+            .gsub(/[ \t\r\n\f\v]+/, " ") # Collapse only ASCII whitespace
+            .strip
+        end
         # Process attribute value according to match behavior
         #
         # @param value [String] Attribute value to process

data/lib/canon/comparison/node_inspector.rb ADDED Viewed

@@ -0,0 +1,103 @@
+# frozen_string_literal: true
+module Canon
+  module Comparison
+    # Single source of truth for cross-backend node type operations.
+    #
+    # The comparison pipeline handles nodes from two backends:
+    # * Canon::Xml::Node (+ RootNode, ElementNode, TextNode, etc.) —
+    #   custom DOM built by SAX builder and DataModel.
+    # * Nokogiri::XML::Node (+ subclasses) — native Nokogiri nodes used
+    #   by the HTML comparator and some legacy paths.
+    #
+    # Every method here dispatches on type via +case/when+ (+is_a?+).
+    # No +respond_to?+ — the types are known at every call site.
+    module NodeInspector
+      CANON_TEXT_TYPE = :text
+      NOKOGIRI_TEXT_TYPE = defined?(Nokogiri::XML::Node::TEXT_NODE) ? Nokogiri::XML::Node::TEXT_NODE : 3
+      # True when +node+ is a text node (whitespace, content, etc.).
+      def self.text_node?(node)
+        case node
+        when Canon::Xml::Node
+          node.node_type == CANON_TEXT_TYPE
+        when Nokogiri::XML::Node
+          node.node_type == NOKOGIRI_TEXT_TYPE
+        else
+          false
+        end
+      end
+      # Extract the text content of +node+ as a String.
+      def self.text_content(node)
+        case node
+        when Canon::Xml::Node
+          node.value.to_s
+        when Nokogiri::XML::Node
+          node.content.to_s
+        else
+          node.to_s
+        end
+      end
+      # True when +node+ is a text node whose content is whitespace-only.
+      # Empty-string text nodes return false — those represent genuine
+      # empty-vs-content asymmetry, not pretty-print indentation.
+      def self.whitespace_only_text?(node)
+        return false unless text_node?(node)
+        text = text_content(node)
+        !text.empty? && text.strip.empty?
+      end
+      # True when +node+ is a comment node.
+      # For HTML, also detects comments that Nokogiri parses as TEXT nodes
+      # (content like "<!-- comment -->" or escaped "<\\!-- comment -->").
+      def self.comment_node?(node)
+        case node
+        when Canon::Xml::Node
+          node.node_type == :comment
+        when Nokogiri::XML::Node
+          return true if node.comment?
+          # HTML comments are parsed as TEXT nodes by Nokogiri
+          if node.text?
+            text_stripped = text_content(node).to_s.strip.gsub("\\", "")
+            return true if text_stripped.start_with?("<!--") && text_stripped.end_with?("-->")
+          end
+          false
+        else
+          false
+        end
+      end
+      # True when +node+ is an element node.
+      def self.element_node?(node)
+        case node
+        when Canon::Xml::Node
+          node.node_type == :element
+        when Nokogiri::XML::Node
+          node.element?
+        else
+          false
+        end
+      end
+      # Extract parse-time errors carried on a node or its owning document.
+      # Returns an Array of Strings.
+      def self.parse_errors(node)
+        case node
+        when nil
+          []
+        when Canon::Xml::Node
+          errors = node.parse_errors
+          Array(errors).map(&:to_s)
+        when Nokogiri::XML::Document, Nokogiri::HTML5::Document
+          Array(node.errors).map(&:to_s)
+        else
+          []
+        end
+      end
+    end
+  end
+end

data/lib/canon/comparison/whitespace_sensitivity.rb CHANGED Viewed

@@ -50,6 +50,15 @@ module Canon
       # HTML elements where every whitespace character is significant.
       HTML_PRESERVE_ELEMENTS = %w[pre code textarea script style].freeze
+      # HTML inline elements — whitespace between these is semantically
+      # significant (renders as a visible space).  Whitespace-only text
+      # nodes that sit between two inline siblings must not be stripped.
+      INLINE_ELEMENTS = %w[
+        a abbr acronym b bdo big br button cite code dfn em i img input kbd
+        label map object output q s samp select small span strong sub sup
+        time tt u var wbr
+      ].freeze
       class << self
         # Classify the whitespace behaviour for an element using ancestor walk.
         #
@@ -213,6 +222,69 @@ module Canon
             .include?(element_name.to_sym)
         end
+        # Check if a whitespace-only text node sits between two inline element
+        # siblings, making the whitespace semantically significant.
+        #
+        # In HTML rendering, a space between <span>A</span> <span>B</span>
+        # produces visible output.  Stripping such nodes produces false
+        # equivalence.
+        #
+        # Works with any parent type (element, DocumentFragment, RootNode)
+        # since the check is about sibling context, not parent type.
+        #
+        # @param text_node [Object] Text node (Nokogiri or Canon::Xml::Node)
+        # @return [Boolean] true if whitespace is between inline siblings
+        def inline_whitespace_significant?(text_node)
+          return false unless text_node.respond_to?(:parent)
+          parent = text_node.parent
+          return false unless parent
+          return false unless parent.respond_to?(:children)
+          siblings = parent.children
+          idx = siblings.index(text_node)
+          return false unless idx
+          # Look at the IMMEDIATE non-whitespace-text neighbour on each
+          # side. Whitespace at a block boundary is collapsed per CSS,
+          # so both immediate neighbours must be inline for the
+          # whitespace to be significant. Walking all siblings (the
+          # earlier behaviour) misclassified whitespace at a block
+          # boundary as significant whenever any inline element existed
+          # elsewhere among the siblings.
+          prev_neighbour = nearest_non_whitespace_sibling(siblings, idx, -1)
+          next_neighbour = nearest_non_whitespace_sibling(siblings, idx,  1)
+          inline_element?(prev_neighbour) && inline_element?(next_neighbour)
+        end
+        # Walk outward from +idx+ in +direction+ (+1 forward, -1 back),
+        # skipping whitespace-only text nodes, and return the first
+        # non-whitespace sibling found.  Returns nil if none.
+        def nearest_non_whitespace_sibling(siblings, idx, direction)
+          i = idx + direction
+          while i >= 0 && i < siblings.length
+            s = siblings[i]
+            unless s.respond_to?(:text?) && s.text? &&
+                s.respond_to?(:content) && s.content.to_s.strip.empty?
+              return s
+            end
+            i += direction
+          end
+          nil
+        end
+        # Check if text content contains a non-breaking space (U+00A0).
+        # NBSP is NOT collapsible whitespace in HTML — it always renders as
+        # a visible space and must never be stripped.
+        #
+        # @param text [String] Text content to check
+        # @return [Boolean] true if text contains U+00A0
+        def contains_nbsp?(text)
+          text.to_s.include?("\u00A0")
+        end
         private
         # Build the Set of preserve whitespace element names (strings).
@@ -336,6 +408,30 @@ module Canon
           # Nokogiri compatibility
           parent.respond_to?(:node_type) && parent.node_type == :element
         end
+        # Get the parent element of a text node, or nil.
+        # Works with both Nokogiri and Canon::Xml::Node types.
+        def parent_element_of(text_node)
+          return nil unless text_node.respond_to?(:parent)
+          parent = text_node.parent
+          return nil unless parent
+          if parent.is_a?(Canon::Xml::Nodes::ElementNode)
+            parent
+          elsif parent.respond_to?(:element?) && parent.element?
+            parent
+          elsif parent.respond_to?(:node_type) && parent.node_type == :element
+            parent
+          end
+        end
+        # Check if a node is an HTML inline element.
+        def inline_element?(node)
+          return false unless node.respond_to?(:name)
+          INLINE_ELEMENTS.include?(node.name.to_s.downcase)
+        end
       end
     end
   end