RubyGems - canon - Versions diffs - 0.2.3 → 0.2.5 - Mend

canon 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

checksums.yaml +4 -4
data/.rubocop_todo.yml +31 -149
data/README.adoc +9 -0
data/docs/advanced/semantic-diff-report.adoc +96 -0
data/docs/features/configuration-profiles.adoc +4 -2
data/docs/features/diff-formatting/index.adoc +3 -0
data/docs/features/diff-formatting/whitespace-adjacency.adoc +140 -0
data/docs/features/match-options/html-policies.adoc +2 -0
data/docs/features/match-options/index.adoc +40 -0
data/docs/guides/choosing-configuration.adoc +12 -1
data/docs/reference/cli-options.adoc +3 -0
data/docs/reference/environment-variables.adoc +3 -1
data/docs/reference/options-across-interfaces.adoc +7 -1
data/docs/understanding/formats/html.adoc +9 -2
data/lib/canon/cli.rb +4 -0
data/lib/canon/commands/diff_command.rb +1 -0
data/lib/canon/comparison/comparison_result.rb +95 -2
data/lib/canon/comparison/html_comparator.rb +96 -11
data/lib/canon/comparison/markup_comparator.rb +68 -71
data/lib/canon/comparison/match_options/base_resolver.rb +1 -0
data/lib/canon/comparison/match_options/xml_resolver.rb +8 -0
data/lib/canon/comparison/match_options.rb +23 -2
data/lib/canon/comparison/node_inspector.rb +103 -0
data/lib/canon/comparison/whitespace_sensitivity.rb +96 -0
data/lib/canon/comparison/xml_comparator/child_comparison.rb +133 -55
data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +24 -23
data/lib/canon/comparison/xml_comparator/node_parser.rb +45 -7
data/lib/canon/comparison/xml_comparator.rb +174 -7
data/lib/canon/comparison/xml_node_comparison.rb +48 -66
data/lib/canon/comparison.rb +143 -22
data/lib/canon/config/env_schema.rb +2 -1
data/lib/canon/config/profiles/metanorma.yml +3 -0
data/lib/canon/config.rb +51 -5
data/lib/canon/diff/diff_classifier.rb +55 -41
data/lib/canon/diff/diff_line_builder.rb +9 -8
data/lib/canon/diff/xml_serialization_formatter.rb +27 -42
data/lib/canon/diff_formatter/by_line/base_formatter.rb +39 -4
data/lib/canon/diff_formatter/by_line/html_formatter.rb +5 -2
data/lib/canon/diff_formatter/by_line_formatter.rb +84 -0
data/lib/canon/diff_formatter/by_object_formatter.rb +53 -0
data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +184 -26
data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +92 -4
data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +29 -0
data/lib/canon/diff_formatter/pretty_diff_formatter.rb +109 -0
data/lib/canon/diff_formatter.rb +128 -175
data/lib/canon/html/data_model.rb +10 -4
data/lib/canon/pretty_printer/html.rb +76 -14
data/lib/canon/pretty_printer/html_void_elements.rb +20 -0
data/lib/canon/pretty_printer/xml_normalized.rb +10 -3
data/lib/canon/tree_diff/adapters/html_adapter.rb +55 -2
data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
data/lib/canon/version.rb +1 -1
data/lib/canon/xml/c14n.rb +59 -5
data/lib/canon/xml/data_model.rb +13 -1
data/lib/canon/xml/element_matcher.rb +3 -0
data/lib/canon/xml/node.rb +23 -1
data/lib/canon/xml/nodes/comment_node.rb +4 -0
data/lib/canon/xml/nodes/element_node.rb +4 -0
data/lib/canon/xml/nodes/text_node.rb +4 -0
data/lib/canon/xml/sax_builder.rb +29 -2
data/lib/canon/xml/xpath_engine.rb +238 -0
metadata +9 -2

data/lib/canon/comparison/xml_comparator.rb CHANGED Viewed

@@ -63,6 +63,18 @@ module Canon
         # @return [Boolean, Array] true if equivalent, or array of diffs if
         #   verbose
         def equivalent?(n1, n2, opts = {}, child_opts = {})
+          # FAST PATH: Object identity - same object is always equivalent
+          # Skip when semantic_diff is requested (caller needs tree diff metadata)
+          if n1.equal?(n2) && !opts.dig(:match, :semantic_diff)
+            return build_trivial_equivalent_result(n1, n2, opts)
+          end
+          # FAST PATH: String content equality - identical strings are equivalent
+          # Skip in verbose mode since caller may need full metadata (e.g. tree_diff statistics)
+          if !opts[:verbose] && n1.is_a?(String) && n2.is_a?(String) && n1 == n2
+            return true
+          end
           opts = DEFAULT_OPTS.merge(opts)
           # Resolve match options with format-specific defaults
@@ -92,8 +104,15 @@ module Canon
           # Create child_opts with resolved options
           child_opts = opts.merge(child_opts)
-          # Determine if we should preserve whitespace during parsing
-          # When structural_whitespace is :strict, preserve all whitespace-only text nodes
+          # Determine if we should preserve whitespace during parsing.
+          # Only structural_whitespace: :strict forces whitespace-only text
+          # nodes to survive parsing.  whitespace_type is about distinguishing
+          # Unicode whitespace *types* in surviving text-node content, and
+          # does NOT require indent text nodes to be kept — libxml's NOBLANKS
+          # only strips pure-ASCII whitespace-only nodes, so NBSP-only nodes
+          # survive regardless.  Coupling whitespace_type: :strict to
+          # parsing-time preservation made pretty-printed fixtures produce
+          # spurious element-position diffs (issue #112).
           preserve_whitespace = match_opts_hash[:structural_whitespace] == :strict
           # Parse nodes if they are strings, applying preprocessing if needed
@@ -141,6 +160,8 @@ module Canon
               format: :xml,
               match_options: match_opts_hash,
               algorithm: :dom,
+              parse_errors_expected: Comparison.parse_errors_for(node1),
+              parse_errors_received: Comparison.parse_errors_for(node2),
             )
           elsif result != Comparison::EQUIVALENT && !differences.empty?
             # Non-verbose mode: check equivalence
@@ -203,6 +224,8 @@ module Canon
               format: :xml,
               match_options: match_opts_hash.merge(strategy.metadata),
               algorithm: :semantic,
+              parse_errors_expected: Comparison.parse_errors_for(node1),
+              parse_errors_received: Comparison.parse_errors_for(node2),
             )
           else
             # Simple boolean result - equivalent if no normative differences
@@ -218,8 +241,59 @@ module Canon
                                                  preserve_whitespace: preserve_whitespace)
         end
+        # Build result for trivially equivalent inputs (same object or identical strings)
+        #
+        # Returns plain `true` in non-verbose mode, or a ComparisonResult in verbose mode.
+        #
+        # @param n1 [Object] First input
+        # @param n2 [Object] Second input
+        # @param opts [Hash] Raw options (before merge with DEFAULT_OPTS)
+        # @return [Boolean, ComparisonResult]
+        def build_trivial_equivalent_result(n1, n2, opts)
+          return true unless opts[:verbose]
+          # Parse nodes for verbose display
+          preserve_whitespace = true
+          node1 = parse_node(n1, :none,
+                             preserve_whitespace: preserve_whitespace)
+          node2 = parse_node(n2, :none,
+                             preserve_whitespace: preserve_whitespace)
+          preprocessed = [
+            serialize_node(node1).gsub("><", ">\n<"),
+            serialize_node(node2).gsub("><", ">\n<"),
+          ]
+          original1 = if n1.is_a?(String)
+                        n1
+                      elsif n1.respond_to?(:to_xml)
+                        n1.to_xml
+                      else
+                        n1.to_s
+                      end
+          original2 = if n2.is_a?(String)
+                        n2
+                      elsif n2.respond_to?(:to_xml)
+                        n2.to_xml
+                      else
+                        n2.to_s
+                      end
+          ComparisonResult.new(
+            differences: [],
+            preprocessed_strings: preprocessed,
+            original_strings: [original1, original2],
+            format: :xml,
+            match_options: {},
+            algorithm: :dom,
+          )
+        end
+        public
         # Main comparison dispatcher
         def compare_nodes(n1, n2, opts, child_opts, diff_children, differences)
+          # FAST PATH: Object identity - same object is always equivalent
+          return Comparison::EQUIVALENT if n1.equal?(n2)
           # Handle DocumentFragment nodes - compare their children instead
           if n1.is_a?(Nokogiri::XML::DocumentFragment) &&
               n2.is_a?(Nokogiri::XML::DocumentFragment)
@@ -305,7 +379,6 @@ module Canon
         end
         # Public comparison methods - exposed for XmlNodeComparison module
-        public
         # Compare two element nodes
         def compare_element_nodes(n1, n2, opts, child_opts, diff_children,
@@ -380,8 +453,10 @@ module Canon
           raw_differs = text1 != text2
           # Check if matches according to behavior
+          whitespace_type = match_opts[:whitespace_type] || :strict
           matches_per_behavior = MatchOptions.match_text?(text1, text2,
-                                                          behavior)
+                                                          behavior,
+                                                          whitespace_type: whitespace_type)
           # Determine the correct dimension for this difference
           # - If text_content is :strict, ALL differences use :text_content dimension
@@ -597,9 +672,16 @@ differences)
                         else
                           " (namespace: #{ns})"
                         end
-              return "element '#{node.name}'#{ns_info}: #{diff1} vs #{diff2}"
+              label = Canon::Comparison.code_pair_label(diff1, diff2)
+              return "element '#{node.name}'#{ns_info}: #{label}"
             elsif node.respond_to?(:name) && !node.respond_to?(:namespace_uri)
-              return "element missing: #{node}"
+              # TextNode and other nodes without namespace_uri
+              display = if node.respond_to?(:value) && node.node_type == :text
+                          "\"#{truncate_text(node.value)}\""
+                        else
+                          node.name.to_s
+                        end
+              return "element missing: #{display}"
             end
           end
@@ -617,6 +699,10 @@ differences)
             return build_text_diff_reason(text1, text2)
           end
+          if dimension == :whitespace_adjacency
+            return build_whitespace_adjacency_reason(node1, node2)
+          end
           # For attribute values differences, show the actual values
           if dimension == :attribute_values
             attrs1 = extract_attributes(node1)
@@ -633,8 +719,17 @@ differences)
           if diff1 == Canon::Comparison::MISSING_NODE && diff2 == Canon::Comparison::MISSING_NODE
             "element structure mismatch (children differ)"
+          elsif dimension == :element_structure &&
+              diff1 == Canon::Comparison::UNEQUAL_ELEMENTS &&
+              diff2 == Canon::Comparison::UNEQUAL_ELEMENTS &&
+              (node1.is_a?(Canon::Xml::Node) || node1.is_a?(Nokogiri::XML::Node)) &&
+              (node2.is_a?(Canon::Xml::Node) || node2.is_a?(Nokogiri::XML::Node)) &&
+              node1.name && node2.name && node1.name != node2.name
+            # Most common case: differing element names.  Surface the
+            # actual names rather than a generic "elements differ".
+            "different element name (<#{node1.name}> vs <#{node2.name}>)"
           else
-            "#{diff1} vs #{diff2}"
+            Canon::Comparison.code_pair_label(diff1, diff2)
           end
         end
@@ -748,6 +843,78 @@ differences)
           "Text: \"#{vis1}\" vs \"#{vis2}\""
         end
+        # Build a Reason line for a +:whitespace_adjacency+ diff (#137).
+        # Names which side carries the whitespace, the adjacency position
+        # relative to content neighbours, and surfaces the whitespace
+        # with visible markers.
+        def build_whitespace_adjacency_reason(node1, node2)
+          text1 = extract_text_from_node(node1)
+          text2 = extract_text_from_node(node2)
+          ni = NodeInspector
+          ws_on_first = ni.whitespace_only_text?(node1) &&
+            !ni.whitespace_only_text?(node2)
+          ws_on_second = ni.whitespace_only_text?(node2) &&
+            !ni.whitespace_only_text?(node1)
+          if ws_on_first
+            ws_text = text1
+            content_text = text2
+            present_side = "EXPECTED"
+            absent_side = "ACTUAL"
+            ws_node = node1
+          elsif ws_on_second
+            ws_text = text2
+            content_text = text1
+            present_side = "ACTUAL"
+            absent_side = "EXPECTED"
+            ws_node = node2
+          else
+            return build_text_diff_reason(text1, text2)
+          end
+          position = whitespace_adjacency_position(ws_node)
+          ws_vis = visualize_whitespace(ws_text)
+          content_vis = content_text ? visualize_whitespace(truncate_text(content_text)) : "(none)"
+          "Whitespace #{position} \"#{content_vis}\": " \
+            "present on #{present_side} (\"#{ws_vis}\"), absent on #{absent_side}"
+        end
+        def whitespace_adjacency_position(ws_node)
+          return :isolated unless ws_node.is_a?(Canon::Xml::Node) ||
+            ws_node.is_a?(Nokogiri::XML::Node)
+          parent = ws_node.parent
+          return :isolated if parent.nil?
+          siblings = parent.children
+          idx = siblings.index(ws_node)
+          return :isolated unless idx
+          before = sibling_with_content?(siblings, idx, -1)
+          after = sibling_with_content?(siblings, idx, 1)
+          if before && after then :surrounding
+          elsif before then :following
+          elsif after then :preceding
+          else :isolated
+          end
+        end
+        def sibling_with_content?(siblings, idx, direction)
+          i = idx + direction
+          while i >= 0 && i < siblings.length
+            s = siblings[i]
+            is_ws_text = NodeInspector.text_node?(s) &&
+              NodeInspector.text_content(s).strip.empty?
+            return true unless is_ws_text
+            i += direction
+          end
+          false
+        end
         # Check if text is only whitespace
         #
         # @param text [String] Text to check

data/lib/canon/comparison/xml_node_comparison.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 # frozen_string_literal: true
+require_relative "node_inspector"
 module Canon
   module Comparison
     # XML Node Comparison Utilities
@@ -180,13 +182,9 @@ differences)
       # @return [Symbol] Comparison result constant
       def self.dispatch_by_node_type(node1, node2, opts, child_opts,
 diff_children, differences)
-        # Canon::Xml::Node types use .node_type method that returns symbols
-        # Nokogiri also has .node_type but returns integers, so check for Symbol
-        if node1.respond_to?(:node_type) && node2.respond_to?(:node_type) &&
-            node1.node_type.is_a?(Symbol) && node2.node_type.is_a?(Symbol)
+        if node1.is_a?(Canon::Xml::Node) && node2.is_a?(Canon::Xml::Node)
           dispatch_canon_node_type(node1, node2, opts, child_opts,
                                    diff_children, differences)
-        # Moxml/Nokogiri types use .element?, .text?, etc. methods
         else
           dispatch_legacy_node_type(node1, node2, opts, child_opts,
                                     diff_children, differences)
@@ -232,6 +230,17 @@ diff_children, differences)
         return false unless text_node?(node) && node.parent
         return false unless MatchOptions.normalize_text(node_text(node)).empty?
+        # HTML-specific: NBSP (U+00A0) is never insignificant whitespace —
+        # it always renders as a visible non-breaking space.
+        format = opts[:format] || match_opts[:format]
+        if %i[html html4 html5].include?(format)
+          return false if WhitespaceSensitivity.contains_nbsp?(node_text(node))
+          # Whitespace between inline element siblings is semantically
+          # significant (renders as a visible gap) and must not be stripped.
+          return false if WhitespaceSensitivity.inline_whitespace_significant?(node)
+        end
         return true unless WhitespaceSensitivity.whitespace_preserved?(
           node.parent, match_opts
         )
@@ -275,8 +284,8 @@ diff_children, differences)
       def self.same_node_type?(node1, node2)
         return false if node1.class != node2.class
-        # For Nokogiri/Canon::Xml nodes, check node type
-        if node1.respond_to?(:node_type) && node2.respond_to?(:node_type)
+        case node1
+        when Canon::Xml::Node, Nokogiri::XML::Node
           node1.node_type == node2.node_type
         else
           true
@@ -294,34 +303,13 @@ diff_children, differences)
       # @param check_children [Boolean] Whether to check child nodes
       # @return [Boolean] true if node is a comment
       def self.comment_node?(node, check_children: false)
-        result = false
-        return true if node.respond_to?(:comment?) && node.comment?
-        return true if node.respond_to?(:node_type) && node.node_type == :comment
-        if node.is_a?(Nokogiri::XML::Element) && !node.children.empty? && check_children
-          node.children.each do |child|
-            # Recursively check child nodes for comments
-            # limit depth to avoid infinite recursion
-            # in case of circular structures (if any)
-            if comment_node?(child, check_children: false)
-              result = true
-              break
-            end
-          end
-        end
-        return true if result
-        # HTML comments are parsed as TEXT nodes by Nokogiri
-        # Check if this is a text node with HTML comment content
-        if text_node?(node)
-          text = node_text(node)
-          # Strip whitespace and backslashes for comparison
-          # Nokogiri escapes HTML comments as "<\\!-- comment -->" in full documents
-          text_stripped = text.to_s.strip.gsub("\\", "")
-          return true if text_stripped.start_with?("<!--") && text_stripped.end_with?("-->")
-        end
+        return true if NodeInspector.comment_node?(node)
-        result
+        if check_children && node.is_a?(Nokogiri::XML::Element) && !node.children.empty?
+          node.children.any? { |child| NodeInspector.comment_node?(child) }
+        else
+          false
+        end
       end
       # Check if a node is a text node
@@ -329,9 +317,7 @@ diff_children, differences)
       # @param node [Object] Node to check
       # @return [Boolean] true if node is a text node
       def self.text_node?(node)
-        (node.respond_to?(:text?) && node.text? &&
-          !node.respond_to?(:element?)) ||
-          (node.respond_to?(:node_type) && node.node_type == :text)
+        NodeInspector.text_node?(node)
       end
       # Extract text content from a node
@@ -341,15 +327,7 @@ diff_children, differences)
       def self.node_text(node)
         return "" unless node
-        if node.respond_to?(:content)
-          node.content.to_s
-        elsif node.respond_to?(:text)
-          node.text.to_s
-        elsif node.respond_to?(:value)
-          node.value.to_s
-        else
-          ""
-        end
+        NodeInspector.text_content(node)
       end
       # Dispatch by Canon::Xml::Node type
@@ -385,21 +363,26 @@ diff_children, differences)
         # Import XmlComparator to use its comparison methods
         require_relative "xml_comparator"
-        if node1.respond_to?(:element?) && node1.element?
-          XmlComparator.compare_element_nodes(node1, node2, opts, child_opts,
-                                              diff_children, differences)
-        elsif node1.respond_to?(:text?) && node1.text?
-          XmlComparator.compare_text_nodes(node1, node2, opts, differences)
-        elsif node1.respond_to?(:comment?) && node1.comment?
-          XmlComparator.compare_comment_nodes(node1, node2, opts, differences)
-        elsif node1.respond_to?(:cdata?) && node1.cdata?
-          XmlComparator.compare_text_nodes(node1, node2, opts, differences)
-        elsif node1.respond_to?(:processing_instruction?) && node1.processing_instruction?
-          XmlComparator.compare_processing_instruction_nodes(node1, node2,
-                                                             opts, differences)
-        elsif node1.respond_to?(:root)
+        case node1
+        when Nokogiri::XML::Document
           XmlComparator.compare_document_nodes(node1, node2, opts, child_opts,
                                                diff_children, differences)
+        when Nokogiri::XML::Node
+          if node1.element?
+            XmlComparator.compare_element_nodes(node1, node2, opts, child_opts,
+                                                diff_children, differences)
+          elsif node1.text?
+            XmlComparator.compare_text_nodes(node1, node2, opts, differences)
+          elsif node1.comment?
+            XmlComparator.compare_comment_nodes(node1, node2, opts, differences)
+          elsif node1.cdata?
+            XmlComparator.compare_text_nodes(node1, node2, opts, differences)
+          elsif node1.processing_instruction?
+            XmlComparator.compare_processing_instruction_nodes(node1, node2,
+                                                               opts, differences)
+          else
+            Comparison::EQUIVALENT
+          end
         else
           Comparison::EQUIVALENT
         end
@@ -431,10 +414,11 @@ differences)
       # @param node [Canon::Xml::Node, Object] Node to serialize
       # @return [String] XML string representation
       def self.serialize_node_to_xml(node)
-        if node.is_a?(Canon::Xml::Nodes::RootNode)
+        case node
+        when Canon::Xml::Nodes::RootNode
           # Serialize all children of root
           node.children.map { |child| serialize_node_to_xml(child) }.join
-        elsif node.is_a?(Canon::Xml::Nodes::ElementNode)
+        when Canon::Xml::Nodes::ElementNode
           # Serialize element with attributes and children
           attrs = node.attribute_nodes.map do |a|
             " #{a.name}=\"#{a.value}\""
@@ -448,14 +432,12 @@ differences)
           else
             "<#{node.name}#{attrs}>#{children_xml}</#{node.name}>"
           end
-        elsif node.is_a?(Canon::Xml::Nodes::TextNode)
+        when Canon::Xml::Nodes::TextNode
           node.value
-        elsif node.is_a?(Canon::Xml::Nodes::CommentNode)
+        when Canon::Xml::Nodes::CommentNode
           "<!--#{node.value}-->"
-        elsif node.is_a?(Canon::Xml::Nodes::ProcessingInstructionNode)
+        when Canon::Xml::Nodes::ProcessingInstructionNode
           "<?#{node.target} #{node.data}?>"
-        elsif node.respond_to?(:to_xml)
-          node.to_xml
         else
           node.to_s
         end

data/lib/canon/comparison.rb CHANGED Viewed

@@ -122,6 +122,65 @@ module Canon
     UNEQUAL_TYPES = 15
     UNEQUAL_PRIMITIVES = 16
+    # Human-readable labels for the integer comparison-result constants
+    # above.  Used by the diff reason builders so user-facing reason text
+    # never leaks raw numeric codes (e.g. "7 vs 7" — see lutaml/canon#127).
+    # String diff codes (e.g. "position 3" emitted by ChildComparison)
+    # pass through +code_label+ unchanged.
+    CODE_LABELS = {
+      EQUIVALENT => "equivalent",
+      MISSING_ATTRIBUTE => "missing attribute",
+      MISSING_NODE => "missing",
+      UNEQUAL_ATTRIBUTES => "attributes differ",
+      UNEQUAL_COMMENTS => "comments differ",
+      UNEQUAL_DOCUMENTS => "documents differ",
+      UNEQUAL_ELEMENTS => "elements differ",
+      UNEQUAL_NODES_TYPES => "node types differ",
+      UNEQUAL_TEXT_CONTENTS => "text content differs",
+      MISSING_HASH_KEY => "missing hash key",
+      UNEQUAL_HASH_VALUES => "hash values differ",
+      UNEQUAL_HASH_KEY_ORDER => "hash key order differs",
+      UNEQUAL_ARRAY_LENGTHS => "array lengths differ",
+      UNEQUAL_ARRAY_ELEMENTS => "array elements differ",
+      UNEQUAL_TYPES => "types differ",
+      UNEQUAL_PRIMITIVES => "primitives differ",
+    }.freeze
+    # Translate a comparison result code (Integer constant or String label
+    # like "position 3") into a human-readable reason fragment.  Unknown
+    # values pass through via +to_s+ as a defensive fallback.
+    #
+    # @param code [Integer, String] Comparison result code
+    # @return [String] Human-readable label
+    def self.code_label(code)
+      return code if code.is_a?(String)
+      CODE_LABELS[code] || code.to_s
+    end
+    # Build a "diff1 [vs diff2]" reason fragment that never leaks raw
+    # integer constants.  When both codes are equal, returns the single
+    # label (e.g. "elements differ") rather than "elements differ vs
+    # elements differ".  See lutaml/canon#127.
+    #
+    # @param diff1 [Integer, String] First diff code
+    # @param diff2 [Integer, String] Second diff code
+    # @return [String] Reason fragment
+    def self.code_pair_label(diff1, diff2)
+      return code_label(diff1) if diff1 == diff2
+      "#{code_label(diff1)} vs #{code_label(diff2)}"
+    end
+    # Extract parse-time errors from a parsed-tree or Nokogiri fragment.
+    # Delegates to NodeInspector for cross-backend type dispatch.
+    #
+    # @param node [Object, nil] Parsed node
+    # @return [Array<String>] Parse errors as strings (empty by default)
+    def self.parse_errors_for(node)
+      NodeInspector.parse_errors(node)
+    end
     class << self
       # Auto-detect format and compare two objects
       #
@@ -144,6 +203,35 @@ module Canon
         dom_diff(obj1, obj2, opts)
       end
+      # Summarize the first difference between two documents.
+      #
+      # Returns a human-readable string describing the first difference
+      # when documents differ, or "Equivalent" when they match.
+      # This is a lightweight alternative to +equivalent?+ with +verbose: true+.
+      #
+      # @param obj1 [Object] First object to compare
+      # @param obj2 [Object] Second object to compare
+      # @param opts [Hash] Comparison options (same as +equivalent?+)
+      # @return [String] Summary string
+      #
+      # @example
+      #   Canon::Comparison.summarize("<p>Hello</p>", "<p>World</p>")
+      #   # => "Not equivalent: text content differs at /p[1] (Hello vs World)"
+      #
+      #   Canon::Comparison.summarize("<p>Hello</p>", "<p>Hello</p>")
+      #   # => "Equivalent"
+      def summarize(obj1, obj2, opts = {})
+        result = equivalent?(obj1, obj2, opts.merge(verbose: true))
+        if result.is_a?(ComparisonResult)
+          result.summary
+        elsif result == true
+          "Equivalent"
+        else
+          "Not equivalent"
+        end
+      end
       # Define a custom comparison profile with DSL syntax
       #
       # @param name [Symbol] Profile name
@@ -602,26 +690,26 @@ module Canon
             # parsers can mutate the DOM).
             opts[:_original_str1] = obj1.dup if obj1.is_a?(String)
             opts[:_original_str2] = obj2.dup if obj2.is_a?(String)
-            if opts[:format] == :html5
-              # HTML5 fragment parsing is safe — it normalizes without
-              # destructive content-model mutations.
-              obj1 = HtmlParser.parse(obj1, :html5) if obj1.is_a?(String)
-              obj2 = HtmlParser.parse(obj2, :html5) if obj2.is_a?(String)
-            else
-              # HTML4 fragment parsing mutates the DOM (strips <body>
-              # attributes, re-parents <h1> content, etc.).  Use XML
-              # fragment parsing which preserves structure faithfully.
-              if obj1.is_a?(String)
-                obj1 = Nokogiri::XML.fragment(
-                  strip_xml_preamble(obj1),
-                )
-              end
-              if obj2.is_a?(String)
-                obj2 = Nokogiri::XML.fragment(
-                  strip_xml_preamble(obj2),
-                )
-              end
-            end
+            # Parse all HTML formats (:html, :html4, :html5) with
+            # Nokogiri::HTML5 so that html4 and html5 share HTML's
+            # whitespace-sensitivity semantics (issue #118).
+            #
+            # The previous html/html4 branch used Nokogiri::XML.fragment
+            # to dodge Nokogiri::HTML4.fragment's destructive DOM
+            # mutations. That avoided one problem but introduced a
+            # bigger one: XML whitespace rules were being applied to
+            # HTML content. HTML's content model — identical between
+            # HTML4 and HTML5 — treats whitespace-only text between
+            # block-level children as insignificant; XML treats every
+            # whitespace text node as significant. Routing html4 input
+            # through an XML parser therefore made
+            # be_html4_equivalent_to reject inputs that
+            # be_html5_equivalent_to (correctly) accepts.
+            # Nokogiri::HTML5.fragment is non-destructive (the original
+            # HTML4.fragment concern does not apply to it) and applies
+            # HTML's content model uniformly.
+            obj1 = HtmlParser.parse(obj1, :html5) if obj1.is_a?(String)
+            obj2 = HtmlParser.parse(obj2, :html5) if obj2.is_a?(String)
           end
         else
           format1 = FormatDetector.detect(obj1)
@@ -662,8 +750,14 @@ module Canon
         # but defined in config
         if Canon::Config.instance.respond_to?(comparison_format)
           format_config = Canon::Config.instance.public_send(comparison_format)
-          if opts[:match_profile].nil? && format_config.match.profile
-            opts[:match_profile] = format_config.match.profile
+          if opts[:global_profile].nil? && format_config.match.profile
+            # Config-sourced profile has *global* priority (applied before
+            # global_options), so that YAML profile_options like
+            # whitespace_type: :normalize can override the built-in profile
+            # (e.g. :spec_friendly)'s whitespace_type: :strict.  Writing to
+            # :match_profile here gave the config profile per-call priority,
+            # which incorrectly overrode the YAML's own overrides.
+            opts[:global_profile] = format_config.match.profile
           end
           # Pass YAML profile's extra match options (e.g., preserve_whitespace_elements)
           # that are stored in MatchConfig's resolver but not exposed via the
@@ -701,6 +795,33 @@ module Canon
         str
       end
+      # Decode HTML named entities (&nbsp; etc.) to their numeric
+      # character reference equivalents so that Nokogiri::XML.fragment
+      # (which only understands the five XML entities) preserves them
+      # as text nodes instead of silently dropping them.
+      #
+      # Uses Nokogiri's HTML4 parser to resolve the entities — the
+      # text is extracted from a fragment so no structural tags are added.
+      #
+      # @param str [String] HTML string potentially containing named entities
+      # @return [String] String with named entities replaced by characters
+      def decode_html_entities(str)
+        # Fast path: skip if no ampersands present
+        return str unless str.include?("&")
+        # Parse as HTML fragment to resolve named entities, then
+        # re-serialize as text.  This converts &nbsp; → U+00A0, etc.
+        doc = Nokogiri::HTML4.fragment(str)
+        # Serialize back, preserving the resolved characters.
+        # to_html re-encodes characters, so use inner_html which
+        # keeps the character form.
+        doc.inner_html
+        # If the serialization re-encoded characters as entities,
+        # that's fine — the XML parser understands numeric refs like &#160;
+      end
       # Detect the format of an object (delegates to FormatDetector)
       #
       # @param obj [Object] Object to detect format of

data/lib/canon/config/env_schema.rb CHANGED Viewed

@@ -14,6 +14,7 @@ module Canon
         show_diffs: :symbol,
         verbose_diff: :boolean,
         algorithm: :symbol,
+        parser: :symbol,
         show_raw_inputs: :boolean,
         show_raw_expected: :boolean,
         show_raw_received: :boolean,
@@ -66,7 +67,7 @@ module Canon
         def all_diff_attributes
           %i[mode use_color context_lines grouping_lines show_diffs
-             verbose_diff algorithm show_raw_inputs show_raw_expected show_raw_received
+             verbose_diff algorithm parser show_raw_inputs show_raw_expected show_raw_received
              show_preprocessed_inputs show_preprocessed_expected show_preprocessed_received
              show_prettyprint_inputs show_prettyprint_expected show_prettyprint_received
              show_line_numbered_inputs character_visualization

data/lib/canon/config/profiles/metanorma.yml CHANGED Viewed

@@ -28,6 +28,9 @@ formats:
   xml:
     match:
       profile: spec_friendly
+      # Treat different Unicode whitespace types (space, NBSP, ideographic space, etc.)
+      # as equivalent — useful for spec comparisons where whitespace type doesn't matter
+      whitespace_type: :normalize
       # Elements where whitespace is PRESERVED exactly (no manipulation)
       # All whitespace characters are significant in these elements
       preserve_whitespace_elements: