RubyGems - canon - Versions diffs - 0.2.4 → 0.2.5 - Mend

canon 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/docs/advanced/semantic-diff-report.adoc +65 -0
data/docs/features/diff-formatting/index.adoc +3 -0
data/docs/features/diff-formatting/whitespace-adjacency.adoc +140 -0
data/docs/reference/environment-variables.adoc +3 -1
data/lib/canon/comparison/comparison_result.rb +16 -2
data/lib/canon/comparison/html_comparator.rb +4 -0
data/lib/canon/comparison/markup_comparator.rb +49 -71
data/lib/canon/comparison/node_inspector.rb +103 -0
data/lib/canon/comparison/xml_comparator/child_comparison.rb +127 -55
data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +24 -23
data/lib/canon/comparison/xml_comparator.rb +94 -3
data/lib/canon/comparison/xml_node_comparison.rb +37 -81
data/lib/canon/comparison.rb +59 -0
data/lib/canon/diff/diff_classifier.rb +37 -39
data/lib/canon/diff/xml_serialization_formatter.rb +27 -42
data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +119 -9
data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +75 -4
data/lib/canon/diff_formatter.rb +71 -2
data/lib/canon/pretty_printer/html.rb +76 -14
data/lib/canon/pretty_printer/html_void_elements.rb +20 -0
data/lib/canon/pretty_printer/xml_normalized.rb +10 -3
data/lib/canon/version.rb +1 -1
data/lib/canon/xml/data_model.rb +13 -1
data/lib/canon/xml/node.rb +15 -0
data/lib/canon/xml/sax_builder.rb +18 -0
metadata +5 -2

data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 # frozen_string_literal: true
+require "nokogiri"
 require_relative "../../xml/namespace_helper"
 module Canon
@@ -260,12 +261,15 @@ module Canon
           end
         end
-        # Serialize a Canon Xml node tree as compact XML for display.
+        # Serialize a node tree as compact XML for display.
         #
         # Produces a human-readable inline XML string without namespace
         # declarations and without indentation — suitable for use in Semantic
-        # Diff Report entries.  Only handles Canon::Xml::Nodes types; for any
-        # other node (Nokogiri, etc.) falls back to +get_node_text+.
+        # Diff Report entries.  Handles both +Canon::Xml::Nodes+ types and
+        # Nokogiri XML/HTML nodes (the html DOM comparison path uses
+        # Nokogiri nodes, so element-structure diffs originating there must
+        # be rendered structurally too — see issue #120).  For any other
+        # node type, falls back to +get_node_text+.
         #
         # @param node [Object] Node to serialize
         # @return [String] Compact XML string
@@ -294,12 +298,79 @@ module Canon
           when Canon::Xml::Nodes::CommentNode
             text = node.respond_to?(:value) ? node.value.to_s : ""
             "<!--#{CGI.escapeHTML(text)}-->"
+          when Nokogiri::XML::Text, Nokogiri::XML::CDATA
+            CGI.escapeHTML(node.content.to_s)
+          when Nokogiri::XML::Comment
+            "<!--#{CGI.escapeHTML(node.content.to_s)}-->"
+          when Nokogiri::XML::Element
+            tag = node.name.to_s
+            attrs = node.attribute_nodes.map do |a|
+              " #{a.name}=\"#{CGI.escapeHTML(a.value.to_s)}\""
+            end.join
+            children_xml = node.children.map do |c|
+              serialize_node_compact(c)
+            end.join
+            if children_xml.empty?
+              "<#{tag}#{attrs}/>"
+            else
+              "<#{tag}#{attrs}>#{children_xml}</#{tag}>"
+            end
           else
-            # Nokogiri nodes or other unknown types — fall back to text extraction
+            # Unknown node types — fall back to text extraction
             get_node_text(node)
           end
         end
+        # Serialize a node's open tag only — name + attributes, no children,
+        # no closing tag.  Used by +format_text_content_one_sided+ to render
+        # a brief parent-element context hint (e.g. +<div id="A">+) for a
+        # one-sided text diff, instead of the full ancestor subtree that
+        # +serialize_node_compact+ would produce.  See lutaml/canon#125.
+        #
+        # @param node [Object] Element node to serialize
+        # @return [String] Open-tag string, or "" for non-elements / nil
+        def self.serialize_open_tag(node)
+          require "cgi"
+          return "" unless node
+          case node
+          when Canon::Xml::Nodes::ElementNode
+            tag = node.name.to_s
+            attrs = node.attribute_nodes.map do |attr|
+              " #{attr.name}=\"#{CGI.escapeHTML(attr.value.to_s)}\""
+            end.join
+            "<#{tag}#{attrs}>"
+          when Nokogiri::XML::Element
+            tag = node.name.to_s
+            attrs = node.attribute_nodes.map do |a|
+              " #{a.name}=\"#{CGI.escapeHTML(a.value.to_s)}\""
+            end.join
+            "<#{tag}#{attrs}>"
+          else
+            ""
+          end
+        end
+        # Return the raw text content of a text node without stripping
+        # whitespace.  +get_node_text+ strips ASCII whitespace, which
+        # destroys whitespace-only payloads that callers (e.g. one-sided
+        # text-content diff rendering) need to display verbatim.
+        #
+        # @param node [Object] Text node
+        # @return [String] Raw text content, or "" if not a text-bearing node
+        def self.raw_text_value(node)
+          return "" unless node
+          case node
+          when Canon::Xml::Node
+            node.value.to_s
+          when Nokogiri::XML::Node
+            node.content.to_s
+          else
+            ""
+          end
+        end
         # Return the best display string for a node.
         #
         # When +compact: true+ and the node is a Canon ElementNode, returns a

data/lib/canon/diff_formatter.rb CHANGED Viewed

@@ -366,8 +366,13 @@ module Canon
     # @param actual [Object] Actual value
     # @return [String] Formatted diff output
     def format_comparison_result(comparison_result, expected, actual)
-      # Detect format from expected content
-      format = Canon::Comparison::FormatDetector.detect(expected)
+      # Prefer the matcher-supplied format (e.g. :html4 from
+      # be_html4_equivalent_to). Auto-detection from the expected string
+      # cannot distinguish HTML from XML for fragments like
+      # `<div class="x"></div>` and would mis-route HTML fixtures
+      # through the XML pretty-printer (issue #135).
+      format = (comparison_result.is_a?(Canon::Comparison::ComparisonResult) && comparison_result.format) ||
+        Canon::Comparison::FormatDetector.detect(expected)
       formatter_options = {
         use_color: @use_color,
@@ -392,6 +397,18 @@ module Canon
         output << "" # Blank line for spacing
       end
+      # Parse-error banner.  When libxml flagged any errors during
+      # parsing, surface them at the top of the report so the user
+      # is not left chasing diffs that describe a partial tree.
+      # See lutaml/canon#130.
+      if comparison_result.is_a?(Canon::Comparison::ComparisonResult) &&
+          comparison_result.parse_errors?
+        output << format_parse_error_banner(
+          comparison_result.parse_errors_expected,
+          comparison_result.parse_errors_received,
+        )
+      end
       # 1. CANON VERBOSE tables (ONLY if CANON_VERBOSE=1)
       verbose_tables = DebugOutput.verbose_tables_only(
         comparison_result,
@@ -507,6 +524,53 @@ module Canon
     private
+    # Render the parse-error banner that appears at the top of the
+    # diff report when libxml flagged any errors during parsing.
+    # Names the offending side(s) and warns that the diff below
+    # describes the parsed tree, not the input.  See lutaml/canon#130.
+    #
+    # @param errors_expected [Array<String>] Errors from the expected side
+    # @param errors_received [Array<String>] Errors from the received side
+    # @return [String] Multi-line banner
+    def format_parse_error_banner(errors_expected, errors_received)
+      lines = []
+      rule = "=" * 70
+      lines << colorize(rule, :yellow, :bold)
+      lines << colorize("  ⚠️  PARSE ERRORS", :yellow, :bold)
+      lines << colorize(rule, :yellow, :bold)
+      if errors_expected.any?
+        lines << colorize("  Expected side:", :yellow, :bold)
+        errors_expected.each do |err|
+          lines << "    #{colorize(err, :red)}"
+        end
+      end
+      if errors_received.any?
+        lines << colorize("  Received side:", :yellow, :bold)
+        errors_received.each do |err|
+          lines << "    #{colorize(err, :red)}"
+        end
+      end
+      lines << ""
+      lines << colorize(
+        "  ⚠️  The diff below describes the parsed tree, not the input.",
+        :yellow,
+      )
+      lines << colorize(
+        "      Content that the parser could not represent has been",
+        :yellow,
+      )
+      lines << colorize(
+        "      dropped and may appear as \"missing\" in the report.",
+        :yellow,
+      )
+      lines << colorize(rule, :yellow, :bold)
+      lines << ""
+      lines.join("\n")
+    end
     # Normalize content for display in diffs
     #
     # @param expected [Object] Expected value
@@ -850,6 +914,7 @@ module Canon
         collapse_whitespace_elements: @collapse_whitespace_elements,
         strip_whitespace_elements: @strip_whitespace_elements,
         sort_attributes: @pretty_printer_sort_attributes,
+        html_mode: %i[html html4 html5].include?(format),
       }
       printer_expected = Canon::PrettyPrinter::XmlNormalized.new(
@@ -931,9 +996,13 @@ module Canon
       if %i[html html4 html5].include?(format)
         require "canon/pretty_printer/html"
+        # Fixture-ready mode actually indents (libxml FORMAT save flag
+        # via AS_XHTML).  The default mode is structurally faithful but
+        # does not indent on HTML5 input -- see lutaml/canon#133.
         printer = Canon::PrettyPrinter::Html.new(
           indent: @pretty_printer_indent,
           indent_type: indent_type_str,
+          fixture_ready: true,
         )
       elsif format == :xml
         require "canon/pretty_printer/xml"

data/lib/canon/pretty_printer/html.rb CHANGED Viewed

@@ -1,19 +1,43 @@
 # frozen_string_literal: true
 require "nokogiri"
+require "stringio"
+require_relative "html_void_elements"
 module Canon
   module PrettyPrinter
-    # Pretty printer for HTML with consistent indentation
+    # Pretty printer for HTML with consistent indentation.
+    #
+    # Two modes:
+    #
+    # 1. Default mode (+fixture_ready: false+): retains the existing
+    #    behaviour for callers that use the pretty-printer as a
+    #    structural normaliser (the canon round-trip tests, the
+    #    diff-pipeline +apply_pretty_print+ stage, etc).  These callers
+    #    do not require actual indentation; they require structural
+    #    equivalence to the input.
+    #
+    # 2. Fixture-ready mode (+fixture_ready: true+): emits
+    #    actually-indented XHTML-shaped output via libxml's +FORMAT+
+    #    save flag.  Used by +DiffFormatter#prettyprint_for_display+
+    #    (the +CANON_<FORMAT>_DIFF_SHOW_PRETTYPRINT_RECEIVED+ surface)
+    #    so the user can read or paste the formatted output directly
+    #    into a fixture heredoc.  Output is XHTML-shaped (void
+    #    elements self-closed, non-void paired) via the +AS_XHTML+
+    #    save flag; the +NO_DECLARATION+ flag suppresses the
+    #    +<?xml ...?>+ prefix.
+    #
+    # See lutaml/canon#133, lutaml/canon#135.
     class Html
-      def initialize(indent: 2, indent_type: "space")
+      def initialize(indent: 2, indent_type: "space", fixture_ready: false)
         @indent = indent.to_i
         @indent_type = indent_type
+        @fixture_ready = fixture_ready
       end
-      # Pretty print HTML with consistent indentation
       def format(html_string)
-        # Detect if this is XHTML or HTML
+        return format_fixture_ready(html_string) if @fixture_ready
         if xhtml?(html_string)
           format_as_xhtml(html_string)
         else
@@ -24,34 +48,72 @@ module Canon
       private
       def xhtml?(html_string)
-        # Check for XHTML DOCTYPE or xmlns attribute
         html_string.include?("XHTML") ||
           html_string.include?('xmlns="http://www.w3.org/1999/xhtml"')
       end
       def format_as_xhtml(html_string)
-        # Parse as XML for XHTML
         doc = Nokogiri::XML(html_string, &:noblanks)
-        # Use Nokogiri's built-in pretty printing
-        if @indent_type == "tab"
-          doc.to_xml(indent: 1, indent_text: "\t", encoding: "UTF-8")
-        else
-          doc.to_xml(indent: @indent, encoding: "UTF-8")
-        end
+        out = if @indent_type == "tab"
+                doc.to_xml(indent: 1, indent_text: "\t", encoding: "UTF-8")
+              else
+                doc.to_xml(indent: @indent, encoding: "UTF-8")
+              end
+        expand_non_void_self_closing(out)
       end
       def format_as_html(html_string)
-        # Parse as HTML5
         doc = Nokogiri::HTML5(html_string)
-        # Use Nokogiri's built-in pretty printing
         if @indent_type == "tab"
           doc.to_html(indent: 1, indent_text: "\t", encoding: "UTF-8")
         else
           doc.to_html(indent: @indent, encoding: "UTF-8")
         end
       end
+      # Fixture-ready serialisation: parse with Nokogiri::HTML5 (so we
+      # get permissive recovery on real-world Word / XHTML5 / HTML5
+      # input shapes), then write through libxml's XML writer with
+      # +FORMAT+ + +AS_XHTML+ + +NO_DECLARATION+.  +FORMAT+ inserts
+      # indentation; +AS_XHTML+ produces well-shaped output (void
+      # elements self-closed, non-void paired); +NO_DECLARATION+
+      # suppresses the +<?xml ...?>+ prefix.
+      def format_fixture_ready(html_string)
+        doc = Nokogiri::HTML5(html_string)
+        io = StringIO.new
+        if @indent_type == "tab"
+          doc.write_to(io, save_with: fixture_ready_save_options,
+                           indent: 1, indent_text: "\t")
+        else
+          doc.write_to(io, save_with: fixture_ready_save_options,
+                           indent: @indent)
+        end
+        io.string
+      end
+      def fixture_ready_save_options
+        Nokogiri::XML::Node::SaveOptions::FORMAT |
+          Nokogiri::XML::Node::SaveOptions::AS_XHTML |
+          Nokogiri::XML::Node::SaveOptions::NO_DECLARATION
+      end
+      # Rewrite +<tag …/>+ into +<tag …></tag>+ for every element name
+      # that is not an HTML5 void element. +<a/>+ is illegal HTML;
+      # void tags like +<br/>+ and +<img …/>+ pass through unchanged.
+      def expand_non_void_self_closing(html)
+        html.gsub(%r{<([A-Za-z][A-Za-z0-9:_-]*)((?:\s+[^<>"]*(?:"[^"]*"[^<>"]*)*)?)/>}) do
+          name = ::Regexp.last_match(1)
+          attrs = ::Regexp.last_match(2)
+          if HtmlVoidElements.void?(name)
+            "<#{name}#{attrs}/>"
+          else
+            "<#{name}#{attrs}></#{name}>"
+          end
+        end
+      end
     end
   end
 end

data/lib/canon/pretty_printer/html_void_elements.rb ADDED Viewed

@@ -0,0 +1,20 @@
+# frozen_string_literal: true
+require "set"
+module Canon
+  module PrettyPrinter
+    # The 14 HTML5 void elements — those whose start tag may stand alone
+    # (with no end tag) and which cannot have any content. Every other
+    # element with no children must be written as +<tag></tag>+ in HTML;
+    # writing +<a/>+ is illegal HTML and is parsed as +<a>+ (start tag only).
+    module HtmlVoidElements
+      VOID = Set.new(%w[area base br col embed hr img input link meta param
+                        source track wbr]).freeze
+      def self.void?(name)
+        VOID.include?(name.to_s.downcase)
+      end
+    end
+  end
+end

data/lib/canon/pretty_printer/xml_normalized.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require "nokogiri"
+require_relative "html_void_elements"
 module Canon
   module PrettyPrinter
@@ -133,12 +134,14 @@ module Canon
                      collapse_whitespace_elements: [],
                      strip_whitespace_elements: [],
                      pretty_printed: false,
-                     sort_attributes: false)
+                     sort_attributes: false,
+                     html_mode: false)
         @indent = indent.to_i
         @indent_char = indent_type == "tab" ? "\t" : " "
         @vis_map = visualization_map || default_vis_map
         @pretty_printed = pretty_printed
         @sort_attributes = sort_attributes
+        @html_mode = html_mode
         @strict_ws  = Set.new((preserve_whitespace_elements || []).map(&:to_s))
         @norm_ws    = Set.new((collapse_whitespace_elements || []).map(&:to_s))
@@ -151,10 +154,10 @@ module Canon
       # @return [String] Serialized XML, one node per line, with content
       #   whitespace visualized at line boundaries
       def format(xml_string)
-        doc = Nokogiri::XML(xml_string)
+        doc = @html_mode ? Nokogiri::HTML5(xml_string) : Nokogiri::XML(xml_string)
         lines = []
-        if doc.version
+        if !@html_mode && doc.version
           enc = doc.encoding ? " encoding=\"#{doc.encoding}\"" : ""
           lines << "<?xml version=\"#{doc.version}\"#{enc}?>"
         end
@@ -198,6 +201,10 @@ module Canon
         children = node.children.reject { |c| c.text? && c.content.empty? }
         if children.empty?
+          if @html_mode && !HtmlVoidElements.void?(node.name)
+            return "#{ind(depth)}#{open_tag(node)}</#{node.name}>"
+          end
           return "#{ind(depth)}#{open_tag(node,
                                           self_close: true)}"
         end

data/lib/canon/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Canon
-  VERSION = "0.2.4"
+  VERSION = "0.2.5"
 end

data/lib/canon/xml/data_model.rb CHANGED Viewed

@@ -31,7 +31,19 @@ module Canon
         check_for_relative_namespace_uris(doc)
         # Convert to XPath data model
-        build_from_nokogiri(doc, preserve_whitespace: preserve_whitespace)
+        result = build_from_nokogiri(doc,
+                                     preserve_whitespace: preserve_whitespace)
+        # Carry libxml's parse errors on the resulting tree so the diff
+        # report can surface them (see lutaml/canon#130).  libxml's
+        # FATAL conditions (e.g. duplicate attributes) silently drop
+        # content from the parse tree; without surfacing the error
+        # list, downstream diffs describe the partial tree, not the
+        # input.
+        errors = Array(doc.errors).map(&:to_s)
+        result.parse_errors = errors if errors.any?
+        result
       end
       # Normalize XML string encoding to UTF-8

data/lib/canon/xml/node.rb CHANGED Viewed

@@ -24,6 +24,21 @@ module Canon
         @in_node_set = value
       end
+      # Parse-time errors carried alongside the node tree, captured at
+      # parse boundaries (Canon::Xml::DataModel.from_xml, etc.) so the
+      # diff report can surface libxml-level FATAL conditions that
+      # would otherwise be silently swallowed and produce misleading
+      # diffs against a partially-loaded tree.  See lutaml/canon#130.
+      #
+      # @return [Array<String>] Parse errors as strings (empty by default)
+      def parse_errors
+        @parse_errors || []
+      end
+      def parse_errors=(value)
+        @parse_errors = Array(value)
+      end
       # Return the text content of this node and all descendants.
       # ElementNode concatenates children's text_content; other nodes
       # (TextNode, CommentNode, etc.) return their value.

data/lib/canon/xml/sax_builder.rb CHANGED Viewed

@@ -93,6 +93,23 @@ strip_doctype: false)
         # Track in-scope namespaces at each level
         # Each entry is a hash of prefix => uri
         @namespace_stack = [build_initial_namespaces]
+        # Captured libxml errors during SAX parsing.  Surfaced on the
+        # resulting RootNode so the diff report can warn the user
+        # when a FATAL parse error has caused content loss
+        # (see lutaml/canon#130).
+        @parse_errors = []
+      end
+      # SAX callbacks for libxml errors and warnings.  Without these
+      # overrides the default handlers swallow the events; with them,
+      # libxml's "Attribute xml:lang redefined" and similar messages
+      # land in @parse_errors and ride through to ComparisonResult.
+      def error(string)
+        @parse_errors << string.to_s.strip
+      end
+      def warning(string)
+        @parse_errors << string.to_s.strip
       end
       # Called when an element starts
@@ -229,6 +246,7 @@ strip_doctype: false)
         # followed by PIs and comments outside the document element
         # (C14N requires this ordering)
         reorder_children(@root)
+        @root.parse_errors = @parse_errors if @parse_errors.any?
         @root
       end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: canon
 version: !ruby/object:Gem::Version
-  version: 0.2.4
+  version: 0.2.5
 platform: ruby
 authors:
 - Ribose Inc.
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-04-27 00:00:00.000000000 Z
+date: 2026-05-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: diff-lcs
@@ -173,6 +173,7 @@ files:
 - docs/features/diff-formatting/index.adoc
 - docs/features/diff-formatting/pretty-diff-mode.adoc
 - docs/features/diff-formatting/themes.adoc
+- docs/features/diff-formatting/whitespace-adjacency.adoc
 - docs/features/environment-configuration/index.adoc
 - docs/features/environment-configuration/override-system.adoc
 - docs/features/environment-configuration/size-limits.adoc
@@ -244,6 +245,7 @@ files:
 - lib/canon/comparison/match_options/json_resolver.rb
 - lib/canon/comparison/match_options/xml_resolver.rb
 - lib/canon/comparison/match_options/yaml_resolver.rb
+- lib/canon/comparison/node_inspector.rb
 - lib/canon/comparison/profile_definition.rb
 - lib/canon/comparison/ruby_object_comparator.rb
 - lib/canon/comparison/strategies/base_match_strategy.rb
@@ -326,6 +328,7 @@ files:
 - lib/canon/options/cli_generator.rb
 - lib/canon/options/registry.rb
 - lib/canon/pretty_printer/html.rb
+- lib/canon/pretty_printer/html_void_elements.rb
 - lib/canon/pretty_printer/json.rb
 - lib/canon/pretty_printer/xml.rb
 - lib/canon/pretty_printer/xml_normalized.rb