RubyGems - canon - Versions diffs - 0.1.5 → 0.1.7 - Mend

canon 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

checksums.yaml +4 -4
data/.rubocop_todo.yml +163 -67
data/README.adoc +400 -7
data/docs/Gemfile +9 -0
data/docs/INDEX.adoc +99 -182
data/docs/_config.yml +100 -0
data/docs/advanced/diff-classification.adoc +547 -0
data/docs/advanced/diff-pipeline.adoc +358 -0
data/docs/advanced/index.adoc +214 -0
data/docs/advanced/semantic-diff-report.adoc +390 -0
data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
data/docs/features/diff-formatting/display-filtering.adoc +472 -0
data/docs/features/diff-formatting/index.adoc +140 -0
data/docs/features/environment-configuration/index.adoc +327 -0
data/docs/features/environment-configuration/override-system.adoc +436 -0
data/docs/features/environment-configuration/size-limits.adoc +273 -0
data/docs/features/index.adoc +173 -0
data/docs/features/input-validation/index.adoc +521 -0
data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
data/docs/features/match-options/html-policies.adoc +312 -0
data/docs/features/match-options/index.adoc +621 -0
data/docs/getting-started/index.adoc +83 -0
data/docs/getting-started/quick-start.adoc +76 -0
data/docs/guides/choosing-configuration.adoc +689 -0
data/docs/guides/index.adoc +181 -0
data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
data/docs/interfaces/index.adoc +101 -0
data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
data/docs/lychee.toml +65 -0
data/docs/reference/cli-options.adoc +418 -0
data/docs/reference/environment-variables.adoc +375 -0
data/docs/reference/index.adoc +204 -0
data/docs/reference/options-across-interfaces.adoc +417 -0
data/docs/understanding/algorithms/dom-diff.adoc +389 -0
data/docs/understanding/algorithms/index.adoc +314 -0
data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
data/docs/understanding/architecture.adoc +447 -0
data/docs/understanding/comparison-pipeline.adoc +317 -0
data/docs/understanding/formats/html.adoc +380 -0
data/docs/understanding/formats/index.adoc +261 -0
data/docs/understanding/formats/json.adoc +390 -0
data/docs/understanding/formats/xml.adoc +366 -0
data/docs/understanding/formats/yaml.adoc +504 -0
data/docs/understanding/index.adoc +130 -0
data/lib/canon/cli.rb +42 -1
data/lib/canon/commands/diff_command.rb +108 -23
data/lib/canon/comparison/compare_profile.rb +101 -0
data/lib/canon/comparison/comparison_result.rb +41 -2
data/lib/canon/comparison/html_comparator.rb +292 -71
data/lib/canon/comparison/html_compare_profile.rb +117 -0
data/lib/canon/comparison/match_options.rb +42 -4
data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
data/lib/canon/comparison/xml_comparator.rb +695 -91
data/lib/canon/comparison.rb +207 -2
data/lib/canon/config/env_provider.rb +71 -0
data/lib/canon/config/env_schema.rb +58 -0
data/lib/canon/config/override_resolver.rb +55 -0
data/lib/canon/config/type_converter.rb +59 -0
data/lib/canon/config.rb +158 -29
data/lib/canon/data_model.rb +29 -0
data/lib/canon/diff/diff_classifier.rb +74 -14
data/lib/canon/diff/diff_context_builder.rb +41 -0
data/lib/canon/diff/diff_line.rb +18 -2
data/lib/canon/diff/diff_node.rb +18 -3
data/lib/canon/diff/diff_node_mapper.rb +71 -12
data/lib/canon/diff/formatting_detector.rb +53 -0
data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
data/lib/canon/diff_formatter/debug_output.rb +7 -1
data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
data/lib/canon/diff_formatter/legend.rb +42 -0
data/lib/canon/diff_formatter.rb +78 -9
data/lib/canon/errors.rb +56 -0
data/lib/canon/formatters/html_formatter_base.rb +35 -1
data/lib/canon/formatters/json_formatter.rb +3 -0
data/lib/canon/formatters/yaml_formatter.rb +3 -0
data/lib/canon/html/data_model.rb +229 -0
data/lib/canon/html.rb +9 -0
data/lib/canon/options/cli_generator.rb +70 -0
data/lib/canon/options/registry.rb +234 -0
data/lib/canon/rspec_matchers.rb +34 -13
data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
data/lib/canon/tree_diff/core/matching.rb +241 -0
data/lib/canon/tree_diff/core/node_signature.rb +164 -0
data/lib/canon/tree_diff/core/node_weight.rb +135 -0
data/lib/canon/tree_diff/core/tree_node.rb +450 -0
data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
data/lib/canon/tree_diff/operation_converter.rb +631 -0
data/lib/canon/tree_diff/operations/operation.rb +92 -0
data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
data/lib/canon/tree_diff.rb +33 -0
data/lib/canon/validators/json_validator.rb +3 -1
data/lib/canon/validators/yaml_validator.rb +3 -1
data/lib/canon/version.rb +1 -1
data/lib/canon/xml/data_model.rb +22 -23
data/lib/canon/xml/element_matcher.rb +128 -20
data/lib/canon/xml/namespace_helper.rb +110 -0
data/lib/canon.rb +3 -0
metadata +81 -23
data/_config.yml +0 -116
data/docs/ADVANCED_TOPICS.adoc +0 -20
data/docs/BASIC_USAGE.adoc +0 -16
data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
data/docs/DIFF_ARCHITECTURE.adoc +0 -435
data/docs/DIFF_FORMATTING.adoc +0 -540
data/docs/FORMATS.adoc +0 -447
data/docs/INPUT_VALIDATION.adoc +0 -477
data/docs/MATCH_ARCHITECTURE.adoc +0 -463
data/docs/MATCH_OPTIONS.adoc +0 -719
data/docs/MODES.adoc +0 -432
data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
data/docs/OPTIONS.adoc +0 -1387
data/docs/PREPROCESSING.adoc +0 -491
data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
data/docs/UNDERSTANDING_CANON.adoc +0 -17

data/lib/canon/comparison/html_comparator.rb CHANGED Viewed

@@ -1,11 +1,16 @@
 # frozen_string_literal: true
 require "nokogiri"
+require_relative "../comparison" # Load base module with constants first
 require_relative "xml_comparator"
 require_relative "match_options"
 require_relative "comparison_result"
+require_relative "compare_profile"
+require_relative "html_compare_profile"
 require_relative "../diff/diff_node"
 require_relative "../diff/diff_classifier"
+require_relative "strategies/match_strategy_factory"
+require_relative "../html/data_model"
 module Canon
   module Comparison
@@ -59,24 +64,41 @@ module Canon
             global_options: opts[:global_options],
           )
+          # Parse nodes to detect HTML version before creating profile
+          # We need to parse early to know if we're dealing with HTML4 or HTML5
+          node1 = parse_node(html1, match_opts_hash[:preprocessing],
+                             match_opts_hash)
+          node2 = parse_node(html2, match_opts_hash[:preprocessing],
+                             match_opts_hash)
+          # Detect HTML version from parsed nodes
+          html_version = detect_html_version_from_node(node1)
+          # Create HTML-specific compare profile
+          compare_profile = HtmlCompareProfile.new(
+            match_opts_hash,
+            html_version: html_version,
+          )
           # Wrap in ResolvedMatchOptions for DiffClassifier
           match_opts = Canon::Comparison::ResolvedMatchOptions.new(
             match_opts_hash,
             format: :html,
+            compare_profile: compare_profile,
           )
           # Store resolved match options hash for use in comparison logic
           opts[:match_opts] = match_opts_hash
+          # Use tree diff if semantic_diff option is enabled
+          if match_opts.semantic_diff?
+            return perform_semantic_tree_diff(html1, html2, opts,
+                                              match_opts_hash)
+          end
           # Create child_opts with resolved options
           child_opts = opts.merge(child_opts)
-          # Parse nodes if they are strings, applying preprocessing if needed
-          node1 = parse_node(html1, match_opts_hash[:preprocessing],
-                             match_opts_hash)
-          node2 = parse_node(html2, match_opts_hash[:preprocessing],
-                             match_opts_hash)
           # Serialize preprocessed nodes for diff display (avoid re-preprocessing)
           preprocessed_str1 = serialize_for_display(node1)
           preprocessed_str2 = serialize_for_display(node2)
@@ -86,11 +108,19 @@ module Canon
           # DocumentFragment nodes need special handling - compare their children
           # instead of the fragment nodes themselves
-          if node1.is_a?(Nokogiri::HTML4::DocumentFragment) &&
-              node2.is_a?(Nokogiri::HTML4::DocumentFragment)
-            # Compare children of fragments
-            children1 = node1.children.to_a
-            children2 = node2.children.to_a
+          if (node1.is_a?(Nokogiri::HTML4::DocumentFragment) ||
+              node1.is_a?(Nokogiri::XML::DocumentFragment)) &&
+              (node2.is_a?(Nokogiri::HTML4::DocumentFragment) ||
+              node2.is_a?(Nokogiri::XML::DocumentFragment))
+            # Compare children of fragments - filter them first
+            all_children1 = node1.children.to_a
+            all_children2 = node2.children.to_a
+            # Filter children based on match options (e.g., ignore comments)
+            children1 = XmlComparator.send(:filter_children, all_children1,
+                                           opts)
+            children2 = XmlComparator.send(:filter_children, all_children2,
+                                           opts)
             if children1.length != children2.length
               result = Comparison::UNEQUAL_ELEMENTS
@@ -129,79 +159,205 @@ module Canon
               format: :html,
               html_version: detect_html_version_from_node(node1),
               match_options: match_opts_hash,
+              algorithm: :dom,
             )
+          elsif result != Comparison::EQUIVALENT && !differences.empty?
+            # Non-verbose mode: check equivalence
+            # If comparison found differences, classify them to determine if normative
+            classifier = Canon::Diff::DiffClassifier.new(match_opts)
+            classifier.classify_all(differences.select do |d|
+              d.is_a?(Canon::Diff::DiffNode)
+            end)
+            # Equivalent if no normative differences (matches semantic algorithm)
+            differences.none?(&:normative?)
           else
+            # Either equivalent or no differences tracked
             result == Comparison::EQUIVALENT
           end
         end
         private
+        # Perform semantic tree diff using SemanticTreeMatchStrategy
+        #
+        # @param html1 [String, Nokogiri::HTML::Document] First HTML
+        # @param html2 [String, Nokogiri::HTML::Document] Second HTML
+        # @param opts [Hash] Comparison options
+        # @param match_opts_hash [Hash] Resolved match options
+        # @return [Boolean, ComparisonResult] Result of tree diff comparison
+        def perform_semantic_tree_diff(html1, html2, opts, match_opts_hash)
+          # Parse to Canon::Xml::Node (preserves preprocessing)
+          # For HTML, we parse as XML to get Canon::Xml::Node structure
+          node1 = parse_node_for_semantic(html1,
+                                          match_opts_hash[:preprocessing])
+          node2 = parse_node_for_semantic(html2,
+                                          match_opts_hash[:preprocessing])
+          # Create strategy using factory
+          strategy = Strategies::MatchStrategyFactory.create(
+            format: :html,
+            match_options: match_opts_hash,
+          )
+          # Pass Canon::Xml::Node directly - adapter now handles it
+          differences = strategy.match(node1, node2)
+          # Return based on verbose mode
+          if opts[:verbose]
+            # Get preprocessed strings for display
+            preprocessed = strategy.preprocess_for_display(node1, node2)
+            # Detect HTML version (default to HTML5 for Canon nodes)
+            html_version = :html5
+            # Return ComparisonResult with strategy metadata
+            ComparisonResult.new(
+              differences: differences,
+              preprocessed_strings: preprocessed,
+              format: :html,
+              html_version: html_version,
+              match_options: match_opts_hash.merge(strategy.metadata),
+              algorithm: :semantic,
+            )
+          else
+            # Simple boolean result - equivalent if no normative differences
+            differences.none?(&:normative?)
+          end
+        end
+        # Parse node as fragment to preserve actual content
+        # Uses HTML4.fragment or HTML5.fragment based on content detection
+        #
+        # @param node [String, Nokogiri node] Node to parse
+        # @param preprocessing [Symbol] Preprocessing mode
+        # @param match_opts [Hash] Match options
+        # @return [Nokogiri::HTML::DocumentFragment] Parsed fragment
+        def parse_node_as_fragment(node, preprocessing = :none, match_opts = {})
+          # If already an XML fragment (no meta tags), return it
+          if node.is_a?(Nokogiri::XML::DocumentFragment)
+            return node
+          end
+          # Convert HTML fragments to string and re-parse as XML to remove phantom tags
+          # This handles cases where pre-parsed HTML4/HTML5 fragments have auto-inserted meta
+          html_string = if node.is_a?(Nokogiri::HTML4::DocumentFragment) ||
+              node.is_a?(Nokogiri::HTML5::DocumentFragment)
+                          node.to_s # Use to_s to avoid re-inserting meta tags
+                        elsif node.is_a?(String)
+                          node
+                        else
+                          node.to_html
+                        end
+          # Use XML fragment parser to preserve structure without auto-generated elements
+          # This avoids both HTML4's meta tag insertion and HTML5's tag stripping
+          # See: https://stackoverflow.com/questions/25998824/stop-nokogiri-from-adding-doctype-and-meta-tags
+          frag = Nokogiri::XML.fragment(html_string)
+          # Apply preprocessing if needed
+          if preprocessing == :rendered
+            normalize_html_style_script_comments(frag)
+            normalize_rendered_whitespace(frag, match_opts)
+            remove_whitespace_only_text_nodes(frag)
+          end
+          frag
+        end
+        # Parse HTML for semantic tree diff using Canon::Html::DataModel
+        # Returns Canon::Xml::Node for preprocessing preservation
+        #
+        # @param html [String, Object] HTML to parse
+        # @param preprocessing [Symbol] Preprocessing mode
+        # @return [Canon::Xml::Node] Parsed Canon node
+        def parse_node_for_semantic(html, preprocessing = :none)
+          # If already a Canon::Xml::Node, return as-is
+          return html if html.is_a?(Canon::Xml::Node)
+          # Convert to string if needed
+          html_string = if html.is_a?(String)
+                          html
+                        elsif html.respond_to?(:to_html)
+                          html.to_html
+                        elsif html.respond_to?(:to_s)
+                          html.to_s
+                        else
+                          raise Canon::Error,
+                                "Unable to convert HTML to string: #{html.class}"
+                        end
+          # Strip DOCTYPE for consistent parsing
+          html_string = html_string.gsub(/<!DOCTYPE[^>]*>/i, "").strip
+          # Apply preprocessing to HTML string before parsing
+          processed_html = case preprocessing
+                           when :normalize
+                             # Normalize whitespace
+                             html_string.lines.map(&:strip).reject(&:empty?).join("\n")
+                           when :c14n
+                             # Canonicalize
+                             Canon::Xml::C14n.canonicalize(html_string,
+                                                           with_comments: false)
+                           when :format
+                             # Pretty format
+                             Canon.format(html_string, :html)
+                           else
+                             # :none or unrecognized
+                             html_string
+                           end
+          # Parse using Canon::Html::DataModel to get Canon::Xml::Node
+          # HTML parsing with proper HTML-specific handling
+          Canon::Html::DataModel.from_html(processed_html)
+        end
         # Parse a node from string or return as-is
         # Applies preprocessing transformation before parsing if specified
+        # For DOM comparison, returns Nokogiri nodes (not Canon::Xml::Node)
         def parse_node(node, preprocessing = :none, match_opts = {})
           # If already a Nokogiri node, check for incompatible XML documents
-          # Only raise error for non-string incompatible formats
           unless node.is_a?(String)
             # Detect if this is an XML document (not HTML)
-            # Strings are allowed since they can be wrapped/parsed as needed
             if is_xml_document?(node)
               raise Canon::CompareFormatMismatchError.new(:xml, :html)
             end
-            # For :rendered preprocessing, apply normalization even to pre-parsed nodes
-            if preprocessing == :rendered
-              # If already a DocumentFragment with :rendered, just normalize it
-              if node.is_a?(Nokogiri::HTML4::DocumentFragment) ||
-                  node.is_a?(Nokogiri::HTML5::DocumentFragment) ||
-                  node.is_a?(Nokogiri::XML::DocumentFragment)
-                # Normalize whitespace directly without re-parsing
-                normalize_html_style_script_comments(node)
-                normalize_rendered_whitespace(node, match_opts)
-                return node
+            # Normalize HTML documents to fragments to avoid DTD differences
+            # This ensures comparing string with document works correctly
+            if node.is_a?(Nokogiri::HTML::Document) ||
+                node.is_a?(Nokogiri::HTML4::Document) ||
+                node.is_a?(Nokogiri::HTML5::Document)
+              # Get root element and create fragment from its outer HTML
+              # This avoids DOCTYPE and other document-level nodes
+              root = node.at_css("html") || node.root
+              if root
+                node = Nokogiri::XML.fragment(root.to_html)
               end
+            end
-              # Normalize whitespace directly without re-parsing
-              normalize_html_style_script_comments(node)
-              normalize_rendered_whitespace(node, match_opts)
-              return node
+            # For :rendered preprocessing with Nokogiri nodes
+            if preprocessing == :rendered
+              # Normalize and return
+              frag = node.is_a?(Nokogiri::XML::DocumentFragment) ? node : Nokogiri::XML.fragment(node.to_html)
+              normalize_html_style_script_comments(frag)
+              normalize_rendered_whitespace(frag, match_opts)
+              remove_whitespace_only_text_nodes(frag)
+              return frag
             end
-            # For other preprocessing, just return the node (including DocumentFragments)
+            # Return Nokogiri node (now normalized if it was a document)
             return node
           end
           # Check if string contains XML declaration but is actually HTML
-          # Nokogiri::HTML4.to_s adds <?xml...?> but the content is still HTML
-          # Check if this is actually HTML content after the declaration
-          # Look for <html tag which indicates HTML
           if node.strip.start_with?("<?xml") && !node.match?(/<html[\s>]/i)
             # No <html> tag, this is likely pure XML
             raise Canon::CompareFormatMismatchError.new(:xml, :html)
           end
-          # Has <?xml but also <html> tag, so it's HTML with XML declaration
-          # (common output from Nokogiri::HTML4#to_s)
-          # For :rendered preprocessing, handle separately to avoid double-parsing
-          if preprocessing == :rendered
-            # Check if this is a full HTML document or a fragment
-            # Use full document parsing if it has <html> tag
-            if node.match?(/<html[\s>]/i)
-              doc = Nokogiri::HTML(node, &:noblanks)
-              normalize_html_style_script_comments(doc)
-              normalize_rendered_whitespace(doc, match_opts)
-              remove_whitespace_only_text_nodes(doc)
-              return doc
-            else
-              # Use fragment for partial HTML
-              frag = Nokogiri::HTML4.fragment(node)
-              normalize_html_style_script_comments(frag)
-              normalize_rendered_whitespace(frag, match_opts)
-              remove_whitespace_only_text_nodes(frag)
-              return frag
-            end
-          end
+          # Strip DOCTYPE declarations from HTML strings
+          # This normalizes parsed HTML (which includes DOCTYPE) with raw HTML strings
+          node = node.gsub(/<!DOCTYPE[^>]*>/i, "").strip
           # Apply preprocessing to HTML string before parsing
           html_string = case preprocessing
@@ -216,15 +372,52 @@ module Canon
                           # Pretty format the HTML
                           Canon.format(node, :html)
                         else
-                          # :none or unrecognized - use as-is
+                          # :none, :rendered or unrecognized - use as-is
                           node
                         end
-          # Use Nokogiri for HTML and normalize style/script comments
-          # Use noblanks to prevent Nokogiri from adding structural whitespace
-          doc = Nokogiri::HTML(html_string, &:noblanks)
-          normalize_html_style_script_comments(doc)
-          doc
+          # Parse as Nokogiri fragment for DOM comparison
+          # Use XML fragment parser to avoid auto-inserted meta tags
+          frag = Nokogiri::XML.fragment(html_string)
+          # Apply :rendered preprocessing if needed
+          if preprocessing == :rendered
+            normalize_html_style_script_comments(frag)
+            normalize_rendered_whitespace(frag, match_opts)
+            remove_whitespace_only_text_nodes(frag)
+          end
+          frag
+        end
+        # Normalize HTML comments within style and script tags for DataModel nodes
+        def normalize_html_style_script_comments_datamodel(root)
+          # Walk the tree to find style/script elements
+          find_and_normalize_style_script(root)
+        end
+        def find_and_normalize_style_script(node)
+          return unless node.respond_to?(:children)
+          node.children.each do |child|
+            next unless child.is_a?(Canon::Xml::Nodes::ElementNode)
+            # If this is a style or script element, normalize its text content
+            if %w[style script].include?(child.name.downcase)
+              # Get text children and remove HTML comments from them
+              child.children.each do |text_child|
+                next unless text_child.is_a?(Canon::Xml::Nodes::TextNode)
+                # Remove HTML comments from text content
+                normalized = text_child.value.gsub(/<!--.*?-->/m, "").strip
+                # Update the text value
+                text_child.instance_variable_set(:@value, normalized)
+              end
+            end
+            # Recursively process children
+            find_and_normalize_style_script(child)
+          end
         end
         # Detect HTML version from content
@@ -244,12 +437,12 @@ module Canon
           end
         end
-        # Detect HTML version from Nokogiri node
+        # Detect HTML version from node
         #
-        # @param node [Nokogiri::XML::Node] Nokogiri HTML node
+        # @param node [Canon::Xml::Node, Nokogiri::XML::Node] HTML node
         # @return [Symbol] :html5 or :html4
         def detect_html_version_from_node(node)
-          # Check node type
+          # Check node type for Nokogiri
           if node.is_a?(Nokogiri::HTML5::Document) ||
               node.is_a?(Nokogiri::HTML5::DocumentFragment)
             :html5
@@ -257,20 +450,27 @@ module Canon
               node.is_a?(Nokogiri::HTML4::DocumentFragment)
             :html4
           else
-            # Default to HTML4 for compatibility
-            :html4
+            # Default to HTML5 for Canon::Xml::Node and unknown types
+            :html5
           end
         end
         # Serialize node to string for diff display
         # This ensures the displayed diff matches what was compared
         #
-        # @param node [Nokogiri::HTML::Document] Parsed HTML node
+        # @param node [Canon::Xml::Node, Nokogiri::HTML::Document] Parsed node
         # @return [String] Serialized HTML string
         def serialize_for_display(node)
-          # Get string representation with formatting for line-by-line diffs
-          # Use to_html which preserves line structure for diff display
-          node.to_html
+          # Use XmlComparator's serializer for Canon::Xml::Node
+          if node.is_a?(Canon::Xml::Node)
+            XmlComparator.send(:serialize_node_to_xml, node)
+          elsif node.respond_to?(:to_html)
+            node.to_html
+          elsif node.respond_to?(:to_xml)
+            node.to_xml
+          else
+            node.to_s
+          end
         end
         # Normalize HTML comments within style and script tags
@@ -301,14 +501,25 @@ module Canon
         #
         # @param doc [Nokogiri::HTML::Document] Document to normalize
         # @param match_opts [Hash] Match options to respect during normalization
-        def normalize_rendered_whitespace(doc, match_opts = {})
+        # @param compare_profile [HtmlCompareProfile] Optional profile for whitespace rules
+        def normalize_rendered_whitespace(doc, match_opts = {},
+compare_profile = nil)
           # If text_content is :strict, don't normalize ANY text content
           # This allows users to explicitly request strict text matching
           return if match_opts[:text_content] == :strict
           # Elements where whitespace is significant - don't normalize
-          # This is an HTML rendering rule, not a match option
-          preserve_whitespace = %w[pre code textarea script style]
+          # Use profile if available, otherwise use default list
+          preserve_whitespace = if compare_profile.is_a?(HtmlCompareProfile)
+                                  # Profile handles HTML-specific whitespace rules
+                                  %w[pre code textarea script
+                                     style].select do |elem|
+                                    compare_profile.preserve_whitespace?(elem)
+                                  end
+                                else
+                                  # Fallback to default list
+                                  %w[pre code textarea script style]
+                                end
           # Walk all text nodes
           doc.xpath(".//text()").each do |text_node|
@@ -360,8 +571,18 @@ module Canon
         # Remove whitespace-only text nodes from the document
         # These are typically insignificant in HTML rendering (e.g., between
         # block elements)
+        #
+        # CRITICAL: Do NOT remove whitespace-only text nodes from whitespace-sensitive
+        # elements like <pre>, <code>, <textarea>, <script>, <style>
         def remove_whitespace_only_text_nodes(doc)
+          # Elements where whitespace is significant - don't remove whitespace-only nodes
+          preserve_whitespace = %w[pre code textarea script style]
           doc.xpath(".//text()").each do |text_node|
+            # CRITICAL: Skip if this text node is inside a whitespace-preserving element
+            parent = text_node.parent
+            next if ancestor_preserves_whitespace?(parent, preserve_whitespace)
             # Remove if the text is only whitespace (after normalization)
             if text_node.content.strip.empty?
               text_node.remove

data/lib/canon/comparison/html_compare_profile.rb ADDED Viewed

@@ -0,0 +1,117 @@
+# frozen_string_literal: true
+require_relative "compare_profile"
+module Canon
+  module Comparison
+    # HtmlCompareProfile extends CompareProfile with HTML-specific comparison policies
+    #
+    # HTML has different semantics than XML:
+    # 1. Comments are presentational (default to :ignore unless explicitly :strict)
+    # 2. Whitespace preservation required in specific elements
+    # 3. Case sensitivity differs between HTML4 and HTML5
+    # 4. Self-closing tags handled differently
+    #
+    # This class provides HTML-specific policy decisions while maintaining
+    # the separation of concerns established by CompareProfile.
+    class HtmlCompareProfile < CompareProfile
+      attr_reader :html_version
+      # @param match_options [ResolvedMatchOptions, Hash] The match options to use
+      # @param html_version [Symbol] The HTML version (:html4 or :html5)
+      def initialize(match_options, html_version: :html5)
+        super(match_options)
+        @html_version = html_version
+      end
+      # Override for HTML-specific comment handling
+      #
+      # In HTML, comments are presentational content (not part of the DOM semantics)
+      # unless explicitly set to :strict. This differs from XML where comments
+      # may carry semantic meaning.
+      #
+      # HTML default for comments is :ignore, so comments don't affect equivalence
+      # unless the user explicitly sets comments: :strict
+      #
+      # @param dimension [Symbol] The match dimension to check
+      # @return [Boolean] true if differences affect equivalence
+      def affects_equivalence?(dimension)
+        # Comments in HTML: default is :ignore (presentational)
+        # Only affect equivalence if explicitly set to :strict
+        if dimension == :comments
+          # Check if comments key exists in options
+          if match_options.is_a?(Hash)
+            # If comments key doesn't exist, default to false (HTML default: ignore)
+            return false unless match_options.key?(:comments)
+            # If key exists, check if it's :strict
+            return match_options[:comments] == :strict
+          elsif match_options.respond_to?(:behavior_for)
+            behavior = behavior_for(dimension)
+            # In HTML, only :strict makes comments affect equivalence
+            return behavior == :strict
+          end
+          # Default: comments don't affect equivalence in HTML
+          return false
+        end
+        # All other dimensions use base class behavior
+        super
+      end
+      # Check if whitespace should be preserved for a given element
+      #
+      # HTML has specific elements where whitespace is significant:
+      # <pre>, <code>, <textarea>, <script>, <style>
+      #
+      # @param element_name [String] The element name to check
+      # @return [Boolean] true if whitespace should be preserved
+      def preserve_whitespace?(element_name)
+        whitespace_sensitive_elements.include?(element_name.to_s.downcase)
+      end
+      # Check if element names should be compared case-sensitively
+      #
+      # HTML4 is case-insensitive, HTML5 is case-sensitive
+      #
+      # @return [Boolean] true if case-sensitive comparison
+      def case_sensitive?
+        @html_version == :html5
+      end
+      private
+      # Elements where whitespace is semantically significant in HTML
+      # @return [Array<String>] List of element names
+      def whitespace_sensitive_elements
+        %w[pre code textarea script style]
+      end
+      # Check if a dimension is explicitly set to :strict
+      # @param dimension [Symbol] The match dimension
+      # @return [Boolean] true if explicitly :strict
+      def explicitly_strict?(dimension)
+        behavior_for(dimension) == :strict
+      end
+      # Check if an option was explicitly provided in match_options
+      # @param dimension [Symbol] The match dimension
+      # @return [Boolean] true if option was explicitly set
+      def has_explicit_option?(dimension)
+        if match_options.is_a?(Hash)
+          match_options.key?(dimension)
+        elsif match_options.respond_to?(:[])
+          # For ResolvedMatchOptions, check if key exists
+          begin
+            match_options[dimension]
+            true
+          rescue StandardError
+            false
+          end
+        else
+          false
+        end
+      end
+    end
+  end
+end