RubyGems - canon - Versions diffs - 0.1.13 → 0.1.14 - Mend

canon 0.1.13 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/.rubocop_todo.yml +69 -21
data/README.adoc +41 -0
data/docs/interfaces/ruby-api/index.adoc +26 -0
data/docs/understanding/formats/xml.adoc +25 -0
data/lib/canon/color_detector.rb +16 -13
data/lib/canon/comparison/dimensions/text_content_dimension.rb +9 -1
data/lib/canon/comparison/html_comparator.rb +89 -35
data/lib/canon/comparison/html_parser.rb +22 -0
data/lib/canon/comparison/markup_comparator.rb +39 -4
data/lib/canon/comparison/profile_definition.rb +1 -1
data/lib/canon/comparison/xml_comparator/child_comparison.rb +87 -9
data/lib/canon/comparison/xml_comparator.rb +44 -7
data/lib/canon/comparison/xml_node_comparison.rb +107 -9
data/lib/canon/comparison.rb +44 -0
data/lib/canon/config/env_schema.rb +2 -1
data/lib/canon/config.rb +10 -0
data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +9 -0
data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +4 -0
data/lib/canon/formatters/xml_formatter.rb +20 -0
data/lib/canon/html/data_model.rb +26 -4
data/lib/canon/rspec_matchers.rb +15 -0
data/lib/canon/tree_diff/adapters/html_adapter.rb +20 -2
data/lib/canon/tree_diff/matchers/hash_matcher.rb +12 -2
data/lib/canon/version.rb +1 -1
data/lib/canon/xml/element_matcher.rb +70 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 60a7f65c6d95c4c10672244fe19a027022aadb8de563d9f3ac58da151085e883
-  data.tar.gz: 47f17024dd3d1a7055cef281439038d42bfb92527765f92d68392fbb14390d39
+  metadata.gz: 81995d22ec29adb9b2fb60f0ed8bc0219fe28e468c89a2001901b0f4521c757b
+  data.tar.gz: fabc6e6c77e92848783e747459377caa787330d3360f83f544b6372cc68ba227
 SHA512:
-  metadata.gz: 526cfa7a890447be2abc8bc358ac96a67a58ed6cb8016beebb65d087e44de48ef742c90be9ab50960c2b3d543bc7e3a7118af88a4f02af0a3e85eeea83161f14
-  data.tar.gz: 95e16e97ee9b71a1f4d220bc3c4f004f3f39f76d1b2631c68e3a5805b9f9aea4390bab9f9537dc0be254c690e6fcfa4146497ce8867bb15a28fec85387d1a0d1
+  metadata.gz: d33e2fcd54ae3b5cab9fdcfe980b1a8d1f2f97b1389ea430ecfda093b275e77f93e49e5a4f1171797df3fcb7f8d0ef28654301dbb846c16a4eb751018ea10129
+  data.tar.gz: 85ffc85bf577b631c9aee7e16f81e0be163de2025154dd197962943c533cd3a1aa0d79799d084194780ff8654e766b0ac3ac36b635425d47e64009c71a4edb6d

data/.rubocop_todo.yml CHANGED Viewed

@@ -1,6 +1,6 @@
 # This configuration was generated by
 # `rubocop --auto-gen-config`
-# on 2026-01-21 09:17:44 UTC using RuboCop version 1.81.7.
+# on 2026-02-17 14:18:53 UTC using RuboCop version 1.81.7.
 # The point is for the user to remove these configuration records
 # one by one as the offenses are removed from the code base.
 # Note that changes in the inspected code, or installation of new
@@ -12,13 +12,52 @@ Gemspec/RequiredRubyVersion:
   Exclude:
     - 'canon.gemspec'
-# Offense count: 700
+# Offense count: 1
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: EnforcedStyle, IndentationWidth.
+# SupportedStyles: with_first_argument, with_fixed_indentation
+Layout/ArgumentAlignment:
+  Exclude:
+    - 'lib/canon/xml/element_matcher.rb'
+# Offense count: 23
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: EnforcedStyleAlignWith.
+# SupportedStylesAlignWith: either, start_of_block, start_of_line
+Layout/BlockAlignment:
+  Exclude:
+    - 'spec/canon/fixtures/isodoc_spec.rb'
+    - 'spec/canon/table_class_attribute_bug_spec.rb'
+# Offense count: 23
+# This cop supports safe autocorrection (--autocorrect).
+Layout/BlockEndNewline:
+  Exclude:
+    - 'spec/canon/fixtures/isodoc_spec.rb'
+    - 'spec/canon/table_class_attribute_bug_spec.rb'
+# Offense count: 46
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: Width, AllowedPatterns.
+Layout/IndentationWidth:
+  Exclude:
+    - 'spec/canon/fixtures/isodoc_spec.rb'
+    - 'spec/canon/table_class_attribute_bug_spec.rb'
+# Offense count: 780
 # This cop supports safe autocorrection (--autocorrect).
 # Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, IgnoreCopDirectives, AllowedPatterns, SplitStrings.
 # URISchemes: http, https
 Layout/LineLength:
   Enabled: false
+# Offense count: 1
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: AllowInHeredoc.
+Layout/TrailingWhitespace:
+  Exclude:
+    - 'lib/canon/xml/element_matcher.rb'
 # Offense count: 48
 # Configuration parameters: IgnoreLiteralBranches, IgnoreConstantBranches, IgnoreDuplicateElseBranch.
 Lint/DuplicateBranch:
@@ -48,44 +87,45 @@ Lint/UnreachableCode:
   Exclude:
     - 'lib/canon/diff_formatter/debug_output.rb'
-# Offense count: 6
+# Offense count: 7
 # This cop supports safe autocorrection (--autocorrect).
 # Configuration parameters: AllowUnusedKeywordArguments, IgnoreEmptyMethods, IgnoreNotImplementedMethods, NotImplementedExceptions.
 # NotImplementedExceptions: NotImplementedError
 Lint/UnusedMethodArgument:
   Exclude:
+    - 'lib/canon/comparison.rb'
     - 'lib/canon/diff/path_builder.rb'
     - 'lib/canon/diff_formatter/by_line/base_formatter.rb'
     - 'lib/canon/diff_formatter/by_line/xml_formatter.rb'
     - 'lib/canon/diff_formatter/by_object/base_formatter.rb'
-# Offense count: 209
+# Offense count: 215
 # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
 Metrics/AbcSize:
   Enabled: false
-# Offense count: 20
+# Offense count: 21
 # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns, inherit_mode.
 # AllowedMethods: refine
 Metrics/BlockLength:
   Max: 84
-# Offense count: 177
+# Offense count: 183
 # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
 Metrics/CyclomaticComplexity:
   Enabled: false
-# Offense count: 363
+# Offense count: 369
 # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
 Metrics/MethodLength:
-  Max: 110
+  Max: 115
 # Offense count: 44
 # Configuration parameters: CountKeywordArgs, MaxOptionalParameters.
 Metrics/ParameterLists:
   Max: 9
-# Offense count: 143
+# Offense count: 149
 # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
 Metrics/PerceivedComplexity:
   Enabled: false
@@ -119,12 +159,13 @@ Naming/VariableNumber:
     - 'lib/canon/comparison/markup_comparator.rb'
     - 'lib/canon/comparison/xml_comparator/diff_node_builder.rb'
-# Offense count: 2
+# Offense count: 13
 # Configuration parameters: MinSize.
 Performance/CollectionLiteralInLoop:
   Exclude:
     - 'lib/canon/comparison/html_comparator.rb'
     - 'lib/canon/xml/xml_base_handler.rb'
+    - 'spec/canon/table_class_attribute_bug_spec.rb'
 # Offense count: 68
 # Configuration parameters: Prefixes, AllowedPatterns.
@@ -132,7 +173,7 @@ Performance/CollectionLiteralInLoop:
 RSpec/ContextWording:
   Enabled: false
-# Offense count: 25
+# Offense count: 27
 # Configuration parameters: IgnoredMetadata.
 RSpec/DescribeClass:
   Enabled: false
@@ -143,13 +184,7 @@ RSpec/DescribeMethod:
     - 'spec/canon/comparison/multiple_differences_spec.rb'
     - 'spec/canon/diff_formatter/character_map_customization_spec.rb'
-# Offense count: 1
-# This cop supports safe autocorrection (--autocorrect).
-RSpec/EmptyHook:
-  Exclude:
-    - 'spec/canon/color_detector_spec.rb'
-# Offense count: 679
+# Offense count: 696
 # Configuration parameters: CountAsOne.
 RSpec/ExampleLength:
   Max: 67
@@ -201,11 +236,11 @@ RSpec/MultipleDescribes:
   Exclude:
     - 'spec/canon/comparison/match_options_spec.rb'
-# Offense count: 522
+# Offense count: 536
 RSpec/MultipleExpectations:
   Max: 15
-# Offense count: 69
+# Offense count: 70
 # Configuration parameters: AllowSubject.
 RSpec/MultipleMemoizedHelpers:
   Max: 13
@@ -224,12 +259,13 @@ RSpec/NamedSubject:
 RSpec/NestedGroups:
   Max: 4
-# Offense count: 10
+# Offense count: 11
 # Configuration parameters: AllowedPatterns.
 # AllowedPatterns: ^expect_, ^assert_
 RSpec/NoExpectationExample:
   Exclude:
     - 'spec/canon/context_grouping_spec.rb'
+    - 'spec/canon/fixtures/isodoc_spec.rb'
     - 'spec/canon/informative_diffs_debug_spec.rb'
     - 'spec/canon/isodoc_blockquotes_spec.rb'
     - 'spec/canon/match_scenarios_spec.rb'
@@ -257,6 +293,18 @@ RSpec/VerifiedDoubles:
     - 'spec/canon/diff/xml_serialization_formatter_spec.rb'
     - 'spec/canon/tree_diff/operation_converter_spec.rb'
+# Offense count: 44
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
+# SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
+# ProceduralMethods: benchmark, bm, bmbm, create, each_with_object, measure, new, realtime, tap, with_object
+# FunctionalMethods: let, let!, subject, watch
+# AllowedMethods: lambda, proc, it
+Style/BlockDelimiters:
+  Exclude:
+    - 'spec/canon/fixtures/isodoc_spec.rb'
+    - 'spec/canon/table_class_attribute_bug_spec.rb'
 # Offense count: 1
 # This cop supports safe autocorrection (--autocorrect).
 # Configuration parameters: EnforcedStyle, AllowComments.

data/README.adoc CHANGED Viewed

@@ -16,6 +16,47 @@ Key features:
 * **Multiple interfaces**: Ruby API, CLI, RSpec matchers
 * **Smart diff output**: By-line or by-object modes with syntax highlighting
+== When to use formatting vs comparison
+Canon provides two main APIs with different purposes:
+*Use `Canon.format` for formatting/canonicalization:*
+* Pretty-printing XML/JSON/YAML for display
+* Canonicalizing documents for storage
+* Normalizing formatting
+*Use `Canon::Comparison.equivalent?` for semantic comparison:*
+* Test assertions
+* Document equivalence checking
+* Diff generation
+[IMPORTANT]
+====
+Do NOT use `Canon.format_xml` output for string comparison in tests.
+The formatting process changes line counts and formatting, which causes
+false test failures.
+Use `Canon::Comparison.equivalent?` instead, which performs semantic
+comparison and properly handles XML declarations.
+====
+[example]
+====
+[source,ruby]
+----
+# WRONG - formatting changes line counts
+expect(Canon.format_xml(actual)).to eq(expected_formatted)
+# RIGHT - semantic comparison ignores formatting differences
+expect(Canon::Comparison.equivalent?(actual, expected, format: :xml)).to be true
+# BEST - use RSpec matchers
+expect(actual).to be_xml_equivalent_to(expected)
+----
+====
 == Installation
 Add to your application's Gemfile:

data/docs/interfaces/ruby-api/index.adoc CHANGED Viewed

@@ -18,6 +18,32 @@ For command-line usage, see link:../cli/[CLI documentation].
 For RSpec testing, see link:../rspec/[RSpec documentation].
+== Choosing the right API
+Canon provides two main categories of APIs with different purposes.
+=== Formatting APIs
+Use `Canon.format` or `Canon.format_xml` when you need to:
+* Pretty-print documents for display
+* Canonicalize documents for storage
+* Normalize document formatting
+NOTE: XML declarations are preserved in pretty-print mode and removed in
+canonicalization mode.
+=== Comparison APIs
+Use `Canon::Comparison.equivalent?` when you need to:
+* Compare documents semantically
+* Generate diffs
+* Make test assertions
+NOTE: XML declarations are stripped during preprocessing for semantic comparison.
+Documents with and without XML declarations are considered equivalent.
 == General
 Canon provides a unified Ruby API for working with XML, HTML, JSON, and YAML

data/docs/understanding/formats/xml.adoc CHANGED Viewed

@@ -183,6 +183,31 @@ configures preprocessing, match options, diff algorithm, and formatting.
 == XML-specific features
+=== XML declaration handling
+The XML declaration (`<?xml version="1.0" encoding="UTF-8"?>`) is handled
+differently depending on the operation:
+[cols="2,3"]
+|===
+| Operation | XML Declaration
+| `Canon.format_xml` (pretty)
+| Preserved
+| `Canon.format_xml` (c14n)
+| Removed (per W3C C14N spec)
+| `Canon::Comparison.equivalent?`
+| Stripped during preprocessing
+| RSpec matchers
+| Stripped during preprocessing
+|===
+This means documents with and without XML declarations are considered
+equivalent when using the comparison API.
 === Comment handling
 XML comments are preserved in canonical form unless `--with-comments` is explicitly set.

data/lib/canon/color_detector.rb CHANGED Viewed

@@ -81,24 +81,27 @@ module Canon
       #
       # @return [Boolean] true if colors appear to be supported
       def detect_from_env
-        # Check for known color-capable terminals
-        colorterm = ENV["COLORTERM"]
-        return true if COLOR_TERM_VALUES.include?(colorterm)
         # Check TERM variable
         term = ENV["TERM"]
-        if term
+        if term && NO_COLOR_TERMS.any? { |t| term.include?(t) }
           # Known no-color terminals
-          return false if NO_COLOR_TERMS.any? { |t| term.include?(t) }
+          return false
+        end
+        # Check CI environments
+        # Some CI systems support colors, others don't
+        return detect_ci_colors if ci_environment?
+        if term
           # Known color-capable terminals
           return true if COLOR_TERM_SUFFIXES.any? { |s| term.end_with?(s) }
           # Most modern terminals support basic ANSI colors
           return true unless term.empty? || term == "unknown"
         end
-        # Check CI environments
-        # Some CI systems support colors, others don't
-        return detect_ci_colors if ci_environment?
+        # Check for known color-capable terminals
+        colorterm = ENV["COLORTERM"]
+        return true if COLOR_TERM_VALUES.include?(colorterm)
         # Default: assume colors are supported on modern terminals
         # This is a safe default for most use cases
@@ -123,16 +126,16 @@ module Canon
       #
       # @return [Boolean] true if CI environment likely supports colors
       def detect_ci_colors
+        # Most modern CI systems support ANSI colors
+        # Only disable for explicitly known non-color CI
+        return false if ENV["TERM"] == "dumb"
         # GitHub Actions explicitly supports colors
         return true if ENV["GITHUB_ACTIONS"]
         # TeamCity supports colors with specific env var
         return true if ENV["TEAMCITY_VERSION"]
-        # Most modern CI systems support ANSI colors
-        # Only disable for explicitly known non-color CI
-        return false if ENV["TERM"] == "dumb"
         # Default to supporting colors in CI
         true
       end

data/lib/canon/comparison/dimensions/text_content_dimension.rb CHANGED Viewed

@@ -44,12 +44,20 @@ module Canon
         # Normalized text comparison
         #
         # Collapses whitespace and compares.
+        # Two whitespace-only strings that both normalize to empty are equivalent.
         #
         # @param text1 [String, nil] First text
         # @param text2 [String, nil] Second text
         # @return [Boolean] true if normalized texts are equal
         def compare_normalize(text1, text2)
-          normalize_text(text1) == normalize_text(text2)
+          normalized1 = normalize_text(text1)
+          normalized2 = normalize_text(text2)
+          # Both empty after normalization = equivalent
+          # This handles whitespace-only text nodes that normalize to empty
+          return true if normalized1.empty? && normalized2.empty?
+          normalized1 == normalized2
         end
         private

data/lib/canon/comparison/html_comparator.rb CHANGED Viewed

@@ -60,6 +60,11 @@ module Canon
         def equivalent?(html1, html2, opts = {}, child_opts = {})
           opts = DEFAULT_OPTS.merge(opts)
+          # Capture original HTML strings BEFORE any parsing/transformation
+          # These are used for display to preserve original formatting
+          original_str1 = extract_original_string(html1)
+          original_str2 = extract_original_string(html2)
           # Resolve match options with format-specific defaults
           match_opts_hash = MatchOptions::Xml.resolve(
             format: :html,
@@ -117,41 +122,14 @@ module Canon
           # This is a SAFETY CHECK for legacy cases where Nokogiri nodes might still be used
           # The main path (parse_node) now returns Canon::Xml::Nodes::RootNode, so this
           # check should rarely trigger, but we keep it for robustness
-          if (node1.is_a?(Nokogiri::HTML4::DocumentFragment) ||
-              node1.is_a?(Nokogiri::XML::DocumentFragment)) &&
-              (node2.is_a?(Nokogiri::HTML4::DocumentFragment) ||
-              node2.is_a?(Nokogiri::XML::DocumentFragment))
-            # Compare children of fragments - filter them first
-            all_children1 = node1.children.to_a
-            all_children2 = node2.children.to_a
-            # Filter children based on match options (e.g., ignore comments)
-            children1 = XmlNodeComparison.filter_children(all_children1, opts)
-            children2 = XmlNodeComparison.filter_children(all_children2, opts)
-            if children1.length != children2.length
-              result = Comparison::UNEQUAL_ELEMENTS
-            elsif children1.empty?
-              result = Comparison::EQUIVALENT
-            else
-              # Compare each pair of children
-              result = Comparison::EQUIVALENT
-              children1.zip(children2).each do |child1, child2|
-                child_result = XmlNodeComparison.compare_nodes(child1, child2,
-                                                               opts, child_opts,
-                                                               diff_children,
-                                                               differences)
-                if child_result != Comparison::EQUIVALENT
-                  result = child_result
-                  break
-                end
-              end
-            end
-          else
-            result = XmlNodeComparison.compare_nodes(node1, node2, opts,
+          result = if fragment_nodes?(node1, node2)
+                     compare_fragment_children(node1, node2, opts, child_opts,
+                                               diff_children, differences)
+                   else
+                     XmlNodeComparison.compare_nodes(node1, node2, opts,
                                                      child_opts, diff_children,
                                                      differences)
-          end
+                   end
           # Classify DiffNodes as normative/informative if we have verbose output
           if opts[:verbose] && !differences.empty?
@@ -165,6 +143,7 @@ module Canon
             ComparisonResult.new(
               differences: differences,
               preprocessed_strings: [preprocessed_str1, preprocessed_str2],
+              original_strings: [original_str1, original_str2],
               format: :html,
               html_version: detect_html_version_from_node(node1),
               match_options: match_opts_hash,
@@ -187,6 +166,53 @@ module Canon
         private
+        # Check if both nodes are document fragments
+        #
+        # @param node1 [Object] First node
+        # @param node2 [Object] Second node
+        # @return [Boolean] true if both are document fragments
+        def fragment_nodes?(node1, node2)
+          (node1.is_a?(Nokogiri::HTML4::DocumentFragment) ||
+           node1.is_a?(Nokogiri::XML::DocumentFragment)) &&
+            (node2.is_a?(Nokogiri::HTML4::DocumentFragment) ||
+             node2.is_a?(Nokogiri::XML::DocumentFragment))
+        end
+        # Compare children of document fragments
+        #
+        # @param node1 [Nokogiri::DocumentFragment] First fragment
+        # @param node2 [Nokogiri::DocumentFragment] Second fragment
+        # @param opts [Hash] Comparison options
+        # @param child_opts [Hash] Child comparison options
+        # @param diff_children [Boolean] Whether to diff children
+        # @param differences [Array] Array to append differences to
+        # @return [Symbol] Comparison result constant
+        def compare_fragment_children(node1, node2, opts, child_opts,
+                                      diff_children, differences)
+          all_children1 = node1.children.to_a
+          all_children2 = node2.children.to_a
+          children1 = XmlNodeComparison.filter_children(all_children1, opts)
+          children2 = XmlNodeComparison.filter_children(all_children2, opts)
+          if children1.length != children2.length
+            return Comparison::UNEQUAL_ELEMENTS
+          elsif children1.empty?
+            return Comparison::EQUIVALENT
+          end
+          # Compare each pair of children
+          children1.zip(children2).each do |child1, child2|
+            child_result = XmlNodeComparison.compare_nodes(child1, child2,
+                                                           opts, child_opts,
+                                                           diff_children,
+                                                           differences)
+            return child_result if child_result != Comparison::EQUIVALENT
+          end
+          Comparison::EQUIVALENT
+        end
         # Perform semantic tree diff using SemanticTreeMatchStrategy
         #
         # @param html1 [String, Nokogiri::HTML::Document] First HTML
@@ -195,6 +221,11 @@ module Canon
         # @param match_opts_hash [Hash] Resolved match options
         # @return [Boolean, ComparisonResult] Result of tree diff comparison
         def perform_semantic_tree_diff(html1, html2, opts, match_opts_hash)
+          # Capture original HTML strings BEFORE any parsing/transformation
+          # These are used for display to preserve original formatting
+          original_str1 = extract_original_string(html1)
+          original_str2 = extract_original_string(html2)
           # Parse to Canon::Xml::Node (preserves preprocessing)
           # For HTML, we parse as XML to get Canon::Xml::Node structure
           node1 = parse_node_for_semantic(html1,
@@ -223,6 +254,7 @@ module Canon
             ComparisonResult.new(
               differences: differences,
               preprocessed_strings: preprocessed,
+              original_strings: [original_str1, original_str2],
               format: :html,
               html_version: html_version,
               match_options: match_opts_hash.merge(strategy.metadata),
@@ -343,7 +375,7 @@ module Canon
           # If already a Nokogiri node, check for incompatible XML documents
           unless node.is_a?(String)
             # Detect if this is an XML document (not HTML)
-            if is_xml_document?(node)
+            if xml_document?(node)
               raise Canon::CompareFormatMismatchError.new(:xml, :html)
             end
@@ -508,6 +540,28 @@ module Canon
           end
         end
+        # Extract original HTML string from various input types
+        # This preserves the original formatting without minification
+        #
+        # @param html [String, Nokogiri::Node, Canon::Xml::Node] Input HTML
+        # @return [String] Original HTML string
+        def extract_original_string(html)
+          if html.is_a?(String)
+            html
+          elsif html.is_a?(Canon::Xml::Node)
+            # Serialize Canon nodes to string
+            Canon::Xml::DataModel.serialize(html)
+          elsif html.respond_to?(:to_html)
+            # Nokogiri nodes - use to_html to preserve formatting
+            html.to_html
+          elsif html.respond_to?(:to_s)
+            html.to_s
+          else
+            raise Canon::Error,
+                  "Unable to extract original string from: #{html.class}"
+          end
+        end
         # Normalize HTML comments within style and script tags
         # Also removes whitespace-only CDATA children that Nokogiri creates
         def normalize_html_style_script_comments(doc)
@@ -637,7 +691,7 @@ compare_profile = nil)
         # Check if a node is an XML document (not HTML)
         # XML documents typically have XML processing instructions or are
         # instances of Nokogiri::XML::Document (not HTML variants)
-        def is_xml_document?(node)
+        def xml_document?(node)
           # Check if it's a pure XML document (not HTML4/HTML5 which also
           # inherit from XML::Document)
           # Check both Document and DocumentFragment variants

data/lib/canon/comparison/html_parser.rb CHANGED Viewed

@@ -25,6 +25,11 @@ module Canon
           return content unless content.is_a?(String)
           return content if already_parsed?(content)
+          # Normalize HTML to ensure consistent parsing by HTML4.fragment
+          # The key issue is that HTML4.fragment treats newlines after </head>
+          # differently than no newlines, causing inconsistent parsing
+          content = normalize_html_for_parsing(content)
           begin
             case format
             when :html5
@@ -74,6 +79,23 @@ module Canon
           # Check for HTML5 DOCTYPE (case-insensitive)
           content.include?("<!DOCTYPE html>") ? :html5 : :html4
         end
+        # Normalize HTML to ensure consistent parsing by HTML4.fragment
+        #
+        # The key issue is that HTML4.fragment treats whitespace after </head>
+        # differently than no whitespace, causing inconsistent parsing:
+        # - "</head>\n<body>" parses to [body, ...] (body is treated as content)
+        # - "</head><body>" parses to [meta, div, ...] (wrapper tags stripped)
+        #
+        # This method normalizes the HTML to ensure consistent parsing.
+        #
+        # @param content [String] HTML content
+        # @return [String] Normalized HTML content
+        def normalize_html_for_parsing(content)
+          # Remove whitespace between </head> and <body> to ensure consistent parsing
+          # This makes formatted and minified HTML parse the same way
+          content.gsub(%r{</head>\s*<body>}i, "</head><body>")
+        end
       end
     end
   end