RubyGems - canon - Versions diffs - 0.2.3 → 0.2.5 - Mend

canon 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

checksums.yaml +4 -4
data/.rubocop_todo.yml +31 -149
data/README.adoc +9 -0
data/docs/advanced/semantic-diff-report.adoc +96 -0
data/docs/features/configuration-profiles.adoc +4 -2
data/docs/features/diff-formatting/index.adoc +3 -0
data/docs/features/diff-formatting/whitespace-adjacency.adoc +140 -0
data/docs/features/match-options/html-policies.adoc +2 -0
data/docs/features/match-options/index.adoc +40 -0
data/docs/guides/choosing-configuration.adoc +12 -1
data/docs/reference/cli-options.adoc +3 -0
data/docs/reference/environment-variables.adoc +3 -1
data/docs/reference/options-across-interfaces.adoc +7 -1
data/docs/understanding/formats/html.adoc +9 -2
data/lib/canon/cli.rb +4 -0
data/lib/canon/commands/diff_command.rb +1 -0
data/lib/canon/comparison/comparison_result.rb +95 -2
data/lib/canon/comparison/html_comparator.rb +96 -11
data/lib/canon/comparison/markup_comparator.rb +68 -71
data/lib/canon/comparison/match_options/base_resolver.rb +1 -0
data/lib/canon/comparison/match_options/xml_resolver.rb +8 -0
data/lib/canon/comparison/match_options.rb +23 -2
data/lib/canon/comparison/node_inspector.rb +103 -0
data/lib/canon/comparison/whitespace_sensitivity.rb +96 -0
data/lib/canon/comparison/xml_comparator/child_comparison.rb +133 -55
data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +24 -23
data/lib/canon/comparison/xml_comparator/node_parser.rb +45 -7
data/lib/canon/comparison/xml_comparator.rb +174 -7
data/lib/canon/comparison/xml_node_comparison.rb +48 -66
data/lib/canon/comparison.rb +143 -22
data/lib/canon/config/env_schema.rb +2 -1
data/lib/canon/config/profiles/metanorma.yml +3 -0
data/lib/canon/config.rb +51 -5
data/lib/canon/diff/diff_classifier.rb +55 -41
data/lib/canon/diff/diff_line_builder.rb +9 -8
data/lib/canon/diff/xml_serialization_formatter.rb +27 -42
data/lib/canon/diff_formatter/by_line/base_formatter.rb +39 -4
data/lib/canon/diff_formatter/by_line/html_formatter.rb +5 -2
data/lib/canon/diff_formatter/by_line_formatter.rb +84 -0
data/lib/canon/diff_formatter/by_object_formatter.rb +53 -0
data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +184 -26
data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +92 -4
data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +29 -0
data/lib/canon/diff_formatter/pretty_diff_formatter.rb +109 -0
data/lib/canon/diff_formatter.rb +128 -175
data/lib/canon/html/data_model.rb +10 -4
data/lib/canon/pretty_printer/html.rb +76 -14
data/lib/canon/pretty_printer/html_void_elements.rb +20 -0
data/lib/canon/pretty_printer/xml_normalized.rb +10 -3
data/lib/canon/tree_diff/adapters/html_adapter.rb +55 -2
data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
data/lib/canon/version.rb +1 -1
data/lib/canon/xml/c14n.rb +59 -5
data/lib/canon/xml/data_model.rb +13 -1
data/lib/canon/xml/element_matcher.rb +3 -0
data/lib/canon/xml/node.rb +23 -1
data/lib/canon/xml/nodes/comment_node.rb +4 -0
data/lib/canon/xml/nodes/element_node.rb +4 -0
data/lib/canon/xml/nodes/text_node.rb +4 -0
data/lib/canon/xml/sax_builder.rb +29 -2
data/lib/canon/xml/xpath_engine.rb +238 -0
metadata +9 -2

data/docs/features/match-options/index.adoc CHANGED Viewed

@@ -92,6 +92,46 @@ Canon.equivalent?(
 `:ignore`:: Structural whitespace is completely ignored
+=== whitespace_type
+**Applies to**: XML, HTML
+**Purpose**: Controls whether different Unicode whitespace characters (space, NBSP, ideographic space, etc.) are treated as equivalent or distinct.
+**Behaviors**:
+`:strict`:: (default) Different Unicode whitespace types are significant.
+Space (U+0020) and NBSP (U+00A0) are treated as different characters.
+This is useful for catching accidental insertion of wrong whitespace types
+(e.g., a pasted NBSP where a regular space was intended).
+`:normalize`:: All Unicode whitespace characters are collapsed to a single space
+before comparison. Space, NBSP, ideographic space (U+3000), and other Unicode
+whitespace characters are treated as equivalent.
+.Using whitespace_type: :strict (default)
+[example]
+====
+[source,ruby]
+----
+# By default, space and NBSP are different
+xml1 = '<root><span>ISO</span> <span>712</span></root>'
+xml2 = '<root><span>ISO</span>&#xa0;<span>712</span></root>'
+Canon::Comparison.equivalent?(xml1, xml2,
+  match_profile: :spec_friendly
+)
+# => false (NBSP detected as different from space)
+# Opt into treating all whitespace types as equivalent
+Canon::Comparison.equivalent?(xml1, xml2,
+  match_profile: :spec_friendly,
+  match: { whitespace_type: :normalize }
+)
+# => true
+----
+====
 === Whitespace sensitivity at element level
 ==== General

data/docs/guides/choosing-configuration.adoc CHANGED Viewed

@@ -210,13 +210,24 @@ Canon::Comparison.equivalent?(doc1, doc2,
     structural_whitespace: :ignore,     # ignore, normalize, strict
     attribute_order: :ignore,           # ignore, strict (XML/HTML)
     attribute_values: :normalize,       # normalize, strict, ignore
-    comments: :ignore                   # ignore, normalize, strict
+    comments: :ignore,                  # ignore, normalize, strict
+    whitespace_type: :strict            # strict (default), normalize
   }
 )
 ----
 **Remember**: Match options behave differently with each algorithm! See link:../features/match-options/algorithm-specific-behavior.adoc[Algorithm-Specific Behavior].
+==== Whitespace Type Sensitivity
+By default, Canon distinguishes between different Unicode whitespace types
+(e.g. regular space U+0020 vs non-breaking space U+00A0 vs ideographic space
+U+3000). This catches accidental insertion of wrong whitespace characters.
+Use `whitespace_type: :normalize` when all Unicode whitespace variants should
+be treated as equivalent (e.g. when output from different tools may use
+different whitespace types for the same visual result).
 === Layer 4: Diff Formatting
 **Question**: How should differences be displayed?

data/docs/reference/cli-options.adoc CHANGED Viewed

@@ -145,6 +145,9 @@ Individual dimension control (overrides profile settings):
 |`--comments BEHAVIOR`
 |Comments: `strict`, `normalize`, `ignore`
+|`--whitespace-type BEHAVIOR`
+|Whitespace type sensitivity: `strict` (default), `normalize`
 |===
 See link:../features/match-options/[Match Options] for details.

data/docs/reference/environment-variables.adoc CHANGED Viewed

@@ -194,7 +194,9 @@ export CANON_JSON_FORMAT_PREPROCESSING=normalize
 |`CANON_SHOW_PRETTYPRINT_RECEIVED`
 |boolean
 |`false`
-|Show only the RECEIVED (actual) block in the fixture-ready pretty-printed section.  This is the most common fixture-update workflow: enable this option to get a copy-pasteable pretty-printed form of the generated output that can replace the old fixture heredoc.  Format-specific: `CANON_{FORMAT}_DIFF_SHOW_PRETTYPRINT_RECEIVED`
+|Show only the RECEIVED (actual) block in the fixture-ready pretty-printed section.  This is the most common fixture-update workflow: enable this option to get a copy-pasteable pretty-printed form of the generated output that can replace the old fixture heredoc.  Format-specific: `CANON_{FORMAT}_DIFF_SHOW_PRETTYPRINT_RECEIVED`.
+For HTML / HTML4 / HTML5 inputs, the pretty-printed output is XHTML-shaped: void elements are self-closed (`<br/>`, `<meta/>`), non-void elements are paired (`<a></a>`), and Nokogiri may add `xmlns="http://www.w3.org/1999/xhtml"` on `<html>` and an `xml:lang` mirror of `lang`.  This is a display-only serialisation chosen because libxml's `FORMAT` save flag (the only path that actually indents HTML5 input) requires the XHTML save mode -- `Nokogiri::HTML5#to_html` silently ignores its `indent:` keyword.  See lutaml/canon#133.
 |All formats (display only)
 |`CANON_COMPACT_SEMANTIC_REPORT`

data/docs/reference/options-across-interfaces.adoc CHANGED Viewed

@@ -223,9 +223,15 @@ Profile values: `strict`, `rendered`, `spec_friendly`, `content_only`
 |`match: { element_hierarchy: :strict }`
 |`config.canon.xml.match.options = { element_hierarchy: :strict }`
 |`CANON_ELEMENT_HIERARCHY=strict`
+|Whitespace Type
+|`--whitespace-type normalize`
+|`match: { whitespace_type: :normalize }`
+|`config.canon.xml.match.options = { whitespace_type: :normalize }`
+|`CANON_WHITESPACE_TYPE=normalize`
 |===
-Values: `strict`, `normalize`, `ignore` (or `strict`, `ignore` for structure/position/hierarchy)
+Values: `strict`, `normalize`, `ignore` (or `strict`, `ignore` for structure/position/hierarchy). `whitespace_type` values: `strict` (default), `normalize`
 ==== XML/HTML-Specific Match Dimensions

data/docs/understanding/formats/html.adoc CHANGED Viewed

@@ -19,7 +19,7 @@ Canon supports HTML 4, HTML5, and XHTML with automatic format detection.
 **Key features:**
 * Automatic HTML vs XHTML detection
-* HTML5 parser for modern HTML
+* HTML5 parser for HTML input regardless of declared version (HTML4 and HTML5 share the same content model and parsing whitespace rules — see <<html4-html5-parity>>)
 * XML parser for XHTML
 * Consistent attribute ordering
 * Whitespace normalization
@@ -203,9 +203,16 @@ Automatically detects HTML5, HTML4, or XHTML based on DOCTYPE and structure.
 ----
 ====
+[[html4-html5-parity]]
+=== HTML4 / HTML5 parity
+`be_html4_equivalent_to` and `be_html5_equivalent_to` apply the same whitespace-sensitivity rules. Whitespace sensitivity is a property of HTML's content model and is identical across the two HTML versions, so any input that compares equivalent under one matcher must compare equivalent under the other.
+Internally, both matchers parse input via `Nokogiri::HTML5.fragment`. (Earlier releases routed `:html` and `:html4` through `Nokogiri::XML.fragment`, which silently applied XML whitespace rules — meaning `be_html4_equivalent_to` could reject inputs that `be_html5_equivalent_to` correctly accepted.) See https://github.com/lutaml/canon/issues/118 for the full background.
 === Whitespace handling
-HTML whitespace is collapsed per CSS rendering rules. Empty text nodes between elements are removed.
+HTML whitespace is collapsed per CSS rendering rules. Empty text nodes between elements are removed. Whitespace-only text between two adjacent inline elements (`<span>A</span> <span>B</span>`) is preserved because it renders as a visible space; whitespace at a block boundary (between an inline element and a block element, or between two block siblings) is collapsed.
 .Whitespace handling example
 [example]

data/lib/canon/cli.rb CHANGED Viewed

@@ -218,6 +218,10 @@ module Canon
                   type: :string,
                   enum: %w[strict normalize ignore],
                   desc: "Comment matching: strict, normalize, or ignore"
+    method_option :whitespace_type,
+                  type: :string,
+                  enum: %w[strict normalize],
+                  desc: "Whitespace type sensitivity: strict (default) or normalize"
     method_option :show_diffs,
                   type: :string,
                   enum: %w[all normative informative],

data/lib/canon/commands/diff_command.rb CHANGED Viewed

@@ -143,6 +143,7 @@ module Canon
         dimensions = %i[
           text_content structural_whitespace attribute_whitespace
           attribute_order attribute_values comments key_order
+          whitespace_type
         ]
         dimensions.each_with_object({}) do |dim, opts|

data/lib/canon/comparison/comparison_result.rb CHANGED Viewed

@@ -6,7 +6,8 @@ module Canon
     # Provides methods to query equivalence based on normative diffs
     class ComparisonResult
       attr_reader :differences, :preprocessed_strings, :format, :html_version,
-                  :match_options, :algorithm, :original_strings
+                  :match_options, :algorithm, :original_strings,
+                  :parse_errors_expected, :parse_errors_received
       # @param differences [Array<DiffNode>] Array of difference nodes
       # @param preprocessed_strings [Array<String, String>] Pre-processed content for display
@@ -15,8 +16,11 @@ module Canon
       # @param match_options [Hash, nil] Resolved match options used for comparison
       # @param algorithm [Symbol] Diff algorithm used (:dom or :semantic)
       # @param original_strings [Array<String, String>, nil] Original unprocessed content for line diff
+      # @param parse_errors_expected [Array<String>, nil] Parser errors from the expected side
+      # @param parse_errors_received [Array<String>, nil] Parser errors from the received side
       def initialize(differences:, preprocessed_strings:, format:,
-html_version: nil, match_options: nil, algorithm: :dom, original_strings: nil)
+html_version: nil, match_options: nil, algorithm: :dom, original_strings: nil,
+parse_errors_expected: nil, parse_errors_received: nil)
         @differences = differences
         @preprocessed_strings = preprocessed_strings
         @original_strings = original_strings || preprocessed_strings
@@ -24,6 +28,16 @@ html_version: nil, match_options: nil, algorithm: :dom, original_strings: nil)
         @html_version = html_version
         @match_options = match_options
         @algorithm = algorithm
+        @parse_errors_expected = Array(parse_errors_expected)
+        @parse_errors_received = Array(parse_errors_received)
+      end
+      # Whether either side reported parse errors.  Used by the diff
+      # formatter to decide whether to render the parse-error banner.
+      #
+      # @return [Boolean]
+      def parse_errors?
+        @parse_errors_expected.any? || @parse_errors_received.any?
       end
       # Check if documents are semantically equivalent (no normative diffs)
@@ -84,6 +98,30 @@ html_version: nil, match_options: nil, algorithm: :dom, original_strings: nil)
         @match_options&.[](:tree_diff_operations) || []
       end
+      # Generate a human-readable summary of the first difference.
+      #
+      # When documents are equivalent, returns "Equivalent".
+      # When they differ, returns a single-line string with the first normative
+      # (or first informative) difference location and reason.
+      #
+      # @return [String] Summary string
+      def summary
+        return "Equivalent" if equivalent?
+        diff = normative_differences.first || informative_differences.first ||
+               @differences.first # rubocop:disable Layout/MultilineOperationIndentation
+        return "Not equivalent" unless diff
+        if diff.is_a?(Canon::Diff::DiffNode)
+          summarize_diff_node(diff)
+        elsif diff.is_a?(Hash)
+          summarize_legacy_hash(diff)
+        else
+          "Not equivalent"
+        end
+      end
       # Generate formatted diff output
       #
       # @param use_color [Boolean] Whether to use ANSI color codes
@@ -116,6 +154,61 @@ show_diffs: :all, diff_mode: :separate, legacy_terminal: false)
           html_version: @html_version,
         )
       end
+      private
+      # Format a single DiffNode into a summary string.
+      #
+      # @param diff [DiffNode] The difference to summarize
+      # @return [String] Human-readable summary
+      def summarize_diff_node(diff)
+        parts = ["Not equivalent:"]
+        # rubocop:disable Layout/SpaceBeforeInterpolation,Style/ConditionalAssignment
+        if diff.path
+          parts << "#{diff.reason} at #{diff.path}"
+        else
+          parts << diff.reason.to_s
+        end
+        # rubocop:enable Layout/SpaceBeforeInterpolation,Style/ConditionalAssignment
+        if diff.serialized_before && diff.serialized_after
+          before_preview = truncate_preview(diff.serialized_before)
+          after_preview = truncate_preview(diff.serialized_after)
+          parts << "(#{before_preview} vs #{after_preview})"
+        end
+        parts.join(" ")
+      end
+      # Format a legacy Hash difference into a summary string.
+      #
+      # @param diff [Hash] Legacy difference hash with :path, :value1, :value2
+      # @return [String] Human-readable summary
+      def summarize_legacy_hash(diff)
+        parts = ["Not equivalent:"]
+        parts << "#{diff[:diff_code_description]} at #{diff[:path]}" if diff[:path]
+        if diff[:value1] && diff[:value2]
+          parts << "(#{truncate_preview(diff[:value1].to_s)} vs #{truncate_preview(diff[:value2].to_s)})"
+        end
+        parts.size > 1 ? parts.join(" ") : "Not equivalent: values differ"
+      end
+      # Truncate a string for preview display.
+      #
+      # @param text [String] Text to truncate
+      # @param max_len [Integer] Maximum length
+      # @return [String] Truncated text with ellipsis if needed
+      def truncate_preview(text, max_len = 40)
+        stripped = text.strip.gsub(/\s+/, " ")
+        if stripped.length > max_len
+          "#{stripped[0...(max_len - 3)]}..."
+        else
+          stripped
+        end
+      end
     end
   end
 end

data/lib/canon/comparison/html_comparator.rb CHANGED Viewed

@@ -13,6 +13,7 @@ require_relative "../diff/diff_classifier"
 require_relative "strategies/match_strategy_factory"
 require_relative "../html/data_model"
 require_relative "xml_node_comparison"
+require_relative "xml_comparator/diff_node_builder"
 # Whitespace sensitivity module (single source of truth for sensitive elements)
 require_relative "whitespace_sensitivity"
@@ -150,6 +151,8 @@ module Canon
               html_version: detect_html_version_from_node(node1),
               match_options: match_opts_hash,
               algorithm: :dom,
+              parse_errors_expected: Comparison.parse_errors_for(node1),
+              parse_errors_received: Comparison.parse_errors_for(node2),
             )
           elsif result != Comparison::EQUIVALENT && !differences.empty?
             # Non-verbose mode: check equivalence
@@ -172,10 +175,42 @@ module Canon
         # @param node2 [Object] Second node
         # @return [Boolean] true if both are document fragments
         def fragment_nodes?(node1, node2)
-          (node1.is_a?(Nokogiri::HTML4::DocumentFragment) ||
-           node1.is_a?(Nokogiri::XML::DocumentFragment)) &&
-            (node2.is_a?(Nokogiri::HTML4::DocumentFragment) ||
-             node2.is_a?(Nokogiri::XML::DocumentFragment))
+          fragment_node?(node1) && fragment_node?(node2)
+        end
+        # Check if a single node is a recognised document fragment.
+        # All three Nokogiri fragment types (XML, HTML4, HTML5) must be
+        # accepted: dom_diff routes html/html4/html5 input through
+        # Nokogiri::HTML5.fragment per #118.
+        def fragment_node?(node)
+          node.is_a?(Nokogiri::XML::DocumentFragment) ||
+            node.is_a?(Nokogiri::HTML4::DocumentFragment) ||
+            node.is_a?(Nokogiri::HTML5::DocumentFragment)
+        end
+        # Record a DiffNode for a fragment-level child-count mismatch.
+        # Each surplus child becomes its own MISSING_NODE diff so the
+        # downstream report shows what was added or removed.
+        def record_fragment_length_mismatch(_node1, _node2, children1,
+                                            children2, differences)
+          longer, shorter, side = if children1.length > children2.length
+                                    [children1, children2, :removed]
+                                  else
+                                    [children2, children1, :added]
+                                  end
+          longer[shorter.length...].each do |orphan|
+            n1 = side == :removed ? orphan : nil
+            n2 = side == :removed ? nil    : orphan
+            differences <<
+              Canon::Comparison::DiffNodeBuilder.build(
+                node1: n1,
+                node2: n2,
+                diff1: Comparison::MISSING_NODE,
+                diff2: Comparison::MISSING_NODE,
+                dimension: :element_structure,
+              )
+          end
         end
         # Compare children of document fragments
@@ -196,6 +231,13 @@ module Canon
           children2 = XmlNodeComparison.filter_children(all_children2, opts)
           if children1.length != children2.length
+            # Record the length mismatch as a DiffNode so verbose mode
+            # surfaces it. Without this, equivalent? wraps an empty
+            # differences array and incorrectly reports the inputs as
+            # equivalent.
+            record_fragment_length_mismatch(node1, node2,
+                                            children1, children2,
+                                            differences)
             return Comparison::UNEQUAL_ELEMENTS
           elsif children1.empty?
             return Comparison::EQUIVALENT
@@ -260,6 +302,8 @@ module Canon
               html_version: html_version,
               match_options: match_opts_hash.merge(strategy.metadata),
               algorithm: :semantic,
+              parse_errors_expected: Comparison.parse_errors_for(node1),
+              parse_errors_received: Comparison.parse_errors_for(node2),
             )
           else
             # Simple boolean result - equivalent if no normative differences
@@ -291,10 +335,12 @@ module Canon
                           node.to_html
                         end
-          # Use XML fragment parser to preserve structure without auto-generated elements
-          # This avoids both HTML4's meta tag insertion and HTML5's tag stripping
-          # See: https://stackoverflow.com/questions/25998824/stop-nokogiri-from-adding-doctype-and-meta-tags
-          frag = Nokogiri::XML.fragment(html_string)
+          # Use XML fragment parser to preserve structure without auto-generated elements.
+          # Decode HTML named entities (&nbsp; etc.) to UTF-8 characters since XML
+          # parser only understands the five XML entities.
+          frag = Nokogiri::XML.fragment(
+            decode_html_named_entities(html_string),
+          )
           # Apply preprocessing if needed
           if preprocessing == :rendered
@@ -448,8 +494,12 @@ module Canon
                         end
           # Parse as Nokogiri fragment for DOM comparison
-          # Use XML fragment parser to avoid auto-inserted meta tags
-          frag = Nokogiri::XML.fragment(html_string)
+          # Use XML fragment parser to avoid auto-inserted meta tags.
+          # Decode HTML named entities (&nbsp; etc.) to UTF-8 characters since
+          # XML parser only understands the five XML entities.
+          frag = Nokogiri::XML.fragment(
+            decode_html_named_entities(html_string),
+          )
           # Apply post-parsing filtering for :normalize, :format, and :rendered preprocessing
           if %i[normalize format rendered].include?(preprocessing)
@@ -496,6 +546,33 @@ module Canon
         # Detect HTML version from content
         #
+        # Decode HTML named entities to their UTF-8 character equivalents.
+        # This is a targeted replacement that only changes entity references,
+        # preserving all tag structure. Needed because Nokogiri::XML.fragment
+        # only understands the five XML entities (&amp; &lt; &gt; &quot; &apos;).
+        #
+        # @param str [String] HTML string possibly containing named entities
+        # @return [String] String with named entities replaced by UTF-8 chars
+        def decode_html_named_entities(str)
+          return str unless str.include?("&")
+          str.gsub(/&nbsp;/i, "\u00A0")
+            .gsub(/&ensp;/i, "\u2002")
+            .gsub(/&emsp;/i, "\u2003")
+            .gsub(/&thinsp;/i, "\u2009")
+            .gsub(/&copy;/i, "\u00A9")
+            .gsub(/&reg;/i, "\u00AE")
+            .gsub(/&trade;/i, "\u2122")
+            .gsub(/&mdash;/i, "\u2014")
+            .gsub(/&ndash;/i, "\u2013")
+            .gsub(/&lsquo;/i, "\u2018")
+            .gsub(/&rsquo;/i, "\u2019")
+            .gsub(/&ldquo;/i, "\u201C")
+            .gsub(/&rdquo;/i, "\u201D")
+            .gsub(/&bull;/i, "\u2022")
+            .gsub(/&hellip;/i, "\u2026")
+        end
         # @param content [String] HTML content
         # @return [Symbol] :html5 or :html4
         def detect_html_version(content)
@@ -721,8 +798,16 @@ compare_profile = nil)
             parent = text_node.parent
             next if ancestor_preserves_whitespace?(parent, preserve_whitespace)
+            content = text_node.content
+            # NBSP (U+00A0) is never insignificant — don't remove
+            next if content.include?("\u00A0")
+            # Whitespace between inline siblings is significant — don't remove
+            next if WhitespaceSensitivity.inline_whitespace_significant?(text_node)
             # Remove if the text is only whitespace (after normalization)
-            if text_node.content.strip.empty?
+            if content.strip.empty?
               text_node.remove
             end
           end