RubyGems - canon - Versions diffs - 0.1.8 → 0.1.9 - Mend

canon 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

checksums.yaml +4 -4
data/.rubocop_todo.yml +112 -25
data/docs/Gemfile +1 -0
data/docs/_config.yml +90 -1
data/docs/advanced/diff-classification.adoc +82 -2
data/docs/features/match-options/index.adoc +239 -1
data/lib/canon/comparison/format_detector.rb +2 -1
data/lib/canon/comparison/html_comparator.rb +19 -8
data/lib/canon/comparison/html_compare_profile.rb +8 -2
data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
data/lib/canon/comparison/xml_comparator.rb +48 -23
data/lib/canon/comparison/xml_node_comparison.rb +25 -3
data/lib/canon/diff/diff_classifier.rb +101 -2
data/lib/canon/diff/formatting_detector.rb +1 -1
data/lib/canon/rspec_matchers.rb +37 -8
data/lib/canon/version.rb +1 -1
data/lib/canon/xml/data_model.rb +24 -13
metadata +3 -78
data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
data/false_positive_analysis.txt +0 -0
data/file1.html +0 -1
data/file2.html +0 -1
data/old-docs/ADVANCED_TOPICS.adoc +0 -20
data/old-docs/BASIC_USAGE.adoc +0 -16
data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
data/old-docs/CLI.adoc +0 -497
data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
data/old-docs/DIFF_FORMATTING.adoc +0 -540
data/old-docs/DIFF_PARAMETERS.adoc +0 -261
data/old-docs/DOM_DIFF.adoc +0 -1017
data/old-docs/ENV_CONFIG.adoc +0 -876
data/old-docs/FORMATS.adoc +0 -867
data/old-docs/INPUT_VALIDATION.adoc +0 -477
data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
data/old-docs/MATCH_OPTIONS.adoc +0 -912
data/old-docs/MODES.adoc +0 -432
data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
data/old-docs/OPTIONS.adoc +0 -1387
data/old-docs/PREPROCESSING.adoc +0 -491
data/old-docs/README.old.adoc +0 -2831
data/old-docs/RSPEC.adoc +0 -814
data/old-docs/RUBY_API.adoc +0 -485
data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
data/old-docs/STRING_COMPARE.adoc +0 -345
data/old-docs/TMP.adoc +0 -3384
data/old-docs/TREE_DIFF.adoc +0 -1080
data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
data/old-docs/VERBOSE.adoc +0 -482
data/old-docs/VISUALIZATION_MAP.adoc +0 -625
data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
data/scripts/analyze_current_state.rb +0 -85
data/scripts/analyze_false_positives.rb +0 -114
data/scripts/analyze_remaining_failures.rb +0 -105
data/scripts/compare_current_failures.rb +0 -95
data/scripts/compare_dom_tree_diff.rb +0 -158
data/scripts/compare_failures.rb +0 -151
data/scripts/debug_attribute_extraction.rb +0 -66
data/scripts/debug_blocks_839.rb +0 -115
data/scripts/debug_meta_matching.rb +0 -52
data/scripts/debug_p_matching.rb +0 -192
data/scripts/debug_signature_matching.rb +0 -118
data/scripts/debug_sourcecode_124.rb +0 -32
data/scripts/debug_whitespace_sensitive.rb +0 -192
data/scripts/extract_false_positives.rb +0 -138
data/scripts/find_actual_false_positives.rb +0 -125
data/scripts/investigate_all_false_positives.rb +0 -161
data/scripts/investigate_batch1.rb +0 -127
data/scripts/investigate_classification.rb +0 -150
data/scripts/investigate_classification_detailed.rb +0 -190
data/scripts/investigate_common_failures.rb +0 -342
data/scripts/investigate_false_negative.rb +0 -80
data/scripts/investigate_false_positive.rb +0 -83
data/scripts/investigate_false_positives.rb +0 -227
data/scripts/investigate_false_positives_batch.rb +0 -163
data/scripts/investigate_mixed_content.rb +0 -125
data/scripts/investigate_remaining_16.rb +0 -214
data/scripts/run_single_test.rb +0 -29
data/scripts/test_all_false_positives.rb +0 -95
data/scripts/test_attribute_details.rb +0 -61
data/scripts/test_both_algorithms.rb +0 -49
data/scripts/test_both_simple.rb +0 -49
data/scripts/test_enhanced_semantic_output.rb +0 -125
data/scripts/test_readme_examples.rb +0 -131
data/scripts/test_semantic_tree_diff.rb +0 -99
data/scripts/test_semantic_ux_improvements.rb +0 -135
data/scripts/test_single_false_positive.rb +0 -119
data/scripts/test_size_limits.rb +0 -99
data/test_html_1.html +0 -21
data/test_html_2.html +0 -21
data/test_nokogiri.rb +0 -33
data/test_normalize.rb +0 -45

data/lib/canon/comparison/xml_comparator/node_type_comparator.rb CHANGED Viewed

@@ -23,7 +23,8 @@ module Canon
           # @param diff_children [Boolean] Whether to diff children
           # @param differences [Array] Array to collect differences
           # @return [Integer] Comparison result code
-          def compare(node1, node2, comparator, opts, child_opts, diff_children, differences)
+          def compare(node1, node2, comparator, opts, child_opts,
+diff_children, differences)
             # Dispatch based on node type
             # Canon::Xml::Node types use .node_type method that returns symbols
             # Nokogiri also has .node_type but returns integers, so check for Symbol
@@ -51,11 +52,14 @@ module Canon
               comparator.send(:compare_element_nodes, node1, node2, opts, child_opts,
                               diff_children, differences)
             when :text
-              comparator.send(:compare_text_nodes, node1, node2, opts, differences)
+              comparator.send(:compare_text_nodes, node1, node2, opts,
+                              differences)
             when :comment
-              comparator.send(:compare_comment_nodes, node1, node2, opts, differences)
+              comparator.send(:compare_comment_nodes, node1, node2, opts,
+                              differences)
             when :cdata
-              comparator.send(:compare_text_nodes, node1, node2, opts, differences)
+              comparator.send(:compare_text_nodes, node1, node2, opts,
+                              differences)
             when :processing_instruction
               comparator.send(:compare_processing_instruction_nodes, node1, node2, opts,
                               differences)
@@ -71,11 +75,14 @@ module Canon
               comparator.send(:compare_element_nodes, node1, node2, opts, child_opts,
                               diff_children, differences)
             elsif node1.respond_to?(:text?) && node1.text?
-              comparator.send(:compare_text_nodes, node1, node2, opts, differences)
+              comparator.send(:compare_text_nodes, node1, node2, opts,
+                              differences)
             elsif node1.respond_to?(:comment?) && node1.comment?
-              comparator.send(:compare_comment_nodes, node1, node2, opts, differences)
+              comparator.send(:compare_comment_nodes, node1, node2, opts,
+                              differences)
             elsif node1.respond_to?(:cdata?) && node1.cdata?
-              comparator.send(:compare_text_nodes, node1, node2, opts, differences)
+              comparator.send(:compare_text_nodes, node1, node2, opts,
+                              differences)
             elsif node1.respond_to?(:processing_instruction?) &&
                 node1.processing_instruction?
               comparator.send(:compare_processing_instruction_nodes, node1, node2, opts,

data/lib/canon/comparison/xml_comparator.rb CHANGED Viewed

@@ -18,6 +18,8 @@ require_relative "xml_comparator/namespace_comparator"
 require_relative "xml_comparator/node_type_comparator"
 require_relative "xml_comparator/child_comparison"
 require_relative "xml_comparator/diff_node_builder"
+# Whitespace sensitivity module
+require_relative "whitespace_sensitivity"
 module Canon
   module Comparison
@@ -90,9 +92,15 @@ module Canon
           # Create child_opts with resolved options
           child_opts = opts.merge(child_opts)
+          # Determine if we should preserve whitespace during parsing
+          # When structural_whitespace is :strict, preserve all whitespace-only text nodes
+          preserve_whitespace = match_opts_hash[:structural_whitespace] == :strict
           # Parse nodes if they are strings, applying preprocessing if needed
-          node1 = parse_node(n1, match_opts_hash[:preprocessing])
-          node2 = parse_node(n2, match_opts_hash[:preprocessing])
+          node1 = parse_node(n1, match_opts_hash[:preprocessing],
+                             preserve_whitespace: preserve_whitespace)
+          node2 = parse_node(n2, match_opts_hash[:preprocessing],
+                             preserve_whitespace: preserve_whitespace)
           # Store original strings for line diff display (before preprocessing)
           original1 = if n1.is_a?(String)
@@ -209,8 +217,9 @@ module Canon
         # Parse a node from string or return as-is
         # Applies preprocessing transformation before parsing if specified
         # Delegates to NodeParser module
-        def parse_node(node, preprocessing = :none)
-          XmlComparatorHelpers::NodeParser.parse(node, preprocessing)
+        def parse_node(node, preprocessing = :none, preserve_whitespace: false)
+          XmlComparatorHelpers::NodeParser.parse(node, preprocessing,
+                                                 preserve_whitespace: preserve_whitespace)
         end
         # Main comparison dispatcher
@@ -331,7 +340,8 @@ module Canon
           # For HTML, check if text node is inside whitespace-preserving element
           # If so, always use strict comparison regardless of text_content setting
-          if should_preserve_whitespace_strictly?(n1, n2)
+          sensitive_element = should_preserve_whitespace_strictly?(n1, n2, opts)
+          if sensitive_element
             behavior = :strict
           end
@@ -344,15 +354,23 @@ module Canon
           # Determine the correct dimension for this difference
           # - If text_content is :strict, ALL differences use :text_content dimension
-          # - If text_content is :normalize, whitespace-only diffs use :structural_whitespace
+          # - If text_content is :normalize, whitespace-only diffs could use :structural_whitespace
+          #   but we keep :text_content to ensure correct classification behavior
           # - Otherwise use :text_content
-          dimension = if behavior == :normalize && whitespace_only_difference?(
-            text1, text2
-          )
-                        :structural_whitespace
-                      else
-                        :text_content
-                      end
+          # However, if element is whitespace-sensitive (like <pre> in HTML),
+          # always use :text_content dimension regardless of behavior
+          #
+          # NOTE: We keep the dimension as :text_content even for whitespace-only diffs
+          # when text_content: :normalize. This ensures that the classification uses
+          # the text_content behavior (:normalize) instead of structural_whitespace
+          # behavior (:strict for XML), which would incorrectly mark the diff as normative.
+          if sensitive_element
+          # Whitespace-sensitive element: always use :text_content dimension
+          else
+            # Always use :text_content for text differences
+            # This ensures correct classification based on text_content behavior
+          end
+          dimension = :text_content
           # Create DiffNode in verbose mode when raw content differs
           # This ensures informative diffs are created even for :ignore/:normalize
@@ -368,17 +386,23 @@ module Canon
         # Check if whitespace should be preserved strictly for these text nodes
         # This applies to HTML elements like pre, code, textarea, script, style
-        def should_preserve_whitespace_strictly?(n1, n2)
-          # Only applies to Nokogiri nodes (HTML)
-          return false unless n1.respond_to?(:parent) && n2.respond_to?(:parent)
-          return false unless n1.parent.respond_to?(:name) && n2.parent.respond_to?(:name)
+        # and elements with xml:space="preserve" or in user-configured whitelist
+        def should_preserve_whitespace_strictly?(n1, n2, opts)
+          # Use WhitespaceSensitivity module to check if element is sensitive
+          # Check both n1 and n2 - if either is in a sensitive element, preserve strictly
+          if n1.respond_to?(:parent)
+            sensitivity_opts = { match_opts: opts[:match_opts] }
+            return true if WhitespaceSensitivity.element_sensitive?(n1,
+                                                                    sensitivity_opts)
+          end
-          # Elements where whitespace must be preserved in HTML
-          preserve_elements = %w[pre code textarea script style]
+          if n2.respond_to?(:parent)
+            sensitivity_opts = { match_opts: opts[:match_opts] }
+            return true if WhitespaceSensitivity.element_sensitive?(n2,
+                                                                    sensitivity_opts)
+          end
-          # Check if either node is inside a whitespace-preserving element
-          in_preserve_element?(n1, preserve_elements) ||
-            in_preserve_element?(n2, preserve_elements)
+          false
         end
         # Check if a node is inside a whitespace-preserving element
@@ -469,7 +493,8 @@ module Canon
         #
         # Delegates to ChildComparison module which handles both ElementMatcher
         # (semantic matching) and simple positional comparison.
-        def compare_children(n1, n2, opts, child_opts, diff_children, differences)
+        def compare_children(n1, n2, opts, child_opts, diff_children,
+differences)
           XmlComparatorHelpers::ChildComparison.compare(
             n1, n2, self, opts, child_opts, diff_children, differences
           )

data/lib/canon/comparison/xml_node_comparison.rb CHANGED Viewed

@@ -139,9 +139,13 @@ diff_children, differences)
         # Check structural_whitespace match option
         match_opts = opts[:match_opts]
-        # Filter out whitespace-only text nodes
-        if match_opts && %i[ignore
-                            normalize].include?(match_opts[:structural_whitespace]) && text_node?(node)
+        return false unless match_opts
+        # Filter out whitespace-only text nodes based on structural_whitespace setting
+        # - :ignore or :normalize: Filter all whitespace-only text nodes
+        # - :strict: Preserve all whitespace-only text nodes (don't filter any)
+        if text_node?(node) && %i[ignore
+                                  normalize].include?(match_opts[:structural_whitespace])
           text = node_text(node)
           return true if MatchOptions.normalize_text(text).empty?
         end
@@ -184,6 +188,24 @@ diff_children, differences)
           node.respond_to?(:node_type) && node.node_type == :text
       end
+      # Extract text content from a node
+      #
+      # @param node [Object] Node to extract text from
+      # @return [String] Text content
+      def self.node_text(node)
+        return "" unless node
+        if node.respond_to?(:content)
+          node.content.to_s
+        elsif node.respond_to?(:text)
+          node.text.to_s
+        elsif node.respond_to?(:value)
+          node.value.to_s
+        else
+          ""
+        end
+      end
       # Dispatch by Canon::Xml::Node type
       def self.dispatch_canon_node_type(node1, node2, opts, child_opts,
 diff_children, differences)

data/lib/canon/diff/diff_classifier.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require_relative "formatting_detector"
 require_relative "../comparison/compare_profile"
+require_relative "../comparison/whitespace_sensitivity"
 module Canon
   module Diff
@@ -28,6 +29,28 @@ module Canon
       # @param diff_node [DiffNode] The diff node to classify
       # @return [DiffNode] The same diff node with normative/formatting attributes set
       def classify(diff_node)
+        # SPECIAL CASE: text_content with :normalize behavior
+        # When text_content is :normalize and the difference is formatting-only,
+        # it should be marked as non-normative (informative)
+        # This ensures that verbose and non-verbose modes give consistent results
+        #
+        # EXCEPTION: If the text node is inside a whitespace-sensitive element
+        # (like <pre>, <code>, <textarea> in HTML), don't apply formatting detection
+        # because whitespace should be preserved in these elements
+        #
+        # This check must come FIRST, before normative_dimension? is called,
+        # because normative_dimension? returns true for text_content: :normalize
+        # (since the dimension affects equivalence), which would prevent formatting
+        # detection from being applied.
+        if diff_node.dimension == :text_content &&
+            profile.send(:behavior_for, :text_content) == :normalize &&
+            !inside_whitespace_sensitive_element?(diff_node) &&
+            formatting_only_diff?(diff_node)
+          diff_node.formatting = true
+          diff_node.normative = false
+          return diff_node
+        end
         # FIRST: Determine if this dimension is normative based on CompareProfile
         # This respects the policy settings (strict/normalize/ignore)
         is_normative = profile.normative_dimension?(diff_node.dimension)
@@ -45,7 +68,7 @@ module Canon
           return diff_node
         end
-        # Otherwise, use the normative determination from CompareProfile
+        # THIRD: Apply the normative determination from CompareProfile
         diff_node.formatting = false
         diff_node.normative = is_normative
@@ -65,10 +88,86 @@ module Canon
       # @param diff_node [DiffNode] The diff node to check
       # @return [Boolean] true if formatting-only
       def formatting_only_diff?(diff_node)
+        # Only apply formatting detection to actual text content differences
+        # If the nodes are not text nodes (e.g., element nodes), don't apply formatting detection
+        node1 = diff_node.node1
+        node2 = diff_node.node2
+        # Check if both nodes are text nodes
+        # If not, this is not a formatting-only difference
+        return false unless text_node?(node1) && text_node?(node2)
         text1 = extract_text_content(diff_node.node1)
         text2 = extract_text_content(diff_node.node2)
-        FormattingDetector.formatting_only?(text1, text2)
+        # For text_content dimension, use normalized text comparison
+        # This handles cases like "" vs "   " (both normalize to "")
+        if diff_node.dimension == :text_content
+          normalized_equivalent?(text1, text2)
+        else
+          FormattingDetector.formatting_only?(text1, text2)
+        end
+      end
+      # Check if two texts are equivalent after normalization
+      # This detects formatting-only differences where normalized texts match
+      # @param text1 [String, nil] First text
+      # @param text2 [String, nil] Second text
+      # @return [Boolean] true if normalized texts are equivalent
+      def normalized_equivalent?(text1, text2)
+        return false if text1.nil? && text2.nil?
+        return false if text1.nil? || text2.nil?
+        # Use MatchOptions.normalize_text for consistency
+        normalized1 = Canon::Comparison::MatchOptions.normalize_text(text1)
+        normalized2 = Canon::Comparison::MatchOptions.normalize_text(text2)
+        # If normalized texts are equivalent but originals are different,
+        # it's a formatting-only difference
+        normalized1 == normalized2 && text1 != text2
+      end
+      # Check if a node is a text node
+      # @param node [Object] The node to check
+      # @return [Boolean] true if the node is a text node
+      def text_node?(node)
+        return false if node.nil?
+        # Canon::Xml::Nodes::TextNode
+        return true if node.is_a?(Canon::Xml::Nodes::TextNode)
+        # Nokogiri text nodes (node_type returns integer constant like 3)
+        return true if node.respond_to?(:node_type) &&
+                       node.node_type.is_a?(Integer) &&
+                       node.node_type == Nokogiri::XML::Node::TEXT_NODE
+        # Moxml text nodes (node_type returns symbol)
+        return true if node.respond_to?(:node_type) && node.node_type == :text
+        # String
+        return true if node.is_a?(String)
+        # Test doubles or objects with text node-like interface
+        # Check if it has a value method (contains text content)
+        return true if node.respond_to?(:value)
+        false
+      end
+      # Check if the text node is inside a whitespace-sensitive element
+      # @param diff_node [DiffNode] The diff node to check
+      # @return [Boolean] true if inside a whitespace-sensitive element
+      def inside_whitespace_sensitive_element?(diff_node)
+        # Get the text node (not the parent element)
+        node = diff_node.node1 || diff_node.node2
+        return false unless node
+        # WhitespaceSensitivity.element_sensitive? expects a text node
+        # and checks its parent element
+        # We need to pass the full options structure with :match_opts key
+        opts = { match_opts: @match_options.options }
+        Canon::Comparison::WhitespaceSensitivity.element_sensitive?(node, opts)
       end
       # Extract text content from a node for formatting comparison

data/lib/canon/diff/formatting_detector.rb CHANGED Viewed

@@ -11,7 +11,7 @@ module Canon
       # @param line2 [String, nil] Second line to compare
       # @return [Boolean] true if lines differ only in formatting
       def self.formatting_only?(line1, line2)
-        # If both are nil or empty, not a formatting diff
+        # If both are nil or empty, not a formatting diff (no difference)
         return false if blank?(line1) && blank?(line2)
         # If only one is blank, it's not just formatting

data/lib/canon/rspec_matchers.rb CHANGED Viewed

@@ -63,6 +63,15 @@ module Canon
         self
       end
+      # Chain method for setting match options
+      # @param match_opts [Hash] match options
+      # @return [SerializationMatcher] self for chaining
+      def with_match(**match_opts)
+        @match ||= {}
+        @match = @match.merge(match_opts)
+        self
+      end
       def matches?(target)
         @target = target
@@ -252,12 +261,22 @@ module Canon
                                diff_algorithm: diff_algorithm)
     end
-    def be_yaml_equivalent_to(expected)
-      SerializationMatcher.new(expected, :yaml)
+    def be_yaml_equivalent_to(expected, match_profile: nil, match: nil,
+                              preprocessing: nil, diff_algorithm: nil)
+      SerializationMatcher.new(expected, :yaml,
+                               match_profile: match_profile,
+                               match: match,
+                               preprocessing: preprocessing,
+                               diff_algorithm: diff_algorithm)
     end
-    def be_json_equivalent_to(expected)
-      SerializationMatcher.new(expected, :json)
+    def be_json_equivalent_to(expected, match_profile: nil, match: nil,
+                              preprocessing: nil, diff_algorithm: nil)
+      SerializationMatcher.new(expected, :json,
+                               match_profile: match_profile,
+                               match: match,
+                               preprocessing: preprocessing,
+                               diff_algorithm: diff_algorithm)
     end
     def be_html_equivalent_to(expected, match_profile: nil, match: nil,
@@ -287,12 +306,22 @@ module Canon
                                diff_algorithm: diff_algorithm)
     end
-    def be_equivalent_to(expected)
-      SerializationMatcher.new(expected, nil)
+    def be_equivalent_to(expected, match_profile: nil, match: nil,
+                         preprocessing: nil, diff_algorithm: nil)
+      SerializationMatcher.new(expected, nil,
+                               match_profile: match_profile,
+                               match: match,
+                               preprocessing: preprocessing,
+                               diff_algorithm: diff_algorithm)
     end
-    def be_string_equivalent_to(expected)
-      SerializationMatcher.new(expected, :string)
+    def be_string_equivalent_to(expected, match_profile: nil, match: nil,
+                                 preprocessing: nil, diff_algorithm: nil)
+      SerializationMatcher.new(expected, :string,
+                               match_profile: match_profile,
+                               match: match,
+                               preprocessing: preprocessing,
+                               diff_algorithm: diff_algorithm)
     end
     if defined?(::RSpec) && ::RSpec.respond_to?(:configure)

data/lib/canon/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Canon
-  VERSION = "0.1.8"
+  VERSION = "0.1.9"
 end

data/lib/canon/xml/data_model.rb CHANGED Viewed

@@ -18,8 +18,9 @@ module Canon
       # Build XPath data model from XML string
       #
       # @param xml_string [String] XML content to parse
+      # @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
       # @return [Nodes::RootNode] Root of the data model tree
-      def self.from_xml(xml_string)
+      def self.from_xml(xml_string, preserve_whitespace: false)
         # Parse with Nokogiri
         doc = Nokogiri::XML(xml_string) do |config|
           config.nonet     # Disable network access
@@ -30,7 +31,7 @@ module Canon
         check_for_relative_namespace_uris(doc)
         # Convert to XPath data model
-        build_from_nokogiri(doc)
+        build_from_nokogiri(doc, preserve_whitespace: preserve_whitespace)
       end
       # Alias for compatibility with base class interface
@@ -74,19 +75,21 @@ module Canon
       # Build XPath data model from Nokogiri document or fragment
       # rubocop:disable Metrics/MethodLength
-      def self.build_from_nokogiri(nokogiri_doc)
+      def self.build_from_nokogiri(nokogiri_doc, preserve_whitespace: false)
         root = Nodes::RootNode.new
         if nokogiri_doc.respond_to?(:root) && nokogiri_doc.root
           # For Documents (XML, HTML4, HTML5, Moxml): process the root element
-          root.add_child(build_element_node(nokogiri_doc.root))
+          root.add_child(build_element_node(nokogiri_doc.root,
+                                            preserve_whitespace: preserve_whitespace))
           # Process PIs and comments outside doc element
           nokogiri_doc.children.each do |child|
             next if child == nokogiri_doc.root
             next if child.is_a?(Nokogiri::XML::DTD)
-            node = build_node_from_nokogiri(child)
+            node = build_node_from_nokogiri(child,
+                                            preserve_whitespace: preserve_whitespace)
             root.add_child(node) if node
           end
         else
@@ -95,7 +98,8 @@ module Canon
           nokogiri_doc.children.each do |child|
             next if child.is_a?(Nokogiri::XML::DTD)
-            node = build_node_from_nokogiri(child)
+            node = build_node_from_nokogiri(child,
+                                            preserve_whitespace: preserve_whitespace)
             root.add_child(node) if node
           end
         end
@@ -104,12 +108,15 @@ module Canon
       end
       # Build node from Nokogiri node
-      def self.build_node_from_nokogiri(nokogiri_node)
+      def self.build_node_from_nokogiri(nokogiri_node,
+preserve_whitespace: false)
         case nokogiri_node
         when Nokogiri::XML::Element
-          build_element_node(nokogiri_node)
+          build_element_node(nokogiri_node,
+                             preserve_whitespace: preserve_whitespace)
         when Nokogiri::XML::Text
-          build_text_node(nokogiri_node)
+          build_text_node(nokogiri_node,
+                          preserve_whitespace: preserve_whitespace)
         when Nokogiri::XML::Comment
           build_comment_node(nokogiri_node)
         when Nokogiri::XML::ProcessingInstruction
@@ -119,7 +126,7 @@ module Canon
       # Build element node from Nokogiri element
       # rubocop:disable Metrics/MethodLength
-      def self.build_element_node(nokogiri_element)
+      def self.build_element_node(nokogiri_element, preserve_whitespace: false)
         element = Nodes::ElementNode.new(
           name: nokogiri_element.name,
           namespace_uri: nokogiri_element.namespace&.href,
@@ -134,7 +141,8 @@ module Canon
         # Build child nodes
         nokogiri_element.children.each do |child|
-          node = build_node_from_nokogiri(child)
+          node = build_node_from_nokogiri(child,
+                                          preserve_whitespace: preserve_whitespace)
           element.add_child(node) if node
         end
@@ -195,13 +203,16 @@ module Canon
       end
       # Build text node from Nokogiri text node
-      def self.build_text_node(nokogiri_text)
+      def self.build_text_node(nokogiri_text, preserve_whitespace: false)
         # XML text nodes: preserve all content including whitespace
         # Unlike HTML, XML treats all whitespace as significant
         content = nokogiri_text.content
         # Skip empty text nodes between elements (common formatting whitespace)
-        return nil if content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element)
+        # UNLESS preserve_whitespace is true (for structural_whitespace: :strict)
+        if !preserve_whitespace && content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element)
+          return nil
+        end
         # Nokogiri already handles CDATA conversion and entity resolution
         Nodes::TextNode.new(value: content)

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: canon
 version: !ruby/object:Gem::Version
-  version: 0.1.8
+  version: 0.1.9
 platform: ruby
 authors:
 - Ribose Inc.
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-01-18 00:00:00.000000000 Z
+date: 2026-01-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: diff-lcs
@@ -174,7 +174,6 @@ files:
 - docs/internals/diffnode-enrichment.adoc
 - docs/internals/index.adoc
 - docs/lychee.toml
-- docs/plans/2025-01-17-html-parser-selection-fix.adoc
 - docs/reference/cli-options.adoc
 - docs/reference/environment-variables.adoc
 - docs/reference/index.adoc
@@ -191,9 +190,6 @@ files:
 - docs/understanding/formats/yaml.adoc
 - docs/understanding/index.adoc
 - exe/canon
-- false_positive_analysis.txt
-- file1.html
-- file2.html
 - lib/canon.rb
 - lib/canon/cache.rb
 - lib/canon/cli.rb
@@ -230,6 +226,7 @@ files:
 - lib/canon/comparison/strategies/base_match_strategy.rb
 - lib/canon/comparison/strategies/match_strategy_factory.rb
 - lib/canon/comparison/strategies/semantic_tree_match_strategy.rb
+- lib/canon/comparison/whitespace_sensitivity.rb
 - lib/canon/comparison/xml_comparator.rb
 - lib/canon/comparison/xml_comparator/attribute_comparator.rb
 - lib/canon/comparison/xml_comparator/attribute_filter.rb
@@ -344,79 +341,7 @@ files:
 - lib/canon/xml/whitespace_normalizer.rb
 - lib/canon/xml/xml_base_handler.rb
 - lib/xml-c14n.rb
-- old-docs/ADVANCED_TOPICS.adoc
-- old-docs/BASIC_USAGE.adoc
-- old-docs/CHARACTER_VISUALIZATION.adoc
-- old-docs/CLI.adoc
-- old-docs/CUSTOMIZING_BEHAVIOR.adoc
-- old-docs/DIFF_ARCHITECTURE.adoc
-- old-docs/DIFF_FORMATTING.adoc
-- old-docs/DIFF_PARAMETERS.adoc
-- old-docs/DOM_DIFF.adoc
-- old-docs/ENV_CONFIG.adoc
-- old-docs/FORMATS.adoc
-- old-docs/INPUT_VALIDATION.adoc
-- old-docs/MATCHER_BEHAVIOR.adoc
-- old-docs/MATCH_ARCHITECTURE.adoc
-- old-docs/MATCH_OPTIONS.adoc
-- old-docs/MODES.adoc
-- old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc
-- old-docs/OPTIONS.adoc
-- old-docs/PREPROCESSING.adoc
-- old-docs/README.old.adoc
-- old-docs/RSPEC.adoc
-- old-docs/RUBY_API.adoc
-- old-docs/SEMANTIC_DIFF_REPORT.adoc
-- old-docs/SEMANTIC_TREE_DIFF.adoc
-- old-docs/STRING_COMPARE.adoc
-- old-docs/TMP.adoc
-- old-docs/TREE_DIFF.adoc
-- old-docs/UNDERSTANDING_CANON.adoc
-- old-docs/VERBOSE.adoc
-- old-docs/VISUALIZATION_MAP.adoc
-- old-docs/WHITESPACE_TREATMENT.adoc
-- scripts/analyze_current_state.rb
-- scripts/analyze_false_positives.rb
-- scripts/analyze_remaining_failures.rb
-- scripts/compare_current_failures.rb
-- scripts/compare_dom_tree_diff.rb
-- scripts/compare_failures.rb
-- scripts/debug_attribute_extraction.rb
-- scripts/debug_blocks_839.rb
-- scripts/debug_meta_matching.rb
-- scripts/debug_p_matching.rb
-- scripts/debug_signature_matching.rb
-- scripts/debug_sourcecode_124.rb
-- scripts/debug_whitespace_sensitive.rb
-- scripts/extract_false_positives.rb
-- scripts/find_actual_false_positives.rb
-- scripts/investigate_all_false_positives.rb
-- scripts/investigate_batch1.rb
-- scripts/investigate_classification.rb
-- scripts/investigate_classification_detailed.rb
-- scripts/investigate_common_failures.rb
-- scripts/investigate_false_negative.rb
-- scripts/investigate_false_positive.rb
-- scripts/investigate_false_positives.rb
-- scripts/investigate_false_positives_batch.rb
-- scripts/investigate_mixed_content.rb
-- scripts/investigate_remaining_16.rb
-- scripts/run_single_test.rb
-- scripts/test_all_false_positives.rb
-- scripts/test_attribute_details.rb
-- scripts/test_both_algorithms.rb
-- scripts/test_both_simple.rb
-- scripts/test_enhanced_semantic_output.rb
-- scripts/test_readme_examples.rb
-- scripts/test_semantic_tree_diff.rb
-- scripts/test_semantic_ux_improvements.rb
-- scripts/test_single_false_positive.rb
-- scripts/test_size_limits.rb
 - sig/xml/c14n.rbs
-- test_html_1.html
-- test_html_2.html
-- test_nokogiri.rb
-- test_normalize.rb
 homepage: https://github.com/lutaml/canon
 licenses:
 - BSD-2-Clause