RubyGems - canon - Versions diffs - 0.1.8 → 0.1.10 - Mend

canon 0.1.8 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

checksums.yaml +4 -4
data/.rubocop_todo.yml +83 -22
data/docs/Gemfile +1 -0
data/docs/_config.yml +90 -1
data/docs/advanced/diff-classification.adoc +196 -24
data/docs/features/match-options/index.adoc +239 -1
data/lib/canon/comparison/format_detector.rb +2 -1
data/lib/canon/comparison/html_comparator.rb +19 -8
data/lib/canon/comparison/html_compare_profile.rb +8 -2
data/lib/canon/comparison/markup_comparator.rb +109 -2
data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +108 -0
data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
data/lib/canon/comparison/xml_comparator.rb +240 -23
data/lib/canon/comparison/xml_node_comparison.rb +25 -3
data/lib/canon/diff/diff_classifier.rb +119 -5
data/lib/canon/diff/formatting_detector.rb +1 -1
data/lib/canon/diff/xml_serialization_formatter.rb +153 -0
data/lib/canon/rspec_matchers.rb +37 -8
data/lib/canon/version.rb +1 -1
data/lib/canon/xml/data_model.rb +24 -13
metadata +4 -78
data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
data/false_positive_analysis.txt +0 -0
data/file1.html +0 -1
data/file2.html +0 -1
data/old-docs/ADVANCED_TOPICS.adoc +0 -20
data/old-docs/BASIC_USAGE.adoc +0 -16
data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
data/old-docs/CLI.adoc +0 -497
data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
data/old-docs/DIFF_FORMATTING.adoc +0 -540
data/old-docs/DIFF_PARAMETERS.adoc +0 -261
data/old-docs/DOM_DIFF.adoc +0 -1017
data/old-docs/ENV_CONFIG.adoc +0 -876
data/old-docs/FORMATS.adoc +0 -867
data/old-docs/INPUT_VALIDATION.adoc +0 -477
data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
data/old-docs/MATCH_OPTIONS.adoc +0 -912
data/old-docs/MODES.adoc +0 -432
data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
data/old-docs/OPTIONS.adoc +0 -1387
data/old-docs/PREPROCESSING.adoc +0 -491
data/old-docs/README.old.adoc +0 -2831
data/old-docs/RSPEC.adoc +0 -814
data/old-docs/RUBY_API.adoc +0 -485
data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
data/old-docs/STRING_COMPARE.adoc +0 -345
data/old-docs/TMP.adoc +0 -3384
data/old-docs/TREE_DIFF.adoc +0 -1080
data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
data/old-docs/VERBOSE.adoc +0 -482
data/old-docs/VISUALIZATION_MAP.adoc +0 -625
data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
data/scripts/analyze_current_state.rb +0 -85
data/scripts/analyze_false_positives.rb +0 -114
data/scripts/analyze_remaining_failures.rb +0 -105
data/scripts/compare_current_failures.rb +0 -95
data/scripts/compare_dom_tree_diff.rb +0 -158
data/scripts/compare_failures.rb +0 -151
data/scripts/debug_attribute_extraction.rb +0 -66
data/scripts/debug_blocks_839.rb +0 -115
data/scripts/debug_meta_matching.rb +0 -52
data/scripts/debug_p_matching.rb +0 -192
data/scripts/debug_signature_matching.rb +0 -118
data/scripts/debug_sourcecode_124.rb +0 -32
data/scripts/debug_whitespace_sensitive.rb +0 -192
data/scripts/extract_false_positives.rb +0 -138
data/scripts/find_actual_false_positives.rb +0 -125
data/scripts/investigate_all_false_positives.rb +0 -161
data/scripts/investigate_batch1.rb +0 -127
data/scripts/investigate_classification.rb +0 -150
data/scripts/investigate_classification_detailed.rb +0 -190
data/scripts/investigate_common_failures.rb +0 -342
data/scripts/investigate_false_negative.rb +0 -80
data/scripts/investigate_false_positive.rb +0 -83
data/scripts/investigate_false_positives.rb +0 -227
data/scripts/investigate_false_positives_batch.rb +0 -163
data/scripts/investigate_mixed_content.rb +0 -125
data/scripts/investigate_remaining_16.rb +0 -214
data/scripts/run_single_test.rb +0 -29
data/scripts/test_all_false_positives.rb +0 -95
data/scripts/test_attribute_details.rb +0 -61
data/scripts/test_both_algorithms.rb +0 -49
data/scripts/test_both_simple.rb +0 -49
data/scripts/test_enhanced_semantic_output.rb +0 -125
data/scripts/test_readme_examples.rb +0 -131
data/scripts/test_semantic_tree_diff.rb +0 -99
data/scripts/test_semantic_ux_improvements.rb +0 -135
data/scripts/test_single_false_positive.rb +0 -119
data/scripts/test_size_limits.rb +0 -99
data/test_html_1.html +0 -21
data/test_html_2.html +0 -21
data/test_nokogiri.rb +0 -33
data/test_normalize.rb +0 -45

data/docs/features/match-options/index.adoc CHANGED Viewed

@@ -45,10 +45,37 @@ Match dimensions are orthogonal aspects that can be configured independently.
 `:strict`:: Text must match exactly, character-for-character including all whitespace
-`:normalize`:: Whitespace is normalized (collapsed/trimmed) before comparison
+`:normalize`:: Whitespace is normalized (collapsed/trimmed) before comparison.
+Formatting-only differences (e.g., extra spaces around text) are classified as
+*informative* rather than normative. This means documents with only whitespace
+differences in text content are considered equivalent.
 `:ignore`:: Text content is completely ignored in comparison
+.Using text_content: :normalize
+[example]
+====
+[source,ruby]
+----
+# These are equivalent with :normalize
+# Whitespace differences are formatting-only (informative)
+Canon.equivalent?(
+  '<p>  text  </p>',
+  '<p>text</p>',
+  match: { text_content: :normalize }
+)
+# => true
+# These differ in :strict mode
+Canon.equivalent?(
+  '<p>  text  </p>',
+  '<p>text</p>',
+  match: { text_content: :strict }
+)
+# => false
+----
+====
 === structural_whitespace
 **Applies to**: All formats
@@ -63,6 +90,200 @@ Match dimensions are orthogonal aspects that can be configured independently.
 `:ignore`:: Structural whitespace is completely ignored
+=== Whitespace sensitivity at element level
+==== General
+In XML, whitespace sensitivity can vary by schema and element:
+* Elements that apply `xml:space="preserve"` are whitespace-sensitive.
+* Other elements may be defined as sensitive by schema (e.g.
+`xs:space="preserve"` in XML Schema) or unannounced conventions, such as
+for mixed content.
+In HTML, elements like `<pre>` and `<code>` preserve whitespace, while others
+like `<div>` and `<p>` do not.
+In the unannounced cases, the developer must indicate which elements are
+whitespace-sensitive.
+In Canon, you can control whitespace sensitivity at the element level using
+`structural_whitespace: :strict` or `text_content: :normalize`.
+Element-level sensitivity controls both:
+* `structural_whitespace`: Whether whitespace between elements in the element is
+preserved
+* `text_content`: Whether whitespace within text nodes of the element is
+normalized
+Options for controlling element-level sensitivity include:
+* **xml:space attribute** - XML standard for declaring whitespace sensitivity in documents
+* **whitelist/blacklist options** - User-specified element lists
+* **Format defaults** - HTML has built-in sensitive elements
+* **respect_xml_space option** - Control whether xml:space is honored
+For elements marked as sensitive, whitespace differences are always normative.
+For non-sensitive elements using `text_content: :normalize`, whitespace
+differences are classified as formatting-only (informative).
+==== xml:space attribute support
+The `xml:space` attribute is the XML standard way to declare whitespace
+sensitivity in XML instance documents:
+[source,xml]
+----
+<!-- Preserve whitespace in this element -->
+<code xml:space="preserve">
+  Indentation and newlines matter here
+</code>
+<!-- Use default behavior -->
+<text xml:space="default">
+  Whitespace handling follows configured behavior
+</text>
+----
+==== Whitelist and blacklist options
+You can explicitly specify which elements are whitespace-sensitive:
+[source,ruby]
+----
+# Specify elements that preserve whitespace
+Canon::Comparison.equivalent?(xml1, xml2,
+  match: {
+    structural_whitespace: :strict,
+    whitespace_sensitive_elements: [:pre, :code, :sample],
+    whitespace_insensitive_elements: [:p, :div]  # Override defaults/whitelist
+  }
+)
+----
+==== respect_xml_space option
+Control whether xml:space attributes in the document are honored:
+[source,ruby]
+----
+# Honor xml:space (default)
+Canon::Comparison.equivalent?(xml1, xml2,
+  match: {
+    structural_whitespace: :strict,
+    respect_xml_space: true  # Use xml:space attributes in document
+  }
+)
+# Ignore xml:space, use only user configuration
+Canon::Comparison.equivalent?(xml1, xml2,
+  match: {
+    structural_whitespace: :strict,
+    respect_xml_space: false  # Override document declarations
+  }
+)
+----
+==== Priority order
+When determining if an element is whitespace-sensitive, Canon uses this priority:
+[source]
+----
+1. respect_xml_space: false → User config only (ignore xml:space)
+   ↓
+2. User whitelist → Use whitelist (user explicitly declared)
+   ↓
+3. Format defaults → HTML: [:pre, :textarea, :script, :style], XML: []
+   ↓
+4. User blacklist → Remove from defaults/whitelist
+   ↓
+5. xml:space="preserve" → Element is sensitive
+   ↓
+6. xml:space="default" → Use steps 1-4
+----
+==== Format-specific defaults
+**HTML**:: `[:pre, :textarea, :script, :style]` - These elements preserve whitespace by HTML specification
+**XML**:: `[]` - No default whitespace-sensitive elements, purely user-controlled
+==== Examples
+.Using xml:space attribute
+[source,ruby]
+----
+xml1 = '<root><code xml:space="preserve">  indented  </code></root>'
+xml2 = '<root><code xml:space="preserve">indented</code></root>'
+# These are NOT equivalent (whitespace matters in xml:space="preserve")
+Canon::Comparison.equivalent?(xml1, xml2,
+  match: { structural_whitespace: :strict }
+)
+# => false
+----
+.Using whitelist
+[source,ruby]
+----
+# Make <p> elements whitespace-sensitive
+Canon::Comparison.equivalent?(xml1, xml2,
+  match: {
+    structural_whitespace: :strict,
+    whitespace_sensitive_elements: [:p, :pre]
+  }
+)
+----
+.Overriding HTML defaults
+[source,ruby]
+----
+# Make <script> NOT whitespace-sensitive (override HTML default)
+Canon::Comparison.equivalent?(html1, html2,
+  format: :html,
+  match: {
+    structural_whitespace: :strict,
+    whitespace_insensitive_elements: [:script]
+  }
+)
+----
+.Using text_content: :normalize with whitespace_insensitive_elements
+[source,ruby]
+----
+# HTML defaults: [:pre, :code, :textarea, :script, :style]
+# Excluding :code means it's no longer whitespace-sensitive
+html1 = '<root><pre>  indented  </pre><code>  code  </code></root>'
+html2 = '<root><pre>  indented  </pre><code>code</code></root>'
+# With :code blacklisted, whitespace in <code> is normalized (formatting-only)
+# HTML uses text_content: :normalize by default
+Canon::Comparison.equivalent?(html1, html2,
+  format: :html,
+  match: {
+    whitespace_insensitive_elements: [:code],
+  }
+)
+# => true (whitespace differences in <code> are formatting-only)
+# Without blacklisting, <code> is sensitive (whitespace matters)
+Canon::Comparison.equivalent?(html1, html2,
+  format: :html,
+  match: {
+    structural_whitespace: :strict,
+  }
+)
+# => false (whitespace in <code> is normative)
+----
 === attribute_whitespace
 **Applies to**: XML, HTML only
@@ -414,6 +635,23 @@ expect(actual).to be_xml_equivalent_to(expected,
     element_position: :ignore,
     element_hierarchy: :ignore
   )
+# Element-level whitespace sensitivity
+expect(actual).to be_xml_equivalent_to(expected,
+  match: { structural_whitespace: :strict }
+)
+  .with_options(
+    whitespace_sensitive_elements: [:pre, :code, :sample],
+    respect_xml_space: true
+  )
+# Override HTML default whitespace-sensitive elements
+expect(html).to be_html_equivalent_to(expected,
+  match: { structural_whitespace: :strict }
+)
+  .with_options(
+    whitespace_insensitive_elements: [:script, :style]
+  )
 ====
 == Comments dimension

data/lib/canon/comparison/format_detector.rb CHANGED Viewed

@@ -72,7 +72,8 @@ module Canon
           return :json if trimmed.start_with?("{", "[")
           # HTML indicators
-          return :html if trimmed.start_with?("<!DOCTYPE html", "<html", "<HTML")
+          return :html if trimmed.start_with?("<!DOCTYPE html", "<html",
+                                              "<HTML")
           # XML indicators - must start with < and end with >
           return :xml if trimmed.start_with?("<") && trimmed.end_with?(">")

data/lib/canon/comparison/html_comparator.rb CHANGED Viewed

@@ -13,6 +13,8 @@ require_relative "../diff/diff_classifier"
 require_relative "strategies/match_strategy_factory"
 require_relative "../html/data_model"
 require_relative "xml_node_comparison"
+# Whitespace sensitivity module (single source of truth for sensitive elements)
+require_relative "whitespace_sensitivity"
 module Canon
   module Comparison
@@ -542,16 +544,22 @@ compare_profile = nil)
           return if match_opts[:text_content] == :strict
           # Elements where whitespace is significant - don't normalize
-          # Use profile if available, otherwise use default list
+          # SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_sensitive_elements
+          # This ensures consistency between preprocessing and comparison logic
+          # SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_sensitive_elements
+          # This ensures consistency between preprocessing and comparison logic
           preserve_whitespace = if compare_profile.is_a?(HtmlCompareProfile)
                                   # Profile handles HTML-specific whitespace rules
-                                  %w[pre code textarea script
-                                     style].select do |elem|
-                                    compare_profile.preserve_whitespace?(elem)
-                                  end
+                                  # Get default list and filter by profile
+                                  WhitespaceSensitivity
+                                    .format_default_sensitive_elements(match_opts)
+                                    .select do |elem|
+                                      compare_profile.preserve_whitespace?(elem.to_s)
+                                    end
+                                    .map(&:to_s)
                                 else
-                                  # Fallback to default list
-                                  %w[pre code textarea script style]
+                                  # Use default list from WhitespaceSensitivity (single source of truth)
+                                  WhitespaceSensitivity.format_default_sensitive_elements(match_opts).map(&:to_s)
                                 end
           # Walk all text nodes
@@ -607,9 +615,12 @@ compare_profile = nil)
         #
         # CRITICAL: Do NOT remove whitespace-only text nodes from whitespace-sensitive
         # elements like <pre>, <code>, <textarea>, <script>, <style>
+        #
+        # SINGLE SOURCE OF TRUTH: Uses WhitespaceSensitivity.format_default_sensitive_elements
         def remove_whitespace_only_text_nodes(doc)
           # Elements where whitespace is significant - don't remove whitespace-only nodes
-          preserve_whitespace = %w[pre code textarea script style]
+          # SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_sensitive_elements
+          preserve_whitespace = WhitespaceSensitivity.format_default_sensitive_elements(format: :html).map(&:to_s)
           doc.xpath(".//text()").each do |text_node|
             # CRITICAL: Skip if this text node is inside a whitespace-preserving element

data/lib/canon/comparison/html_compare_profile.rb CHANGED Viewed

@@ -1,6 +1,8 @@
 # frozen_string_literal: true
 require_relative "compare_profile"
+# Whitespace sensitivity module (single source of truth for sensitive elements)
+require_relative "whitespace_sensitivity"
 module Canon
   module Comparison
@@ -82,9 +84,13 @@ module Canon
       private
       # Elements where whitespace is semantically significant in HTML
-      # @return [Array<String>] List of element names
+      #
+      # SINGLE SOURCE OF TRUTH: Delegates to WhitespaceSensitivity.format_default_sensitive_elements
+      # This ensures consistency across the codebase.
+      #
+      # @return [Array<String>] List of element names (as strings)
       def whitespace_sensitive_elements
-        %w[pre code textarea script style]
+        WhitespaceSensitivity.format_default_sensitive_elements(format: @html_version).map(&:to_s)
       end
       # Check if a dimension is explicitly set to :strict

data/lib/canon/comparison/markup_comparator.rb CHANGED Viewed

@@ -239,9 +239,116 @@ module Canon
         # @param diff2 [Symbol] Difference type for node2
         # @param dimension [Symbol] The dimension of the difference
         # @return [String] Human-readable reason
-        def build_difference_reason(_node1, _node2, diff1, diff2, dimension)
+        def build_difference_reason(node1, node2, diff1, diff2, dimension)
+          # For attribute presence differences, show what attributes differ
+          if dimension == :attribute_presence
+            attrs1 = extract_attributes(node1)
+            attrs2 = extract_attributes(node2)
+            return build_attribute_difference_reason(attrs1, attrs2)
+          end
+          # For text content differences, show the actual text (truncated if needed)
+          if dimension == :text_content
+            text1 = extract_text_content_from_node(node1)
+            text2 = extract_text_content_from_node(node2)
+            return build_text_difference_reason(text1, text2)
+          end
           # Default reason - can be overridden in subclasses
-          "Difference in #{dimension}: #{diff1} vs #{diff2}"
+          "#{diff1} vs #{diff2}"
+        end
+        # Build a clear reason message for attribute presence differences
+        # Shows which attributes are only in node1, only in node2, or different values
+        #
+        # @param attrs1 [Hash, nil] First node's attributes
+        # @param attrs2 [Hash, nil] Second node's attributes
+        # @return [String] Clear explanation of the attribute difference
+        def build_attribute_difference_reason(attrs1, attrs2)
+          return "#{attrs1&.keys&.size || 0} vs #{attrs2&.keys&.size || 0} attributes" unless attrs1 && attrs2
+          require "set"
+          keys1 = attrs1.keys.to_set
+          keys2 = attrs2.keys.to_set
+          only_in_1 = keys1 - keys2
+          only_in_2 = keys2 - keys1
+          common = keys1 & keys2
+          # Check if values differ for common keys
+          different_values = common.reject { |k| attrs1[k] == attrs2[k] }
+          parts = []
+          parts << "only in first: #{only_in_1.to_a.sort.join(', ')}" if only_in_1.any?
+          parts << "only in second: #{only_in_2.to_a.sort.join(', ')}" if only_in_2.any?
+          parts << "different values: #{different_values.sort.join(', ')}" if different_values.any?
+          if parts.empty?
+            "#{keys1.size} vs #{keys2.size} attributes (same names)"
+          else
+            parts.join("; ")
+          end
+        end
+        # Extract text content from a node for diff reason
+        #
+        # @param node [Object, nil] Node to extract text from
+        # @return [String, nil] Text content or nil
+        def extract_text_content_from_node(node)
+          return nil if node.nil?
+          # For Canon::Xml::Nodes::TextNode
+          return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
+          # For XML/HTML nodes with text_content method
+          return node.text_content if node.respond_to?(:text_content)
+          # For nodes with text method
+          return node.text if node.respond_to?(:text)
+          # For nodes with content method (Moxml::Text)
+          return node.content if node.respond_to?(:content)
+          # For nodes with value method (other types)
+          return node.value if node.respond_to?(:value)
+          # For simple text nodes or strings
+          return node.to_s if node.is_a?(String)
+          # For other node types, try to_s
+          node.to_s
+        rescue StandardError
+          nil
+        end
+        # Build a clear reason message for text content differences
+        # Shows the actual text content (truncated if too long)
+        #
+        # @param text1 [String, nil] First text content
+        # @param text2 [String, nil] Second text content
+        # @return [String] Clear explanation of the text difference
+        def build_text_difference_reason(text1, text2)
+          # Handle nil cases
+          return "missing vs '#{truncate_text(text2)}'" if text1.nil? && text2
+          return "'#{truncate_text(text1)}' vs missing" if text1 && text2.nil?
+          return "both missing" if text1.nil? && text2.nil?
+          # Both have content - show truncated versions
+          "'#{truncate_text(text1)}' vs '#{truncate_text(text2)}'"
+        end
+        # Truncate text for display in reason messages
+        #
+        # @param text [String] Text to truncate
+        # @param max_length [Integer] Maximum length
+        # @return [String] Truncated text
+        def truncate_text(text, max_length = 40)
+          return "" if text.nil?
+          text = text.to_s
+          return text if text.length <= max_length
+          "#{text[0...max_length]}..."
         end
         # Serialize an element node to string

data/lib/canon/comparison/match_options/base_resolver.rb CHANGED Viewed

@@ -27,6 +27,9 @@ module Canon
             # Start with format-specific defaults
             options = format_defaults(format).dup
+            # Store format for later use (e.g., WhitespaceSensitivity needs it)
+            options[:format] = format
             # Apply global profile if specified
             if global_profile
               profile_opts = get_profile_options(global_profile)
@@ -111,12 +114,16 @@ module Canon
           def validate_match_options!(match_options)
             # Special options that don't need validation as dimensions
             special_options = %i[
+              format
               preprocessing
               semantic_diff
               similarity_threshold
               hash_matching
               similarity_matching
               propagation
+              whitespace_sensitive_elements
+              whitespace_insensitive_elements
+              respect_xml_space
             ]
             match_options.each do |dimension, behavior|