canon 0.1.9 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -568,9 +568,201 @@ differences)
568
568
  end
569
569
  end
570
570
 
571
+ # For attribute presence differences, show what attributes differ
572
+ if dimension == :attribute_presence
573
+ attrs1 = extract_attributes(node1)
574
+ attrs2 = extract_attributes(node2)
575
+ return build_attribute_diff_reason(attrs1, attrs2)
576
+ end
577
+
578
+ # For text content differences, show the actual text (truncated if needed)
579
+ if dimension == :text_content
580
+ text1 = extract_text_from_node(node1)
581
+ text2 = extract_text_from_node(node2)
582
+ return build_text_diff_reason(text1, text2)
583
+ end
584
+
571
585
  "#{diff1} vs #{diff2}"
572
586
  end
573
587
 
588
+ # Build a clear reason message for attribute presence differences
589
+ #
590
+ # @param attrs1 [Hash, nil] First node's attributes
591
+ # @param attrs2 [Hash, nil] Second node's attributes
592
+ # @return [String] Clear explanation of the attribute difference
593
+ def build_attribute_diff_reason(attrs1, attrs2)
594
+ return "#{attrs1&.keys&.size || 0} vs #{attrs2&.keys&.size || 0} attributes" unless attrs1 && attrs2
595
+
596
+ require "set"
597
+ keys1 = attrs1.keys.to_set
598
+ keys2 = attrs2.keys.to_set
599
+
600
+ only_in_first = keys1 - keys2
601
+ only_in_second = keys2 - keys1
602
+ common = keys1 & keys2
603
+
604
+ # Check if values differ for common keys
605
+ different_values = common.reject { |k| attrs1[k] == attrs2[k] }
606
+
607
+ parts = []
608
+ parts << "only in first: #{only_in_first.to_a.sort.join(', ')}" if only_in_first.any?
609
+ parts << "only in second: #{only_in_second.to_a.sort.join(', ')}" if only_in_second.any?
610
+ parts << "different values: #{different_values.sort.join(', ')}" if different_values.any?
611
+
612
+ if parts.empty?
613
+ "#{keys1.size} vs #{keys2.size} attributes (same names)"
614
+ else
615
+ parts.join("; ")
616
+ end
617
+ end
618
+
619
+ # Extract text from a node for diff reason
620
+ #
621
+ # @param node [Object, nil] Node to extract text from
622
+ # @return [String, nil] Text content or nil
623
+ def extract_text_from_node(node)
624
+ return nil if node.nil?
625
+
626
+ # For Canon::Xml::Nodes::TextNode
627
+ return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
628
+
629
+ # For XML/HTML nodes with text_content method
630
+ return node.text_content if node.respond_to?(:text_content)
631
+
632
+ # For nodes with text method
633
+ return node.text if node.respond_to?(:text)
634
+
635
+ # For nodes with content method (Moxml::Text)
636
+ return node.content if node.respond_to?(:content)
637
+
638
+ # For nodes with value method (other types)
639
+ return node.value if node.respond_to?(:value)
640
+
641
+ # For simple text nodes or strings
642
+ return node.to_s if node.is_a?(String)
643
+
644
+ # For other node types, try to_s
645
+ node.to_s
646
+ rescue StandardError
647
+ nil
648
+ end
649
+
650
+ # Build a clear reason message for text content differences
651
+ #
652
+ # @param text1 [String, nil] First text content
653
+ # @param text2 [String, nil] Second text content
654
+ # @return [String] Clear explanation of the text difference
655
+ def build_text_diff_reason(text1, text2)
656
+ # Handle nil cases
657
+ return "missing vs '#{truncate_text(text2)}'" if text1.nil? && text2
658
+ return "'#{truncate_text(text2)}' vs missing" if text1 && text2.nil?
659
+ return "both missing" if text1.nil? && text2.nil?
660
+
661
+ # Check if both are whitespace-only
662
+ if whitespace_only?(text1) && whitespace_only?(text2)
663
+ return "whitespace: #{describe_whitespace(text1)} vs #{describe_whitespace(text2)}"
664
+ end
665
+
666
+ # Show text with visible whitespace markers
667
+ # Use escaped representations for clarity: \n for newline, \t for tab, · for spaces
668
+ vis1 = visualize_whitespace(text1)
669
+ vis2 = visualize_whitespace(text2)
670
+
671
+ "Text: \"#{vis1}\" vs \"#{vis2}\""
672
+ end
673
+
674
+ # Check if text is only whitespace
675
+ #
676
+ # @param text [String] Text to check
677
+ # @return [Boolean] true if whitespace-only
678
+ def whitespace_only?(text)
679
+ return false if text.nil?
680
+
681
+ text.to_s.strip.empty?
682
+ end
683
+
684
+ # Make whitespace visible in text content
685
+ # Uses the existing character visualization map from DiffFormatter (single source of truth)
686
+ #
687
+ # @param text [String] Text to visualize
688
+ # @return [String] Text with visible whitespace markers
689
+ def visualize_whitespace(text)
690
+ return "" if text.nil?
691
+
692
+ # Use the character map loader as the single source of truth
693
+ viz_map = character_visualization_map
694
+
695
+ # Replace each character with its visualization
696
+ text.chars.map { |char| viz_map[char] || char }.join
697
+ end
698
+
699
+ # Get the character visualization map (lazy-loaded to avoid circular dependency)
700
+ #
701
+ # @return [Hash] Character to visualization symbol mapping
702
+ def character_visualization_map
703
+ @character_visualization_map ||= begin
704
+ # Load the YAML file directly to avoid circular dependency
705
+ require "yaml"
706
+ lib_root = File.expand_path("../..", __dir__)
707
+ yaml_path = File.join(lib_root,
708
+ "canon/diff_formatter/character_map.yml")
709
+ data = YAML.load_file(yaml_path)
710
+
711
+ # Build visualization map from the YAML data
712
+ visualization_map = {}
713
+ data["characters"].each do |char_data|
714
+ # Get the character from either unicode code point or character field
715
+ char = if char_data["unicode"]
716
+ # Convert hex string to character
717
+ [char_data["unicode"].to_i(16)].pack("U")
718
+ else
719
+ # Use character field directly (handles \n, \t, etc.)
720
+ char_data["character"]
721
+ end
722
+
723
+ vis = char_data["visualization"]
724
+ visualization_map[char] = vis
725
+ end
726
+
727
+ visualization_map
728
+ end
729
+ end
730
+
731
+ # Describe whitespace content in a readable way
732
+ #
733
+ # @param text [String] Whitespace text
734
+ # @return [String] Description like "4 chars (2 newlines, 2 spaces)"
735
+ def describe_whitespace(text)
736
+ return "0 chars" if text.nil? || text.empty?
737
+
738
+ char_count = text.length
739
+ newline_count = text.count("\n")
740
+ space_count = text.count(" ")
741
+ tab_count = text.count("\t")
742
+
743
+ parts = []
744
+ parts << "#{newline_count} newlines" if newline_count.positive?
745
+ parts << "#{space_count} spaces" if space_count.positive?
746
+ parts << "#{tab_count} tabs" if tab_count.positive?
747
+
748
+ description = parts.join(", ")
749
+ "#{char_count} chars (#{description})"
750
+ end
751
+
752
+ # Truncate text for display in reason messages
753
+ #
754
+ # @param text [String] Text to truncate
755
+ # @param max_length [Integer] Maximum length
756
+ # @return [String] Truncated text
757
+ def truncate_text(text, max_length = 40)
758
+ return "" if text.nil?
759
+
760
+ text = text.to_s
761
+ return text if text.length <= max_length
762
+
763
+ "#{text[0...max_length]}..."
764
+ end
765
+
574
766
  # Compare namespace declarations (xmlns and xmlns:* attributes)
575
767
  # Delegates to XmlComparatorHelpers::NamespaceComparator
576
768
  def compare_namespace_declarations(n1, n2, opts, differences)
@@ -14,6 +14,9 @@ module Canon
14
14
  show_diffs: :symbol,
15
15
  verbose_diff: :boolean,
16
16
  algorithm: :symbol,
17
+ show_raw_inputs: :boolean,
18
+ show_preprocessed_inputs: :boolean,
19
+ show_line_numbered_inputs: :boolean,
17
20
 
18
21
  # MatchConfig attributes
19
22
  profile: :symbol,
@@ -42,7 +45,8 @@ module Canon
42
45
 
43
46
  def all_diff_attributes
44
47
  %i[mode use_color context_lines grouping_lines show_diffs
45
- verbose_diff algorithm max_file_size max_node_count max_diff_lines]
48
+ verbose_diff algorithm show_raw_inputs show_preprocessed_inputs
49
+ show_line_numbered_inputs max_file_size max_node_count max_diff_lines]
46
50
  end
47
51
 
48
52
  def all_match_attributes
data/lib/canon/config.rb CHANGED
@@ -221,6 +221,30 @@ module Canon
221
221
  @resolver.set_programmatic(:verbose_diff, value)
222
222
  end
223
223
 
224
+ def show_raw_inputs
225
+ @resolver.resolve(:show_raw_inputs)
226
+ end
227
+
228
+ def show_raw_inputs=(value)
229
+ @resolver.set_programmatic(:show_raw_inputs, value)
230
+ end
231
+
232
+ def show_preprocessed_inputs
233
+ @resolver.resolve(:show_preprocessed_inputs)
234
+ end
235
+
236
+ def show_preprocessed_inputs=(value)
237
+ @resolver.set_programmatic(:show_preprocessed_inputs, value)
238
+ end
239
+
240
+ def show_line_numbered_inputs
241
+ @resolver.resolve(:show_line_numbered_inputs)
242
+ end
243
+
244
+ def show_line_numbered_inputs=(value)
245
+ @resolver.set_programmatic(:show_line_numbered_inputs, value)
246
+ end
247
+
224
248
  def algorithm
225
249
  @resolver.resolve(:algorithm)
226
250
  end
@@ -266,6 +290,9 @@ module Canon
266
290
  show_diffs: show_diffs,
267
291
  verbose_diff: verbose_diff,
268
292
  diff_algorithm: algorithm,
293
+ show_raw_inputs: show_raw_inputs,
294
+ show_preprocessed_inputs: show_preprocessed_inputs,
295
+ show_line_numbered_inputs: show_line_numbered_inputs,
269
296
  max_file_size: max_file_size,
270
297
  max_node_count: max_node_count,
271
298
  max_diff_lines: max_diff_lines,
@@ -283,6 +310,9 @@ module Canon
283
310
  show_diffs: :all,
284
311
  verbose_diff: false,
285
312
  algorithm: :dom,
313
+ show_raw_inputs: false,
314
+ show_preprocessed_inputs: false,
315
+ show_line_numbered_inputs: false,
286
316
  max_file_size: 5_242_880, # 5MB in bytes
287
317
  max_node_count: 10_000, # Maximum nodes in tree
288
318
  max_diff_lines: 10_000, # Maximum diff output lines
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "formatting_detector"
4
+ require_relative "xml_serialization_formatter"
4
5
  require_relative "../comparison/compare_profile"
5
6
  require_relative "../comparison/whitespace_sensitivity"
6
7
 
@@ -8,6 +9,11 @@ module Canon
8
9
  module Diff
9
10
  # Classifies DiffNodes as normative (affects equivalence) or informative (doesn't affect equivalence)
10
11
  # based on the match options in effect
12
+ #
13
+ # Classification hierarchy (three distinct kinds of differences):
14
+ # 1. Serialization formatting: XML syntax differences (always non-normative)
15
+ # 2. Content formatting: Whitespace differences in content (non-normative when normalized)
16
+ # 3. Normative: Semantic content differences (affect equivalence)
11
17
  class DiffClassifier
12
18
  attr_reader :match_options, :profile
13
19
 
@@ -25,11 +31,20 @@ module Canon
25
31
 
26
32
  # Classify a single DiffNode as normative or informative
27
33
  # Hierarchy: formatting-only < informative < normative
28
- # CompareProfile determines base classification, FormattingDetector refines informative differences
34
+ # CompareProfile determines base classification, XmlSerializationFormatter handles serialization formatting
29
35
  # @param diff_node [DiffNode] The diff node to classify
30
36
  # @return [DiffNode] The same diff node with normative/formatting attributes set
31
37
  def classify(diff_node)
32
- # SPECIAL CASE: text_content with :normalize behavior
38
+ # FIRST: Check for XML serialization-level formatting differences
39
+ # These are ALWAYS non-normative (formatting-only) regardless of match options
40
+ # Examples: self-closing tags (<tag/>) vs explicit closing tags (<tag></tag>)
41
+ if XmlSerializationFormatter.serialization_formatting?(diff_node)
42
+ diff_node.formatting = true
43
+ diff_node.normative = false
44
+ return diff_node
45
+ end
46
+
47
+ # SECOND: Handle content-level formatting for text_content with :normalize behavior
33
48
  # When text_content is :normalize and the difference is formatting-only,
34
49
  # it should be marked as non-normative (informative)
35
50
  # This ensures that verbose and non-verbose modes give consistent results
@@ -38,7 +53,7 @@ module Canon
38
53
  # (like <pre>, <code>, <textarea> in HTML), don't apply formatting detection
39
54
  # because whitespace should be preserved in these elements
40
55
  #
41
- # This check must come FIRST, before normative_dimension? is called,
56
+ # This check must come BEFORE normative_dimension? is called,
42
57
  # because normative_dimension? returns true for text_content: :normalize
43
58
  # (since the dimension affects equivalence), which would prevent formatting
44
59
  # detection from being applied.
@@ -51,11 +66,11 @@ module Canon
51
66
  return diff_node
52
67
  end
53
68
 
54
- # FIRST: Determine if this dimension is normative based on CompareProfile
69
+ # THIRD: Determine if this dimension is normative based on CompareProfile
55
70
  # This respects the policy settings (strict/normalize/ignore)
56
71
  is_normative = profile.normative_dimension?(diff_node.dimension)
57
72
 
58
- # SECOND: Check if FormattingDetector should be consulted
73
+ # FOURTH: Check if FormattingDetector should be consulted for non-normative dimensions
59
74
  # Only check for formatting-only when dimension is NOT normative
60
75
  # This ensures strict mode differences remain normative
61
76
  should_check_formatting = !is_normative &&
@@ -68,7 +83,7 @@ module Canon
68
83
  return diff_node
69
84
  end
70
85
 
71
- # THIRD: Apply the normative determination from CompareProfile
86
+ # FIFTH: Apply the normative determination from CompareProfile
72
87
  diff_node.formatting = false
73
88
  diff_node.normative = is_normative
74
89
 
@@ -127,33 +142,6 @@ module Canon
127
142
  normalized1 == normalized2 && text1 != text2
128
143
  end
129
144
 
130
- # Check if a node is a text node
131
- # @param node [Object] The node to check
132
- # @return [Boolean] true if the node is a text node
133
- def text_node?(node)
134
- return false if node.nil?
135
-
136
- # Canon::Xml::Nodes::TextNode
137
- return true if node.is_a?(Canon::Xml::Nodes::TextNode)
138
-
139
- # Nokogiri text nodes (node_type returns integer constant like 3)
140
- return true if node.respond_to?(:node_type) &&
141
- node.node_type.is_a?(Integer) &&
142
- node.node_type == Nokogiri::XML::Node::TEXT_NODE
143
-
144
- # Moxml text nodes (node_type returns symbol)
145
- return true if node.respond_to?(:node_type) && node.node_type == :text
146
-
147
- # String
148
- return true if node.is_a?(String)
149
-
150
- # Test doubles or objects with text node-like interface
151
- # Check if it has a value method (contains text content)
152
- return true if node.respond_to?(:value)
153
-
154
- false
155
- end
156
-
157
145
  # Check if the text node is inside a whitespace-sensitive element
158
146
  # @param diff_node [DiffNode] The diff node to check
159
147
  # @return [Boolean] true if inside a whitespace-sensitive element
@@ -200,6 +188,33 @@ module Canon
200
188
  # If extraction fails, return nil (not formatting-only)
201
189
  nil
202
190
  end
191
+
192
+ # Check if a node is a text node
193
+ # @param node [Object] The node to check
194
+ # @return [Boolean] true if the node is a text node
195
+ def text_node?(node)
196
+ return false if node.nil?
197
+
198
+ # Canon::Xml::Nodes::TextNode
199
+ return true if node.is_a?(Canon::Xml::Nodes::TextNode)
200
+
201
+ # Nokogiri text nodes (node_type returns integer constant like 3)
202
+ return true if node.respond_to?(:node_type) &&
203
+ node.node_type.is_a?(Integer) &&
204
+ node.node_type == Nokogiri::XML::Node::TEXT_NODE
205
+
206
+ # Moxml text nodes (node_type returns symbol)
207
+ return true if node.respond_to?(:node_type) && node.node_type == :text
208
+
209
+ # String
210
+ return true if node.is_a?(String)
211
+
212
+ # Test doubles or objects with text node-like interface
213
+ # Check if it has a value method (contains text content)
214
+ return true if node.respond_to?(:value)
215
+
216
+ false
217
+ end
203
218
  end
204
219
  end
205
220
  end
@@ -0,0 +1,153 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module Diff
5
+ # Detects and classifies XML serialization-level formatting differences.
6
+ #
7
+ # Serialization-level formatting differences are differences in XML syntax
8
+ # that do not affect the semantic content of the document. These differences
9
+ # arise from different valid ways to serialize the same semantic content.
10
+ #
11
+ # These differences are ALWAYS non-normative (formatting-only) regardless
12
+ # of match options, because they are purely syntactic variations.
13
+ #
14
+ # Examples:
15
+ # - Self-closing vs explicit closing tags: <tag/> vs <tag></tag>
16
+ # - Attribute quote style: attr="value" vs attr='value' (parser-normalized)
17
+ # - Whitespace within tags: <tag a="1" b="2"> vs <tag a="1" b="2"> (parser-normalized)
18
+ #
19
+ # Note: Some serialization differences are normalized away by XML parsers
20
+ # (attribute quotes, tag spacing). This class focuses on differences that
21
+ # survive parsing and comparison, such as self-closing vs explicit closing.
22
+ class XmlSerializationFormatter
23
+ # Detect if a diff node represents an XML serialization formatting difference.
24
+ #
25
+ # Serialization formatting differences are ALWAYS non-normative because they
26
+ # represent different valid serializations of the same semantic content.
27
+ #
28
+ # @param diff_node [DiffNode] The diff node to check
29
+ # @return [Boolean] true if this is a serialization formatting difference
30
+ def self.serialization_formatting?(diff_node)
31
+ # Currently only handles text_content dimension
32
+ # Future: add detection for other dimensions
33
+ return false unless diff_node.dimension == :text_content
34
+
35
+ empty_text_content_serialization_diff?(diff_node)
36
+ end
37
+
38
+ # Check if a text_content difference is from XML serialization format.
39
+ #
40
+ # Specifically detects self-closing tags (<tag/>) vs explicit closing tags
41
+ # (<tag></tag>), which create different text node structures:
42
+ # - Self-closing: no text node (nil)
43
+ # - Explicit closing: empty or whitespace-only text node ("", " ", "\n", etc.)
44
+ #
45
+ # Per XML standards, these forms are semantically equivalent.
46
+ #
47
+ # @param diff_node [DiffNode] The diff node to check
48
+ # @return [Boolean] true if this is a serialization formatting difference
49
+ def self.empty_text_content_serialization_diff?(diff_node)
50
+ return false unless diff_node.dimension == :text_content
51
+
52
+ node1 = diff_node.node1
53
+ node2 = diff_node.node2
54
+
55
+ # Both nodes are nil - no actual difference, not a serialization formatting diff
56
+ return false if node1.nil? && node2.nil?
57
+
58
+ # Only one is nil (e.g., one doc has self-closing, other has text)
59
+ # If the non-nil one is blank, it's still serialization formatting
60
+ if node1.nil? || node2.nil?
61
+ non_nil = node1 || node2
62
+ return false unless text_node?(non_nil)
63
+
64
+ text = extract_text_content(non_nil)
65
+ return blank?(text)
66
+ end
67
+
68
+ # Both must be text nodes
69
+ return false unless text_node?(node1) && text_node?(node2)
70
+
71
+ text1 = extract_text_content(node1)
72
+ text2 = extract_text_content(node2)
73
+
74
+ # Check if both texts are blank/whitespace-only
75
+ # This indicates self-closing vs explicit closing tag syntax
76
+ blank?(text1) && blank?(text2)
77
+ end
78
+
79
+ # Check if a value is blank (nil or whitespace-only)
80
+ # @param value [String, nil] Value to check
81
+ # @return [Boolean] true if blank
82
+ def self.blank?(value)
83
+ value.nil? ||
84
+ (value.respond_to?(:empty?) && value.empty?) ||
85
+ (value.respond_to?(:strip) && value.strip.empty?)
86
+ end
87
+
88
+ # Check if a node is a text node
89
+ # @param node [Object] The node to check
90
+ # @return [Boolean] true if the node is a text node
91
+ def self.text_node?(node)
92
+ return false if node.nil?
93
+
94
+ # Canon::Xml::Nodes::TextNode
95
+ return true if node.is_a?(Canon::Xml::Nodes::TextNode)
96
+
97
+ # Moxml::Text (check before generic node_type check)
98
+ return true if node.is_a?(Moxml::Text)
99
+
100
+ # Nokogiri text nodes (node_type returns integer constant like 3)
101
+ return true if node.respond_to?(:node_type) &&
102
+ node.node_type.is_a?(Integer) &&
103
+ node.node_type == Nokogiri::XML::Node::TEXT_NODE
104
+
105
+ # Moxml text nodes (node_type returns symbol) - for when using Moxml adapters
106
+ return true if node.respond_to?(:node_type) && node.node_type == :text
107
+
108
+ # String
109
+ return true if node.is_a?(String)
110
+
111
+ # Test doubles or objects with text node-like interface
112
+ # Check if it has a value method (contains text content)
113
+ return true if node.respond_to?(:value)
114
+
115
+ false
116
+ end
117
+
118
+ # Extract text content from a node
119
+ # @param node [Object] The node to extract text from
120
+ # @return [String, nil] The text content or nil
121
+ def self.extract_text_content(node)
122
+ return nil if node.nil?
123
+
124
+ # For TextNode with value attribute (Canon::Xml::Nodes::TextNode)
125
+ return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
126
+
127
+ # For XML/HTML nodes with text_content method
128
+ return node.text_content if node.respond_to?(:text_content)
129
+
130
+ # For nodes with content method (try before text, as Moxml::Text.text returns "")
131
+ return node.content if node.respond_to?(:content)
132
+
133
+ # For nodes with text method
134
+ return node.text if node.respond_to?(:text)
135
+
136
+ # For nodes with value method (other types)
137
+ return node.value if node.respond_to?(:value)
138
+
139
+ # For simple text nodes or strings
140
+ return node.to_s if node.is_a?(String)
141
+
142
+ # For other node types, try to_s
143
+ node.to_s
144
+ rescue StandardError
145
+ # If extraction fails, return nil (not a serialization difference)
146
+ nil
147
+ end
148
+
149
+ private_class_method :blank?, :text_node?, :extract_text_content,
150
+ :empty_text_content_serialization_diff?
151
+ end
152
+ end
153
+ end