canon 0.1.8 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +83 -22
  3. data/docs/Gemfile +1 -0
  4. data/docs/_config.yml +90 -1
  5. data/docs/advanced/diff-classification.adoc +196 -24
  6. data/docs/features/match-options/index.adoc +239 -1
  7. data/lib/canon/comparison/format_detector.rb +2 -1
  8. data/lib/canon/comparison/html_comparator.rb +19 -8
  9. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  10. data/lib/canon/comparison/markup_comparator.rb +109 -2
  11. data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
  12. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  13. data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
  14. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +108 -0
  15. data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
  16. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
  17. data/lib/canon/comparison/xml_comparator.rb +240 -23
  18. data/lib/canon/comparison/xml_node_comparison.rb +25 -3
  19. data/lib/canon/diff/diff_classifier.rb +119 -5
  20. data/lib/canon/diff/formatting_detector.rb +1 -1
  21. data/lib/canon/diff/xml_serialization_formatter.rb +153 -0
  22. data/lib/canon/rspec_matchers.rb +37 -8
  23. data/lib/canon/version.rb +1 -1
  24. data/lib/canon/xml/data_model.rb +24 -13
  25. metadata +4 -78
  26. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
  27. data/false_positive_analysis.txt +0 -0
  28. data/file1.html +0 -1
  29. data/file2.html +0 -1
  30. data/old-docs/ADVANCED_TOPICS.adoc +0 -20
  31. data/old-docs/BASIC_USAGE.adoc +0 -16
  32. data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
  33. data/old-docs/CLI.adoc +0 -497
  34. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  35. data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
  36. data/old-docs/DIFF_FORMATTING.adoc +0 -540
  37. data/old-docs/DIFF_PARAMETERS.adoc +0 -261
  38. data/old-docs/DOM_DIFF.adoc +0 -1017
  39. data/old-docs/ENV_CONFIG.adoc +0 -876
  40. data/old-docs/FORMATS.adoc +0 -867
  41. data/old-docs/INPUT_VALIDATION.adoc +0 -477
  42. data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
  43. data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
  44. data/old-docs/MATCH_OPTIONS.adoc +0 -912
  45. data/old-docs/MODES.adoc +0 -432
  46. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  47. data/old-docs/OPTIONS.adoc +0 -1387
  48. data/old-docs/PREPROCESSING.adoc +0 -491
  49. data/old-docs/README.old.adoc +0 -2831
  50. data/old-docs/RSPEC.adoc +0 -814
  51. data/old-docs/RUBY_API.adoc +0 -485
  52. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
  53. data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
  54. data/old-docs/STRING_COMPARE.adoc +0 -345
  55. data/old-docs/TMP.adoc +0 -3384
  56. data/old-docs/TREE_DIFF.adoc +0 -1080
  57. data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
  58. data/old-docs/VERBOSE.adoc +0 -482
  59. data/old-docs/VISUALIZATION_MAP.adoc +0 -625
  60. data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
  61. data/scripts/analyze_current_state.rb +0 -85
  62. data/scripts/analyze_false_positives.rb +0 -114
  63. data/scripts/analyze_remaining_failures.rb +0 -105
  64. data/scripts/compare_current_failures.rb +0 -95
  65. data/scripts/compare_dom_tree_diff.rb +0 -158
  66. data/scripts/compare_failures.rb +0 -151
  67. data/scripts/debug_attribute_extraction.rb +0 -66
  68. data/scripts/debug_blocks_839.rb +0 -115
  69. data/scripts/debug_meta_matching.rb +0 -52
  70. data/scripts/debug_p_matching.rb +0 -192
  71. data/scripts/debug_signature_matching.rb +0 -118
  72. data/scripts/debug_sourcecode_124.rb +0 -32
  73. data/scripts/debug_whitespace_sensitive.rb +0 -192
  74. data/scripts/extract_false_positives.rb +0 -138
  75. data/scripts/find_actual_false_positives.rb +0 -125
  76. data/scripts/investigate_all_false_positives.rb +0 -161
  77. data/scripts/investigate_batch1.rb +0 -127
  78. data/scripts/investigate_classification.rb +0 -150
  79. data/scripts/investigate_classification_detailed.rb +0 -190
  80. data/scripts/investigate_common_failures.rb +0 -342
  81. data/scripts/investigate_false_negative.rb +0 -80
  82. data/scripts/investigate_false_positive.rb +0 -83
  83. data/scripts/investigate_false_positives.rb +0 -227
  84. data/scripts/investigate_false_positives_batch.rb +0 -163
  85. data/scripts/investigate_mixed_content.rb +0 -125
  86. data/scripts/investigate_remaining_16.rb +0 -214
  87. data/scripts/run_single_test.rb +0 -29
  88. data/scripts/test_all_false_positives.rb +0 -95
  89. data/scripts/test_attribute_details.rb +0 -61
  90. data/scripts/test_both_algorithms.rb +0 -49
  91. data/scripts/test_both_simple.rb +0 -49
  92. data/scripts/test_enhanced_semantic_output.rb +0 -125
  93. data/scripts/test_readme_examples.rb +0 -131
  94. data/scripts/test_semantic_tree_diff.rb +0 -99
  95. data/scripts/test_semantic_ux_improvements.rb +0 -135
  96. data/scripts/test_single_false_positive.rb +0 -119
  97. data/scripts/test_size_limits.rb +0 -99
  98. data/test_html_1.html +0 -21
  99. data/test_html_2.html +0 -21
  100. data/test_nokogiri.rb +0 -33
  101. data/test_normalize.rb +0 -45
@@ -18,6 +18,8 @@ require_relative "xml_comparator/namespace_comparator"
18
18
  require_relative "xml_comparator/node_type_comparator"
19
19
  require_relative "xml_comparator/child_comparison"
20
20
  require_relative "xml_comparator/diff_node_builder"
21
+ # Whitespace sensitivity module
22
+ require_relative "whitespace_sensitivity"
21
23
 
22
24
  module Canon
23
25
  module Comparison
@@ -90,9 +92,15 @@ module Canon
90
92
  # Create child_opts with resolved options
91
93
  child_opts = opts.merge(child_opts)
92
94
 
95
+ # Determine if we should preserve whitespace during parsing
96
+ # When structural_whitespace is :strict, preserve all whitespace-only text nodes
97
+ preserve_whitespace = match_opts_hash[:structural_whitespace] == :strict
98
+
93
99
  # Parse nodes if they are strings, applying preprocessing if needed
94
- node1 = parse_node(n1, match_opts_hash[:preprocessing])
95
- node2 = parse_node(n2, match_opts_hash[:preprocessing])
100
+ node1 = parse_node(n1, match_opts_hash[:preprocessing],
101
+ preserve_whitespace: preserve_whitespace)
102
+ node2 = parse_node(n2, match_opts_hash[:preprocessing],
103
+ preserve_whitespace: preserve_whitespace)
96
104
 
97
105
  # Store original strings for line diff display (before preprocessing)
98
106
  original1 = if n1.is_a?(String)
@@ -209,8 +217,9 @@ module Canon
209
217
  # Parse a node from string or return as-is
210
218
  # Applies preprocessing transformation before parsing if specified
211
219
  # Delegates to NodeParser module
212
- def parse_node(node, preprocessing = :none)
213
- XmlComparatorHelpers::NodeParser.parse(node, preprocessing)
220
+ def parse_node(node, preprocessing = :none, preserve_whitespace: false)
221
+ XmlComparatorHelpers::NodeParser.parse(node, preprocessing,
222
+ preserve_whitespace: preserve_whitespace)
214
223
  end
215
224
 
216
225
  # Main comparison dispatcher
@@ -331,7 +340,8 @@ module Canon
331
340
 
332
341
  # For HTML, check if text node is inside whitespace-preserving element
333
342
  # If so, always use strict comparison regardless of text_content setting
334
- if should_preserve_whitespace_strictly?(n1, n2)
343
+ sensitive_element = should_preserve_whitespace_strictly?(n1, n2, opts)
344
+ if sensitive_element
335
345
  behavior = :strict
336
346
  end
337
347
 
@@ -344,15 +354,23 @@ module Canon
344
354
 
345
355
  # Determine the correct dimension for this difference
346
356
  # - If text_content is :strict, ALL differences use :text_content dimension
347
- # - If text_content is :normalize, whitespace-only diffs use :structural_whitespace
357
+ # - If text_content is :normalize, whitespace-only diffs could use :structural_whitespace
358
+ # but we keep :text_content to ensure correct classification behavior
348
359
  # - Otherwise use :text_content
349
- dimension = if behavior == :normalize && whitespace_only_difference?(
350
- text1, text2
351
- )
352
- :structural_whitespace
353
- else
354
- :text_content
355
- end
360
+ # However, if element is whitespace-sensitive (like <pre> in HTML),
361
+ # always use :text_content dimension regardless of behavior
362
+ #
363
+ # NOTE: We keep the dimension as :text_content even for whitespace-only diffs
364
+ # when text_content: :normalize. This ensures that the classification uses
365
+ # the text_content behavior (:normalize) instead of structural_whitespace
366
+ # behavior (:strict for XML), which would incorrectly mark the diff as normative.
367
+ if sensitive_element
368
+ # Whitespace-sensitive element: always use :text_content dimension
369
+ else
370
+ # Always use :text_content for text differences
371
+ # This ensures correct classification based on text_content behavior
372
+ end
373
+ dimension = :text_content
356
374
 
357
375
  # Create DiffNode in verbose mode when raw content differs
358
376
  # This ensures informative diffs are created even for :ignore/:normalize
@@ -368,17 +386,23 @@ module Canon
368
386
 
369
387
  # Check if whitespace should be preserved strictly for these text nodes
370
388
  # This applies to HTML elements like pre, code, textarea, script, style
371
- def should_preserve_whitespace_strictly?(n1, n2)
372
- # Only applies to Nokogiri nodes (HTML)
373
- return false unless n1.respond_to?(:parent) && n2.respond_to?(:parent)
374
- return false unless n1.parent.respond_to?(:name) && n2.parent.respond_to?(:name)
389
+ # and elements with xml:space="preserve" or in user-configured whitelist
390
+ def should_preserve_whitespace_strictly?(n1, n2, opts)
391
+ # Use WhitespaceSensitivity module to check if element is sensitive
392
+ # Check both n1 and n2 - if either is in a sensitive element, preserve strictly
393
+ if n1.respond_to?(:parent)
394
+ sensitivity_opts = { match_opts: opts[:match_opts] }
395
+ return true if WhitespaceSensitivity.element_sensitive?(n1,
396
+ sensitivity_opts)
397
+ end
375
398
 
376
- # Elements where whitespace must be preserved in HTML
377
- preserve_elements = %w[pre code textarea script style]
399
+ if n2.respond_to?(:parent)
400
+ sensitivity_opts = { match_opts: opts[:match_opts] }
401
+ return true if WhitespaceSensitivity.element_sensitive?(n2,
402
+ sensitivity_opts)
403
+ end
378
404
 
379
- # Check if either node is inside a whitespace-preserving element
380
- in_preserve_element?(n1, preserve_elements) ||
381
- in_preserve_element?(n2, preserve_elements)
405
+ false
382
406
  end
383
407
 
384
408
  # Check if a node is inside a whitespace-preserving element
@@ -469,7 +493,8 @@ module Canon
469
493
  #
470
494
  # Delegates to ChildComparison module which handles both ElementMatcher
471
495
  # (semantic matching) and simple positional comparison.
472
- def compare_children(n1, n2, opts, child_opts, diff_children, differences)
496
+ def compare_children(n1, n2, opts, child_opts, diff_children,
497
+ differences)
473
498
  XmlComparatorHelpers::ChildComparison.compare(
474
499
  n1, n2, self, opts, child_opts, diff_children, differences
475
500
  )
@@ -543,9 +568,201 @@ module Canon
543
568
  end
544
569
  end
545
570
 
571
+ # For attribute presence differences, show what attributes differ
572
+ if dimension == :attribute_presence
573
+ attrs1 = extract_attributes(node1)
574
+ attrs2 = extract_attributes(node2)
575
+ return build_attribute_diff_reason(attrs1, attrs2)
576
+ end
577
+
578
+ # For text content differences, show the actual text (truncated if needed)
579
+ if dimension == :text_content
580
+ text1 = extract_text_from_node(node1)
581
+ text2 = extract_text_from_node(node2)
582
+ return build_text_diff_reason(text1, text2)
583
+ end
584
+
546
585
  "#{diff1} vs #{diff2}"
547
586
  end
548
587
 
588
+ # Build a clear reason message for attribute presence differences
589
+ #
590
+ # @param attrs1 [Hash, nil] First node's attributes
591
+ # @param attrs2 [Hash, nil] Second node's attributes
592
+ # @return [String] Clear explanation of the attribute difference
593
+ def build_attribute_diff_reason(attrs1, attrs2)
594
+ return "#{attrs1&.keys&.size || 0} vs #{attrs2&.keys&.size || 0} attributes" unless attrs1 && attrs2
595
+
596
+ require "set"
597
+ keys1 = attrs1.keys.to_set
598
+ keys2 = attrs2.keys.to_set
599
+
600
+ only_in_first = keys1 - keys2
601
+ only_in_second = keys2 - keys1
602
+ common = keys1 & keys2
603
+
604
+ # Check if values differ for common keys
605
+ different_values = common.reject { |k| attrs1[k] == attrs2[k] }
606
+
607
+ parts = []
608
+ parts << "only in first: #{only_in_first.to_a.sort.join(', ')}" if only_in_first.any?
609
+ parts << "only in second: #{only_in_second.to_a.sort.join(', ')}" if only_in_second.any?
610
+ parts << "different values: #{different_values.sort.join(', ')}" if different_values.any?
611
+
612
+ if parts.empty?
613
+ "#{keys1.size} vs #{keys2.size} attributes (same names)"
614
+ else
615
+ parts.join("; ")
616
+ end
617
+ end
618
+
619
+ # Extract text from a node for diff reason
620
+ #
621
+ # @param node [Object, nil] Node to extract text from
622
+ # @return [String, nil] Text content or nil
623
+ def extract_text_from_node(node)
624
+ return nil if node.nil?
625
+
626
+ # For Canon::Xml::Nodes::TextNode
627
+ return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
628
+
629
+ # For XML/HTML nodes with text_content method
630
+ return node.text_content if node.respond_to?(:text_content)
631
+
632
+ # For nodes with text method
633
+ return node.text if node.respond_to?(:text)
634
+
635
+ # For nodes with content method (Moxml::Text)
636
+ return node.content if node.respond_to?(:content)
637
+
638
+ # For nodes with value method (other types)
639
+ return node.value if node.respond_to?(:value)
640
+
641
+ # For simple text nodes or strings
642
+ return node.to_s if node.is_a?(String)
643
+
644
+ # For other node types, try to_s
645
+ node.to_s
646
+ rescue StandardError
647
+ nil
648
+ end
649
+
650
+ # Build a clear reason message for text content differences
651
+ #
652
+ # @param text1 [String, nil] First text content
653
+ # @param text2 [String, nil] Second text content
654
+ # @return [String] Clear explanation of the text difference
655
+ def build_text_diff_reason(text1, text2)
656
+ # Handle nil cases
657
+ return "missing vs '#{truncate_text(text2)}'" if text1.nil? && text2
658
+ return "'#{truncate_text(text2)}' vs missing" if text1 && text2.nil?
659
+ return "both missing" if text1.nil? && text2.nil?
660
+
661
+ # Check if both are whitespace-only
662
+ if whitespace_only?(text1) && whitespace_only?(text2)
663
+ return "whitespace: #{describe_whitespace(text1)} vs #{describe_whitespace(text2)}"
664
+ end
665
+
666
+ # Show text with visible whitespace markers
667
+ # Use escaped representations for clarity: \n for newline, \t for tab, · for spaces
668
+ vis1 = visualize_whitespace(text1)
669
+ vis2 = visualize_whitespace(text2)
670
+
671
+ "Text: \"#{vis1}\" vs \"#{vis2}\""
672
+ end
673
+
674
+ # Check if text is only whitespace
675
+ #
676
+ # @param text [String] Text to check
677
+ # @return [Boolean] true if whitespace-only
678
+ def whitespace_only?(text)
679
+ return false if text.nil?
680
+
681
+ text.to_s.strip.empty?
682
+ end
683
+
684
+ # Make whitespace visible in text content
685
+ # Uses the existing character visualization map from DiffFormatter (single source of truth)
686
+ #
687
+ # @param text [String] Text to visualize
688
+ # @return [String] Text with visible whitespace markers
689
+ def visualize_whitespace(text)
690
+ return "" if text.nil?
691
+
692
+ # Use the character map loader as the single source of truth
693
+ viz_map = character_visualization_map
694
+
695
+ # Replace each character with its visualization
696
+ text.chars.map { |char| viz_map[char] || char }.join
697
+ end
698
+
699
+ # Get the character visualization map (lazy-loaded to avoid circular dependency)
700
+ #
701
+ # @return [Hash] Character to visualization symbol mapping
702
+ def character_visualization_map
703
+ @character_visualization_map ||= begin
704
+ # Load the YAML file directly to avoid circular dependency
705
+ require "yaml"
706
+ lib_root = File.expand_path("../..", __dir__)
707
+ yaml_path = File.join(lib_root,
708
+ "canon/diff_formatter/character_map.yml")
709
+ data = YAML.load_file(yaml_path)
710
+
711
+ # Build visualization map from the YAML data
712
+ visualization_map = {}
713
+ data["characters"].each do |char_data|
714
+ # Get the character from either unicode code point or character field
715
+ char = if char_data["unicode"]
716
+ # Convert hex string to character
717
+ [char_data["unicode"].to_i(16)].pack("U")
718
+ else
719
+ # Use character field directly (handles \n, \t, etc.)
720
+ char_data["character"]
721
+ end
722
+
723
+ vis = char_data["visualization"]
724
+ visualization_map[char] = vis
725
+ end
726
+
727
+ visualization_map
728
+ end
729
+ end
730
+
731
+ # Describe whitespace content in a readable way
732
+ #
733
+ # @param text [String] Whitespace text
734
+ # @return [String] Description like "4 chars (2 newlines, 2 spaces)"
735
+ def describe_whitespace(text)
736
+ return "0 chars" if text.nil? || text.empty?
737
+
738
+ char_count = text.length
739
+ newline_count = text.count("\n")
740
+ space_count = text.count(" ")
741
+ tab_count = text.count("\t")
742
+
743
+ parts = []
744
+ parts << "#{newline_count} newlines" if newline_count.positive?
745
+ parts << "#{space_count} spaces" if space_count.positive?
746
+ parts << "#{tab_count} tabs" if tab_count.positive?
747
+
748
+ description = parts.join(", ")
749
+ "#{char_count} chars (#{description})"
750
+ end
751
+
752
+ # Truncate text for display in reason messages
753
+ #
754
+ # @param text [String] Text to truncate
755
+ # @param max_length [Integer] Maximum length
756
+ # @return [String] Truncated text
757
+ def truncate_text(text, max_length = 40)
758
+ return "" if text.nil?
759
+
760
+ text = text.to_s
761
+ return text if text.length <= max_length
762
+
763
+ "#{text[0...max_length]}..."
764
+ end
765
+
549
766
  # Compare namespace declarations (xmlns and xmlns:* attributes)
550
767
  # Delegates to XmlComparatorHelpers::NamespaceComparator
551
768
  def compare_namespace_declarations(n1, n2, opts, differences)
@@ -139,9 +139,13 @@ diff_children, differences)
139
139
 
140
140
  # Check structural_whitespace match option
141
141
  match_opts = opts[:match_opts]
142
- # Filter out whitespace-only text nodes
143
- if match_opts && %i[ignore
144
- normalize].include?(match_opts[:structural_whitespace]) && text_node?(node)
142
+ return false unless match_opts
143
+
144
+ # Filter out whitespace-only text nodes based on structural_whitespace setting
145
+ # - :ignore or :normalize: Filter all whitespace-only text nodes
146
+ # - :strict: Preserve all whitespace-only text nodes (don't filter any)
147
+ if text_node?(node) && %i[ignore
148
+ normalize].include?(match_opts[:structural_whitespace])
145
149
  text = node_text(node)
146
150
  return true if MatchOptions.normalize_text(text).empty?
147
151
  end
@@ -184,6 +188,24 @@ diff_children, differences)
184
188
  node.respond_to?(:node_type) && node.node_type == :text
185
189
  end
186
190
 
191
+ # Extract text content from a node
192
+ #
193
+ # @param node [Object] Node to extract text from
194
+ # @return [String] Text content
195
+ def self.node_text(node)
196
+ return "" unless node
197
+
198
+ if node.respond_to?(:content)
199
+ node.content.to_s
200
+ elsif node.respond_to?(:text)
201
+ node.text.to_s
202
+ elsif node.respond_to?(:value)
203
+ node.value.to_s
204
+ else
205
+ ""
206
+ end
207
+ end
208
+
187
209
  # Dispatch by Canon::Xml::Node type
188
210
  def self.dispatch_canon_node_type(node1, node2, opts, child_opts,
189
211
  diff_children, differences)
@@ -1,12 +1,19 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "formatting_detector"
4
+ require_relative "xml_serialization_formatter"
4
5
  require_relative "../comparison/compare_profile"
6
+ require_relative "../comparison/whitespace_sensitivity"
5
7
 
6
8
  module Canon
7
9
  module Diff
8
10
  # Classifies DiffNodes as normative (affects equivalence) or informative (doesn't affect equivalence)
9
11
  # based on the match options in effect
12
+ #
13
+ # Classification hierarchy (three distinct kinds of differences):
14
+ # 1. Serialization formatting: XML syntax differences (always non-normative)
15
+ # 2. Content formatting: Whitespace differences in content (non-normative when normalized)
16
+ # 3. Normative: Semantic content differences (affect equivalence)
10
17
  class DiffClassifier
11
18
  attr_reader :match_options, :profile
12
19
 
@@ -24,15 +31,46 @@ module Canon
24
31
 
25
32
  # Classify a single DiffNode as normative or informative
26
33
  # Hierarchy: formatting-only < informative < normative
27
- # CompareProfile determines base classification, FormattingDetector refines informative differences
34
+ # CompareProfile determines base classification, XmlSerializationFormatter handles serialization formatting
28
35
  # @param diff_node [DiffNode] The diff node to classify
29
36
  # @return [DiffNode] The same diff node with normative/formatting attributes set
30
37
  def classify(diff_node)
31
- # FIRST: Determine if this dimension is normative based on CompareProfile
38
+ # FIRST: Check for XML serialization-level formatting differences
39
+ # These are ALWAYS non-normative (formatting-only) regardless of match options
40
+ # Examples: self-closing tags (<tag/>) vs explicit closing tags (<tag></tag>)
41
+ if XmlSerializationFormatter.serialization_formatting?(diff_node)
42
+ diff_node.formatting = true
43
+ diff_node.normative = false
44
+ return diff_node
45
+ end
46
+
47
+ # SECOND: Handle content-level formatting for text_content with :normalize behavior
48
+ # When text_content is :normalize and the difference is formatting-only,
49
+ # it should be marked as non-normative (informative)
50
+ # This ensures that verbose and non-verbose modes give consistent results
51
+ #
52
+ # EXCEPTION: If the text node is inside a whitespace-sensitive element
53
+ # (like <pre>, <code>, <textarea> in HTML), don't apply formatting detection
54
+ # because whitespace should be preserved in these elements
55
+ #
56
+ # This check must come BEFORE normative_dimension? is called,
57
+ # because normative_dimension? returns true for text_content: :normalize
58
+ # (since the dimension affects equivalence), which would prevent formatting
59
+ # detection from being applied.
60
+ if diff_node.dimension == :text_content &&
61
+ profile.send(:behavior_for, :text_content) == :normalize &&
62
+ !inside_whitespace_sensitive_element?(diff_node) &&
63
+ formatting_only_diff?(diff_node)
64
+ diff_node.formatting = true
65
+ diff_node.normative = false
66
+ return diff_node
67
+ end
68
+
69
+ # THIRD: Determine if this dimension is normative based on CompareProfile
32
70
  # This respects the policy settings (strict/normalize/ignore)
33
71
  is_normative = profile.normative_dimension?(diff_node.dimension)
34
72
 
35
- # SECOND: Check if FormattingDetector should be consulted
73
+ # FOURTH: Check if FormattingDetector should be consulted for non-normative dimensions
36
74
  # Only check for formatting-only when dimension is NOT normative
37
75
  # This ensures strict mode differences remain normative
38
76
  should_check_formatting = !is_normative &&
@@ -45,7 +83,7 @@ module Canon
45
83
  return diff_node
46
84
  end
47
85
 
48
- # Otherwise, use the normative determination from CompareProfile
86
+ # FIFTH: Apply the normative determination from CompareProfile
49
87
  diff_node.formatting = false
50
88
  diff_node.normative = is_normative
51
89
 
@@ -65,10 +103,59 @@ module Canon
65
103
  # @param diff_node [DiffNode] The diff node to check
66
104
  # @return [Boolean] true if formatting-only
67
105
  def formatting_only_diff?(diff_node)
106
+ # Only apply formatting detection to actual text content differences
107
+ # If the nodes are not text nodes (e.g., element nodes), don't apply formatting detection
108
+ node1 = diff_node.node1
109
+ node2 = diff_node.node2
110
+
111
+ # Check if both nodes are text nodes
112
+ # If not, this is not a formatting-only difference
113
+ return false unless text_node?(node1) && text_node?(node2)
114
+
68
115
  text1 = extract_text_content(diff_node.node1)
69
116
  text2 = extract_text_content(diff_node.node2)
70
117
 
71
- FormattingDetector.formatting_only?(text1, text2)
118
+ # For text_content dimension, use normalized text comparison
119
+ # This handles cases like "" vs " " (both normalize to "")
120
+ if diff_node.dimension == :text_content
121
+ normalized_equivalent?(text1, text2)
122
+ else
123
+ FormattingDetector.formatting_only?(text1, text2)
124
+ end
125
+ end
126
+
127
+ # Check if two texts are equivalent after normalization
128
+ # This detects formatting-only differences where normalized texts match
129
+ # @param text1 [String, nil] First text
130
+ # @param text2 [String, nil] Second text
131
+ # @return [Boolean] true if normalized texts are equivalent
132
+ def normalized_equivalent?(text1, text2)
133
+ return false if text1.nil? && text2.nil?
134
+ return false if text1.nil? || text2.nil?
135
+
136
+ # Use MatchOptions.normalize_text for consistency
137
+ normalized1 = Canon::Comparison::MatchOptions.normalize_text(text1)
138
+ normalized2 = Canon::Comparison::MatchOptions.normalize_text(text2)
139
+
140
+ # If normalized texts are equivalent but originals are different,
141
+ # it's a formatting-only difference
142
+ normalized1 == normalized2 && text1 != text2
143
+ end
144
+
145
+ # Check if the text node is inside a whitespace-sensitive element
146
+ # @param diff_node [DiffNode] The diff node to check
147
+ # @return [Boolean] true if inside a whitespace-sensitive element
148
+ def inside_whitespace_sensitive_element?(diff_node)
149
+ # Get the text node (not the parent element)
150
+ node = diff_node.node1 || diff_node.node2
151
+ return false unless node
152
+
153
+ # WhitespaceSensitivity.element_sensitive? expects a text node
154
+ # and checks its parent element
155
+ # We need to pass the full options structure with :match_opts key
156
+ opts = { match_opts: @match_options.options }
157
+
158
+ Canon::Comparison::WhitespaceSensitivity.element_sensitive?(node, opts)
72
159
  end
73
160
 
74
161
  # Extract text content from a node for formatting comparison
@@ -101,6 +188,33 @@ module Canon
101
188
  # If extraction fails, return nil (not formatting-only)
102
189
  nil
103
190
  end
191
+
192
+ # Check if a node is a text node
193
+ # @param node [Object] The node to check
194
+ # @return [Boolean] true if the node is a text node
195
+ def text_node?(node)
196
+ return false if node.nil?
197
+
198
+ # Canon::Xml::Nodes::TextNode
199
+ return true if node.is_a?(Canon::Xml::Nodes::TextNode)
200
+
201
+ # Nokogiri text nodes (node_type returns integer constant like 3)
202
+ return true if node.respond_to?(:node_type) &&
203
+ node.node_type.is_a?(Integer) &&
204
+ node.node_type == Nokogiri::XML::Node::TEXT_NODE
205
+
206
+ # Moxml text nodes (node_type returns symbol)
207
+ return true if node.respond_to?(:node_type) && node.node_type == :text
208
+
209
+ # String
210
+ return true if node.is_a?(String)
211
+
212
+ # Test doubles or objects with text node-like interface
213
+ # Check if it has a value method (contains text content)
214
+ return true if node.respond_to?(:value)
215
+
216
+ false
217
+ end
104
218
  end
105
219
  end
106
220
  end
@@ -11,7 +11,7 @@ module Canon
11
11
  # @param line2 [String, nil] Second line to compare
12
12
  # @return [Boolean] true if lines differ only in formatting
13
13
  def self.formatting_only?(line1, line2)
14
- # If both are nil or empty, not a formatting diff
14
+ # If both are nil or empty, not a formatting diff (no difference)
15
15
  return false if blank?(line1) && blank?(line2)
16
16
 
17
17
  # If only one is blank, it's not just formatting