canon 0.1.8 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +83 -22
- data/docs/Gemfile +1 -0
- data/docs/_config.yml +90 -1
- data/docs/advanced/diff-classification.adoc +196 -24
- data/docs/features/match-options/index.adoc +239 -1
- data/lib/canon/comparison/format_detector.rb +2 -1
- data/lib/canon/comparison/html_comparator.rb +19 -8
- data/lib/canon/comparison/html_compare_profile.rb +8 -2
- data/lib/canon/comparison/markup_comparator.rb +109 -2
- data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
- data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +108 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
- data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
- data/lib/canon/comparison/xml_comparator.rb +240 -23
- data/lib/canon/comparison/xml_node_comparison.rb +25 -3
- data/lib/canon/diff/diff_classifier.rb +119 -5
- data/lib/canon/diff/formatting_detector.rb +1 -1
- data/lib/canon/diff/xml_serialization_formatter.rb +153 -0
- data/lib/canon/rspec_matchers.rb +37 -8
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +24 -13
- metadata +4 -78
- data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
- data/false_positive_analysis.txt +0 -0
- data/file1.html +0 -1
- data/file2.html +0 -1
- data/old-docs/ADVANCED_TOPICS.adoc +0 -20
- data/old-docs/BASIC_USAGE.adoc +0 -16
- data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
- data/old-docs/CLI.adoc +0 -497
- data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/old-docs/DIFF_FORMATTING.adoc +0 -540
- data/old-docs/DIFF_PARAMETERS.adoc +0 -261
- data/old-docs/DOM_DIFF.adoc +0 -1017
- data/old-docs/ENV_CONFIG.adoc +0 -876
- data/old-docs/FORMATS.adoc +0 -867
- data/old-docs/INPUT_VALIDATION.adoc +0 -477
- data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
- data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/old-docs/MATCH_OPTIONS.adoc +0 -912
- data/old-docs/MODES.adoc +0 -432
- data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/old-docs/OPTIONS.adoc +0 -1387
- data/old-docs/PREPROCESSING.adoc +0 -491
- data/old-docs/README.old.adoc +0 -2831
- data/old-docs/RSPEC.adoc +0 -814
- data/old-docs/RUBY_API.adoc +0 -485
- data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
- data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
- data/old-docs/STRING_COMPARE.adoc +0 -345
- data/old-docs/TMP.adoc +0 -3384
- data/old-docs/TREE_DIFF.adoc +0 -1080
- data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
- data/old-docs/VERBOSE.adoc +0 -482
- data/old-docs/VISUALIZATION_MAP.adoc +0 -625
- data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
- data/scripts/analyze_current_state.rb +0 -85
- data/scripts/analyze_false_positives.rb +0 -114
- data/scripts/analyze_remaining_failures.rb +0 -105
- data/scripts/compare_current_failures.rb +0 -95
- data/scripts/compare_dom_tree_diff.rb +0 -158
- data/scripts/compare_failures.rb +0 -151
- data/scripts/debug_attribute_extraction.rb +0 -66
- data/scripts/debug_blocks_839.rb +0 -115
- data/scripts/debug_meta_matching.rb +0 -52
- data/scripts/debug_p_matching.rb +0 -192
- data/scripts/debug_signature_matching.rb +0 -118
- data/scripts/debug_sourcecode_124.rb +0 -32
- data/scripts/debug_whitespace_sensitive.rb +0 -192
- data/scripts/extract_false_positives.rb +0 -138
- data/scripts/find_actual_false_positives.rb +0 -125
- data/scripts/investigate_all_false_positives.rb +0 -161
- data/scripts/investigate_batch1.rb +0 -127
- data/scripts/investigate_classification.rb +0 -150
- data/scripts/investigate_classification_detailed.rb +0 -190
- data/scripts/investigate_common_failures.rb +0 -342
- data/scripts/investigate_false_negative.rb +0 -80
- data/scripts/investigate_false_positive.rb +0 -83
- data/scripts/investigate_false_positives.rb +0 -227
- data/scripts/investigate_false_positives_batch.rb +0 -163
- data/scripts/investigate_mixed_content.rb +0 -125
- data/scripts/investigate_remaining_16.rb +0 -214
- data/scripts/run_single_test.rb +0 -29
- data/scripts/test_all_false_positives.rb +0 -95
- data/scripts/test_attribute_details.rb +0 -61
- data/scripts/test_both_algorithms.rb +0 -49
- data/scripts/test_both_simple.rb +0 -49
- data/scripts/test_enhanced_semantic_output.rb +0 -125
- data/scripts/test_readme_examples.rb +0 -131
- data/scripts/test_semantic_tree_diff.rb +0 -99
- data/scripts/test_semantic_ux_improvements.rb +0 -135
- data/scripts/test_single_false_positive.rb +0 -119
- data/scripts/test_size_limits.rb +0 -99
- data/test_html_1.html +0 -21
- data/test_html_2.html +0 -21
- data/test_nokogiri.rb +0 -33
- data/test_normalize.rb +0 -45
|
@@ -18,6 +18,8 @@ require_relative "xml_comparator/namespace_comparator"
|
|
|
18
18
|
require_relative "xml_comparator/node_type_comparator"
|
|
19
19
|
require_relative "xml_comparator/child_comparison"
|
|
20
20
|
require_relative "xml_comparator/diff_node_builder"
|
|
21
|
+
# Whitespace sensitivity module
|
|
22
|
+
require_relative "whitespace_sensitivity"
|
|
21
23
|
|
|
22
24
|
module Canon
|
|
23
25
|
module Comparison
|
|
@@ -90,9 +92,15 @@ module Canon
|
|
|
90
92
|
# Create child_opts with resolved options
|
|
91
93
|
child_opts = opts.merge(child_opts)
|
|
92
94
|
|
|
95
|
+
# Determine if we should preserve whitespace during parsing
|
|
96
|
+
# When structural_whitespace is :strict, preserve all whitespace-only text nodes
|
|
97
|
+
preserve_whitespace = match_opts_hash[:structural_whitespace] == :strict
|
|
98
|
+
|
|
93
99
|
# Parse nodes if they are strings, applying preprocessing if needed
|
|
94
|
-
node1 = parse_node(n1, match_opts_hash[:preprocessing]
|
|
95
|
-
|
|
100
|
+
node1 = parse_node(n1, match_opts_hash[:preprocessing],
|
|
101
|
+
preserve_whitespace: preserve_whitespace)
|
|
102
|
+
node2 = parse_node(n2, match_opts_hash[:preprocessing],
|
|
103
|
+
preserve_whitespace: preserve_whitespace)
|
|
96
104
|
|
|
97
105
|
# Store original strings for line diff display (before preprocessing)
|
|
98
106
|
original1 = if n1.is_a?(String)
|
|
@@ -209,8 +217,9 @@ module Canon
|
|
|
209
217
|
# Parse a node from string or return as-is
|
|
210
218
|
# Applies preprocessing transformation before parsing if specified
|
|
211
219
|
# Delegates to NodeParser module
|
|
212
|
-
def parse_node(node, preprocessing = :none)
|
|
213
|
-
XmlComparatorHelpers::NodeParser.parse(node, preprocessing
|
|
220
|
+
def parse_node(node, preprocessing = :none, preserve_whitespace: false)
|
|
221
|
+
XmlComparatorHelpers::NodeParser.parse(node, preprocessing,
|
|
222
|
+
preserve_whitespace: preserve_whitespace)
|
|
214
223
|
end
|
|
215
224
|
|
|
216
225
|
# Main comparison dispatcher
|
|
@@ -331,7 +340,8 @@ module Canon
|
|
|
331
340
|
|
|
332
341
|
# For HTML, check if text node is inside whitespace-preserving element
|
|
333
342
|
# If so, always use strict comparison regardless of text_content setting
|
|
334
|
-
|
|
343
|
+
sensitive_element = should_preserve_whitespace_strictly?(n1, n2, opts)
|
|
344
|
+
if sensitive_element
|
|
335
345
|
behavior = :strict
|
|
336
346
|
end
|
|
337
347
|
|
|
@@ -344,15 +354,23 @@ module Canon
|
|
|
344
354
|
|
|
345
355
|
# Determine the correct dimension for this difference
|
|
346
356
|
# - If text_content is :strict, ALL differences use :text_content dimension
|
|
347
|
-
# - If text_content is :normalize, whitespace-only diffs use :structural_whitespace
|
|
357
|
+
# - If text_content is :normalize, whitespace-only diffs could use :structural_whitespace
|
|
358
|
+
# but we keep :text_content to ensure correct classification behavior
|
|
348
359
|
# - Otherwise use :text_content
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
360
|
+
# However, if element is whitespace-sensitive (like <pre> in HTML),
|
|
361
|
+
# always use :text_content dimension regardless of behavior
|
|
362
|
+
#
|
|
363
|
+
# NOTE: We keep the dimension as :text_content even for whitespace-only diffs
|
|
364
|
+
# when text_content: :normalize. This ensures that the classification uses
|
|
365
|
+
# the text_content behavior (:normalize) instead of structural_whitespace
|
|
366
|
+
# behavior (:strict for XML), which would incorrectly mark the diff as normative.
|
|
367
|
+
if sensitive_element
|
|
368
|
+
# Whitespace-sensitive element: always use :text_content dimension
|
|
369
|
+
else
|
|
370
|
+
# Always use :text_content for text differences
|
|
371
|
+
# This ensures correct classification based on text_content behavior
|
|
372
|
+
end
|
|
373
|
+
dimension = :text_content
|
|
356
374
|
|
|
357
375
|
# Create DiffNode in verbose mode when raw content differs
|
|
358
376
|
# This ensures informative diffs are created even for :ignore/:normalize
|
|
@@ -368,17 +386,23 @@ module Canon
|
|
|
368
386
|
|
|
369
387
|
# Check if whitespace should be preserved strictly for these text nodes
|
|
370
388
|
# This applies to HTML elements like pre, code, textarea, script, style
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
389
|
+
# and elements with xml:space="preserve" or in user-configured whitelist
|
|
390
|
+
def should_preserve_whitespace_strictly?(n1, n2, opts)
|
|
391
|
+
# Use WhitespaceSensitivity module to check if element is sensitive
|
|
392
|
+
# Check both n1 and n2 - if either is in a sensitive element, preserve strictly
|
|
393
|
+
if n1.respond_to?(:parent)
|
|
394
|
+
sensitivity_opts = { match_opts: opts[:match_opts] }
|
|
395
|
+
return true if WhitespaceSensitivity.element_sensitive?(n1,
|
|
396
|
+
sensitivity_opts)
|
|
397
|
+
end
|
|
375
398
|
|
|
376
|
-
|
|
377
|
-
|
|
399
|
+
if n2.respond_to?(:parent)
|
|
400
|
+
sensitivity_opts = { match_opts: opts[:match_opts] }
|
|
401
|
+
return true if WhitespaceSensitivity.element_sensitive?(n2,
|
|
402
|
+
sensitivity_opts)
|
|
403
|
+
end
|
|
378
404
|
|
|
379
|
-
|
|
380
|
-
in_preserve_element?(n1, preserve_elements) ||
|
|
381
|
-
in_preserve_element?(n2, preserve_elements)
|
|
405
|
+
false
|
|
382
406
|
end
|
|
383
407
|
|
|
384
408
|
# Check if a node is inside a whitespace-preserving element
|
|
@@ -469,7 +493,8 @@ module Canon
|
|
|
469
493
|
#
|
|
470
494
|
# Delegates to ChildComparison module which handles both ElementMatcher
|
|
471
495
|
# (semantic matching) and simple positional comparison.
|
|
472
|
-
def compare_children(n1, n2, opts, child_opts, diff_children,
|
|
496
|
+
def compare_children(n1, n2, opts, child_opts, diff_children,
|
|
497
|
+
differences)
|
|
473
498
|
XmlComparatorHelpers::ChildComparison.compare(
|
|
474
499
|
n1, n2, self, opts, child_opts, diff_children, differences
|
|
475
500
|
)
|
|
@@ -543,9 +568,201 @@ module Canon
|
|
|
543
568
|
end
|
|
544
569
|
end
|
|
545
570
|
|
|
571
|
+
# For attribute presence differences, show what attributes differ
|
|
572
|
+
if dimension == :attribute_presence
|
|
573
|
+
attrs1 = extract_attributes(node1)
|
|
574
|
+
attrs2 = extract_attributes(node2)
|
|
575
|
+
return build_attribute_diff_reason(attrs1, attrs2)
|
|
576
|
+
end
|
|
577
|
+
|
|
578
|
+
# For text content differences, show the actual text (truncated if needed)
|
|
579
|
+
if dimension == :text_content
|
|
580
|
+
text1 = extract_text_from_node(node1)
|
|
581
|
+
text2 = extract_text_from_node(node2)
|
|
582
|
+
return build_text_diff_reason(text1, text2)
|
|
583
|
+
end
|
|
584
|
+
|
|
546
585
|
"#{diff1} vs #{diff2}"
|
|
547
586
|
end
|
|
548
587
|
|
|
588
|
+
# Build a clear reason message for attribute presence differences
|
|
589
|
+
#
|
|
590
|
+
# @param attrs1 [Hash, nil] First node's attributes
|
|
591
|
+
# @param attrs2 [Hash, nil] Second node's attributes
|
|
592
|
+
# @return [String] Clear explanation of the attribute difference
|
|
593
|
+
def build_attribute_diff_reason(attrs1, attrs2)
|
|
594
|
+
return "#{attrs1&.keys&.size || 0} vs #{attrs2&.keys&.size || 0} attributes" unless attrs1 && attrs2
|
|
595
|
+
|
|
596
|
+
require "set"
|
|
597
|
+
keys1 = attrs1.keys.to_set
|
|
598
|
+
keys2 = attrs2.keys.to_set
|
|
599
|
+
|
|
600
|
+
only_in_first = keys1 - keys2
|
|
601
|
+
only_in_second = keys2 - keys1
|
|
602
|
+
common = keys1 & keys2
|
|
603
|
+
|
|
604
|
+
# Check if values differ for common keys
|
|
605
|
+
different_values = common.reject { |k| attrs1[k] == attrs2[k] }
|
|
606
|
+
|
|
607
|
+
parts = []
|
|
608
|
+
parts << "only in first: #{only_in_first.to_a.sort.join(', ')}" if only_in_first.any?
|
|
609
|
+
parts << "only in second: #{only_in_second.to_a.sort.join(', ')}" if only_in_second.any?
|
|
610
|
+
parts << "different values: #{different_values.sort.join(', ')}" if different_values.any?
|
|
611
|
+
|
|
612
|
+
if parts.empty?
|
|
613
|
+
"#{keys1.size} vs #{keys2.size} attributes (same names)"
|
|
614
|
+
else
|
|
615
|
+
parts.join("; ")
|
|
616
|
+
end
|
|
617
|
+
end
|
|
618
|
+
|
|
619
|
+
# Extract text from a node for diff reason
|
|
620
|
+
#
|
|
621
|
+
# @param node [Object, nil] Node to extract text from
|
|
622
|
+
# @return [String, nil] Text content or nil
|
|
623
|
+
def extract_text_from_node(node)
|
|
624
|
+
return nil if node.nil?
|
|
625
|
+
|
|
626
|
+
# For Canon::Xml::Nodes::TextNode
|
|
627
|
+
return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
|
|
628
|
+
|
|
629
|
+
# For XML/HTML nodes with text_content method
|
|
630
|
+
return node.text_content if node.respond_to?(:text_content)
|
|
631
|
+
|
|
632
|
+
# For nodes with text method
|
|
633
|
+
return node.text if node.respond_to?(:text)
|
|
634
|
+
|
|
635
|
+
# For nodes with content method (Moxml::Text)
|
|
636
|
+
return node.content if node.respond_to?(:content)
|
|
637
|
+
|
|
638
|
+
# For nodes with value method (other types)
|
|
639
|
+
return node.value if node.respond_to?(:value)
|
|
640
|
+
|
|
641
|
+
# For simple text nodes or strings
|
|
642
|
+
return node.to_s if node.is_a?(String)
|
|
643
|
+
|
|
644
|
+
# For other node types, try to_s
|
|
645
|
+
node.to_s
|
|
646
|
+
rescue StandardError
|
|
647
|
+
nil
|
|
648
|
+
end
|
|
649
|
+
|
|
650
|
+
# Build a clear reason message for text content differences
|
|
651
|
+
#
|
|
652
|
+
# @param text1 [String, nil] First text content
|
|
653
|
+
# @param text2 [String, nil] Second text content
|
|
654
|
+
# @return [String] Clear explanation of the text difference
|
|
655
|
+
def build_text_diff_reason(text1, text2)
|
|
656
|
+
# Handle nil cases
|
|
657
|
+
return "missing vs '#{truncate_text(text2)}'" if text1.nil? && text2
|
|
658
|
+
return "'#{truncate_text(text2)}' vs missing" if text1 && text2.nil?
|
|
659
|
+
return "both missing" if text1.nil? && text2.nil?
|
|
660
|
+
|
|
661
|
+
# Check if both are whitespace-only
|
|
662
|
+
if whitespace_only?(text1) && whitespace_only?(text2)
|
|
663
|
+
return "whitespace: #{describe_whitespace(text1)} vs #{describe_whitespace(text2)}"
|
|
664
|
+
end
|
|
665
|
+
|
|
666
|
+
# Show text with visible whitespace markers
|
|
667
|
+
# Use escaped representations for clarity: \n for newline, \t for tab, · for spaces
|
|
668
|
+
vis1 = visualize_whitespace(text1)
|
|
669
|
+
vis2 = visualize_whitespace(text2)
|
|
670
|
+
|
|
671
|
+
"Text: \"#{vis1}\" vs \"#{vis2}\""
|
|
672
|
+
end
|
|
673
|
+
|
|
674
|
+
# Check if text is only whitespace
|
|
675
|
+
#
|
|
676
|
+
# @param text [String] Text to check
|
|
677
|
+
# @return [Boolean] true if whitespace-only
|
|
678
|
+
def whitespace_only?(text)
|
|
679
|
+
return false if text.nil?
|
|
680
|
+
|
|
681
|
+
text.to_s.strip.empty?
|
|
682
|
+
end
|
|
683
|
+
|
|
684
|
+
# Make whitespace visible in text content
|
|
685
|
+
# Uses the existing character visualization map from DiffFormatter (single source of truth)
|
|
686
|
+
#
|
|
687
|
+
# @param text [String] Text to visualize
|
|
688
|
+
# @return [String] Text with visible whitespace markers
|
|
689
|
+
def visualize_whitespace(text)
|
|
690
|
+
return "" if text.nil?
|
|
691
|
+
|
|
692
|
+
# Use the character map loader as the single source of truth
|
|
693
|
+
viz_map = character_visualization_map
|
|
694
|
+
|
|
695
|
+
# Replace each character with its visualization
|
|
696
|
+
text.chars.map { |char| viz_map[char] || char }.join
|
|
697
|
+
end
|
|
698
|
+
|
|
699
|
+
# Get the character visualization map (lazy-loaded to avoid circular dependency)
|
|
700
|
+
#
|
|
701
|
+
# @return [Hash] Character to visualization symbol mapping
|
|
702
|
+
def character_visualization_map
|
|
703
|
+
@character_visualization_map ||= begin
|
|
704
|
+
# Load the YAML file directly to avoid circular dependency
|
|
705
|
+
require "yaml"
|
|
706
|
+
lib_root = File.expand_path("../..", __dir__)
|
|
707
|
+
yaml_path = File.join(lib_root,
|
|
708
|
+
"canon/diff_formatter/character_map.yml")
|
|
709
|
+
data = YAML.load_file(yaml_path)
|
|
710
|
+
|
|
711
|
+
# Build visualization map from the YAML data
|
|
712
|
+
visualization_map = {}
|
|
713
|
+
data["characters"].each do |char_data|
|
|
714
|
+
# Get the character from either unicode code point or character field
|
|
715
|
+
char = if char_data["unicode"]
|
|
716
|
+
# Convert hex string to character
|
|
717
|
+
[char_data["unicode"].to_i(16)].pack("U")
|
|
718
|
+
else
|
|
719
|
+
# Use character field directly (handles \n, \t, etc.)
|
|
720
|
+
char_data["character"]
|
|
721
|
+
end
|
|
722
|
+
|
|
723
|
+
vis = char_data["visualization"]
|
|
724
|
+
visualization_map[char] = vis
|
|
725
|
+
end
|
|
726
|
+
|
|
727
|
+
visualization_map
|
|
728
|
+
end
|
|
729
|
+
end
|
|
730
|
+
|
|
731
|
+
# Describe whitespace content in a readable way
|
|
732
|
+
#
|
|
733
|
+
# @param text [String] Whitespace text
|
|
734
|
+
# @return [String] Description like "4 chars (2 newlines, 2 spaces)"
|
|
735
|
+
def describe_whitespace(text)
|
|
736
|
+
return "0 chars" if text.nil? || text.empty?
|
|
737
|
+
|
|
738
|
+
char_count = text.length
|
|
739
|
+
newline_count = text.count("\n")
|
|
740
|
+
space_count = text.count(" ")
|
|
741
|
+
tab_count = text.count("\t")
|
|
742
|
+
|
|
743
|
+
parts = []
|
|
744
|
+
parts << "#{newline_count} newlines" if newline_count.positive?
|
|
745
|
+
parts << "#{space_count} spaces" if space_count.positive?
|
|
746
|
+
parts << "#{tab_count} tabs" if tab_count.positive?
|
|
747
|
+
|
|
748
|
+
description = parts.join(", ")
|
|
749
|
+
"#{char_count} chars (#{description})"
|
|
750
|
+
end
|
|
751
|
+
|
|
752
|
+
# Truncate text for display in reason messages
|
|
753
|
+
#
|
|
754
|
+
# @param text [String] Text to truncate
|
|
755
|
+
# @param max_length [Integer] Maximum length
|
|
756
|
+
# @return [String] Truncated text
|
|
757
|
+
def truncate_text(text, max_length = 40)
|
|
758
|
+
return "" if text.nil?
|
|
759
|
+
|
|
760
|
+
text = text.to_s
|
|
761
|
+
return text if text.length <= max_length
|
|
762
|
+
|
|
763
|
+
"#{text[0...max_length]}..."
|
|
764
|
+
end
|
|
765
|
+
|
|
549
766
|
# Compare namespace declarations (xmlns and xmlns:* attributes)
|
|
550
767
|
# Delegates to XmlComparatorHelpers::NamespaceComparator
|
|
551
768
|
def compare_namespace_declarations(n1, n2, opts, differences)
|
|
@@ -139,9 +139,13 @@ diff_children, differences)
|
|
|
139
139
|
|
|
140
140
|
# Check structural_whitespace match option
|
|
141
141
|
match_opts = opts[:match_opts]
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
142
|
+
return false unless match_opts
|
|
143
|
+
|
|
144
|
+
# Filter out whitespace-only text nodes based on structural_whitespace setting
|
|
145
|
+
# - :ignore or :normalize: Filter all whitespace-only text nodes
|
|
146
|
+
# - :strict: Preserve all whitespace-only text nodes (don't filter any)
|
|
147
|
+
if text_node?(node) && %i[ignore
|
|
148
|
+
normalize].include?(match_opts[:structural_whitespace])
|
|
145
149
|
text = node_text(node)
|
|
146
150
|
return true if MatchOptions.normalize_text(text).empty?
|
|
147
151
|
end
|
|
@@ -184,6 +188,24 @@ diff_children, differences)
|
|
|
184
188
|
node.respond_to?(:node_type) && node.node_type == :text
|
|
185
189
|
end
|
|
186
190
|
|
|
191
|
+
# Extract text content from a node
|
|
192
|
+
#
|
|
193
|
+
# @param node [Object] Node to extract text from
|
|
194
|
+
# @return [String] Text content
|
|
195
|
+
def self.node_text(node)
|
|
196
|
+
return "" unless node
|
|
197
|
+
|
|
198
|
+
if node.respond_to?(:content)
|
|
199
|
+
node.content.to_s
|
|
200
|
+
elsif node.respond_to?(:text)
|
|
201
|
+
node.text.to_s
|
|
202
|
+
elsif node.respond_to?(:value)
|
|
203
|
+
node.value.to_s
|
|
204
|
+
else
|
|
205
|
+
""
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
187
209
|
# Dispatch by Canon::Xml::Node type
|
|
188
210
|
def self.dispatch_canon_node_type(node1, node2, opts, child_opts,
|
|
189
211
|
diff_children, differences)
|
|
@@ -1,12 +1,19 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "formatting_detector"
|
|
4
|
+
require_relative "xml_serialization_formatter"
|
|
4
5
|
require_relative "../comparison/compare_profile"
|
|
6
|
+
require_relative "../comparison/whitespace_sensitivity"
|
|
5
7
|
|
|
6
8
|
module Canon
|
|
7
9
|
module Diff
|
|
8
10
|
# Classifies DiffNodes as normative (affects equivalence) or informative (doesn't affect equivalence)
|
|
9
11
|
# based on the match options in effect
|
|
12
|
+
#
|
|
13
|
+
# Classification hierarchy (three distinct kinds of differences):
|
|
14
|
+
# 1. Serialization formatting: XML syntax differences (always non-normative)
|
|
15
|
+
# 2. Content formatting: Whitespace differences in content (non-normative when normalized)
|
|
16
|
+
# 3. Normative: Semantic content differences (affect equivalence)
|
|
10
17
|
class DiffClassifier
|
|
11
18
|
attr_reader :match_options, :profile
|
|
12
19
|
|
|
@@ -24,15 +31,46 @@ module Canon
|
|
|
24
31
|
|
|
25
32
|
# Classify a single DiffNode as normative or informative
|
|
26
33
|
# Hierarchy: formatting-only < informative < normative
|
|
27
|
-
# CompareProfile determines base classification,
|
|
34
|
+
# CompareProfile determines base classification, XmlSerializationFormatter handles serialization formatting
|
|
28
35
|
# @param diff_node [DiffNode] The diff node to classify
|
|
29
36
|
# @return [DiffNode] The same diff node with normative/formatting attributes set
|
|
30
37
|
def classify(diff_node)
|
|
31
|
-
# FIRST:
|
|
38
|
+
# FIRST: Check for XML serialization-level formatting differences
|
|
39
|
+
# These are ALWAYS non-normative (formatting-only) regardless of match options
|
|
40
|
+
# Examples: self-closing tags (<tag/>) vs explicit closing tags (<tag></tag>)
|
|
41
|
+
if XmlSerializationFormatter.serialization_formatting?(diff_node)
|
|
42
|
+
diff_node.formatting = true
|
|
43
|
+
diff_node.normative = false
|
|
44
|
+
return diff_node
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# SECOND: Handle content-level formatting for text_content with :normalize behavior
|
|
48
|
+
# When text_content is :normalize and the difference is formatting-only,
|
|
49
|
+
# it should be marked as non-normative (informative)
|
|
50
|
+
# This ensures that verbose and non-verbose modes give consistent results
|
|
51
|
+
#
|
|
52
|
+
# EXCEPTION: If the text node is inside a whitespace-sensitive element
|
|
53
|
+
# (like <pre>, <code>, <textarea> in HTML), don't apply formatting detection
|
|
54
|
+
# because whitespace should be preserved in these elements
|
|
55
|
+
#
|
|
56
|
+
# This check must come BEFORE normative_dimension? is called,
|
|
57
|
+
# because normative_dimension? returns true for text_content: :normalize
|
|
58
|
+
# (since the dimension affects equivalence), which would prevent formatting
|
|
59
|
+
# detection from being applied.
|
|
60
|
+
if diff_node.dimension == :text_content &&
|
|
61
|
+
profile.send(:behavior_for, :text_content) == :normalize &&
|
|
62
|
+
!inside_whitespace_sensitive_element?(diff_node) &&
|
|
63
|
+
formatting_only_diff?(diff_node)
|
|
64
|
+
diff_node.formatting = true
|
|
65
|
+
diff_node.normative = false
|
|
66
|
+
return diff_node
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# THIRD: Determine if this dimension is normative based on CompareProfile
|
|
32
70
|
# This respects the policy settings (strict/normalize/ignore)
|
|
33
71
|
is_normative = profile.normative_dimension?(diff_node.dimension)
|
|
34
72
|
|
|
35
|
-
#
|
|
73
|
+
# FOURTH: Check if FormattingDetector should be consulted for non-normative dimensions
|
|
36
74
|
# Only check for formatting-only when dimension is NOT normative
|
|
37
75
|
# This ensures strict mode differences remain normative
|
|
38
76
|
should_check_formatting = !is_normative &&
|
|
@@ -45,7 +83,7 @@ module Canon
|
|
|
45
83
|
return diff_node
|
|
46
84
|
end
|
|
47
85
|
|
|
48
|
-
#
|
|
86
|
+
# FIFTH: Apply the normative determination from CompareProfile
|
|
49
87
|
diff_node.formatting = false
|
|
50
88
|
diff_node.normative = is_normative
|
|
51
89
|
|
|
@@ -65,10 +103,59 @@ module Canon
|
|
|
65
103
|
# @param diff_node [DiffNode] The diff node to check
|
|
66
104
|
# @return [Boolean] true if formatting-only
|
|
67
105
|
def formatting_only_diff?(diff_node)
|
|
106
|
+
# Only apply formatting detection to actual text content differences
|
|
107
|
+
# If the nodes are not text nodes (e.g., element nodes), don't apply formatting detection
|
|
108
|
+
node1 = diff_node.node1
|
|
109
|
+
node2 = diff_node.node2
|
|
110
|
+
|
|
111
|
+
# Check if both nodes are text nodes
|
|
112
|
+
# If not, this is not a formatting-only difference
|
|
113
|
+
return false unless text_node?(node1) && text_node?(node2)
|
|
114
|
+
|
|
68
115
|
text1 = extract_text_content(diff_node.node1)
|
|
69
116
|
text2 = extract_text_content(diff_node.node2)
|
|
70
117
|
|
|
71
|
-
|
|
118
|
+
# For text_content dimension, use normalized text comparison
|
|
119
|
+
# This handles cases like "" vs " " (both normalize to "")
|
|
120
|
+
if diff_node.dimension == :text_content
|
|
121
|
+
normalized_equivalent?(text1, text2)
|
|
122
|
+
else
|
|
123
|
+
FormattingDetector.formatting_only?(text1, text2)
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Check if two texts are equivalent after normalization
|
|
128
|
+
# This detects formatting-only differences where normalized texts match
|
|
129
|
+
# @param text1 [String, nil] First text
|
|
130
|
+
# @param text2 [String, nil] Second text
|
|
131
|
+
# @return [Boolean] true if normalized texts are equivalent
|
|
132
|
+
def normalized_equivalent?(text1, text2)
|
|
133
|
+
return false if text1.nil? && text2.nil?
|
|
134
|
+
return false if text1.nil? || text2.nil?
|
|
135
|
+
|
|
136
|
+
# Use MatchOptions.normalize_text for consistency
|
|
137
|
+
normalized1 = Canon::Comparison::MatchOptions.normalize_text(text1)
|
|
138
|
+
normalized2 = Canon::Comparison::MatchOptions.normalize_text(text2)
|
|
139
|
+
|
|
140
|
+
# If normalized texts are equivalent but originals are different,
|
|
141
|
+
# it's a formatting-only difference
|
|
142
|
+
normalized1 == normalized2 && text1 != text2
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Check if the text node is inside a whitespace-sensitive element
|
|
146
|
+
# @param diff_node [DiffNode] The diff node to check
|
|
147
|
+
# @return [Boolean] true if inside a whitespace-sensitive element
|
|
148
|
+
def inside_whitespace_sensitive_element?(diff_node)
|
|
149
|
+
# Get the text node (not the parent element)
|
|
150
|
+
node = diff_node.node1 || diff_node.node2
|
|
151
|
+
return false unless node
|
|
152
|
+
|
|
153
|
+
# WhitespaceSensitivity.element_sensitive? expects a text node
|
|
154
|
+
# and checks its parent element
|
|
155
|
+
# We need to pass the full options structure with :match_opts key
|
|
156
|
+
opts = { match_opts: @match_options.options }
|
|
157
|
+
|
|
158
|
+
Canon::Comparison::WhitespaceSensitivity.element_sensitive?(node, opts)
|
|
72
159
|
end
|
|
73
160
|
|
|
74
161
|
# Extract text content from a node for formatting comparison
|
|
@@ -101,6 +188,33 @@ module Canon
|
|
|
101
188
|
# If extraction fails, return nil (not formatting-only)
|
|
102
189
|
nil
|
|
103
190
|
end
|
|
191
|
+
|
|
192
|
+
# Check if a node is a text node
|
|
193
|
+
# @param node [Object] The node to check
|
|
194
|
+
# @return [Boolean] true if the node is a text node
|
|
195
|
+
def text_node?(node)
|
|
196
|
+
return false if node.nil?
|
|
197
|
+
|
|
198
|
+
# Canon::Xml::Nodes::TextNode
|
|
199
|
+
return true if node.is_a?(Canon::Xml::Nodes::TextNode)
|
|
200
|
+
|
|
201
|
+
# Nokogiri text nodes (node_type returns integer constant like 3)
|
|
202
|
+
return true if node.respond_to?(:node_type) &&
|
|
203
|
+
node.node_type.is_a?(Integer) &&
|
|
204
|
+
node.node_type == Nokogiri::XML::Node::TEXT_NODE
|
|
205
|
+
|
|
206
|
+
# Moxml text nodes (node_type returns symbol)
|
|
207
|
+
return true if node.respond_to?(:node_type) && node.node_type == :text
|
|
208
|
+
|
|
209
|
+
# String
|
|
210
|
+
return true if node.is_a?(String)
|
|
211
|
+
|
|
212
|
+
# Test doubles or objects with text node-like interface
|
|
213
|
+
# Check if it has a value method (contains text content)
|
|
214
|
+
return true if node.respond_to?(:value)
|
|
215
|
+
|
|
216
|
+
false
|
|
217
|
+
end
|
|
104
218
|
end
|
|
105
219
|
end
|
|
106
220
|
end
|
|
@@ -11,7 +11,7 @@ module Canon
|
|
|
11
11
|
# @param line2 [String, nil] Second line to compare
|
|
12
12
|
# @return [Boolean] true if lines differ only in formatting
|
|
13
13
|
def self.formatting_only?(line1, line2)
|
|
14
|
-
# If both are nil or empty, not a formatting diff
|
|
14
|
+
# If both are nil or empty, not a formatting diff (no difference)
|
|
15
15
|
return false if blank?(line1) && blank?(line2)
|
|
16
16
|
|
|
17
17
|
# If only one is blank, it's not just formatting
|