canon 0.1.8 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +83 -22
- data/docs/Gemfile +1 -0
- data/docs/_config.yml +90 -1
- data/docs/advanced/diff-classification.adoc +196 -24
- data/docs/features/match-options/index.adoc +239 -1
- data/lib/canon/comparison/format_detector.rb +2 -1
- data/lib/canon/comparison/html_comparator.rb +19 -8
- data/lib/canon/comparison/html_compare_profile.rb +8 -2
- data/lib/canon/comparison/markup_comparator.rb +109 -2
- data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
- data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +108 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
- data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
- data/lib/canon/comparison/xml_comparator.rb +240 -23
- data/lib/canon/comparison/xml_node_comparison.rb +25 -3
- data/lib/canon/diff/diff_classifier.rb +119 -5
- data/lib/canon/diff/formatting_detector.rb +1 -1
- data/lib/canon/diff/xml_serialization_formatter.rb +153 -0
- data/lib/canon/rspec_matchers.rb +37 -8
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +24 -13
- metadata +4 -78
- data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
- data/false_positive_analysis.txt +0 -0
- data/file1.html +0 -1
- data/file2.html +0 -1
- data/old-docs/ADVANCED_TOPICS.adoc +0 -20
- data/old-docs/BASIC_USAGE.adoc +0 -16
- data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
- data/old-docs/CLI.adoc +0 -497
- data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/old-docs/DIFF_FORMATTING.adoc +0 -540
- data/old-docs/DIFF_PARAMETERS.adoc +0 -261
- data/old-docs/DOM_DIFF.adoc +0 -1017
- data/old-docs/ENV_CONFIG.adoc +0 -876
- data/old-docs/FORMATS.adoc +0 -867
- data/old-docs/INPUT_VALIDATION.adoc +0 -477
- data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
- data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/old-docs/MATCH_OPTIONS.adoc +0 -912
- data/old-docs/MODES.adoc +0 -432
- data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/old-docs/OPTIONS.adoc +0 -1387
- data/old-docs/PREPROCESSING.adoc +0 -491
- data/old-docs/README.old.adoc +0 -2831
- data/old-docs/RSPEC.adoc +0 -814
- data/old-docs/RUBY_API.adoc +0 -485
- data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
- data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
- data/old-docs/STRING_COMPARE.adoc +0 -345
- data/old-docs/TMP.adoc +0 -3384
- data/old-docs/TREE_DIFF.adoc +0 -1080
- data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
- data/old-docs/VERBOSE.adoc +0 -482
- data/old-docs/VISUALIZATION_MAP.adoc +0 -625
- data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
- data/scripts/analyze_current_state.rb +0 -85
- data/scripts/analyze_false_positives.rb +0 -114
- data/scripts/analyze_remaining_failures.rb +0 -105
- data/scripts/compare_current_failures.rb +0 -95
- data/scripts/compare_dom_tree_diff.rb +0 -158
- data/scripts/compare_failures.rb +0 -151
- data/scripts/debug_attribute_extraction.rb +0 -66
- data/scripts/debug_blocks_839.rb +0 -115
- data/scripts/debug_meta_matching.rb +0 -52
- data/scripts/debug_p_matching.rb +0 -192
- data/scripts/debug_signature_matching.rb +0 -118
- data/scripts/debug_sourcecode_124.rb +0 -32
- data/scripts/debug_whitespace_sensitive.rb +0 -192
- data/scripts/extract_false_positives.rb +0 -138
- data/scripts/find_actual_false_positives.rb +0 -125
- data/scripts/investigate_all_false_positives.rb +0 -161
- data/scripts/investigate_batch1.rb +0 -127
- data/scripts/investigate_classification.rb +0 -150
- data/scripts/investigate_classification_detailed.rb +0 -190
- data/scripts/investigate_common_failures.rb +0 -342
- data/scripts/investigate_false_negative.rb +0 -80
- data/scripts/investigate_false_positive.rb +0 -83
- data/scripts/investigate_false_positives.rb +0 -227
- data/scripts/investigate_false_positives_batch.rb +0 -163
- data/scripts/investigate_mixed_content.rb +0 -125
- data/scripts/investigate_remaining_16.rb +0 -214
- data/scripts/run_single_test.rb +0 -29
- data/scripts/test_all_false_positives.rb +0 -95
- data/scripts/test_attribute_details.rb +0 -61
- data/scripts/test_both_algorithms.rb +0 -49
- data/scripts/test_both_simple.rb +0 -49
- data/scripts/test_enhanced_semantic_output.rb +0 -125
- data/scripts/test_readme_examples.rb +0 -131
- data/scripts/test_semantic_tree_diff.rb +0 -99
- data/scripts/test_semantic_ux_improvements.rb +0 -135
- data/scripts/test_single_false_positive.rb +0 -119
- data/scripts/test_size_limits.rb +0 -99
- data/test_html_1.html +0 -21
- data/test_html_2.html +0 -21
- data/test_nokogiri.rb +0 -33
- data/test_normalize.rb +0 -45
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module Comparison
|
|
5
|
+
# Whitespace sensitivity utilities for element-level control
|
|
6
|
+
#
|
|
7
|
+
# This module provides logic to determine whether whitespace should be
|
|
8
|
+
# preserved during comparison based on:
|
|
9
|
+
# - Format-specific defaults (HTML has built-in sensitive elements)
|
|
10
|
+
# - User-configured whitelist (elements that care about whitespace)
|
|
11
|
+
# - User-configured blacklist (elements that don't care about whitespace)
|
|
12
|
+
# - xml:space attribute in the document itself
|
|
13
|
+
# - respect_xml_space flag (whether to honor or override xml:space)
|
|
14
|
+
#
|
|
15
|
+
# == Priority Order
|
|
16
|
+
#
|
|
17
|
+
# 1. respect_xml_space: false → User config only (ignore xml:space)
|
|
18
|
+
# 2. User whitelist → Use whitelist (user explicitly declared)
|
|
19
|
+
# 3. Format defaults → HTML: [:pre, :textarea, :script, :style], XML: []
|
|
20
|
+
# 4. User blacklist → Remove from defaults/whitelist
|
|
21
|
+
# 5. xml:space="preserve" → Element is sensitive
|
|
22
|
+
# 6. xml:space="default" → Use steps 1-4
|
|
23
|
+
#
|
|
24
|
+
# == Usage
|
|
25
|
+
#
|
|
26
|
+
# WhitespaceSensitivity.element_sensitive?(node, opts)
|
|
27
|
+
# => true if whitespace should be preserved for this element
|
|
28
|
+
module WhitespaceSensitivity
|
|
29
|
+
class << self
|
|
30
|
+
# Check if an element is whitespace-sensitive based on configuration
|
|
31
|
+
#
|
|
32
|
+
# @param node [Object] The element node to check
|
|
33
|
+
# @param opts [Hash] Comparison options containing match_opts
|
|
34
|
+
# @return [Boolean] true if whitespace should be preserved for this element
|
|
35
|
+
def element_sensitive?(node, opts)
|
|
36
|
+
match_opts = opts[:match_opts]
|
|
37
|
+
return false unless match_opts
|
|
38
|
+
return false unless text_node_parent?(node)
|
|
39
|
+
|
|
40
|
+
parent = node.parent
|
|
41
|
+
|
|
42
|
+
# 1. Check if we should ignore xml:space (user override)
|
|
43
|
+
if !respect_xml_space?(match_opts)
|
|
44
|
+
return user_config_sensitive?(parent, match_opts)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# 2. Check xml:space="preserve" (document declaration)
|
|
48
|
+
return true if xml_space_preserve?(parent)
|
|
49
|
+
|
|
50
|
+
# 3. Check xml:space="default" (use configured behavior)
|
|
51
|
+
return false if xml_space_default?(parent)
|
|
52
|
+
|
|
53
|
+
# 4. Use user configuration + format defaults
|
|
54
|
+
configured_sensitive?(parent, match_opts)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Check if whitespace-only text node should be filtered
|
|
58
|
+
#
|
|
59
|
+
# @param node [Object] The text node to check
|
|
60
|
+
# @param opts [Hash] Comparison options
|
|
61
|
+
# @return [Boolean] true if node should be preserved (not filtered)
|
|
62
|
+
def preserve_whitespace_node?(node, opts)
|
|
63
|
+
return false unless node.respond_to?(:parent)
|
|
64
|
+
return false unless node.parent
|
|
65
|
+
|
|
66
|
+
element_sensitive?(node, opts)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Get format-specific default sensitive elements
|
|
70
|
+
#
|
|
71
|
+
# This is the SINGLE SOURCE OF TRUTH for default whitespace-sensitive
|
|
72
|
+
# elements. All other code should use this method to get the list.
|
|
73
|
+
#
|
|
74
|
+
# @param match_opts [Hash] Resolved match options
|
|
75
|
+
# @return [Array<Symbol>] Default sensitive element names
|
|
76
|
+
def format_default_sensitive_elements(match_opts)
|
|
77
|
+
format = match_opts[:format] || :xml
|
|
78
|
+
|
|
79
|
+
case format
|
|
80
|
+
when :html, :html4, :html5
|
|
81
|
+
# HTML specification: these elements preserve whitespace
|
|
82
|
+
%i[pre code textarea script style].freeze
|
|
83
|
+
when :xml
|
|
84
|
+
# XML has no default sensitive elements - purely user-controlled
|
|
85
|
+
[].freeze
|
|
86
|
+
else
|
|
87
|
+
[].freeze
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Check if an element is in the default sensitive list for its format
|
|
92
|
+
#
|
|
93
|
+
# Convenience method for checking element sensitivity without building
|
|
94
|
+
# the full list first.
|
|
95
|
+
#
|
|
96
|
+
# @param element_name [String, Symbol] The element name to check
|
|
97
|
+
# @param match_opts [Hash] Resolved match options
|
|
98
|
+
# @return [Boolean] true if element is in default sensitive list
|
|
99
|
+
def default_sensitive_element?(element_name, match_opts)
|
|
100
|
+
format_default_sensitive_elements(match_opts)
|
|
101
|
+
.include?(element_name.to_sym)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
private
|
|
105
|
+
|
|
106
|
+
# Check if we should respect xml:space attribute
|
|
107
|
+
#
|
|
108
|
+
# @param match_opts [Hash] Resolved match options
|
|
109
|
+
# @return [Boolean] true if xml:space should be respected
|
|
110
|
+
def respect_xml_space?(match_opts)
|
|
111
|
+
if match_opts.key?(:respect_xml_space)
|
|
112
|
+
match_opts[:respect_xml_space]
|
|
113
|
+
else
|
|
114
|
+
true
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Check if xml:space="preserve" is set
|
|
119
|
+
#
|
|
120
|
+
# @param element [Object] The element to check
|
|
121
|
+
# @return [Boolean] true if xml:space="preserve"
|
|
122
|
+
def xml_space_preserve?(element)
|
|
123
|
+
if element.is_a?(Canon::Xml::Nodes::ElementNode)
|
|
124
|
+
# Check attribute_nodes for xml:space attribute
|
|
125
|
+
# xml:space is stored with name="space" and namespace_uri="http://www.w3.org/XML/1998/namespace"
|
|
126
|
+
element.attribute_nodes.any? do |attr|
|
|
127
|
+
attr.name == "space" &&
|
|
128
|
+
attr.namespace_uri == "http://www.w3.org/XML/1998/namespace" &&
|
|
129
|
+
attr.value == "preserve"
|
|
130
|
+
end
|
|
131
|
+
elsif element.respond_to?(:[])
|
|
132
|
+
element["xml:space"] == "preserve"
|
|
133
|
+
else
|
|
134
|
+
false
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Check if xml:space="default" is set
|
|
139
|
+
#
|
|
140
|
+
# @param element [Object] The element to check
|
|
141
|
+
# @return [Boolean] true if xml:space="default"
|
|
142
|
+
def xml_space_default?(element)
|
|
143
|
+
if element.is_a?(Canon::Xml::Nodes::ElementNode)
|
|
144
|
+
# Check attribute_nodes for xml:space attribute
|
|
145
|
+
# xml:space is stored with name="space" and namespace_uri="http://www.w3.org/XML/1998/namespace"
|
|
146
|
+
element.attribute_nodes.any? do |attr|
|
|
147
|
+
attr.name == "space" &&
|
|
148
|
+
attr.namespace_uri == "http://www.w3.org/XML/1998/namespace" &&
|
|
149
|
+
attr.value == "default"
|
|
150
|
+
end
|
|
151
|
+
elsif element.respond_to?(:[])
|
|
152
|
+
element["xml:space"] == "default"
|
|
153
|
+
else
|
|
154
|
+
false
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Check sensitivity based on user configuration
|
|
159
|
+
#
|
|
160
|
+
# @param element [Object] The element to check
|
|
161
|
+
# @param match_opts [Hash] Resolved match options
|
|
162
|
+
# @return [Boolean] true if element is in whitelist
|
|
163
|
+
def user_config_sensitive?(element, match_opts)
|
|
164
|
+
return false unless match_opts[:whitespace_sensitive_elements]
|
|
165
|
+
|
|
166
|
+
match_opts[:whitespace_sensitive_elements].include?(element.name.to_sym)
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# Check sensitivity based on user config + format defaults
|
|
170
|
+
#
|
|
171
|
+
# @param element [Object] The element to check
|
|
172
|
+
# @param match_opts [Hash] Resolved match options
|
|
173
|
+
# @return [Boolean] true if element should be sensitive
|
|
174
|
+
def configured_sensitive?(element, match_opts)
|
|
175
|
+
# Start with format defaults
|
|
176
|
+
sensitive = format_default_sensitive_elements(match_opts).to_set
|
|
177
|
+
|
|
178
|
+
# Apply whitelist (adds to defaults)
|
|
179
|
+
if match_opts[:whitespace_sensitive_elements]
|
|
180
|
+
sensitive |= match_opts[:whitespace_sensitive_elements]
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Apply blacklist (removes from everything)
|
|
184
|
+
if match_opts[:whitespace_insensitive_elements]
|
|
185
|
+
sensitive -= match_opts[:whitespace_insensitive_elements]
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
sensitive.include?(element.name.to_sym)
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
# Check if node has a parent that's an element (not document root)
|
|
192
|
+
#
|
|
193
|
+
# @param node [Object] The node to check
|
|
194
|
+
# @return [Boolean] true if node has an element parent
|
|
195
|
+
def text_node_parent?(node)
|
|
196
|
+
return false unless node.respond_to?(:parent)
|
|
197
|
+
return false unless node.parent
|
|
198
|
+
|
|
199
|
+
parent = node.parent
|
|
200
|
+
return true if parent.respond_to?(:element?) && parent.element?
|
|
201
|
+
|
|
202
|
+
# Nokogiri compatibility
|
|
203
|
+
parent.respond_to?(:node_type) && parent.node_type == :element
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
end
|
|
@@ -26,7 +26,8 @@ module Canon
|
|
|
26
26
|
# @param diff_children [Boolean] Whether to diff children
|
|
27
27
|
# @param differences [Array] Array to collect differences
|
|
28
28
|
# @return [Integer] Comparison result code
|
|
29
|
-
def compare(node1, node2, comparator, opts, child_opts,
|
|
29
|
+
def compare(node1, node2, comparator, opts, child_opts,
|
|
30
|
+
diff_children, differences)
|
|
30
31
|
children1 = comparator.send(:filter_children, node1.children, opts)
|
|
31
32
|
children2 = comparator.send(:filter_children, node2.children, opts)
|
|
32
33
|
|
|
@@ -51,7 +52,9 @@ module Canon
|
|
|
51
52
|
# method that returns symbols, and only works with element nodes.
|
|
52
53
|
def can_use_element_matcher?(children1, children2)
|
|
53
54
|
!children1.empty? && !children2.empty? &&
|
|
54
|
-
children1.all?
|
|
55
|
+
children1.all? do |c|
|
|
56
|
+
c.is_a?(Canon::Xml::Node) && c.node_type == :element
|
|
57
|
+
end &&
|
|
55
58
|
children2.all? { |c| c.is_a?(Canon::Xml::Node) && c.node_type == :element }
|
|
56
59
|
end
|
|
57
60
|
|
|
@@ -140,7 +143,8 @@ module Canon
|
|
|
140
143
|
opts, child_opts, diff_children, differences)
|
|
141
144
|
# Length check
|
|
142
145
|
unless children1.length == children2.length
|
|
143
|
-
dimension = determine_dimension_for_mismatch(children1,
|
|
146
|
+
dimension = determine_dimension_for_mismatch(children1,
|
|
147
|
+
children2, comparator)
|
|
144
148
|
comparator.send(:add_difference, parent_node, parent_node,
|
|
145
149
|
Comparison::MISSING_NODE, Comparison::MISSING_NODE,
|
|
146
150
|
dimension, opts, differences)
|
|
@@ -167,15 +171,19 @@ module Canon
|
|
|
167
171
|
(0...max_len).each do |i|
|
|
168
172
|
if i >= children1.length
|
|
169
173
|
# Extra child in children2
|
|
170
|
-
dimension = comparator.send(:determine_node_dimension,
|
|
174
|
+
dimension = comparator.send(:determine_node_dimension,
|
|
175
|
+
children2[i])
|
|
171
176
|
break
|
|
172
177
|
elsif i >= children2.length
|
|
173
178
|
# Extra child in children1
|
|
174
|
-
dimension = comparator.send(:determine_node_dimension,
|
|
179
|
+
dimension = comparator.send(:determine_node_dimension,
|
|
180
|
+
children1[i])
|
|
175
181
|
break
|
|
176
|
-
elsif !comparator.send(:same_node_type?, children1[i],
|
|
182
|
+
elsif !comparator.send(:same_node_type?, children1[i],
|
|
183
|
+
children2[i])
|
|
177
184
|
# Different node types at same position
|
|
178
|
-
dimension = comparator.send(:determine_node_dimension,
|
|
185
|
+
dimension = comparator.send(:determine_node_dimension,
|
|
186
|
+
children1[i])
|
|
179
187
|
break
|
|
180
188
|
end
|
|
181
189
|
end
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "set"
|
|
3
4
|
require_relative "../../diff/diff_node"
|
|
4
5
|
require_relative "../../diff/path_builder"
|
|
5
6
|
require_relative "../../diff/node_serializer"
|
|
@@ -62,6 +63,21 @@ module Canon
|
|
|
62
63
|
end
|
|
63
64
|
end
|
|
64
65
|
|
|
66
|
+
# For attribute presence differences, show what attributes differ
|
|
67
|
+
if dimension == :attribute_presence
|
|
68
|
+
attrs1 = extract_attributes(node1)
|
|
69
|
+
attrs2 = extract_attributes(node2)
|
|
70
|
+
return build_attribute_difference_reason(attrs1, attrs2)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# For text content differences, show the actual text (truncated if needed)
|
|
74
|
+
if dimension == :text_content
|
|
75
|
+
text1 = extract_text_content(node1)
|
|
76
|
+
text2 = extract_text_content(node2)
|
|
77
|
+
return build_text_difference_reason(text1, text2)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Default reason
|
|
65
81
|
"#{diff1} vs #{diff2}"
|
|
66
82
|
end
|
|
67
83
|
|
|
@@ -110,6 +126,98 @@ module Canon
|
|
|
110
126
|
|
|
111
127
|
Canon::Diff::NodeSerializer.extract_attributes(node)
|
|
112
128
|
end
|
|
129
|
+
|
|
130
|
+
# Build a clear reason message for attribute presence differences
|
|
131
|
+
# Shows which attributes are only in node1, only in node2, or different values
|
|
132
|
+
#
|
|
133
|
+
# @param attrs1 [Hash, nil] First node's attributes
|
|
134
|
+
# @param attrs2 [Hash, nil] Second node's attributes
|
|
135
|
+
# @return [String] Clear explanation of the attribute difference
|
|
136
|
+
def self.build_attribute_difference_reason(attrs1, attrs2)
|
|
137
|
+
return "#{attrs1&.keys&.size || 0} vs #{attrs2&.keys&.size || 0} attributes" unless attrs1 && attrs2
|
|
138
|
+
|
|
139
|
+
keys1 = attrs1.keys.to_set
|
|
140
|
+
keys2 = attrs2.keys.to_set
|
|
141
|
+
|
|
142
|
+
only_in_1 = keys1 - keys2
|
|
143
|
+
only_in_2 = keys2 - keys1
|
|
144
|
+
common = keys1 & keys2
|
|
145
|
+
|
|
146
|
+
# Check if values differ for common keys
|
|
147
|
+
different_values = common.reject { |k| attrs1[k] == attrs2[k] }
|
|
148
|
+
|
|
149
|
+
parts = []
|
|
150
|
+
parts << "only in first: #{only_in_1.to_a.sort.join(', ')}" if only_in_1.any?
|
|
151
|
+
parts << "only in second: #{only_in_2.to_a.sort.join(', ')}" if only_in_2.any?
|
|
152
|
+
parts << "different values: #{different_values.sort.join(', ')}" if different_values.any?
|
|
153
|
+
|
|
154
|
+
if parts.empty?
|
|
155
|
+
"#{keys1.size} vs #{keys2.size} attributes (same names)"
|
|
156
|
+
else
|
|
157
|
+
parts.join("; ")
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Extract text content from a node
|
|
162
|
+
#
|
|
163
|
+
# @param node [Object, nil] Node to extract text from
|
|
164
|
+
# @return [String, nil] Text content or nil
|
|
165
|
+
def self.extract_text_content(node)
|
|
166
|
+
return nil if node.nil?
|
|
167
|
+
|
|
168
|
+
# For Canon::Xml::Nodes::TextNode
|
|
169
|
+
return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
|
|
170
|
+
|
|
171
|
+
# For XML/HTML nodes with text_content method
|
|
172
|
+
return node.text_content if node.respond_to?(:text_content)
|
|
173
|
+
|
|
174
|
+
# For nodes with text method
|
|
175
|
+
return node.text if node.respond_to?(:text)
|
|
176
|
+
|
|
177
|
+
# For nodes with content method (Moxml::Text)
|
|
178
|
+
return node.content if node.respond_to?(:content)
|
|
179
|
+
|
|
180
|
+
# For nodes with value method (other types)
|
|
181
|
+
return node.value if node.respond_to?(:value)
|
|
182
|
+
|
|
183
|
+
# For simple text nodes or strings
|
|
184
|
+
return node.to_s if node.is_a?(String)
|
|
185
|
+
|
|
186
|
+
# For other node types, try to_s
|
|
187
|
+
node.to_s
|
|
188
|
+
rescue StandardError
|
|
189
|
+
nil
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Build a clear reason message for text content differences
|
|
193
|
+
# Shows the actual text content (truncated if too long)
|
|
194
|
+
#
|
|
195
|
+
# @param text1 [String, nil] First text content
|
|
196
|
+
# @param text2 [String, nil] Second text content
|
|
197
|
+
# @return [String] Clear explanation of the text difference
|
|
198
|
+
def self.build_text_difference_reason(text1, text2)
|
|
199
|
+
# Handle nil cases
|
|
200
|
+
return "missing vs '#{truncate(text2)}'" if text1.nil? && text2
|
|
201
|
+
return "'#{truncate(text1)}' vs missing" if text1 && text2.nil?
|
|
202
|
+
return "both missing" if text1.nil? && text2.nil?
|
|
203
|
+
|
|
204
|
+
# Both have content - show truncated versions
|
|
205
|
+
"'#{truncate(text1)}' vs '#{truncate(text2)}'"
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# Truncate text for display in reason messages
|
|
209
|
+
#
|
|
210
|
+
# @param text [String] Text to truncate
|
|
211
|
+
# @param max_length [Integer] Maximum length
|
|
212
|
+
# @return [String] Truncated text
|
|
213
|
+
def self.truncate(text, max_length = 40)
|
|
214
|
+
return "" if text.nil?
|
|
215
|
+
|
|
216
|
+
text = text.to_s
|
|
217
|
+
return text if text.length <= max_length
|
|
218
|
+
|
|
219
|
+
"#{text[0...max_length]}..."
|
|
220
|
+
end
|
|
113
221
|
end
|
|
114
222
|
end
|
|
115
223
|
end
|
|
@@ -13,21 +13,24 @@ module Canon
|
|
|
13
13
|
#
|
|
14
14
|
# @param node [String, Object] Node to parse
|
|
15
15
|
# @param preprocessing [Symbol] Preprocessing mode (:none, :normalize, :c14n, :format)
|
|
16
|
+
# @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
|
|
16
17
|
# @return [Canon::Xml::Node] Parsed node
|
|
17
|
-
def self.parse(node, preprocessing = :none)
|
|
18
|
+
def self.parse(node, preprocessing = :none, preserve_whitespace: false)
|
|
18
19
|
# If already a Canon::Xml::Node, return as-is
|
|
19
20
|
return node if node.is_a?(Canon::Xml::Node)
|
|
20
21
|
|
|
21
22
|
# If it's a Nokogiri or Moxml node, convert to DataModel
|
|
22
23
|
unless node.is_a?(String)
|
|
23
|
-
return convert_from_node(node
|
|
24
|
+
return convert_from_node(node,
|
|
25
|
+
preserve_whitespace: preserve_whitespace)
|
|
24
26
|
end
|
|
25
27
|
|
|
26
28
|
# Apply preprocessing to XML string before parsing
|
|
27
29
|
xml_string = apply_preprocessing(node, preprocessing)
|
|
28
30
|
|
|
29
31
|
# Use Canon::Xml::DataModel for parsing to get Canon::Xml::Node instances
|
|
30
|
-
Canon::Xml::DataModel.from_xml(xml_string
|
|
32
|
+
Canon::Xml::DataModel.from_xml(xml_string,
|
|
33
|
+
preserve_whitespace: preserve_whitespace)
|
|
31
34
|
end
|
|
32
35
|
|
|
33
36
|
# Apply preprocessing transformation to XML string
|
|
@@ -55,8 +58,9 @@ module Canon
|
|
|
55
58
|
# Convert from Nokogiri/Moxml node to Canon::Xml::Node
|
|
56
59
|
#
|
|
57
60
|
# @param node [Object] Nokogiri or Moxml node
|
|
61
|
+
# @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
|
|
58
62
|
# @return [Canon::Xml::Node] Converted node
|
|
59
|
-
def self.convert_from_node(node)
|
|
63
|
+
def self.convert_from_node(node, preserve_whitespace: false)
|
|
60
64
|
# Convert to XML string then parse through DataModel
|
|
61
65
|
xml_str = if node.respond_to?(:to_xml)
|
|
62
66
|
node.to_xml
|
|
@@ -66,7 +70,8 @@ module Canon
|
|
|
66
70
|
raise Canon::Error,
|
|
67
71
|
"Unable to convert node to string: #{node.class}"
|
|
68
72
|
end
|
|
69
|
-
Canon::Xml::DataModel.from_xml(xml_str
|
|
73
|
+
Canon::Xml::DataModel.from_xml(xml_str,
|
|
74
|
+
preserve_whitespace: preserve_whitespace)
|
|
70
75
|
end
|
|
71
76
|
end
|
|
72
77
|
end
|
|
@@ -23,7 +23,8 @@ module Canon
|
|
|
23
23
|
# @param diff_children [Boolean] Whether to diff children
|
|
24
24
|
# @param differences [Array] Array to collect differences
|
|
25
25
|
# @return [Integer] Comparison result code
|
|
26
|
-
def compare(node1, node2, comparator, opts, child_opts,
|
|
26
|
+
def compare(node1, node2, comparator, opts, child_opts,
|
|
27
|
+
diff_children, differences)
|
|
27
28
|
# Dispatch based on node type
|
|
28
29
|
# Canon::Xml::Node types use .node_type method that returns symbols
|
|
29
30
|
# Nokogiri also has .node_type but returns integers, so check for Symbol
|
|
@@ -51,11 +52,14 @@ module Canon
|
|
|
51
52
|
comparator.send(:compare_element_nodes, node1, node2, opts, child_opts,
|
|
52
53
|
diff_children, differences)
|
|
53
54
|
when :text
|
|
54
|
-
comparator.send(:compare_text_nodes, node1, node2, opts,
|
|
55
|
+
comparator.send(:compare_text_nodes, node1, node2, opts,
|
|
56
|
+
differences)
|
|
55
57
|
when :comment
|
|
56
|
-
comparator.send(:compare_comment_nodes, node1, node2, opts,
|
|
58
|
+
comparator.send(:compare_comment_nodes, node1, node2, opts,
|
|
59
|
+
differences)
|
|
57
60
|
when :cdata
|
|
58
|
-
comparator.send(:compare_text_nodes, node1, node2, opts,
|
|
61
|
+
comparator.send(:compare_text_nodes, node1, node2, opts,
|
|
62
|
+
differences)
|
|
59
63
|
when :processing_instruction
|
|
60
64
|
comparator.send(:compare_processing_instruction_nodes, node1, node2, opts,
|
|
61
65
|
differences)
|
|
@@ -71,11 +75,14 @@ module Canon
|
|
|
71
75
|
comparator.send(:compare_element_nodes, node1, node2, opts, child_opts,
|
|
72
76
|
diff_children, differences)
|
|
73
77
|
elsif node1.respond_to?(:text?) && node1.text?
|
|
74
|
-
comparator.send(:compare_text_nodes, node1, node2, opts,
|
|
78
|
+
comparator.send(:compare_text_nodes, node1, node2, opts,
|
|
79
|
+
differences)
|
|
75
80
|
elsif node1.respond_to?(:comment?) && node1.comment?
|
|
76
|
-
comparator.send(:compare_comment_nodes, node1, node2, opts,
|
|
81
|
+
comparator.send(:compare_comment_nodes, node1, node2, opts,
|
|
82
|
+
differences)
|
|
77
83
|
elsif node1.respond_to?(:cdata?) && node1.cdata?
|
|
78
|
-
comparator.send(:compare_text_nodes, node1, node2, opts,
|
|
84
|
+
comparator.send(:compare_text_nodes, node1, node2, opts,
|
|
85
|
+
differences)
|
|
79
86
|
elsif node1.respond_to?(:processing_instruction?) &&
|
|
80
87
|
node1.processing_instruction?
|
|
81
88
|
comparator.send(:compare_processing_instruction_nodes, node1, node2, opts,
|