canon 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +31 -149
- data/README.adoc +9 -0
- data/docs/advanced/semantic-diff-report.adoc +31 -0
- data/docs/features/configuration-profiles.adoc +4 -2
- data/docs/features/match-options/html-policies.adoc +2 -0
- data/docs/features/match-options/index.adoc +40 -0
- data/docs/guides/choosing-configuration.adoc +12 -1
- data/docs/reference/cli-options.adoc +3 -0
- data/docs/reference/options-across-interfaces.adoc +7 -1
- data/docs/understanding/formats/html.adoc +9 -2
- data/lib/canon/cli.rb +4 -0
- data/lib/canon/commands/diff_command.rb +1 -0
- data/lib/canon/comparison/comparison_result.rb +79 -0
- data/lib/canon/comparison/html_comparator.rb +92 -11
- data/lib/canon/comparison/markup_comparator.rb +19 -0
- data/lib/canon/comparison/match_options/base_resolver.rb +1 -0
- data/lib/canon/comparison/match_options/xml_resolver.rb +8 -0
- data/lib/canon/comparison/match_options.rb +23 -2
- data/lib/canon/comparison/whitespace_sensitivity.rb +96 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +6 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +45 -7
- data/lib/canon/comparison/xml_comparator.rb +80 -4
- data/lib/canon/comparison/xml_node_comparison.rb +29 -3
- data/lib/canon/comparison.rb +84 -22
- data/lib/canon/config/env_schema.rb +2 -1
- data/lib/canon/config/profiles/metanorma.yml +3 -0
- data/lib/canon/config.rb +51 -5
- data/lib/canon/diff/diff_classifier.rb +18 -2
- data/lib/canon/diff/diff_line_builder.rb +9 -8
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +39 -4
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +5 -2
- data/lib/canon/diff_formatter/by_line_formatter.rb +84 -0
- data/lib/canon/diff_formatter/by_object_formatter.rb +53 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +65 -17
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +17 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +29 -0
- data/lib/canon/diff_formatter/pretty_diff_formatter.rb +109 -0
- data/lib/canon/diff_formatter.rb +57 -173
- data/lib/canon/html/data_model.rb +10 -4
- data/lib/canon/tree_diff/adapters/html_adapter.rb +55 -2
- data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/c14n.rb +59 -5
- data/lib/canon/xml/element_matcher.rb +3 -0
- data/lib/canon/xml/node.rb +8 -1
- data/lib/canon/xml/nodes/comment_node.rb +4 -0
- data/lib/canon/xml/nodes/element_node.rb +4 -0
- data/lib/canon/xml/nodes/text_node.rb +4 -0
- data/lib/canon/xml/sax_builder.rb +11 -2
- data/lib/canon/xml/xpath_engine.rb +238 -0
- metadata +6 -2
|
@@ -13,6 +13,7 @@ require_relative "../diff/diff_classifier"
|
|
|
13
13
|
require_relative "strategies/match_strategy_factory"
|
|
14
14
|
require_relative "../html/data_model"
|
|
15
15
|
require_relative "xml_node_comparison"
|
|
16
|
+
require_relative "xml_comparator/diff_node_builder"
|
|
16
17
|
# Whitespace sensitivity module (single source of truth for sensitive elements)
|
|
17
18
|
require_relative "whitespace_sensitivity"
|
|
18
19
|
|
|
@@ -172,10 +173,42 @@ module Canon
|
|
|
172
173
|
# @param node2 [Object] Second node
|
|
173
174
|
# @return [Boolean] true if both are document fragments
|
|
174
175
|
def fragment_nodes?(node1, node2)
|
|
175
|
-
(node1
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
176
|
+
fragment_node?(node1) && fragment_node?(node2)
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Check if a single node is a recognised document fragment.
|
|
180
|
+
# All three Nokogiri fragment types (XML, HTML4, HTML5) must be
|
|
181
|
+
# accepted: dom_diff routes html/html4/html5 input through
|
|
182
|
+
# Nokogiri::HTML5.fragment per #118.
|
|
183
|
+
def fragment_node?(node)
|
|
184
|
+
node.is_a?(Nokogiri::XML::DocumentFragment) ||
|
|
185
|
+
node.is_a?(Nokogiri::HTML4::DocumentFragment) ||
|
|
186
|
+
node.is_a?(Nokogiri::HTML5::DocumentFragment)
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Record a DiffNode for a fragment-level child-count mismatch.
|
|
190
|
+
# Each surplus child becomes its own MISSING_NODE diff so the
|
|
191
|
+
# downstream report shows what was added or removed.
|
|
192
|
+
def record_fragment_length_mismatch(_node1, _node2, children1,
|
|
193
|
+
children2, differences)
|
|
194
|
+
longer, shorter, side = if children1.length > children2.length
|
|
195
|
+
[children1, children2, :removed]
|
|
196
|
+
else
|
|
197
|
+
[children2, children1, :added]
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
longer[shorter.length...].each do |orphan|
|
|
201
|
+
n1 = side == :removed ? orphan : nil
|
|
202
|
+
n2 = side == :removed ? nil : orphan
|
|
203
|
+
differences <<
|
|
204
|
+
Canon::Comparison::DiffNodeBuilder.build(
|
|
205
|
+
node1: n1,
|
|
206
|
+
node2: n2,
|
|
207
|
+
diff1: Comparison::MISSING_NODE,
|
|
208
|
+
diff2: Comparison::MISSING_NODE,
|
|
209
|
+
dimension: :element_structure,
|
|
210
|
+
)
|
|
211
|
+
end
|
|
179
212
|
end
|
|
180
213
|
|
|
181
214
|
# Compare children of document fragments
|
|
@@ -196,6 +229,13 @@ module Canon
|
|
|
196
229
|
children2 = XmlNodeComparison.filter_children(all_children2, opts)
|
|
197
230
|
|
|
198
231
|
if children1.length != children2.length
|
|
232
|
+
# Record the length mismatch as a DiffNode so verbose mode
|
|
233
|
+
# surfaces it. Without this, equivalent? wraps an empty
|
|
234
|
+
# differences array and incorrectly reports the inputs as
|
|
235
|
+
# equivalent.
|
|
236
|
+
record_fragment_length_mismatch(node1, node2,
|
|
237
|
+
children1, children2,
|
|
238
|
+
differences)
|
|
199
239
|
return Comparison::UNEQUAL_ELEMENTS
|
|
200
240
|
elsif children1.empty?
|
|
201
241
|
return Comparison::EQUIVALENT
|
|
@@ -291,10 +331,12 @@ module Canon
|
|
|
291
331
|
node.to_html
|
|
292
332
|
end
|
|
293
333
|
|
|
294
|
-
# Use XML fragment parser to preserve structure without auto-generated elements
|
|
295
|
-
#
|
|
296
|
-
#
|
|
297
|
-
frag = Nokogiri::XML.fragment(
|
|
334
|
+
# Use XML fragment parser to preserve structure without auto-generated elements.
|
|
335
|
+
# Decode HTML named entities ( etc.) to UTF-8 characters since XML
|
|
336
|
+
# parser only understands the five XML entities.
|
|
337
|
+
frag = Nokogiri::XML.fragment(
|
|
338
|
+
decode_html_named_entities(html_string),
|
|
339
|
+
)
|
|
298
340
|
|
|
299
341
|
# Apply preprocessing if needed
|
|
300
342
|
if preprocessing == :rendered
|
|
@@ -448,8 +490,12 @@ module Canon
|
|
|
448
490
|
end
|
|
449
491
|
|
|
450
492
|
# Parse as Nokogiri fragment for DOM comparison
|
|
451
|
-
# Use XML fragment parser to avoid auto-inserted meta tags
|
|
452
|
-
|
|
493
|
+
# Use XML fragment parser to avoid auto-inserted meta tags.
|
|
494
|
+
# Decode HTML named entities ( etc.) to UTF-8 characters since
|
|
495
|
+
# XML parser only understands the five XML entities.
|
|
496
|
+
frag = Nokogiri::XML.fragment(
|
|
497
|
+
decode_html_named_entities(html_string),
|
|
498
|
+
)
|
|
453
499
|
|
|
454
500
|
# Apply post-parsing filtering for :normalize, :format, and :rendered preprocessing
|
|
455
501
|
if %i[normalize format rendered].include?(preprocessing)
|
|
@@ -496,6 +542,33 @@ module Canon
|
|
|
496
542
|
|
|
497
543
|
# Detect HTML version from content
|
|
498
544
|
#
|
|
545
|
+
# Decode HTML named entities to their UTF-8 character equivalents.
|
|
546
|
+
# This is a targeted replacement that only changes entity references,
|
|
547
|
+
# preserving all tag structure. Needed because Nokogiri::XML.fragment
|
|
548
|
+
# only understands the five XML entities (& < > " ').
|
|
549
|
+
#
|
|
550
|
+
# @param str [String] HTML string possibly containing named entities
|
|
551
|
+
# @return [String] String with named entities replaced by UTF-8 chars
|
|
552
|
+
def decode_html_named_entities(str)
|
|
553
|
+
return str unless str.include?("&")
|
|
554
|
+
|
|
555
|
+
str.gsub(/ /i, "\u00A0")
|
|
556
|
+
.gsub(/ /i, "\u2002")
|
|
557
|
+
.gsub(/ /i, "\u2003")
|
|
558
|
+
.gsub(/ /i, "\u2009")
|
|
559
|
+
.gsub(/©/i, "\u00A9")
|
|
560
|
+
.gsub(/®/i, "\u00AE")
|
|
561
|
+
.gsub(/™/i, "\u2122")
|
|
562
|
+
.gsub(/—/i, "\u2014")
|
|
563
|
+
.gsub(/–/i, "\u2013")
|
|
564
|
+
.gsub(/‘/i, "\u2018")
|
|
565
|
+
.gsub(/’/i, "\u2019")
|
|
566
|
+
.gsub(/“/i, "\u201C")
|
|
567
|
+
.gsub(/”/i, "\u201D")
|
|
568
|
+
.gsub(/•/i, "\u2022")
|
|
569
|
+
.gsub(/…/i, "\u2026")
|
|
570
|
+
end
|
|
571
|
+
|
|
499
572
|
# @param content [String] HTML content
|
|
500
573
|
# @return [Symbol] :html5 or :html4
|
|
501
574
|
def detect_html_version(content)
|
|
@@ -721,8 +794,16 @@ compare_profile = nil)
|
|
|
721
794
|
parent = text_node.parent
|
|
722
795
|
next if ancestor_preserves_whitespace?(parent, preserve_whitespace)
|
|
723
796
|
|
|
797
|
+
content = text_node.content
|
|
798
|
+
|
|
799
|
+
# NBSP (U+00A0) is never insignificant — don't remove
|
|
800
|
+
next if content.include?("\u00A0")
|
|
801
|
+
|
|
802
|
+
# Whitespace between inline siblings is significant — don't remove
|
|
803
|
+
next if WhitespaceSensitivity.inline_whitespace_significant?(text_node)
|
|
804
|
+
|
|
724
805
|
# Remove if the text is only whitespace (after normalization)
|
|
725
|
-
if
|
|
806
|
+
if content.strip.empty?
|
|
726
807
|
text_node.remove
|
|
727
808
|
end
|
|
728
809
|
end
|
|
@@ -182,6 +182,25 @@ module Canon
|
|
|
182
182
|
return false unless text_node?(node) && node.parent
|
|
183
183
|
return false unless MatchOptions.normalize_text(node_text(node)).empty?
|
|
184
184
|
|
|
185
|
+
# NBSP (U+00A0) is never insignificant whitespace —
|
|
186
|
+
# it always renders as a visible non-breaking space.
|
|
187
|
+
# For HTML: always preserve NBSP nodes.
|
|
188
|
+
# For XML with whitespace_type: :strict: preserve NBSP nodes so
|
|
189
|
+
# different Unicode whitespace types remain distinguishable.
|
|
190
|
+
format = opts[:format] || match_opts[:format]
|
|
191
|
+
whitespace_type = match_opts[:whitespace_type] || :strict
|
|
192
|
+
if (%i[html html4
|
|
193
|
+
html5].include?(format) || whitespace_type == :strict) && WhitespaceSensitivity.contains_nbsp?(node_text(node))
|
|
194
|
+
return false
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
if %i[html html4
|
|
198
|
+
html5].include?(format) && WhitespaceSensitivity.inline_whitespace_significant?(node)
|
|
199
|
+
# Whitespace between inline element siblings is semantically
|
|
200
|
+
# significant (renders as a visible gap) and must not be stripped.
|
|
201
|
+
return false
|
|
202
|
+
end
|
|
203
|
+
|
|
185
204
|
return true unless WhitespaceSensitivity.whitespace_preserved?(
|
|
186
205
|
node.parent, match_opts
|
|
187
206
|
)
|
|
@@ -24,6 +24,7 @@ module Canon
|
|
|
24
24
|
attribute_values: :strict,
|
|
25
25
|
element_position: :ignore,
|
|
26
26
|
comments: :ignore,
|
|
27
|
+
whitespace_type: :strict,
|
|
27
28
|
},
|
|
28
29
|
xml: {
|
|
29
30
|
preprocessing: :none,
|
|
@@ -34,6 +35,7 @@ module Canon
|
|
|
34
35
|
attribute_values: :strict,
|
|
35
36
|
element_position: :strict,
|
|
36
37
|
comments: :strict,
|
|
38
|
+
whitespace_type: :strict,
|
|
37
39
|
},
|
|
38
40
|
}.freeze
|
|
39
41
|
|
|
@@ -51,6 +53,7 @@ module Canon
|
|
|
51
53
|
attribute_values: :strict,
|
|
52
54
|
element_position: :strict,
|
|
53
55
|
comments: :strict,
|
|
56
|
+
whitespace_type: :strict,
|
|
54
57
|
},
|
|
55
58
|
|
|
56
59
|
# Rendered: Match rendered output (HTML default)
|
|
@@ -64,6 +67,7 @@ module Canon
|
|
|
64
67
|
attribute_values: :strict,
|
|
65
68
|
element_position: :strict,
|
|
66
69
|
comments: :ignore,
|
|
70
|
+
whitespace_type: :strict,
|
|
67
71
|
},
|
|
68
72
|
|
|
69
73
|
# HTML4: Match HTML4 rendered output
|
|
@@ -77,6 +81,7 @@ module Canon
|
|
|
77
81
|
attribute_values: :normalize,
|
|
78
82
|
element_position: :ignore,
|
|
79
83
|
comments: :ignore,
|
|
84
|
+
whitespace_type: :strict,
|
|
80
85
|
},
|
|
81
86
|
|
|
82
87
|
# HTML5: Match HTML5 rendered output (same as rendered)
|
|
@@ -89,6 +94,7 @@ module Canon
|
|
|
89
94
|
attribute_values: :strict,
|
|
90
95
|
element_position: :ignore,
|
|
91
96
|
comments: :ignore,
|
|
97
|
+
whitespace_type: :strict,
|
|
92
98
|
},
|
|
93
99
|
|
|
94
100
|
# Spec-friendly: Formatting doesn't matter
|
|
@@ -102,6 +108,7 @@ module Canon
|
|
|
102
108
|
attribute_values: :normalize,
|
|
103
109
|
element_position: :ignore,
|
|
104
110
|
comments: :ignore,
|
|
111
|
+
whitespace_type: :strict,
|
|
105
112
|
},
|
|
106
113
|
|
|
107
114
|
# Content-only: Only content matters
|
|
@@ -114,6 +121,7 @@ module Canon
|
|
|
114
121
|
attribute_values: :normalize,
|
|
115
122
|
element_position: :ignore,
|
|
116
123
|
comments: :ignore,
|
|
124
|
+
whitespace_type: :strict,
|
|
117
125
|
},
|
|
118
126
|
}.freeze
|
|
119
127
|
|
|
@@ -69,13 +69,18 @@ module Canon
|
|
|
69
69
|
# @param text1 [String] First text
|
|
70
70
|
# @param text2 [String] Second text
|
|
71
71
|
# @param behavior [Symbol] Match behavior (:strict, :normalize, :ignore)
|
|
72
|
+
# @param whitespace_type [Symbol] Whitespace type handling (:strict, :normalize)
|
|
72
73
|
# @return [Boolean] true if texts match according to behavior
|
|
73
|
-
def match_text?(text1, text2, behavior)
|
|
74
|
+
def match_text?(text1, text2, behavior, whitespace_type: :strict)
|
|
74
75
|
case behavior
|
|
75
76
|
when :strict
|
|
76
77
|
text1 == text2
|
|
77
78
|
when :normalize
|
|
78
|
-
|
|
79
|
+
if whitespace_type == :normalize
|
|
80
|
+
normalize_text(text1) == normalize_text(text2)
|
|
81
|
+
else
|
|
82
|
+
normalize_text_preserving_type(text1) == normalize_text_preserving_type(text2)
|
|
83
|
+
end
|
|
79
84
|
when :ignore
|
|
80
85
|
true
|
|
81
86
|
else
|
|
@@ -101,6 +106,22 @@ module Canon
|
|
|
101
106
|
.strip # Remove leading/trailing whitespace
|
|
102
107
|
end
|
|
103
108
|
|
|
109
|
+
# Normalize text preserving Unicode whitespace type distinctions.
|
|
110
|
+
#
|
|
111
|
+
# Only ASCII whitespace (space, tab, newline, etc.) is collapsed.
|
|
112
|
+
# Unicode whitespace (NBSP, ideographic space, etc.) is preserved,
|
|
113
|
+
# so different whitespace types remain distinguishable.
|
|
114
|
+
#
|
|
115
|
+
# @param text [String] Text to normalize
|
|
116
|
+
# @return [String] Normalized text with preserved whitespace types
|
|
117
|
+
def normalize_text_preserving_type(text)
|
|
118
|
+
return "" if text.nil?
|
|
119
|
+
|
|
120
|
+
text.to_s
|
|
121
|
+
.gsub(/[ \t\r\n\f\v]+/, " ") # Collapse only ASCII whitespace
|
|
122
|
+
.strip
|
|
123
|
+
end
|
|
124
|
+
|
|
104
125
|
# Process attribute value according to match behavior
|
|
105
126
|
#
|
|
106
127
|
# @param value [String] Attribute value to process
|
|
@@ -50,6 +50,15 @@ module Canon
|
|
|
50
50
|
# HTML elements where every whitespace character is significant.
|
|
51
51
|
HTML_PRESERVE_ELEMENTS = %w[pre code textarea script style].freeze
|
|
52
52
|
|
|
53
|
+
# HTML inline elements — whitespace between these is semantically
|
|
54
|
+
# significant (renders as a visible space). Whitespace-only text
|
|
55
|
+
# nodes that sit between two inline siblings must not be stripped.
|
|
56
|
+
INLINE_ELEMENTS = %w[
|
|
57
|
+
a abbr acronym b bdo big br button cite code dfn em i img input kbd
|
|
58
|
+
label map object output q s samp select small span strong sub sup
|
|
59
|
+
time tt u var wbr
|
|
60
|
+
].freeze
|
|
61
|
+
|
|
53
62
|
class << self
|
|
54
63
|
# Classify the whitespace behaviour for an element using ancestor walk.
|
|
55
64
|
#
|
|
@@ -213,6 +222,69 @@ module Canon
|
|
|
213
222
|
.include?(element_name.to_sym)
|
|
214
223
|
end
|
|
215
224
|
|
|
225
|
+
# Check if a whitespace-only text node sits between two inline element
|
|
226
|
+
# siblings, making the whitespace semantically significant.
|
|
227
|
+
#
|
|
228
|
+
# In HTML rendering, a space between <span>A</span> <span>B</span>
|
|
229
|
+
# produces visible output. Stripping such nodes produces false
|
|
230
|
+
# equivalence.
|
|
231
|
+
#
|
|
232
|
+
# Works with any parent type (element, DocumentFragment, RootNode)
|
|
233
|
+
# since the check is about sibling context, not parent type.
|
|
234
|
+
#
|
|
235
|
+
# @param text_node [Object] Text node (Nokogiri or Canon::Xml::Node)
|
|
236
|
+
# @return [Boolean] true if whitespace is between inline siblings
|
|
237
|
+
def inline_whitespace_significant?(text_node)
|
|
238
|
+
return false unless text_node.respond_to?(:parent)
|
|
239
|
+
|
|
240
|
+
parent = text_node.parent
|
|
241
|
+
return false unless parent
|
|
242
|
+
return false unless parent.respond_to?(:children)
|
|
243
|
+
|
|
244
|
+
siblings = parent.children
|
|
245
|
+
idx = siblings.index(text_node)
|
|
246
|
+
return false unless idx
|
|
247
|
+
|
|
248
|
+
# Look at the IMMEDIATE non-whitespace-text neighbour on each
|
|
249
|
+
# side. Whitespace at a block boundary is collapsed per CSS,
|
|
250
|
+
# so both immediate neighbours must be inline for the
|
|
251
|
+
# whitespace to be significant. Walking all siblings (the
|
|
252
|
+
# earlier behaviour) misclassified whitespace at a block
|
|
253
|
+
# boundary as significant whenever any inline element existed
|
|
254
|
+
# elsewhere among the siblings.
|
|
255
|
+
prev_neighbour = nearest_non_whitespace_sibling(siblings, idx, -1)
|
|
256
|
+
next_neighbour = nearest_non_whitespace_sibling(siblings, idx, 1)
|
|
257
|
+
|
|
258
|
+
inline_element?(prev_neighbour) && inline_element?(next_neighbour)
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Walk outward from +idx+ in +direction+ (+1 forward, -1 back),
|
|
262
|
+
# skipping whitespace-only text nodes, and return the first
|
|
263
|
+
# non-whitespace sibling found. Returns nil if none.
|
|
264
|
+
def nearest_non_whitespace_sibling(siblings, idx, direction)
|
|
265
|
+
i = idx + direction
|
|
266
|
+
while i >= 0 && i < siblings.length
|
|
267
|
+
s = siblings[i]
|
|
268
|
+
unless s.respond_to?(:text?) && s.text? &&
|
|
269
|
+
s.respond_to?(:content) && s.content.to_s.strip.empty?
|
|
270
|
+
return s
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
i += direction
|
|
274
|
+
end
|
|
275
|
+
nil
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# Check if text content contains a non-breaking space (U+00A0).
|
|
279
|
+
# NBSP is NOT collapsible whitespace in HTML — it always renders as
|
|
280
|
+
# a visible space and must never be stripped.
|
|
281
|
+
#
|
|
282
|
+
# @param text [String] Text content to check
|
|
283
|
+
# @return [Boolean] true if text contains U+00A0
|
|
284
|
+
def contains_nbsp?(text)
|
|
285
|
+
text.to_s.include?("\u00A0")
|
|
286
|
+
end
|
|
287
|
+
|
|
216
288
|
private
|
|
217
289
|
|
|
218
290
|
# Build the Set of preserve whitespace element names (strings).
|
|
@@ -336,6 +408,30 @@ module Canon
|
|
|
336
408
|
# Nokogiri compatibility
|
|
337
409
|
parent.respond_to?(:node_type) && parent.node_type == :element
|
|
338
410
|
end
|
|
411
|
+
|
|
412
|
+
# Get the parent element of a text node, or nil.
|
|
413
|
+
# Works with both Nokogiri and Canon::Xml::Node types.
|
|
414
|
+
def parent_element_of(text_node)
|
|
415
|
+
return nil unless text_node.respond_to?(:parent)
|
|
416
|
+
|
|
417
|
+
parent = text_node.parent
|
|
418
|
+
return nil unless parent
|
|
419
|
+
|
|
420
|
+
if parent.is_a?(Canon::Xml::Nodes::ElementNode)
|
|
421
|
+
parent
|
|
422
|
+
elsif parent.respond_to?(:element?) && parent.element?
|
|
423
|
+
parent
|
|
424
|
+
elsif parent.respond_to?(:node_type) && parent.node_type == :element
|
|
425
|
+
parent
|
|
426
|
+
end
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
# Check if a node is an HTML inline element.
|
|
430
|
+
def inline_element?(node)
|
|
431
|
+
return false unless node.respond_to?(:name)
|
|
432
|
+
|
|
433
|
+
INLINE_ELEMENTS.include?(node.name.to_s.downcase)
|
|
434
|
+
end
|
|
339
435
|
end
|
|
340
436
|
end
|
|
341
437
|
end
|
|
@@ -28,6 +28,9 @@ module Canon
|
|
|
28
28
|
# @return [Integer] Comparison result code
|
|
29
29
|
def compare(node1, node2, comparator, opts, child_opts,
|
|
30
30
|
diff_children, differences)
|
|
31
|
+
# FAST PATH: Object identity - same object means equivalent children
|
|
32
|
+
return Comparison::EQUIVALENT if node1.equal?(node2)
|
|
33
|
+
|
|
31
34
|
# Apply side-specific pretty-print heuristic when either flag is set:
|
|
32
35
|
# pretty_printed_expected → drop \n-starting whitespace nodes from node1
|
|
33
36
|
# pretty_printed_received → drop \n-starting whitespace nodes from node2
|
|
@@ -43,6 +46,9 @@ diff_children, differences)
|
|
|
43
46
|
# Quick check: if both have no children, they're equivalent
|
|
44
47
|
return Comparison::EQUIVALENT if children1.empty? && children2.empty?
|
|
45
48
|
|
|
49
|
+
# FAST PATH: Identical children arrays mean equivalent subtrees
|
|
50
|
+
return Comparison::EQUIVALENT if children1.equal?(children2)
|
|
51
|
+
|
|
46
52
|
# Check if we can use ElementMatcher (requires Canon::Xml::DataModel nodes)
|
|
47
53
|
if can_use_element_matcher?(children1, children2)
|
|
48
54
|
use_element_matcher_comparison(children1, children2, node1, comparator,
|
|
@@ -14,15 +14,18 @@ module Canon
|
|
|
14
14
|
# @param node [String, Object] Node to parse
|
|
15
15
|
# @param preprocessing [Symbol] Preprocessing mode (:none, :normalize, :c14n, :format)
|
|
16
16
|
# @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
|
|
17
|
+
# @param parser [Symbol] Parser backend (:sax or :dom, default from config)
|
|
17
18
|
# @return [Canon::Xml::Node] Parsed node
|
|
18
|
-
def self.parse(node, preprocessing = :none, preserve_whitespace: false
|
|
19
|
+
def self.parse(node, preprocessing = :none, preserve_whitespace: false,
|
|
20
|
+
parser: nil)
|
|
19
21
|
# If already a Canon::Xml::Node, return as-is
|
|
20
22
|
return node if node.is_a?(Canon::Xml::Node)
|
|
21
23
|
|
|
22
24
|
# If it's a Nokogiri or Moxml node, convert to DataModel
|
|
23
25
|
unless node.is_a?(String)
|
|
24
26
|
return convert_from_node(node,
|
|
25
|
-
preserve_whitespace: preserve_whitespace
|
|
27
|
+
preserve_whitespace: preserve_whitespace,
|
|
28
|
+
parser: parser)
|
|
26
29
|
end
|
|
27
30
|
|
|
28
31
|
# Normalize encoding before preprocessing (UTF-16 strings can't use strip, etc.)
|
|
@@ -31,9 +34,17 @@ module Canon
|
|
|
31
34
|
# Apply preprocessing to XML string before parsing
|
|
32
35
|
xml_string = apply_preprocessing(node, preprocessing).strip
|
|
33
36
|
|
|
34
|
-
#
|
|
35
|
-
|
|
37
|
+
# Select parser backend
|
|
38
|
+
resolved_parser = parser || resolve_parser_config
|
|
39
|
+
|
|
40
|
+
if resolved_parser == :sax
|
|
41
|
+
require_relative "../../xml/sax_builder"
|
|
42
|
+
Canon::Xml::SaxBuilder.parse(xml_string,
|
|
36
43
|
preserve_whitespace: preserve_whitespace)
|
|
44
|
+
else
|
|
45
|
+
Canon::Xml::DataModel.from_xml(xml_string,
|
|
46
|
+
preserve_whitespace: preserve_whitespace)
|
|
47
|
+
end
|
|
37
48
|
end
|
|
38
49
|
|
|
39
50
|
# Apply preprocessing transformation to XML string
|
|
@@ -62,9 +73,18 @@ module Canon
|
|
|
62
73
|
#
|
|
63
74
|
# @param node [Object] Nokogiri or Moxml node
|
|
64
75
|
# @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
|
|
76
|
+
# @param parser [Symbol, nil] Parser backend override
|
|
65
77
|
# @return [Canon::Xml::Node] Converted node
|
|
66
|
-
def self.convert_from_node(node, preserve_whitespace: false
|
|
67
|
-
|
|
78
|
+
def self.convert_from_node(node, preserve_whitespace: false,
|
|
79
|
+
parser: nil)
|
|
80
|
+
# FAST PATH: Convert Nokogiri/Moxml nodes directly without string round-trip
|
|
81
|
+
if defined?(Nokogiri::XML::Node) && node.is_a?(Nokogiri::XML::Node)
|
|
82
|
+
return Canon::Xml::DataModel.build_from_nokogiri(
|
|
83
|
+
node, preserve_whitespace: preserve_whitespace
|
|
84
|
+
)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# SLOW PATH: Fallback to string serialization for unknown node types
|
|
68
88
|
xml_str = if node.respond_to?(:to_xml)
|
|
69
89
|
node.to_xml
|
|
70
90
|
elsif node.respond_to?(:to_s)
|
|
@@ -73,8 +93,26 @@ module Canon
|
|
|
73
93
|
raise Canon::Error,
|
|
74
94
|
"Unable to convert node to string: #{node.class}"
|
|
75
95
|
end
|
|
76
|
-
|
|
96
|
+
|
|
97
|
+
resolved_parser = parser || resolve_parser_config
|
|
98
|
+
|
|
99
|
+
if resolved_parser == :sax
|
|
100
|
+
require_relative "../../xml/sax_builder"
|
|
101
|
+
Canon::Xml::SaxBuilder.parse(xml_str,
|
|
77
102
|
preserve_whitespace: preserve_whitespace)
|
|
103
|
+
else
|
|
104
|
+
Canon::Xml::DataModel.from_xml(xml_str,
|
|
105
|
+
preserve_whitespace: preserve_whitespace)
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Resolve parser config from global config
|
|
110
|
+
#
|
|
111
|
+
# @return [Symbol] :sax or :dom
|
|
112
|
+
def self.resolve_parser_config
|
|
113
|
+
Canon::Config.instance.xml.diff.parser
|
|
114
|
+
rescue StandardError
|
|
115
|
+
:sax
|
|
78
116
|
end
|
|
79
117
|
end
|
|
80
118
|
end
|
|
@@ -63,6 +63,18 @@ module Canon
|
|
|
63
63
|
# @return [Boolean, Array] true if equivalent, or array of diffs if
|
|
64
64
|
# verbose
|
|
65
65
|
def equivalent?(n1, n2, opts = {}, child_opts = {})
|
|
66
|
+
# FAST PATH: Object identity - same object is always equivalent
|
|
67
|
+
# Skip when semantic_diff is requested (caller needs tree diff metadata)
|
|
68
|
+
if n1.equal?(n2) && !opts.dig(:match, :semantic_diff)
|
|
69
|
+
return build_trivial_equivalent_result(n1, n2, opts)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# FAST PATH: String content equality - identical strings are equivalent
|
|
73
|
+
# Skip in verbose mode since caller may need full metadata (e.g. tree_diff statistics)
|
|
74
|
+
if !opts[:verbose] && n1.is_a?(String) && n2.is_a?(String) && n1 == n2
|
|
75
|
+
return true
|
|
76
|
+
end
|
|
77
|
+
|
|
66
78
|
opts = DEFAULT_OPTS.merge(opts)
|
|
67
79
|
|
|
68
80
|
# Resolve match options with format-specific defaults
|
|
@@ -92,8 +104,15 @@ module Canon
|
|
|
92
104
|
# Create child_opts with resolved options
|
|
93
105
|
child_opts = opts.merge(child_opts)
|
|
94
106
|
|
|
95
|
-
# Determine if we should preserve whitespace during parsing
|
|
96
|
-
#
|
|
107
|
+
# Determine if we should preserve whitespace during parsing.
|
|
108
|
+
# Only structural_whitespace: :strict forces whitespace-only text
|
|
109
|
+
# nodes to survive parsing. whitespace_type is about distinguishing
|
|
110
|
+
# Unicode whitespace *types* in surviving text-node content, and
|
|
111
|
+
# does NOT require indent text nodes to be kept — libxml's NOBLANKS
|
|
112
|
+
# only strips pure-ASCII whitespace-only nodes, so NBSP-only nodes
|
|
113
|
+
# survive regardless. Coupling whitespace_type: :strict to
|
|
114
|
+
# parsing-time preservation made pretty-printed fixtures produce
|
|
115
|
+
# spurious element-position diffs (issue #112).
|
|
97
116
|
preserve_whitespace = match_opts_hash[:structural_whitespace] == :strict
|
|
98
117
|
|
|
99
118
|
# Parse nodes if they are strings, applying preprocessing if needed
|
|
@@ -218,8 +237,57 @@ module Canon
|
|
|
218
237
|
preserve_whitespace: preserve_whitespace)
|
|
219
238
|
end
|
|
220
239
|
|
|
240
|
+
# Build result for trivially equivalent inputs (same object or identical strings)
|
|
241
|
+
#
|
|
242
|
+
# Returns plain `true` in non-verbose mode, or a ComparisonResult in verbose mode.
|
|
243
|
+
#
|
|
244
|
+
# @param n1 [Object] First input
|
|
245
|
+
# @param n2 [Object] Second input
|
|
246
|
+
# @param opts [Hash] Raw options (before merge with DEFAULT_OPTS)
|
|
247
|
+
# @return [Boolean, ComparisonResult]
|
|
248
|
+
def build_trivial_equivalent_result(n1, n2, opts)
|
|
249
|
+
return true unless opts[:verbose]
|
|
250
|
+
|
|
251
|
+
# Parse nodes for verbose display
|
|
252
|
+
preserve_whitespace = true
|
|
253
|
+
node1 = parse_node(n1, :none,
|
|
254
|
+
preserve_whitespace: preserve_whitespace)
|
|
255
|
+
node2 = parse_node(n2, :none,
|
|
256
|
+
preserve_whitespace: preserve_whitespace)
|
|
257
|
+
preprocessed = [
|
|
258
|
+
serialize_node(node1).gsub("><", ">\n<"),
|
|
259
|
+
serialize_node(node2).gsub("><", ">\n<"),
|
|
260
|
+
]
|
|
261
|
+
original1 = if n1.is_a?(String)
|
|
262
|
+
n1
|
|
263
|
+
elsif n1.respond_to?(:to_xml)
|
|
264
|
+
n1.to_xml
|
|
265
|
+
else
|
|
266
|
+
n1.to_s
|
|
267
|
+
end
|
|
268
|
+
original2 = if n2.is_a?(String)
|
|
269
|
+
n2
|
|
270
|
+
elsif n2.respond_to?(:to_xml)
|
|
271
|
+
n2.to_xml
|
|
272
|
+
else
|
|
273
|
+
n2.to_s
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
ComparisonResult.new(
|
|
277
|
+
differences: [],
|
|
278
|
+
preprocessed_strings: preprocessed,
|
|
279
|
+
original_strings: [original1, original2],
|
|
280
|
+
format: :xml,
|
|
281
|
+
match_options: {},
|
|
282
|
+
algorithm: :dom,
|
|
283
|
+
)
|
|
284
|
+
end
|
|
285
|
+
|
|
221
286
|
# Main comparison dispatcher
|
|
222
287
|
def compare_nodes(n1, n2, opts, child_opts, diff_children, differences)
|
|
288
|
+
# FAST PATH: Object identity - same object is always equivalent
|
|
289
|
+
return Comparison::EQUIVALENT if n1.equal?(n2)
|
|
290
|
+
|
|
223
291
|
# Handle DocumentFragment nodes - compare their children instead
|
|
224
292
|
if n1.is_a?(Nokogiri::XML::DocumentFragment) &&
|
|
225
293
|
n2.is_a?(Nokogiri::XML::DocumentFragment)
|
|
@@ -380,8 +448,10 @@ module Canon
|
|
|
380
448
|
raw_differs = text1 != text2
|
|
381
449
|
|
|
382
450
|
# Check if matches according to behavior
|
|
451
|
+
whitespace_type = match_opts[:whitespace_type] || :strict
|
|
383
452
|
matches_per_behavior = MatchOptions.match_text?(text1, text2,
|
|
384
|
-
behavior
|
|
453
|
+
behavior,
|
|
454
|
+
whitespace_type: whitespace_type)
|
|
385
455
|
|
|
386
456
|
# Determine the correct dimension for this difference
|
|
387
457
|
# - If text_content is :strict, ALL differences use :text_content dimension
|
|
@@ -599,7 +669,13 @@ differences)
|
|
|
599
669
|
end
|
|
600
670
|
return "element '#{node.name}'#{ns_info}: #{diff1} vs #{diff2}"
|
|
601
671
|
elsif node.respond_to?(:name) && !node.respond_to?(:namespace_uri)
|
|
602
|
-
|
|
672
|
+
# TextNode and other nodes without namespace_uri
|
|
673
|
+
display = if node.respond_to?(:value) && node.node_type == :text
|
|
674
|
+
"\"#{truncate_text(node.value)}\""
|
|
675
|
+
else
|
|
676
|
+
node.name.to_s
|
|
677
|
+
end
|
|
678
|
+
return "element missing: #{display}"
|
|
603
679
|
end
|
|
604
680
|
end
|
|
605
681
|
|