canon 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +31 -149
- data/README.adoc +9 -0
- data/docs/advanced/semantic-diff-report.adoc +96 -0
- data/docs/features/configuration-profiles.adoc +4 -2
- data/docs/features/diff-formatting/index.adoc +3 -0
- data/docs/features/diff-formatting/whitespace-adjacency.adoc +140 -0
- data/docs/features/match-options/html-policies.adoc +2 -0
- data/docs/features/match-options/index.adoc +40 -0
- data/docs/guides/choosing-configuration.adoc +12 -1
- data/docs/reference/cli-options.adoc +3 -0
- data/docs/reference/environment-variables.adoc +3 -1
- data/docs/reference/options-across-interfaces.adoc +7 -1
- data/docs/understanding/formats/html.adoc +9 -2
- data/lib/canon/cli.rb +4 -0
- data/lib/canon/commands/diff_command.rb +1 -0
- data/lib/canon/comparison/comparison_result.rb +95 -2
- data/lib/canon/comparison/html_comparator.rb +96 -11
- data/lib/canon/comparison/markup_comparator.rb +68 -71
- data/lib/canon/comparison/match_options/base_resolver.rb +1 -0
- data/lib/canon/comparison/match_options/xml_resolver.rb +8 -0
- data/lib/canon/comparison/match_options.rb +23 -2
- data/lib/canon/comparison/node_inspector.rb +103 -0
- data/lib/canon/comparison/whitespace_sensitivity.rb +96 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +133 -55
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +24 -23
- data/lib/canon/comparison/xml_comparator/node_parser.rb +45 -7
- data/lib/canon/comparison/xml_comparator.rb +174 -7
- data/lib/canon/comparison/xml_node_comparison.rb +48 -66
- data/lib/canon/comparison.rb +143 -22
- data/lib/canon/config/env_schema.rb +2 -1
- data/lib/canon/config/profiles/metanorma.yml +3 -0
- data/lib/canon/config.rb +51 -5
- data/lib/canon/diff/diff_classifier.rb +55 -41
- data/lib/canon/diff/diff_line_builder.rb +9 -8
- data/lib/canon/diff/xml_serialization_formatter.rb +27 -42
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +39 -4
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +5 -2
- data/lib/canon/diff_formatter/by_line_formatter.rb +84 -0
- data/lib/canon/diff_formatter/by_object_formatter.rb +53 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +184 -26
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +92 -4
- data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +29 -0
- data/lib/canon/diff_formatter/pretty_diff_formatter.rb +109 -0
- data/lib/canon/diff_formatter.rb +128 -175
- data/lib/canon/html/data_model.rb +10 -4
- data/lib/canon/pretty_printer/html.rb +76 -14
- data/lib/canon/pretty_printer/html_void_elements.rb +20 -0
- data/lib/canon/pretty_printer/xml_normalized.rb +10 -3
- data/lib/canon/tree_diff/adapters/html_adapter.rb +55 -2
- data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/c14n.rb +59 -5
- data/lib/canon/xml/data_model.rb +13 -1
- data/lib/canon/xml/element_matcher.rb +3 -0
- data/lib/canon/xml/node.rb +23 -1
- data/lib/canon/xml/nodes/comment_node.rb +4 -0
- data/lib/canon/xml/nodes/element_node.rb +4 -0
- data/lib/canon/xml/nodes/text_node.rb +4 -0
- data/lib/canon/xml/sax_builder.rb +29 -2
- data/lib/canon/xml/xpath_engine.rb +238 -0
- metadata +9 -2
|
@@ -63,6 +63,18 @@ module Canon
|
|
|
63
63
|
# @return [Boolean, Array] true if equivalent, or array of diffs if
|
|
64
64
|
# verbose
|
|
65
65
|
def equivalent?(n1, n2, opts = {}, child_opts = {})
|
|
66
|
+
# FAST PATH: Object identity - same object is always equivalent
|
|
67
|
+
# Skip when semantic_diff is requested (caller needs tree diff metadata)
|
|
68
|
+
if n1.equal?(n2) && !opts.dig(:match, :semantic_diff)
|
|
69
|
+
return build_trivial_equivalent_result(n1, n2, opts)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# FAST PATH: String content equality - identical strings are equivalent
|
|
73
|
+
# Skip in verbose mode since caller may need full metadata (e.g. tree_diff statistics)
|
|
74
|
+
if !opts[:verbose] && n1.is_a?(String) && n2.is_a?(String) && n1 == n2
|
|
75
|
+
return true
|
|
76
|
+
end
|
|
77
|
+
|
|
66
78
|
opts = DEFAULT_OPTS.merge(opts)
|
|
67
79
|
|
|
68
80
|
# Resolve match options with format-specific defaults
|
|
@@ -92,8 +104,15 @@ module Canon
|
|
|
92
104
|
# Create child_opts with resolved options
|
|
93
105
|
child_opts = opts.merge(child_opts)
|
|
94
106
|
|
|
95
|
-
# Determine if we should preserve whitespace during parsing
|
|
96
|
-
#
|
|
107
|
+
# Determine if we should preserve whitespace during parsing.
|
|
108
|
+
# Only structural_whitespace: :strict forces whitespace-only text
|
|
109
|
+
# nodes to survive parsing. whitespace_type is about distinguishing
|
|
110
|
+
# Unicode whitespace *types* in surviving text-node content, and
|
|
111
|
+
# does NOT require indent text nodes to be kept — libxml's NOBLANKS
|
|
112
|
+
# only strips pure-ASCII whitespace-only nodes, so NBSP-only nodes
|
|
113
|
+
# survive regardless. Coupling whitespace_type: :strict to
|
|
114
|
+
# parsing-time preservation made pretty-printed fixtures produce
|
|
115
|
+
# spurious element-position diffs (issue #112).
|
|
97
116
|
preserve_whitespace = match_opts_hash[:structural_whitespace] == :strict
|
|
98
117
|
|
|
99
118
|
# Parse nodes if they are strings, applying preprocessing if needed
|
|
@@ -141,6 +160,8 @@ module Canon
|
|
|
141
160
|
format: :xml,
|
|
142
161
|
match_options: match_opts_hash,
|
|
143
162
|
algorithm: :dom,
|
|
163
|
+
parse_errors_expected: Comparison.parse_errors_for(node1),
|
|
164
|
+
parse_errors_received: Comparison.parse_errors_for(node2),
|
|
144
165
|
)
|
|
145
166
|
elsif result != Comparison::EQUIVALENT && !differences.empty?
|
|
146
167
|
# Non-verbose mode: check equivalence
|
|
@@ -203,6 +224,8 @@ module Canon
|
|
|
203
224
|
format: :xml,
|
|
204
225
|
match_options: match_opts_hash.merge(strategy.metadata),
|
|
205
226
|
algorithm: :semantic,
|
|
227
|
+
parse_errors_expected: Comparison.parse_errors_for(node1),
|
|
228
|
+
parse_errors_received: Comparison.parse_errors_for(node2),
|
|
206
229
|
)
|
|
207
230
|
else
|
|
208
231
|
# Simple boolean result - equivalent if no normative differences
|
|
@@ -218,8 +241,59 @@ module Canon
|
|
|
218
241
|
preserve_whitespace: preserve_whitespace)
|
|
219
242
|
end
|
|
220
243
|
|
|
244
|
+
# Build result for trivially equivalent inputs (same object or identical strings)
|
|
245
|
+
#
|
|
246
|
+
# Returns plain `true` in non-verbose mode, or a ComparisonResult in verbose mode.
|
|
247
|
+
#
|
|
248
|
+
# @param n1 [Object] First input
|
|
249
|
+
# @param n2 [Object] Second input
|
|
250
|
+
# @param opts [Hash] Raw options (before merge with DEFAULT_OPTS)
|
|
251
|
+
# @return [Boolean, ComparisonResult]
|
|
252
|
+
def build_trivial_equivalent_result(n1, n2, opts)
|
|
253
|
+
return true unless opts[:verbose]
|
|
254
|
+
|
|
255
|
+
# Parse nodes for verbose display
|
|
256
|
+
preserve_whitespace = true
|
|
257
|
+
node1 = parse_node(n1, :none,
|
|
258
|
+
preserve_whitespace: preserve_whitespace)
|
|
259
|
+
node2 = parse_node(n2, :none,
|
|
260
|
+
preserve_whitespace: preserve_whitespace)
|
|
261
|
+
preprocessed = [
|
|
262
|
+
serialize_node(node1).gsub("><", ">\n<"),
|
|
263
|
+
serialize_node(node2).gsub("><", ">\n<"),
|
|
264
|
+
]
|
|
265
|
+
original1 = if n1.is_a?(String)
|
|
266
|
+
n1
|
|
267
|
+
elsif n1.respond_to?(:to_xml)
|
|
268
|
+
n1.to_xml
|
|
269
|
+
else
|
|
270
|
+
n1.to_s
|
|
271
|
+
end
|
|
272
|
+
original2 = if n2.is_a?(String)
|
|
273
|
+
n2
|
|
274
|
+
elsif n2.respond_to?(:to_xml)
|
|
275
|
+
n2.to_xml
|
|
276
|
+
else
|
|
277
|
+
n2.to_s
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
ComparisonResult.new(
|
|
281
|
+
differences: [],
|
|
282
|
+
preprocessed_strings: preprocessed,
|
|
283
|
+
original_strings: [original1, original2],
|
|
284
|
+
format: :xml,
|
|
285
|
+
match_options: {},
|
|
286
|
+
algorithm: :dom,
|
|
287
|
+
)
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
public
|
|
291
|
+
|
|
221
292
|
# Main comparison dispatcher
|
|
222
293
|
def compare_nodes(n1, n2, opts, child_opts, diff_children, differences)
|
|
294
|
+
# FAST PATH: Object identity - same object is always equivalent
|
|
295
|
+
return Comparison::EQUIVALENT if n1.equal?(n2)
|
|
296
|
+
|
|
223
297
|
# Handle DocumentFragment nodes - compare their children instead
|
|
224
298
|
if n1.is_a?(Nokogiri::XML::DocumentFragment) &&
|
|
225
299
|
n2.is_a?(Nokogiri::XML::DocumentFragment)
|
|
@@ -305,7 +379,6 @@ module Canon
|
|
|
305
379
|
end
|
|
306
380
|
|
|
307
381
|
# Public comparison methods - exposed for XmlNodeComparison module
|
|
308
|
-
public
|
|
309
382
|
|
|
310
383
|
# Compare two element nodes
|
|
311
384
|
def compare_element_nodes(n1, n2, opts, child_opts, diff_children,
|
|
@@ -380,8 +453,10 @@ module Canon
|
|
|
380
453
|
raw_differs = text1 != text2
|
|
381
454
|
|
|
382
455
|
# Check if matches according to behavior
|
|
456
|
+
whitespace_type = match_opts[:whitespace_type] || :strict
|
|
383
457
|
matches_per_behavior = MatchOptions.match_text?(text1, text2,
|
|
384
|
-
behavior
|
|
458
|
+
behavior,
|
|
459
|
+
whitespace_type: whitespace_type)
|
|
385
460
|
|
|
386
461
|
# Determine the correct dimension for this difference
|
|
387
462
|
# - If text_content is :strict, ALL differences use :text_content dimension
|
|
@@ -597,9 +672,16 @@ differences)
|
|
|
597
672
|
else
|
|
598
673
|
" (namespace: #{ns})"
|
|
599
674
|
end
|
|
600
|
-
|
|
675
|
+
label = Canon::Comparison.code_pair_label(diff1, diff2)
|
|
676
|
+
return "element '#{node.name}'#{ns_info}: #{label}"
|
|
601
677
|
elsif node.respond_to?(:name) && !node.respond_to?(:namespace_uri)
|
|
602
|
-
|
|
678
|
+
# TextNode and other nodes without namespace_uri
|
|
679
|
+
display = if node.respond_to?(:value) && node.node_type == :text
|
|
680
|
+
"\"#{truncate_text(node.value)}\""
|
|
681
|
+
else
|
|
682
|
+
node.name.to_s
|
|
683
|
+
end
|
|
684
|
+
return "element missing: #{display}"
|
|
603
685
|
end
|
|
604
686
|
end
|
|
605
687
|
|
|
@@ -617,6 +699,10 @@ differences)
|
|
|
617
699
|
return build_text_diff_reason(text1, text2)
|
|
618
700
|
end
|
|
619
701
|
|
|
702
|
+
if dimension == :whitespace_adjacency
|
|
703
|
+
return build_whitespace_adjacency_reason(node1, node2)
|
|
704
|
+
end
|
|
705
|
+
|
|
620
706
|
# For attribute values differences, show the actual values
|
|
621
707
|
if dimension == :attribute_values
|
|
622
708
|
attrs1 = extract_attributes(node1)
|
|
@@ -633,8 +719,17 @@ differences)
|
|
|
633
719
|
|
|
634
720
|
if diff1 == Canon::Comparison::MISSING_NODE && diff2 == Canon::Comparison::MISSING_NODE
|
|
635
721
|
"element structure mismatch (children differ)"
|
|
722
|
+
elsif dimension == :element_structure &&
|
|
723
|
+
diff1 == Canon::Comparison::UNEQUAL_ELEMENTS &&
|
|
724
|
+
diff2 == Canon::Comparison::UNEQUAL_ELEMENTS &&
|
|
725
|
+
(node1.is_a?(Canon::Xml::Node) || node1.is_a?(Nokogiri::XML::Node)) &&
|
|
726
|
+
(node2.is_a?(Canon::Xml::Node) || node2.is_a?(Nokogiri::XML::Node)) &&
|
|
727
|
+
node1.name && node2.name && node1.name != node2.name
|
|
728
|
+
# Most common case: differing element names. Surface the
|
|
729
|
+
# actual names rather than a generic "elements differ".
|
|
730
|
+
"different element name (<#{node1.name}> vs <#{node2.name}>)"
|
|
636
731
|
else
|
|
637
|
-
|
|
732
|
+
Canon::Comparison.code_pair_label(diff1, diff2)
|
|
638
733
|
end
|
|
639
734
|
end
|
|
640
735
|
|
|
@@ -748,6 +843,78 @@ differences)
|
|
|
748
843
|
"Text: \"#{vis1}\" vs \"#{vis2}\""
|
|
749
844
|
end
|
|
750
845
|
|
|
846
|
+
# Build a Reason line for a +:whitespace_adjacency+ diff (#137).
|
|
847
|
+
# Names which side carries the whitespace, the adjacency position
|
|
848
|
+
# relative to content neighbours, and surfaces the whitespace
|
|
849
|
+
# with visible markers.
|
|
850
|
+
def build_whitespace_adjacency_reason(node1, node2)
|
|
851
|
+
text1 = extract_text_from_node(node1)
|
|
852
|
+
text2 = extract_text_from_node(node2)
|
|
853
|
+
|
|
854
|
+
ni = NodeInspector
|
|
855
|
+
ws_on_first = ni.whitespace_only_text?(node1) &&
|
|
856
|
+
!ni.whitespace_only_text?(node2)
|
|
857
|
+
ws_on_second = ni.whitespace_only_text?(node2) &&
|
|
858
|
+
!ni.whitespace_only_text?(node1)
|
|
859
|
+
|
|
860
|
+
if ws_on_first
|
|
861
|
+
ws_text = text1
|
|
862
|
+
content_text = text2
|
|
863
|
+
present_side = "EXPECTED"
|
|
864
|
+
absent_side = "ACTUAL"
|
|
865
|
+
ws_node = node1
|
|
866
|
+
elsif ws_on_second
|
|
867
|
+
ws_text = text2
|
|
868
|
+
content_text = text1
|
|
869
|
+
present_side = "ACTUAL"
|
|
870
|
+
absent_side = "EXPECTED"
|
|
871
|
+
ws_node = node2
|
|
872
|
+
else
|
|
873
|
+
return build_text_diff_reason(text1, text2)
|
|
874
|
+
end
|
|
875
|
+
|
|
876
|
+
position = whitespace_adjacency_position(ws_node)
|
|
877
|
+
ws_vis = visualize_whitespace(ws_text)
|
|
878
|
+
content_vis = content_text ? visualize_whitespace(truncate_text(content_text)) : "(none)"
|
|
879
|
+
|
|
880
|
+
"Whitespace #{position} \"#{content_vis}\": " \
|
|
881
|
+
"present on #{present_side} (\"#{ws_vis}\"), absent on #{absent_side}"
|
|
882
|
+
end
|
|
883
|
+
|
|
884
|
+
def whitespace_adjacency_position(ws_node)
|
|
885
|
+
return :isolated unless ws_node.is_a?(Canon::Xml::Node) ||
|
|
886
|
+
ws_node.is_a?(Nokogiri::XML::Node)
|
|
887
|
+
|
|
888
|
+
parent = ws_node.parent
|
|
889
|
+
return :isolated if parent.nil?
|
|
890
|
+
|
|
891
|
+
siblings = parent.children
|
|
892
|
+
idx = siblings.index(ws_node)
|
|
893
|
+
return :isolated unless idx
|
|
894
|
+
|
|
895
|
+
before = sibling_with_content?(siblings, idx, -1)
|
|
896
|
+
after = sibling_with_content?(siblings, idx, 1)
|
|
897
|
+
|
|
898
|
+
if before && after then :surrounding
|
|
899
|
+
elsif before then :following
|
|
900
|
+
elsif after then :preceding
|
|
901
|
+
else :isolated
|
|
902
|
+
end
|
|
903
|
+
end
|
|
904
|
+
|
|
905
|
+
def sibling_with_content?(siblings, idx, direction)
|
|
906
|
+
i = idx + direction
|
|
907
|
+
while i >= 0 && i < siblings.length
|
|
908
|
+
s = siblings[i]
|
|
909
|
+
is_ws_text = NodeInspector.text_node?(s) &&
|
|
910
|
+
NodeInspector.text_content(s).strip.empty?
|
|
911
|
+
return true unless is_ws_text
|
|
912
|
+
|
|
913
|
+
i += direction
|
|
914
|
+
end
|
|
915
|
+
false
|
|
916
|
+
end
|
|
917
|
+
|
|
751
918
|
# Check if text is only whitespace
|
|
752
919
|
#
|
|
753
920
|
# @param text [String] Text to check
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative "node_inspector"
|
|
4
|
+
|
|
3
5
|
module Canon
|
|
4
6
|
module Comparison
|
|
5
7
|
# XML Node Comparison Utilities
|
|
@@ -180,13 +182,9 @@ differences)
|
|
|
180
182
|
# @return [Symbol] Comparison result constant
|
|
181
183
|
def self.dispatch_by_node_type(node1, node2, opts, child_opts,
|
|
182
184
|
diff_children, differences)
|
|
183
|
-
|
|
184
|
-
# Nokogiri also has .node_type but returns integers, so check for Symbol
|
|
185
|
-
if node1.respond_to?(:node_type) && node2.respond_to?(:node_type) &&
|
|
186
|
-
node1.node_type.is_a?(Symbol) && node2.node_type.is_a?(Symbol)
|
|
185
|
+
if node1.is_a?(Canon::Xml::Node) && node2.is_a?(Canon::Xml::Node)
|
|
187
186
|
dispatch_canon_node_type(node1, node2, opts, child_opts,
|
|
188
187
|
diff_children, differences)
|
|
189
|
-
# Moxml/Nokogiri types use .element?, .text?, etc. methods
|
|
190
188
|
else
|
|
191
189
|
dispatch_legacy_node_type(node1, node2, opts, child_opts,
|
|
192
190
|
diff_children, differences)
|
|
@@ -232,6 +230,17 @@ diff_children, differences)
|
|
|
232
230
|
return false unless text_node?(node) && node.parent
|
|
233
231
|
return false unless MatchOptions.normalize_text(node_text(node)).empty?
|
|
234
232
|
|
|
233
|
+
# HTML-specific: NBSP (U+00A0) is never insignificant whitespace —
|
|
234
|
+
# it always renders as a visible non-breaking space.
|
|
235
|
+
format = opts[:format] || match_opts[:format]
|
|
236
|
+
if %i[html html4 html5].include?(format)
|
|
237
|
+
return false if WhitespaceSensitivity.contains_nbsp?(node_text(node))
|
|
238
|
+
|
|
239
|
+
# Whitespace between inline element siblings is semantically
|
|
240
|
+
# significant (renders as a visible gap) and must not be stripped.
|
|
241
|
+
return false if WhitespaceSensitivity.inline_whitespace_significant?(node)
|
|
242
|
+
end
|
|
243
|
+
|
|
235
244
|
return true unless WhitespaceSensitivity.whitespace_preserved?(
|
|
236
245
|
node.parent, match_opts
|
|
237
246
|
)
|
|
@@ -275,8 +284,8 @@ diff_children, differences)
|
|
|
275
284
|
def self.same_node_type?(node1, node2)
|
|
276
285
|
return false if node1.class != node2.class
|
|
277
286
|
|
|
278
|
-
|
|
279
|
-
|
|
287
|
+
case node1
|
|
288
|
+
when Canon::Xml::Node, Nokogiri::XML::Node
|
|
280
289
|
node1.node_type == node2.node_type
|
|
281
290
|
else
|
|
282
291
|
true
|
|
@@ -294,34 +303,13 @@ diff_children, differences)
|
|
|
294
303
|
# @param check_children [Boolean] Whether to check child nodes
|
|
295
304
|
# @return [Boolean] true if node is a comment
|
|
296
305
|
def self.comment_node?(node, check_children: false)
|
|
297
|
-
|
|
298
|
-
return true if node.respond_to?(:comment?) && node.comment?
|
|
299
|
-
return true if node.respond_to?(:node_type) && node.node_type == :comment
|
|
300
|
-
|
|
301
|
-
if node.is_a?(Nokogiri::XML::Element) && !node.children.empty? && check_children
|
|
302
|
-
node.children.each do |child|
|
|
303
|
-
# Recursively check child nodes for comments
|
|
304
|
-
# limit depth to avoid infinite recursion
|
|
305
|
-
# in case of circular structures (if any)
|
|
306
|
-
if comment_node?(child, check_children: false)
|
|
307
|
-
result = true
|
|
308
|
-
break
|
|
309
|
-
end
|
|
310
|
-
end
|
|
311
|
-
end
|
|
312
|
-
return true if result
|
|
313
|
-
|
|
314
|
-
# HTML comments are parsed as TEXT nodes by Nokogiri
|
|
315
|
-
# Check if this is a text node with HTML comment content
|
|
316
|
-
if text_node?(node)
|
|
317
|
-
text = node_text(node)
|
|
318
|
-
# Strip whitespace and backslashes for comparison
|
|
319
|
-
# Nokogiri escapes HTML comments as "<\\!-- comment -->" in full documents
|
|
320
|
-
text_stripped = text.to_s.strip.gsub("\\", "")
|
|
321
|
-
return true if text_stripped.start_with?("<!--") && text_stripped.end_with?("-->")
|
|
322
|
-
end
|
|
306
|
+
return true if NodeInspector.comment_node?(node)
|
|
323
307
|
|
|
324
|
-
|
|
308
|
+
if check_children && node.is_a?(Nokogiri::XML::Element) && !node.children.empty?
|
|
309
|
+
node.children.any? { |child| NodeInspector.comment_node?(child) }
|
|
310
|
+
else
|
|
311
|
+
false
|
|
312
|
+
end
|
|
325
313
|
end
|
|
326
314
|
|
|
327
315
|
# Check if a node is a text node
|
|
@@ -329,9 +317,7 @@ diff_children, differences)
|
|
|
329
317
|
# @param node [Object] Node to check
|
|
330
318
|
# @return [Boolean] true if node is a text node
|
|
331
319
|
def self.text_node?(node)
|
|
332
|
-
|
|
333
|
-
!node.respond_to?(:element?)) ||
|
|
334
|
-
(node.respond_to?(:node_type) && node.node_type == :text)
|
|
320
|
+
NodeInspector.text_node?(node)
|
|
335
321
|
end
|
|
336
322
|
|
|
337
323
|
# Extract text content from a node
|
|
@@ -341,15 +327,7 @@ diff_children, differences)
|
|
|
341
327
|
def self.node_text(node)
|
|
342
328
|
return "" unless node
|
|
343
329
|
|
|
344
|
-
|
|
345
|
-
node.content.to_s
|
|
346
|
-
elsif node.respond_to?(:text)
|
|
347
|
-
node.text.to_s
|
|
348
|
-
elsif node.respond_to?(:value)
|
|
349
|
-
node.value.to_s
|
|
350
|
-
else
|
|
351
|
-
""
|
|
352
|
-
end
|
|
330
|
+
NodeInspector.text_content(node)
|
|
353
331
|
end
|
|
354
332
|
|
|
355
333
|
# Dispatch by Canon::Xml::Node type
|
|
@@ -385,21 +363,26 @@ diff_children, differences)
|
|
|
385
363
|
# Import XmlComparator to use its comparison methods
|
|
386
364
|
require_relative "xml_comparator"
|
|
387
365
|
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
diff_children, differences)
|
|
391
|
-
elsif node1.respond_to?(:text?) && node1.text?
|
|
392
|
-
XmlComparator.compare_text_nodes(node1, node2, opts, differences)
|
|
393
|
-
elsif node1.respond_to?(:comment?) && node1.comment?
|
|
394
|
-
XmlComparator.compare_comment_nodes(node1, node2, opts, differences)
|
|
395
|
-
elsif node1.respond_to?(:cdata?) && node1.cdata?
|
|
396
|
-
XmlComparator.compare_text_nodes(node1, node2, opts, differences)
|
|
397
|
-
elsif node1.respond_to?(:processing_instruction?) && node1.processing_instruction?
|
|
398
|
-
XmlComparator.compare_processing_instruction_nodes(node1, node2,
|
|
399
|
-
opts, differences)
|
|
400
|
-
elsif node1.respond_to?(:root)
|
|
366
|
+
case node1
|
|
367
|
+
when Nokogiri::XML::Document
|
|
401
368
|
XmlComparator.compare_document_nodes(node1, node2, opts, child_opts,
|
|
402
369
|
diff_children, differences)
|
|
370
|
+
when Nokogiri::XML::Node
|
|
371
|
+
if node1.element?
|
|
372
|
+
XmlComparator.compare_element_nodes(node1, node2, opts, child_opts,
|
|
373
|
+
diff_children, differences)
|
|
374
|
+
elsif node1.text?
|
|
375
|
+
XmlComparator.compare_text_nodes(node1, node2, opts, differences)
|
|
376
|
+
elsif node1.comment?
|
|
377
|
+
XmlComparator.compare_comment_nodes(node1, node2, opts, differences)
|
|
378
|
+
elsif node1.cdata?
|
|
379
|
+
XmlComparator.compare_text_nodes(node1, node2, opts, differences)
|
|
380
|
+
elsif node1.processing_instruction?
|
|
381
|
+
XmlComparator.compare_processing_instruction_nodes(node1, node2,
|
|
382
|
+
opts, differences)
|
|
383
|
+
else
|
|
384
|
+
Comparison::EQUIVALENT
|
|
385
|
+
end
|
|
403
386
|
else
|
|
404
387
|
Comparison::EQUIVALENT
|
|
405
388
|
end
|
|
@@ -431,10 +414,11 @@ differences)
|
|
|
431
414
|
# @param node [Canon::Xml::Node, Object] Node to serialize
|
|
432
415
|
# @return [String] XML string representation
|
|
433
416
|
def self.serialize_node_to_xml(node)
|
|
434
|
-
|
|
417
|
+
case node
|
|
418
|
+
when Canon::Xml::Nodes::RootNode
|
|
435
419
|
# Serialize all children of root
|
|
436
420
|
node.children.map { |child| serialize_node_to_xml(child) }.join
|
|
437
|
-
|
|
421
|
+
when Canon::Xml::Nodes::ElementNode
|
|
438
422
|
# Serialize element with attributes and children
|
|
439
423
|
attrs = node.attribute_nodes.map do |a|
|
|
440
424
|
" #{a.name}=\"#{a.value}\""
|
|
@@ -448,14 +432,12 @@ differences)
|
|
|
448
432
|
else
|
|
449
433
|
"<#{node.name}#{attrs}>#{children_xml}</#{node.name}>"
|
|
450
434
|
end
|
|
451
|
-
|
|
435
|
+
when Canon::Xml::Nodes::TextNode
|
|
452
436
|
node.value
|
|
453
|
-
|
|
437
|
+
when Canon::Xml::Nodes::CommentNode
|
|
454
438
|
"<!--#{node.value}-->"
|
|
455
|
-
|
|
439
|
+
when Canon::Xml::Nodes::ProcessingInstructionNode
|
|
456
440
|
"<?#{node.target} #{node.data}?>"
|
|
457
|
-
elsif node.respond_to?(:to_xml)
|
|
458
|
-
node.to_xml
|
|
459
441
|
else
|
|
460
442
|
node.to_s
|
|
461
443
|
end
|
data/lib/canon/comparison.rb
CHANGED
|
@@ -122,6 +122,65 @@ module Canon
|
|
|
122
122
|
UNEQUAL_TYPES = 15
|
|
123
123
|
UNEQUAL_PRIMITIVES = 16
|
|
124
124
|
|
|
125
|
+
# Human-readable labels for the integer comparison-result constants
|
|
126
|
+
# above. Used by the diff reason builders so user-facing reason text
|
|
127
|
+
# never leaks raw numeric codes (e.g. "7 vs 7" — see lutaml/canon#127).
|
|
128
|
+
# String diff codes (e.g. "position 3" emitted by ChildComparison)
|
|
129
|
+
# pass through +code_label+ unchanged.
|
|
130
|
+
CODE_LABELS = {
|
|
131
|
+
EQUIVALENT => "equivalent",
|
|
132
|
+
MISSING_ATTRIBUTE => "missing attribute",
|
|
133
|
+
MISSING_NODE => "missing",
|
|
134
|
+
UNEQUAL_ATTRIBUTES => "attributes differ",
|
|
135
|
+
UNEQUAL_COMMENTS => "comments differ",
|
|
136
|
+
UNEQUAL_DOCUMENTS => "documents differ",
|
|
137
|
+
UNEQUAL_ELEMENTS => "elements differ",
|
|
138
|
+
UNEQUAL_NODES_TYPES => "node types differ",
|
|
139
|
+
UNEQUAL_TEXT_CONTENTS => "text content differs",
|
|
140
|
+
MISSING_HASH_KEY => "missing hash key",
|
|
141
|
+
UNEQUAL_HASH_VALUES => "hash values differ",
|
|
142
|
+
UNEQUAL_HASH_KEY_ORDER => "hash key order differs",
|
|
143
|
+
UNEQUAL_ARRAY_LENGTHS => "array lengths differ",
|
|
144
|
+
UNEQUAL_ARRAY_ELEMENTS => "array elements differ",
|
|
145
|
+
UNEQUAL_TYPES => "types differ",
|
|
146
|
+
UNEQUAL_PRIMITIVES => "primitives differ",
|
|
147
|
+
}.freeze
|
|
148
|
+
|
|
149
|
+
# Translate a comparison result code (Integer constant or String label
|
|
150
|
+
# like "position 3") into a human-readable reason fragment. Unknown
|
|
151
|
+
# values pass through via +to_s+ as a defensive fallback.
|
|
152
|
+
#
|
|
153
|
+
# @param code [Integer, String] Comparison result code
|
|
154
|
+
# @return [String] Human-readable label
|
|
155
|
+
def self.code_label(code)
|
|
156
|
+
return code if code.is_a?(String)
|
|
157
|
+
|
|
158
|
+
CODE_LABELS[code] || code.to_s
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Build a "diff1 [vs diff2]" reason fragment that never leaks raw
|
|
162
|
+
# integer constants. When both codes are equal, returns the single
|
|
163
|
+
# label (e.g. "elements differ") rather than "elements differ vs
|
|
164
|
+
# elements differ". See lutaml/canon#127.
|
|
165
|
+
#
|
|
166
|
+
# @param diff1 [Integer, String] First diff code
|
|
167
|
+
# @param diff2 [Integer, String] Second diff code
|
|
168
|
+
# @return [String] Reason fragment
|
|
169
|
+
def self.code_pair_label(diff1, diff2)
|
|
170
|
+
return code_label(diff1) if diff1 == diff2
|
|
171
|
+
|
|
172
|
+
"#{code_label(diff1)} vs #{code_label(diff2)}"
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# Extract parse-time errors from a parsed-tree or Nokogiri fragment.
|
|
176
|
+
# Delegates to NodeInspector for cross-backend type dispatch.
|
|
177
|
+
#
|
|
178
|
+
# @param node [Object, nil] Parsed node
|
|
179
|
+
# @return [Array<String>] Parse errors as strings (empty by default)
|
|
180
|
+
def self.parse_errors_for(node)
|
|
181
|
+
NodeInspector.parse_errors(node)
|
|
182
|
+
end
|
|
183
|
+
|
|
125
184
|
class << self
|
|
126
185
|
# Auto-detect format and compare two objects
|
|
127
186
|
#
|
|
@@ -144,6 +203,35 @@ module Canon
|
|
|
144
203
|
dom_diff(obj1, obj2, opts)
|
|
145
204
|
end
|
|
146
205
|
|
|
206
|
+
# Summarize the first difference between two documents.
|
|
207
|
+
#
|
|
208
|
+
# Returns a human-readable string describing the first difference
|
|
209
|
+
# when documents differ, or "Equivalent" when they match.
|
|
210
|
+
# This is a lightweight alternative to +equivalent?+ with +verbose: true+.
|
|
211
|
+
#
|
|
212
|
+
# @param obj1 [Object] First object to compare
|
|
213
|
+
# @param obj2 [Object] Second object to compare
|
|
214
|
+
# @param opts [Hash] Comparison options (same as +equivalent?+)
|
|
215
|
+
# @return [String] Summary string
|
|
216
|
+
#
|
|
217
|
+
# @example
|
|
218
|
+
# Canon::Comparison.summarize("<p>Hello</p>", "<p>World</p>")
|
|
219
|
+
# # => "Not equivalent: text content differs at /p[1] (Hello vs World)"
|
|
220
|
+
#
|
|
221
|
+
# Canon::Comparison.summarize("<p>Hello</p>", "<p>Hello</p>")
|
|
222
|
+
# # => "Equivalent"
|
|
223
|
+
def summarize(obj1, obj2, opts = {})
|
|
224
|
+
result = equivalent?(obj1, obj2, opts.merge(verbose: true))
|
|
225
|
+
|
|
226
|
+
if result.is_a?(ComparisonResult)
|
|
227
|
+
result.summary
|
|
228
|
+
elsif result == true
|
|
229
|
+
"Equivalent"
|
|
230
|
+
else
|
|
231
|
+
"Not equivalent"
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
|
|
147
235
|
# Define a custom comparison profile with DSL syntax
|
|
148
236
|
#
|
|
149
237
|
# @param name [Symbol] Profile name
|
|
@@ -602,26 +690,26 @@ module Canon
|
|
|
602
690
|
# parsers can mutate the DOM).
|
|
603
691
|
opts[:_original_str1] = obj1.dup if obj1.is_a?(String)
|
|
604
692
|
opts[:_original_str2] = obj2.dup if obj2.is_a?(String)
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
693
|
+
# Parse all HTML formats (:html, :html4, :html5) with
|
|
694
|
+
# Nokogiri::HTML5 so that html4 and html5 share HTML's
|
|
695
|
+
# whitespace-sensitivity semantics (issue #118).
|
|
696
|
+
#
|
|
697
|
+
# The previous html/html4 branch used Nokogiri::XML.fragment
|
|
698
|
+
# to dodge Nokogiri::HTML4.fragment's destructive DOM
|
|
699
|
+
# mutations. That avoided one problem but introduced a
|
|
700
|
+
# bigger one: XML whitespace rules were being applied to
|
|
701
|
+
# HTML content. HTML's content model — identical between
|
|
702
|
+
# HTML4 and HTML5 — treats whitespace-only text between
|
|
703
|
+
# block-level children as insignificant; XML treats every
|
|
704
|
+
# whitespace text node as significant. Routing html4 input
|
|
705
|
+
# through an XML parser therefore made
|
|
706
|
+
# be_html4_equivalent_to reject inputs that
|
|
707
|
+
# be_html5_equivalent_to (correctly) accepts.
|
|
708
|
+
# Nokogiri::HTML5.fragment is non-destructive (the original
|
|
709
|
+
# HTML4.fragment concern does not apply to it) and applies
|
|
710
|
+
# HTML's content model uniformly.
|
|
711
|
+
obj1 = HtmlParser.parse(obj1, :html5) if obj1.is_a?(String)
|
|
712
|
+
obj2 = HtmlParser.parse(obj2, :html5) if obj2.is_a?(String)
|
|
625
713
|
end
|
|
626
714
|
else
|
|
627
715
|
format1 = FormatDetector.detect(obj1)
|
|
@@ -662,8 +750,14 @@ module Canon
|
|
|
662
750
|
# but defined in config
|
|
663
751
|
if Canon::Config.instance.respond_to?(comparison_format)
|
|
664
752
|
format_config = Canon::Config.instance.public_send(comparison_format)
|
|
665
|
-
if opts[:
|
|
666
|
-
|
|
753
|
+
if opts[:global_profile].nil? && format_config.match.profile
|
|
754
|
+
# Config-sourced profile has *global* priority (applied before
|
|
755
|
+
# global_options), so that YAML profile_options like
|
|
756
|
+
# whitespace_type: :normalize can override the built-in profile
|
|
757
|
+
# (e.g. :spec_friendly)'s whitespace_type: :strict. Writing to
|
|
758
|
+
# :match_profile here gave the config profile per-call priority,
|
|
759
|
+
# which incorrectly overrode the YAML's own overrides.
|
|
760
|
+
opts[:global_profile] = format_config.match.profile
|
|
667
761
|
end
|
|
668
762
|
# Pass YAML profile's extra match options (e.g., preserve_whitespace_elements)
|
|
669
763
|
# that are stored in MatchConfig's resolver but not exposed via the
|
|
@@ -701,6 +795,33 @@ module Canon
|
|
|
701
795
|
str
|
|
702
796
|
end
|
|
703
797
|
|
|
798
|
+
# Decode HTML named entities ( etc.) to their numeric
|
|
799
|
+
# character reference equivalents so that Nokogiri::XML.fragment
|
|
800
|
+
# (which only understands the five XML entities) preserves them
|
|
801
|
+
# as text nodes instead of silently dropping them.
|
|
802
|
+
#
|
|
803
|
+
# Uses Nokogiri's HTML4 parser to resolve the entities — the
|
|
804
|
+
# text is extracted from a fragment so no structural tags are added.
|
|
805
|
+
#
|
|
806
|
+
# @param str [String] HTML string potentially containing named entities
|
|
807
|
+
# @return [String] String with named entities replaced by characters
|
|
808
|
+
def decode_html_entities(str)
|
|
809
|
+
# Fast path: skip if no ampersands present
|
|
810
|
+
return str unless str.include?("&")
|
|
811
|
+
|
|
812
|
+
# Parse as HTML fragment to resolve named entities, then
|
|
813
|
+
# re-serialize as text. This converts → U+00A0, etc.
|
|
814
|
+
doc = Nokogiri::HTML4.fragment(str)
|
|
815
|
+
|
|
816
|
+
# Serialize back, preserving the resolved characters.
|
|
817
|
+
# to_html re-encodes characters, so use inner_html which
|
|
818
|
+
# keeps the character form.
|
|
819
|
+
doc.inner_html
|
|
820
|
+
|
|
821
|
+
# If the serialization re-encoded characters as entities,
|
|
822
|
+
# that's fine — the XML parser understands numeric refs like  
|
|
823
|
+
end
|
|
824
|
+
|
|
704
825
|
# Detect the format of an object (delegates to FormatDetector)
|
|
705
826
|
#
|
|
706
827
|
# @param obj [Object] Object to detect format of
|
|
@@ -14,6 +14,7 @@ module Canon
|
|
|
14
14
|
show_diffs: :symbol,
|
|
15
15
|
verbose_diff: :boolean,
|
|
16
16
|
algorithm: :symbol,
|
|
17
|
+
parser: :symbol,
|
|
17
18
|
show_raw_inputs: :boolean,
|
|
18
19
|
show_raw_expected: :boolean,
|
|
19
20
|
show_raw_received: :boolean,
|
|
@@ -66,7 +67,7 @@ module Canon
|
|
|
66
67
|
|
|
67
68
|
def all_diff_attributes
|
|
68
69
|
%i[mode use_color context_lines grouping_lines show_diffs
|
|
69
|
-
verbose_diff algorithm show_raw_inputs show_raw_expected show_raw_received
|
|
70
|
+
verbose_diff algorithm parser show_raw_inputs show_raw_expected show_raw_received
|
|
70
71
|
show_preprocessed_inputs show_preprocessed_expected show_preprocessed_received
|
|
71
72
|
show_prettyprint_inputs show_prettyprint_expected show_prettyprint_received
|
|
72
73
|
show_line_numbered_inputs character_visualization
|
|
@@ -28,6 +28,9 @@ formats:
|
|
|
28
28
|
xml:
|
|
29
29
|
match:
|
|
30
30
|
profile: spec_friendly
|
|
31
|
+
# Treat different Unicode whitespace types (space, NBSP, ideographic space, etc.)
|
|
32
|
+
# as equivalent — useful for spec comparisons where whitespace type doesn't matter
|
|
33
|
+
whitespace_type: :normalize
|
|
31
34
|
# Elements where whitespace is PRESERVED exactly (no manipulation)
|
|
32
35
|
# All whitespace characters are significant in these elements
|
|
33
36
|
preserve_whitespace_elements:
|