canon 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -1
- data/.rubocop_todo.yml +276 -7
- data/README.adoc +203 -138
- data/_config.yml +116 -0
- data/docs/ADVANCED_TOPICS.adoc +20 -0
- data/docs/BASIC_USAGE.adoc +16 -0
- data/docs/CHARACTER_VISUALIZATION.adoc +567 -0
- data/docs/CLI.adoc +493 -0
- data/docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
- data/docs/DIFF_ARCHITECTURE.adoc +435 -0
- data/docs/DIFF_FORMATTING.adoc +540 -0
- data/docs/FORMATS.adoc +447 -0
- data/docs/INDEX.adoc +222 -0
- data/docs/INPUT_VALIDATION.adoc +477 -0
- data/docs/MATCH_ARCHITECTURE.adoc +463 -0
- data/docs/MATCH_OPTIONS.adoc +719 -0
- data/docs/MODES.adoc +432 -0
- data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
- data/docs/OPTIONS.adoc +1387 -0
- data/docs/PREPROCESSING.adoc +491 -0
- data/docs/RSPEC.adoc +605 -0
- data/docs/RUBY_API.adoc +478 -0
- data/docs/SEMANTIC_DIFF_REPORT.adoc +528 -0
- data/docs/UNDERSTANDING_CANON.adoc +17 -0
- data/docs/VERBOSE.adoc +482 -0
- data/exe/canon +7 -0
- data/lib/canon/cli.rb +179 -0
- data/lib/canon/commands/diff_command.rb +195 -0
- data/lib/canon/commands/format_command.rb +113 -0
- data/lib/canon/comparison/base_comparator.rb +39 -0
- data/lib/canon/comparison/comparison_result.rb +79 -0
- data/lib/canon/comparison/html_comparator.rb +410 -0
- data/lib/canon/comparison/json_comparator.rb +212 -0
- data/lib/canon/comparison/match_options.rb +616 -0
- data/lib/canon/comparison/xml_comparator.rb +566 -0
- data/lib/canon/comparison/yaml_comparator.rb +93 -0
- data/lib/canon/comparison.rb +239 -0
- data/lib/canon/config.rb +172 -0
- data/lib/canon/diff/diff_block.rb +71 -0
- data/lib/canon/diff/diff_block_builder.rb +105 -0
- data/lib/canon/diff/diff_classifier.rb +46 -0
- data/lib/canon/diff/diff_context.rb +85 -0
- data/lib/canon/diff/diff_context_builder.rb +107 -0
- data/lib/canon/diff/diff_line.rb +77 -0
- data/lib/canon/diff/diff_node.rb +56 -0
- data/lib/canon/diff/diff_node_mapper.rb +148 -0
- data/lib/canon/diff/diff_report.rb +133 -0
- data/lib/canon/diff/diff_report_builder.rb +62 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +407 -0
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +672 -0
- data/lib/canon/diff_formatter/by_line/json_formatter.rb +284 -0
- data/lib/canon/diff_formatter/by_line/simple_formatter.rb +190 -0
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +860 -0
- data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +292 -0
- data/lib/canon/diff_formatter/by_object/base_formatter.rb +199 -0
- data/lib/canon/diff_formatter/by_object/json_formatter.rb +305 -0
- data/lib/canon/diff_formatter/by_object/xml_formatter.rb +248 -0
- data/lib/canon/diff_formatter/by_object/yaml_formatter.rb +17 -0
- data/lib/canon/diff_formatter/character_map.yml +197 -0
- data/lib/canon/diff_formatter/debug_output.rb +431 -0
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +551 -0
- data/lib/canon/diff_formatter/legend.rb +141 -0
- data/lib/canon/diff_formatter.rb +520 -0
- data/lib/canon/errors.rb +56 -0
- data/lib/canon/formatters/html4_formatter.rb +17 -0
- data/lib/canon/formatters/html5_formatter.rb +17 -0
- data/lib/canon/formatters/html_formatter.rb +37 -0
- data/lib/canon/formatters/html_formatter_base.rb +163 -0
- data/lib/canon/formatters/json_formatter.rb +3 -0
- data/lib/canon/formatters/xml_formatter.rb +20 -55
- data/lib/canon/formatters/yaml_formatter.rb +4 -1
- data/lib/canon/pretty_printer/html.rb +57 -0
- data/lib/canon/pretty_printer/json.rb +25 -0
- data/lib/canon/pretty_printer/xml.rb +29 -0
- data/lib/canon/rspec_matchers.rb +222 -80
- data/lib/canon/validators/base_validator.rb +49 -0
- data/lib/canon/validators/html_validator.rb +138 -0
- data/lib/canon/validators/json_validator.rb +89 -0
- data/lib/canon/validators/xml_validator.rb +53 -0
- data/lib/canon/validators/yaml_validator.rb +73 -0
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/attribute_handler.rb +80 -0
- data/lib/canon/xml/c14n.rb +36 -0
- data/lib/canon/xml/character_encoder.rb +38 -0
- data/lib/canon/xml/data_model.rb +225 -0
- data/lib/canon/xml/element_matcher.rb +196 -0
- data/lib/canon/xml/line_range_mapper.rb +158 -0
- data/lib/canon/xml/namespace_handler.rb +86 -0
- data/lib/canon/xml/node.rb +32 -0
- data/lib/canon/xml/nodes/attribute_node.rb +54 -0
- data/lib/canon/xml/nodes/comment_node.rb +23 -0
- data/lib/canon/xml/nodes/element_node.rb +56 -0
- data/lib/canon/xml/nodes/namespace_node.rb +38 -0
- data/lib/canon/xml/nodes/processing_instruction_node.rb +24 -0
- data/lib/canon/xml/nodes/root_node.rb +16 -0
- data/lib/canon/xml/nodes/text_node.rb +23 -0
- data/lib/canon/xml/processor.rb +151 -0
- data/lib/canon/xml/whitespace_normalizer.rb +72 -0
- data/lib/canon/xml/xml_base_handler.rb +188 -0
- data/lib/canon.rb +14 -3
- metadata +116 -21
|
@@ -0,0 +1,410 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
require_relative "xml_comparator"
|
|
5
|
+
require_relative "match_options"
|
|
6
|
+
require_relative "comparison_result"
|
|
7
|
+
require_relative "../diff/diff_node"
|
|
8
|
+
require_relative "../diff/diff_classifier"
|
|
9
|
+
|
|
10
|
+
module Canon
|
|
11
|
+
module Comparison
|
|
12
|
+
# HTML comparison class
|
|
13
|
+
# Handles comparison of HTML nodes with various options
|
|
14
|
+
class HtmlComparator
|
|
15
|
+
# Default comparison options for HTML
|
|
16
|
+
DEFAULT_OPTS = {
|
|
17
|
+
# Structural filtering options
|
|
18
|
+
ignore_children: false,
|
|
19
|
+
ignore_text_nodes: false,
|
|
20
|
+
ignore_attr_content: [],
|
|
21
|
+
ignore_attrs: [],
|
|
22
|
+
ignore_attrs_by_name: [],
|
|
23
|
+
ignore_nodes: [],
|
|
24
|
+
|
|
25
|
+
# Output options
|
|
26
|
+
verbose: false,
|
|
27
|
+
diff_children: false,
|
|
28
|
+
|
|
29
|
+
# Match system options
|
|
30
|
+
match_profile: nil,
|
|
31
|
+
match: nil,
|
|
32
|
+
preprocessing: nil,
|
|
33
|
+
global_profile: nil,
|
|
34
|
+
global_options: nil,
|
|
35
|
+
|
|
36
|
+
# Diff display options
|
|
37
|
+
diff: nil,
|
|
38
|
+
}.freeze
|
|
39
|
+
|
|
40
|
+
class << self
|
|
41
|
+
# Compare two HTML nodes for equivalence
|
|
42
|
+
#
|
|
43
|
+
# @param html1 [String, Nokogiri::HTML::Document] First HTML
|
|
44
|
+
# @param html2 [String, Nokogiri::HTML::Document] Second HTML
|
|
45
|
+
# @param opts [Hash] Comparison options
|
|
46
|
+
# @param child_opts [Hash] Options for child comparison
|
|
47
|
+
# @return [Boolean, Array] true if equivalent, or array of diffs if
|
|
48
|
+
# verbose
|
|
49
|
+
def equivalent?(html1, html2, opts = {}, child_opts = {})
|
|
50
|
+
opts = DEFAULT_OPTS.merge(opts)
|
|
51
|
+
|
|
52
|
+
# Resolve match options with format-specific defaults
|
|
53
|
+
match_opts_hash = MatchOptions::Xml.resolve(
|
|
54
|
+
format: :html,
|
|
55
|
+
match_profile: opts[:match_profile],
|
|
56
|
+
match: opts[:match],
|
|
57
|
+
preprocessing: opts[:preprocessing],
|
|
58
|
+
global_profile: opts[:global_profile],
|
|
59
|
+
global_options: opts[:global_options],
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Wrap in ResolvedMatchOptions for DiffClassifier
|
|
63
|
+
match_opts = Canon::Comparison::ResolvedMatchOptions.new(
|
|
64
|
+
match_opts_hash,
|
|
65
|
+
format: :html,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Store resolved match options hash for use in comparison logic
|
|
69
|
+
opts[:match_opts] = match_opts_hash
|
|
70
|
+
|
|
71
|
+
# Create child_opts with resolved options
|
|
72
|
+
child_opts = opts.merge(child_opts)
|
|
73
|
+
|
|
74
|
+
# Parse nodes if they are strings, applying preprocessing if needed
|
|
75
|
+
node1 = parse_node(html1, match_opts_hash[:preprocessing],
|
|
76
|
+
match_opts_hash)
|
|
77
|
+
node2 = parse_node(html2, match_opts_hash[:preprocessing],
|
|
78
|
+
match_opts_hash)
|
|
79
|
+
|
|
80
|
+
# Serialize preprocessed nodes for diff display (avoid re-preprocessing)
|
|
81
|
+
preprocessed_str1 = serialize_for_display(node1)
|
|
82
|
+
preprocessed_str2 = serialize_for_display(node2)
|
|
83
|
+
|
|
84
|
+
differences = []
|
|
85
|
+
diff_children = opts[:diff_children] || false
|
|
86
|
+
|
|
87
|
+
# DocumentFragment nodes need special handling - compare their children
|
|
88
|
+
# instead of the fragment nodes themselves
|
|
89
|
+
if node1.is_a?(Nokogiri::HTML4::DocumentFragment) &&
|
|
90
|
+
node2.is_a?(Nokogiri::HTML4::DocumentFragment)
|
|
91
|
+
# Compare children of fragments
|
|
92
|
+
children1 = node1.children.to_a
|
|
93
|
+
children2 = node2.children.to_a
|
|
94
|
+
|
|
95
|
+
if children1.length != children2.length
|
|
96
|
+
result = Comparison::UNEQUAL_ELEMENTS
|
|
97
|
+
elsif children1.empty?
|
|
98
|
+
result = Comparison::EQUIVALENT
|
|
99
|
+
else
|
|
100
|
+
# Compare each pair of children
|
|
101
|
+
result = Comparison::EQUIVALENT
|
|
102
|
+
children1.zip(children2).each do |child1, child2|
|
|
103
|
+
child_result = XmlComparator.send(:compare_nodes, child1, child2,
|
|
104
|
+
opts, child_opts, diff_children,
|
|
105
|
+
differences)
|
|
106
|
+
if child_result != Comparison::EQUIVALENT
|
|
107
|
+
result = child_result
|
|
108
|
+
break
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
else
|
|
113
|
+
result = XmlComparator.send(:compare_nodes, node1, node2, opts,
|
|
114
|
+
child_opts, diff_children, differences)
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Classify DiffNodes as normative/informative if we have verbose output
|
|
118
|
+
if opts[:verbose] && !differences.empty?
|
|
119
|
+
classifier = Canon::Diff::DiffClassifier.new(match_opts)
|
|
120
|
+
classifier.classify_all(differences.select do |d|
|
|
121
|
+
d.is_a?(Canon::Diff::DiffNode)
|
|
122
|
+
end)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
if opts[:verbose]
|
|
126
|
+
ComparisonResult.new(
|
|
127
|
+
differences: differences,
|
|
128
|
+
preprocessed_strings: [preprocessed_str1, preprocessed_str2],
|
|
129
|
+
format: :html,
|
|
130
|
+
html_version: detect_html_version_from_node(node1),
|
|
131
|
+
match_options: match_opts_hash,
|
|
132
|
+
)
|
|
133
|
+
else
|
|
134
|
+
result == Comparison::EQUIVALENT
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
private
|
|
139
|
+
|
|
140
|
+
# Parse a node from string or return as-is
|
|
141
|
+
# Applies preprocessing transformation before parsing if specified
|
|
142
|
+
def parse_node(node, preprocessing = :none, match_opts = {})
|
|
143
|
+
# If already a Nokogiri node, check for incompatible XML documents
|
|
144
|
+
# Only raise error for non-string incompatible formats
|
|
145
|
+
unless node.is_a?(String)
|
|
146
|
+
# Detect if this is an XML document (not HTML)
|
|
147
|
+
# Strings are allowed since they can be wrapped/parsed as needed
|
|
148
|
+
if is_xml_document?(node)
|
|
149
|
+
raise Canon::CompareFormatMismatchError.new(:xml, :html)
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# For :rendered preprocessing, apply normalization even to pre-parsed nodes
|
|
153
|
+
if preprocessing == :rendered
|
|
154
|
+
# If already a DocumentFragment with :rendered, just normalize it
|
|
155
|
+
if node.is_a?(Nokogiri::HTML4::DocumentFragment) ||
|
|
156
|
+
node.is_a?(Nokogiri::HTML5::DocumentFragment) ||
|
|
157
|
+
node.is_a?(Nokogiri::XML::DocumentFragment)
|
|
158
|
+
# Normalize whitespace directly without re-parsing
|
|
159
|
+
normalize_html_style_script_comments(node)
|
|
160
|
+
normalize_rendered_whitespace(node, match_opts)
|
|
161
|
+
return node
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Normalize whitespace directly without re-parsing
|
|
165
|
+
normalize_html_style_script_comments(node)
|
|
166
|
+
normalize_rendered_whitespace(node, match_opts)
|
|
167
|
+
return node
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# For other preprocessing, just return the node (including DocumentFragments)
|
|
171
|
+
return node
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Check if string contains XML declaration but is actually HTML
|
|
175
|
+
# Nokogiri::HTML4.to_s adds <?xml...?> but the content is still HTML
|
|
176
|
+
# Check if this is actually HTML content after the declaration
|
|
177
|
+
# Look for <html tag which indicates HTML
|
|
178
|
+
if node.strip.start_with?("<?xml") && !node.match?(/<html[\s>]/i)
|
|
179
|
+
# No <html> tag, this is likely pure XML
|
|
180
|
+
raise Canon::CompareFormatMismatchError.new(:xml, :html)
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Has <?xml but also <html> tag, so it's HTML with XML declaration
|
|
184
|
+
# (common output from Nokogiri::HTML4#to_s)
|
|
185
|
+
|
|
186
|
+
# For :rendered preprocessing, handle separately to avoid double-parsing
|
|
187
|
+
if preprocessing == :rendered
|
|
188
|
+
# Check if this is a full HTML document or a fragment
|
|
189
|
+
# Use full document parsing if it has <html> tag
|
|
190
|
+
if node.match?(/<html[\s>]/i)
|
|
191
|
+
doc = Nokogiri::HTML(node, &:noblanks)
|
|
192
|
+
normalize_html_style_script_comments(doc)
|
|
193
|
+
normalize_rendered_whitespace(doc, match_opts)
|
|
194
|
+
remove_whitespace_only_text_nodes(doc)
|
|
195
|
+
return doc
|
|
196
|
+
else
|
|
197
|
+
# Use fragment for partial HTML
|
|
198
|
+
frag = Nokogiri::HTML4.fragment(node)
|
|
199
|
+
normalize_html_style_script_comments(frag)
|
|
200
|
+
normalize_rendered_whitespace(frag, match_opts)
|
|
201
|
+
remove_whitespace_only_text_nodes(frag)
|
|
202
|
+
return frag
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Apply preprocessing to HTML string before parsing
|
|
207
|
+
html_string = case preprocessing
|
|
208
|
+
when :normalize
|
|
209
|
+
# Normalize whitespace: collapse runs, trim lines
|
|
210
|
+
node.lines.map(&:strip).reject(&:empty?).join("\n")
|
|
211
|
+
when :c14n
|
|
212
|
+
# Canonicalize the HTML (use XML canonicalization)
|
|
213
|
+
Canon::Xml::C14n.canonicalize(node,
|
|
214
|
+
with_comments: false)
|
|
215
|
+
when :format
|
|
216
|
+
# Pretty format the HTML
|
|
217
|
+
Canon.format(node, :html)
|
|
218
|
+
else
|
|
219
|
+
# :none or unrecognized - use as-is
|
|
220
|
+
node
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# Use Nokogiri for HTML and normalize style/script comments
|
|
224
|
+
# Use noblanks to prevent Nokogiri from adding structural whitespace
|
|
225
|
+
doc = Nokogiri::HTML(html_string, &:noblanks)
|
|
226
|
+
normalize_html_style_script_comments(doc)
|
|
227
|
+
doc
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
# Detect HTML version from content
|
|
231
|
+
#
|
|
232
|
+
# @param content [String] HTML content
|
|
233
|
+
# @return [Symbol] :html5 or :html4
|
|
234
|
+
def detect_html_version(content)
|
|
235
|
+
# Check for HTML5 doctype (case-insensitive)
|
|
236
|
+
if content.match?(/<!DOCTYPE\s+html>/i)
|
|
237
|
+
:html5
|
|
238
|
+
# Check for HTML4 doctype patterns
|
|
239
|
+
elsif content.match?(/<!DOCTYPE\s+HTML\s+PUBLIC/i)
|
|
240
|
+
:html4
|
|
241
|
+
else
|
|
242
|
+
# Default to HTML5 for modern usage
|
|
243
|
+
:html5
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# Detect HTML version from Nokogiri node
|
|
248
|
+
#
|
|
249
|
+
# @param node [Nokogiri::XML::Node] Nokogiri HTML node
|
|
250
|
+
# @return [Symbol] :html5 or :html4
|
|
251
|
+
def detect_html_version_from_node(node)
|
|
252
|
+
# Check node type
|
|
253
|
+
if node.is_a?(Nokogiri::HTML5::Document) ||
|
|
254
|
+
node.is_a?(Nokogiri::HTML5::DocumentFragment)
|
|
255
|
+
:html5
|
|
256
|
+
elsif node.is_a?(Nokogiri::HTML4::Document) ||
|
|
257
|
+
node.is_a?(Nokogiri::HTML4::DocumentFragment)
|
|
258
|
+
:html4
|
|
259
|
+
else
|
|
260
|
+
# Default to HTML4 for compatibility
|
|
261
|
+
:html4
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
# Serialize node to string for diff display
|
|
266
|
+
# This ensures the displayed diff matches what was compared
|
|
267
|
+
#
|
|
268
|
+
# @param node [Nokogiri::HTML::Document] Parsed HTML node
|
|
269
|
+
# @return [String] Serialized HTML string
|
|
270
|
+
def serialize_for_display(node)
|
|
271
|
+
# Get string representation with formatting for line-by-line diffs
|
|
272
|
+
# Use to_html which preserves line structure for diff display
|
|
273
|
+
node.to_html
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
# Normalize HTML comments within style and script tags
|
|
277
|
+
# Also removes whitespace-only CDATA children that Nokogiri creates
|
|
278
|
+
def normalize_html_style_script_comments(doc)
|
|
279
|
+
doc.css("style, script").each do |element|
|
|
280
|
+
# Remove HTML comments from style/script content
|
|
281
|
+
# SAFE: This regex operates on already-parsed DOM element content,
|
|
282
|
+
# not on raw user input. The non-greedy .*? correctly matches
|
|
283
|
+
# comment boundaries. Any remaining <!-- would be literal text
|
|
284
|
+
# (not a comment), which is safe in this context.
|
|
285
|
+
# CodeQL false positive: see https://github.com/github/codeql/issues/XXXX
|
|
286
|
+
normalized = element.content.gsub(/<!--.*?-->/m, "").strip
|
|
287
|
+
|
|
288
|
+
if normalized.empty?
|
|
289
|
+
# Remove all children (including whitespace-only CDATA nodes)
|
|
290
|
+
element.children.remove
|
|
291
|
+
else
|
|
292
|
+
element.content = normalized
|
|
293
|
+
end
|
|
294
|
+
end
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
# Normalize whitespace in text nodes according to HTML rendering rules
|
|
298
|
+
# In HTML rendering, sequences of whitespace (spaces, tabs, newlines)
|
|
299
|
+
# collapse to a single space, except in elements where whitespace is
|
|
300
|
+
# significant (pre, code, textarea, script, style)
|
|
301
|
+
#
|
|
302
|
+
# @param doc [Nokogiri::HTML::Document] Document to normalize
|
|
303
|
+
# @param match_opts [Hash] Match options to respect during normalization
|
|
304
|
+
def normalize_rendered_whitespace(doc, match_opts = {})
|
|
305
|
+
# If text_content is :strict, don't normalize ANY text content
|
|
306
|
+
# This allows users to explicitly request strict text matching
|
|
307
|
+
return if match_opts[:text_content] == :strict
|
|
308
|
+
|
|
309
|
+
# Elements where whitespace is significant - don't normalize
|
|
310
|
+
# This is an HTML rendering rule, not a match option
|
|
311
|
+
preserve_whitespace = %w[pre code textarea script style]
|
|
312
|
+
|
|
313
|
+
# Walk all text nodes
|
|
314
|
+
doc.xpath(".//text()").each do |text_node|
|
|
315
|
+
# Skip if this text node is inside a whitespace-preserving element
|
|
316
|
+
# Check all ancestors, not just immediate parent
|
|
317
|
+
# Whitespace preservation happens REGARDLESS of text_content setting
|
|
318
|
+
parent = text_node.parent
|
|
319
|
+
next if ancestor_preserves_whitespace?(parent, preserve_whitespace)
|
|
320
|
+
|
|
321
|
+
# Collapse whitespace sequences (spaces, tabs, newlines) to single
|
|
322
|
+
# space
|
|
323
|
+
normalized = text_node.content.gsub(/\s+/, " ")
|
|
324
|
+
|
|
325
|
+
# Trim leading/trailing whitespace if appropriate
|
|
326
|
+
normalized = normalized.strip if should_trim_text_node?(text_node)
|
|
327
|
+
|
|
328
|
+
text_node.content = normalized
|
|
329
|
+
end
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
# Check if any ancestor of the given node preserves whitespace
|
|
333
|
+
def ancestor_preserves_whitespace?(node, preserve_list)
|
|
334
|
+
current = node
|
|
335
|
+
while current.respond_to?(:name)
|
|
336
|
+
return true if preserve_list.include?(current.name.downcase)
|
|
337
|
+
|
|
338
|
+
# Stop at document root - documents don't have parents
|
|
339
|
+
break if current.is_a?(Nokogiri::XML::Document)
|
|
340
|
+
|
|
341
|
+
current = current.parent
|
|
342
|
+
end
|
|
343
|
+
false
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
# Determine if a text node should have leading/trailing whitespace
|
|
347
|
+
# trimmed Text nodes at the start or end of their parent element should
|
|
348
|
+
# be trimmed
|
|
349
|
+
def should_trim_text_node?(text_node)
|
|
350
|
+
parent = text_node.parent
|
|
351
|
+
siblings = parent.children
|
|
352
|
+
|
|
353
|
+
# Trim if text is the only child
|
|
354
|
+
return true if siblings.length == 1
|
|
355
|
+
|
|
356
|
+
# Trim if text is at the start or end of parent
|
|
357
|
+
text_node == siblings.first || text_node == siblings.last
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
# Remove whitespace-only text nodes from the document
|
|
361
|
+
# These are typically insignificant in HTML rendering (e.g., between
|
|
362
|
+
# block elements)
|
|
363
|
+
def remove_whitespace_only_text_nodes(doc)
|
|
364
|
+
doc.xpath(".//text()").each do |text_node|
|
|
365
|
+
# Remove if the text is only whitespace (after normalization)
|
|
366
|
+
if text_node.content.strip.empty?
|
|
367
|
+
text_node.remove
|
|
368
|
+
end
|
|
369
|
+
end
|
|
370
|
+
end
|
|
371
|
+
|
|
372
|
+
# Check if a node is an XML document (not HTML)
|
|
373
|
+
# XML documents typically have XML processing instructions or are
|
|
374
|
+
# instances of Nokogiri::XML::Document (not HTML variants)
|
|
375
|
+
def is_xml_document?(node)
|
|
376
|
+
# Check if it's a pure XML document (not HTML4/HTML5 which also
|
|
377
|
+
# inherit from XML::Document)
|
|
378
|
+
# Check both Document and DocumentFragment variants
|
|
379
|
+
return false if node.is_a?(Nokogiri::HTML4::Document) ||
|
|
380
|
+
node.is_a?(Nokogiri::HTML5::Document) ||
|
|
381
|
+
node.is_a?(Nokogiri::HTML4::DocumentFragment) ||
|
|
382
|
+
node.is_a?(Nokogiri::HTML5::DocumentFragment)
|
|
383
|
+
|
|
384
|
+
# If it's an XML document, check for XML processing instruction
|
|
385
|
+
if node.is_a?(Nokogiri::XML::Document) && node.children.any? do |child|
|
|
386
|
+
child.is_a?(Nokogiri::XML::ProcessingInstruction) &&
|
|
387
|
+
child.name == "xml"
|
|
388
|
+
end
|
|
389
|
+
# XML documents often start with <?xml ...?> processing instruction
|
|
390
|
+
return true
|
|
391
|
+
|
|
392
|
+
# Note: We don't blindly return true here because HTML documents
|
|
393
|
+
# also inherit from XML::Document. We only return true if there's
|
|
394
|
+
# an XML processing instruction above.
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
# Check if it's a fragment that contains XML processing instructions
|
|
398
|
+
if node.respond_to?(:children) && node.children.any? do |child|
|
|
399
|
+
child.is_a?(Nokogiri::XML::ProcessingInstruction) &&
|
|
400
|
+
child.name == "xml"
|
|
401
|
+
end
|
|
402
|
+
return true
|
|
403
|
+
end
|
|
404
|
+
|
|
405
|
+
false
|
|
406
|
+
end
|
|
407
|
+
end
|
|
408
|
+
end
|
|
409
|
+
end
|
|
410
|
+
end
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require_relative "match_options"
|
|
5
|
+
require_relative "comparison_result"
|
|
6
|
+
|
|
7
|
+
module Canon
|
|
8
|
+
module Comparison
|
|
9
|
+
# JSON comparison class
|
|
10
|
+
# Handles comparison of JSON objects with various options
|
|
11
|
+
class JsonComparator
|
|
12
|
+
# Default comparison options for JSON
|
|
13
|
+
DEFAULT_OPTS = {
|
|
14
|
+
# Output options
|
|
15
|
+
verbose: false,
|
|
16
|
+
|
|
17
|
+
# Match system options
|
|
18
|
+
match_profile: nil,
|
|
19
|
+
match: nil,
|
|
20
|
+
preprocessing: nil,
|
|
21
|
+
global_profile: nil,
|
|
22
|
+
global_options: nil,
|
|
23
|
+
|
|
24
|
+
# Diff display options
|
|
25
|
+
diff: nil,
|
|
26
|
+
}.freeze
|
|
27
|
+
|
|
28
|
+
class << self
|
|
29
|
+
# Compare two JSON objects for equivalence
|
|
30
|
+
#
|
|
31
|
+
# @param json1 [String, Hash, Array] First JSON
|
|
32
|
+
# @param json2 [String, Hash, Array] Second JSON
|
|
33
|
+
# @param opts [Hash] Comparison options
|
|
34
|
+
# @return [Boolean, ComparisonResult] true if equivalent, or ComparisonResult if verbose
|
|
35
|
+
def equivalent?(json1, json2, opts = {})
|
|
36
|
+
opts = DEFAULT_OPTS.merge(opts)
|
|
37
|
+
|
|
38
|
+
# Resolve match options with format-specific defaults
|
|
39
|
+
match_opts_hash = MatchOptions::Json.resolve(
|
|
40
|
+
format: :json,
|
|
41
|
+
match_profile: opts[:match_profile],
|
|
42
|
+
match: opts[:match],
|
|
43
|
+
preprocessing: opts[:preprocessing],
|
|
44
|
+
global_profile: opts[:global_profile],
|
|
45
|
+
global_options: opts[:global_options],
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Wrap in ResolvedMatchOptions for consistency with XML/HTML
|
|
49
|
+
Canon::Comparison::ResolvedMatchOptions.new(
|
|
50
|
+
match_opts_hash,
|
|
51
|
+
format: :json,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Store resolved match options for use in comparison logic
|
|
55
|
+
opts[:match_opts] = match_opts_hash
|
|
56
|
+
|
|
57
|
+
# Parse JSON if strings
|
|
58
|
+
obj1 = parse_json(json1)
|
|
59
|
+
obj2 = parse_json(json2)
|
|
60
|
+
|
|
61
|
+
differences = []
|
|
62
|
+
result = compare_ruby_objects(obj1, obj2, opts, differences, "")
|
|
63
|
+
|
|
64
|
+
if opts[:verbose]
|
|
65
|
+
# Format JSON for display
|
|
66
|
+
json_str1 = obj1.is_a?(String) ? obj1 : JSON.pretty_generate(obj1)
|
|
67
|
+
json_str2 = obj2.is_a?(String) ? obj2 : JSON.pretty_generate(obj2)
|
|
68
|
+
|
|
69
|
+
ComparisonResult.new(
|
|
70
|
+
differences: differences,
|
|
71
|
+
preprocessed_strings: [json_str1, json_str2],
|
|
72
|
+
format: :json,
|
|
73
|
+
match_options: match_opts_hash,
|
|
74
|
+
)
|
|
75
|
+
else
|
|
76
|
+
result == Comparison::EQUIVALENT
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
private
|
|
81
|
+
|
|
82
|
+
# Parse JSON from string or return as-is
|
|
83
|
+
def parse_json(obj)
|
|
84
|
+
return obj unless obj.is_a?(String)
|
|
85
|
+
|
|
86
|
+
JSON.parse(obj)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Compare Ruby objects (Hash, Array, primitives) for JSON/YAML
|
|
90
|
+
def compare_ruby_objects(obj1, obj2, opts, differences, path)
|
|
91
|
+
# Check for type mismatch
|
|
92
|
+
unless obj1.instance_of?(obj2.class)
|
|
93
|
+
add_ruby_difference(path, obj1, obj2, Comparison::UNEQUAL_TYPES,
|
|
94
|
+
opts, differences)
|
|
95
|
+
return Comparison::UNEQUAL_TYPES
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
case obj1
|
|
99
|
+
when Hash
|
|
100
|
+
compare_hashes(obj1, obj2, opts, differences, path)
|
|
101
|
+
when Array
|
|
102
|
+
compare_arrays(obj1, obj2, opts, differences, path)
|
|
103
|
+
when NilClass, TrueClass, FalseClass, Numeric, String, Symbol
|
|
104
|
+
compare_primitives(obj1, obj2, opts, differences, path)
|
|
105
|
+
else
|
|
106
|
+
# Fallback to equality comparison
|
|
107
|
+
if obj1 == obj2
|
|
108
|
+
Comparison::EQUIVALENT
|
|
109
|
+
else
|
|
110
|
+
add_ruby_difference(path, obj1, obj2,
|
|
111
|
+
Comparison::UNEQUAL_PRIMITIVES, opts,
|
|
112
|
+
differences)
|
|
113
|
+
Comparison::UNEQUAL_PRIMITIVES
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Compare two hashes
|
|
119
|
+
def compare_hashes(hash1, hash2, opts, differences, path)
|
|
120
|
+
keys1 = hash1.keys
|
|
121
|
+
keys2 = hash2.keys
|
|
122
|
+
|
|
123
|
+
# Sort keys if order should be ignored (based on match options)
|
|
124
|
+
match_opts = opts[:match_opts]
|
|
125
|
+
if match_opts[:key_order] != :strict
|
|
126
|
+
keys1 = keys1.sort_by(&:to_s)
|
|
127
|
+
keys2 = keys2.sort_by(&:to_s)
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Check for missing keys
|
|
131
|
+
missing_in_2 = keys1 - keys2
|
|
132
|
+
missing_in_1 = keys2 - keys1
|
|
133
|
+
|
|
134
|
+
missing_in_2.each do |key|
|
|
135
|
+
key_path = path.empty? ? key.to_s : "#{path}.#{key}"
|
|
136
|
+
add_ruby_difference(key_path, hash1[key], nil,
|
|
137
|
+
Comparison::MISSING_HASH_KEY, opts, differences)
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
missing_in_1.each do |key|
|
|
141
|
+
key_path = path.empty? ? key.to_s : "#{path}.#{key}"
|
|
142
|
+
add_ruby_difference(key_path, nil, hash2[key],
|
|
143
|
+
Comparison::MISSING_HASH_KEY, opts, differences)
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
has_missing_keys = !missing_in_1.empty? || !missing_in_2.empty?
|
|
147
|
+
|
|
148
|
+
# Compare common keys
|
|
149
|
+
common_keys = keys1 & keys2
|
|
150
|
+
all_equivalent = true
|
|
151
|
+
common_keys.each do |key|
|
|
152
|
+
key_path = path.empty? ? key.to_s : "#{path}.#{key}"
|
|
153
|
+
result = compare_ruby_objects(hash1[key], hash2[key], opts,
|
|
154
|
+
differences, key_path)
|
|
155
|
+
all_equivalent = false unless result == Comparison::EQUIVALENT
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Return appropriate status
|
|
159
|
+
return Comparison::MISSING_HASH_KEY if has_missing_keys && all_equivalent
|
|
160
|
+
return Comparison::UNEQUAL_HASH_VALUES unless all_equivalent
|
|
161
|
+
|
|
162
|
+
has_missing_keys ? Comparison::MISSING_HASH_KEY : Comparison::EQUIVALENT
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Compare two arrays
|
|
166
|
+
def compare_arrays(arr1, arr2, opts, differences, path)
|
|
167
|
+
unless arr1.length == arr2.length
|
|
168
|
+
add_ruby_difference(path, arr1, arr2,
|
|
169
|
+
Comparison::UNEQUAL_ARRAY_LENGTHS, opts,
|
|
170
|
+
differences)
|
|
171
|
+
return Comparison::UNEQUAL_ARRAY_LENGTHS
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
all_equivalent = true
|
|
175
|
+
arr1.each_with_index do |elem1, index|
|
|
176
|
+
elem2 = arr2[index]
|
|
177
|
+
elem_path = "#{path}[#{index}]"
|
|
178
|
+
result = compare_ruby_objects(elem1, elem2, opts, differences,
|
|
179
|
+
elem_path)
|
|
180
|
+
all_equivalent = false unless result == Comparison::EQUIVALENT
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
all_equivalent ? Comparison::EQUIVALENT : Comparison::UNEQUAL_ARRAY_ELEMENTS
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Compare primitive values
|
|
187
|
+
def compare_primitives(val1, val2, opts, differences, path)
|
|
188
|
+
if val1 == val2
|
|
189
|
+
Comparison::EQUIVALENT
|
|
190
|
+
else
|
|
191
|
+
add_ruby_difference(path, val1, val2,
|
|
192
|
+
Comparison::UNEQUAL_PRIMITIVES, opts,
|
|
193
|
+
differences)
|
|
194
|
+
Comparison::UNEQUAL_PRIMITIVES
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Add a Ruby object difference
|
|
199
|
+
def add_ruby_difference(path, obj1, obj2, diff_code, opts, differences)
|
|
200
|
+
return unless opts[:verbose]
|
|
201
|
+
|
|
202
|
+
differences << {
|
|
203
|
+
path: path,
|
|
204
|
+
value1: obj1,
|
|
205
|
+
value2: obj2,
|
|
206
|
+
diff_code: diff_code,
|
|
207
|
+
}
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
end
|