canon 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +69 -92
- data/README.adoc +13 -13
- data/docs/.lycheeignore +69 -0
- data/docs/Gemfile +1 -0
- data/docs/_config.yml +90 -1
- data/docs/advanced/diff-classification.adoc +82 -2
- data/docs/advanced/extending-canon.adoc +193 -0
- data/docs/features/match-options/index.adoc +239 -1
- data/docs/internals/diffnode-enrichment.adoc +611 -0
- data/docs/internals/index.adoc +251 -0
- data/docs/lychee.toml +13 -6
- data/docs/understanding/architecture.adoc +749 -33
- data/docs/understanding/comparison-pipeline.adoc +122 -0
- data/lib/canon/cache.rb +129 -0
- data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
- data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
- data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
- data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
- data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
- data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
- data/lib/canon/comparison/dimensions/registry.rb +77 -0
- data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
- data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
- data/lib/canon/comparison/dimensions.rb +54 -0
- data/lib/canon/comparison/format_detector.rb +87 -0
- data/lib/canon/comparison/html_comparator.rb +70 -26
- data/lib/canon/comparison/html_compare_profile.rb +8 -2
- data/lib/canon/comparison/html_parser.rb +80 -0
- data/lib/canon/comparison/json_comparator.rb +12 -0
- data/lib/canon/comparison/json_parser.rb +19 -0
- data/lib/canon/comparison/markup_comparator.rb +293 -0
- data/lib/canon/comparison/match_options/base_resolver.rb +150 -0
- data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
- data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
- data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
- data/lib/canon/comparison/match_options.rb +68 -463
- data/lib/canon/comparison/profile_definition.rb +149 -0
- data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
- data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
- data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
- data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +197 -0
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
- data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +79 -0
- data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +102 -0
- data/lib/canon/comparison/xml_comparator.rb +97 -684
- data/lib/canon/comparison/xml_node_comparison.rb +319 -0
- data/lib/canon/comparison/xml_parser.rb +19 -0
- data/lib/canon/comparison/yaml_comparator.rb +3 -3
- data/lib/canon/comparison.rb +265 -110
- data/lib/canon/diff/diff_classifier.rb +101 -2
- data/lib/canon/diff/diff_node.rb +32 -2
- data/lib/canon/diff/formatting_detector.rb +1 -1
- data/lib/canon/diff/node_serializer.rb +191 -0
- data/lib/canon/diff/path_builder.rb +143 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
- data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
- data/lib/canon/diff_formatter.rb +1 -1
- data/lib/canon/rspec_matchers.rb +38 -9
- data/lib/canon/tree_diff/operation_converter.rb +92 -338
- data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
- data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
- data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
- data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +24 -13
- metadata +48 -2
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "nokogiri"
|
|
4
4
|
require_relative "../comparison" # Load base module with constants first
|
|
5
|
+
require_relative "markup_comparator"
|
|
5
6
|
require_relative "xml_comparator"
|
|
6
7
|
require_relative "match_options"
|
|
7
8
|
require_relative "comparison_result"
|
|
@@ -11,12 +12,17 @@ require_relative "../diff/diff_node"
|
|
|
11
12
|
require_relative "../diff/diff_classifier"
|
|
12
13
|
require_relative "strategies/match_strategy_factory"
|
|
13
14
|
require_relative "../html/data_model"
|
|
15
|
+
require_relative "xml_node_comparison"
|
|
16
|
+
# Whitespace sensitivity module (single source of truth for sensitive elements)
|
|
17
|
+
require_relative "whitespace_sensitivity"
|
|
14
18
|
|
|
15
19
|
module Canon
|
|
16
20
|
module Comparison
|
|
17
21
|
# HTML comparison class
|
|
18
22
|
# Handles comparison of HTML nodes with various options
|
|
19
|
-
|
|
23
|
+
#
|
|
24
|
+
# Inherits shared comparison functionality from MarkupComparator.
|
|
25
|
+
class HtmlComparator < MarkupComparator
|
|
20
26
|
# Default comparison options for HTML
|
|
21
27
|
DEFAULT_OPTS = {
|
|
22
28
|
# Structural filtering options
|
|
@@ -108,6 +114,9 @@ module Canon
|
|
|
108
114
|
|
|
109
115
|
# DocumentFragment nodes need special handling - compare their children
|
|
110
116
|
# instead of the fragment nodes themselves
|
|
117
|
+
# This is a SAFETY CHECK for legacy cases where Nokogiri nodes might still be used
|
|
118
|
+
# The main path (parse_node) now returns Canon::Xml::Nodes::RootNode, so this
|
|
119
|
+
# check should rarely trigger, but we keep it for robustness
|
|
111
120
|
if (node1.is_a?(Nokogiri::HTML4::DocumentFragment) ||
|
|
112
121
|
node1.is_a?(Nokogiri::XML::DocumentFragment)) &&
|
|
113
122
|
(node2.is_a?(Nokogiri::HTML4::DocumentFragment) ||
|
|
@@ -117,10 +126,8 @@ module Canon
|
|
|
117
126
|
all_children2 = node2.children.to_a
|
|
118
127
|
|
|
119
128
|
# Filter children based on match options (e.g., ignore comments)
|
|
120
|
-
children1 =
|
|
121
|
-
|
|
122
|
-
children2 = XmlComparator.send(:filter_children, all_children2,
|
|
123
|
-
opts)
|
|
129
|
+
children1 = XmlNodeComparison.filter_children(all_children1, opts)
|
|
130
|
+
children2 = XmlNodeComparison.filter_children(all_children2, opts)
|
|
124
131
|
|
|
125
132
|
if children1.length != children2.length
|
|
126
133
|
result = Comparison::UNEQUAL_ELEMENTS
|
|
@@ -130,9 +137,10 @@ module Canon
|
|
|
130
137
|
# Compare each pair of children
|
|
131
138
|
result = Comparison::EQUIVALENT
|
|
132
139
|
children1.zip(children2).each do |child1, child2|
|
|
133
|
-
child_result =
|
|
134
|
-
|
|
135
|
-
|
|
140
|
+
child_result = XmlNodeComparison.compare_nodes(child1, child2,
|
|
141
|
+
opts, child_opts,
|
|
142
|
+
diff_children,
|
|
143
|
+
differences)
|
|
136
144
|
if child_result != Comparison::EQUIVALENT
|
|
137
145
|
result = child_result
|
|
138
146
|
break
|
|
@@ -140,8 +148,9 @@ module Canon
|
|
|
140
148
|
end
|
|
141
149
|
end
|
|
142
150
|
else
|
|
143
|
-
result =
|
|
144
|
-
|
|
151
|
+
result = XmlNodeComparison.compare_nodes(node1, node2, opts,
|
|
152
|
+
child_opts, diff_children,
|
|
153
|
+
differences)
|
|
145
154
|
end
|
|
146
155
|
|
|
147
156
|
# Classify DiffNodes as normative/informative if we have verbose output
|
|
@@ -287,7 +296,16 @@ module Canon
|
|
|
287
296
|
end
|
|
288
297
|
|
|
289
298
|
# Strip DOCTYPE for consistent parsing
|
|
290
|
-
|
|
299
|
+
# Use non-regex approach to avoid ReDoS vulnerability
|
|
300
|
+
# DOCTYPE declarations end with first > character
|
|
301
|
+
doctype_start = html_string =~ /<!DOCTYPE/i
|
|
302
|
+
if doctype_start
|
|
303
|
+
doctype_end = html_string.index(">", doctype_start)
|
|
304
|
+
html_string = html_string[0...doctype_start] + html_string[(doctype_end + 1)..] if doctype_end
|
|
305
|
+
html_string.strip!
|
|
306
|
+
else
|
|
307
|
+
html_string = html_string.strip
|
|
308
|
+
end
|
|
291
309
|
|
|
292
310
|
# Apply preprocessing to HTML string before parsing
|
|
293
311
|
processed_html = case preprocessing
|
|
@@ -313,8 +331,15 @@ module Canon
|
|
|
313
331
|
|
|
314
332
|
# Parse a node from string or return as-is
|
|
315
333
|
# Applies preprocessing transformation before parsing if specified
|
|
316
|
-
#
|
|
334
|
+
# Returns Nokogiri nodes for DOM comparison (preserves original behavior)
|
|
317
335
|
def parse_node(node, preprocessing = :none, match_opts = {})
|
|
336
|
+
# If already a Canon::Xml::Node, convert to Nokogiri for DOM path
|
|
337
|
+
if node.is_a?(Canon::Xml::Node)
|
|
338
|
+
# Canon nodes used in semantic diff path, convert to Nokogiri for DOM path
|
|
339
|
+
xml_str = Canon::Xml::DataModel.serialize(node)
|
|
340
|
+
node = xml_str
|
|
341
|
+
end
|
|
342
|
+
|
|
318
343
|
# If already a Nokogiri node, check for incompatible XML documents
|
|
319
344
|
unless node.is_a?(String)
|
|
320
345
|
# Detect if this is an XML document (not HTML)
|
|
@@ -357,7 +382,15 @@ module Canon
|
|
|
357
382
|
|
|
358
383
|
# Strip DOCTYPE declarations from HTML strings
|
|
359
384
|
# This normalizes parsed HTML (which includes DOCTYPE) with raw HTML strings
|
|
360
|
-
|
|
385
|
+
# Use non-regex approach to avoid ReDoS vulnerability
|
|
386
|
+
doctype_start = node =~ /<!DOCTYPE/i
|
|
387
|
+
if doctype_start
|
|
388
|
+
doctype_end = node.index(">", doctype_start)
|
|
389
|
+
node = node[0...doctype_start] + node[(doctype_end + 1)..] if doctype_end
|
|
390
|
+
node.strip!
|
|
391
|
+
else
|
|
392
|
+
node = node.strip
|
|
393
|
+
end
|
|
361
394
|
|
|
362
395
|
# Apply preprocessing to HTML string before parsing
|
|
363
396
|
html_string = case preprocessing
|
|
@@ -380,10 +413,12 @@ module Canon
|
|
|
380
413
|
# Use XML fragment parser to avoid auto-inserted meta tags
|
|
381
414
|
frag = Nokogiri::XML.fragment(html_string)
|
|
382
415
|
|
|
383
|
-
# Apply :rendered preprocessing
|
|
384
|
-
if
|
|
416
|
+
# Apply post-parsing filtering for :normalize, :format, and :rendered preprocessing
|
|
417
|
+
if %i[normalize format rendered].include?(preprocessing)
|
|
385
418
|
normalize_html_style_script_comments(frag)
|
|
386
|
-
|
|
419
|
+
if preprocessing == :rendered
|
|
420
|
+
normalize_rendered_whitespace(frag, match_opts)
|
|
421
|
+
end
|
|
387
422
|
remove_whitespace_only_text_nodes(frag)
|
|
388
423
|
end
|
|
389
424
|
|
|
@@ -461,9 +496,9 @@ module Canon
|
|
|
461
496
|
# @param node [Canon::Xml::Node, Nokogiri::HTML::Document] Parsed node
|
|
462
497
|
# @return [String] Serialized HTML string
|
|
463
498
|
def serialize_for_display(node)
|
|
464
|
-
# Use
|
|
499
|
+
# Use XmlNodeComparison's serializer for Canon::Xml::Node
|
|
465
500
|
if node.is_a?(Canon::Xml::Node)
|
|
466
|
-
|
|
501
|
+
XmlNodeComparison.serialize_node_to_xml(node)
|
|
467
502
|
elsif node.respond_to?(:to_html)
|
|
468
503
|
node.to_html
|
|
469
504
|
elsif node.respond_to?(:to_xml)
|
|
@@ -509,16 +544,22 @@ compare_profile = nil)
|
|
|
509
544
|
return if match_opts[:text_content] == :strict
|
|
510
545
|
|
|
511
546
|
# Elements where whitespace is significant - don't normalize
|
|
512
|
-
#
|
|
547
|
+
# SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_sensitive_elements
|
|
548
|
+
# This ensures consistency between preprocessing and comparison logic
|
|
549
|
+
# SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_sensitive_elements
|
|
550
|
+
# This ensures consistency between preprocessing and comparison logic
|
|
513
551
|
preserve_whitespace = if compare_profile.is_a?(HtmlCompareProfile)
|
|
514
552
|
# Profile handles HTML-specific whitespace rules
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
553
|
+
# Get default list and filter by profile
|
|
554
|
+
WhitespaceSensitivity
|
|
555
|
+
.format_default_sensitive_elements(match_opts)
|
|
556
|
+
.select do |elem|
|
|
557
|
+
compare_profile.preserve_whitespace?(elem.to_s)
|
|
558
|
+
end
|
|
559
|
+
.map(&:to_s)
|
|
519
560
|
else
|
|
520
|
-
#
|
|
521
|
-
|
|
561
|
+
# Use default list from WhitespaceSensitivity (single source of truth)
|
|
562
|
+
WhitespaceSensitivity.format_default_sensitive_elements(match_opts).map(&:to_s)
|
|
522
563
|
end
|
|
523
564
|
|
|
524
565
|
# Walk all text nodes
|
|
@@ -574,9 +615,12 @@ compare_profile = nil)
|
|
|
574
615
|
#
|
|
575
616
|
# CRITICAL: Do NOT remove whitespace-only text nodes from whitespace-sensitive
|
|
576
617
|
# elements like <pre>, <code>, <textarea>, <script>, <style>
|
|
618
|
+
#
|
|
619
|
+
# SINGLE SOURCE OF TRUTH: Uses WhitespaceSensitivity.format_default_sensitive_elements
|
|
577
620
|
def remove_whitespace_only_text_nodes(doc)
|
|
578
621
|
# Elements where whitespace is significant - don't remove whitespace-only nodes
|
|
579
|
-
|
|
622
|
+
# SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_sensitive_elements
|
|
623
|
+
preserve_whitespace = WhitespaceSensitivity.format_default_sensitive_elements(format: :html).map(&:to_s)
|
|
580
624
|
|
|
581
625
|
doc.xpath(".//text()").each do |text_node|
|
|
582
626
|
# CRITICAL: Skip if this text node is inside a whitespace-preserving element
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "compare_profile"
|
|
4
|
+
# Whitespace sensitivity module (single source of truth for sensitive elements)
|
|
5
|
+
require_relative "whitespace_sensitivity"
|
|
4
6
|
|
|
5
7
|
module Canon
|
|
6
8
|
module Comparison
|
|
@@ -82,9 +84,13 @@ module Canon
|
|
|
82
84
|
private
|
|
83
85
|
|
|
84
86
|
# Elements where whitespace is semantically significant in HTML
|
|
85
|
-
#
|
|
87
|
+
#
|
|
88
|
+
# SINGLE SOURCE OF TRUTH: Delegates to WhitespaceSensitivity.format_default_sensitive_elements
|
|
89
|
+
# This ensures consistency across the codebase.
|
|
90
|
+
#
|
|
91
|
+
# @return [Array<String>] List of element names (as strings)
|
|
86
92
|
def whitespace_sensitive_elements
|
|
87
|
-
|
|
93
|
+
WhitespaceSensitivity.format_default_sensitive_elements(format: @html_version).map(&:to_s)
|
|
88
94
|
end
|
|
89
95
|
|
|
90
96
|
# Check if a dimension is explicitly set to :strict
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
|
|
5
|
+
module Canon
|
|
6
|
+
module Comparison
|
|
7
|
+
# HTML parsing service with version detection and fragment support
|
|
8
|
+
#
|
|
9
|
+
# Provides HTML parsing capabilities with automatic HTML4/HTML5 version
|
|
10
|
+
# detection. Handles both full documents and fragments.
|
|
11
|
+
#
|
|
12
|
+
# @example Parse HTML string
|
|
13
|
+
# HtmlParser.parse("<div>content</div>", :html5)
|
|
14
|
+
#
|
|
15
|
+
# @example Auto-detect and parse
|
|
16
|
+
# HtmlParser.detect_and_parse("<!DOCTYPE html><html>...</html>")
|
|
17
|
+
class HtmlParser
|
|
18
|
+
class << self
|
|
19
|
+
# Parse HTML string into Nokogiri document with the correct parser
|
|
20
|
+
#
|
|
21
|
+
# @param content [String, Object] Content to parse (returns as-is if not a string)
|
|
22
|
+
# @param format [Symbol] HTML format (:html, :html4, :html5)
|
|
23
|
+
# @return [Nokogiri::HTML::Document, Nokogiri::HTML5::Document, Nokogiri::HTML::DocumentFragment, Object]
|
|
24
|
+
def parse(content, format)
|
|
25
|
+
return content unless content.is_a?(String)
|
|
26
|
+
return content if already_parsed?(content)
|
|
27
|
+
|
|
28
|
+
begin
|
|
29
|
+
case format
|
|
30
|
+
when :html5
|
|
31
|
+
Nokogiri::HTML5.fragment(content)
|
|
32
|
+
when :html4
|
|
33
|
+
Nokogiri::HTML4.fragment(content)
|
|
34
|
+
when :html
|
|
35
|
+
detect_and_parse(content)
|
|
36
|
+
else
|
|
37
|
+
content
|
|
38
|
+
end
|
|
39
|
+
rescue StandardError
|
|
40
|
+
# Fallback to raw string if parsing fails (maintains backward compatibility)
|
|
41
|
+
content
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Check if content is already a parsed HTML document/fragment
|
|
46
|
+
#
|
|
47
|
+
# @param content [Object] Content to check
|
|
48
|
+
# @return [Boolean] true if already parsed
|
|
49
|
+
def already_parsed?(content)
|
|
50
|
+
content.is_a?(Nokogiri::HTML::Document) ||
|
|
51
|
+
content.is_a?(Nokogiri::HTML5::Document) ||
|
|
52
|
+
content.is_a?(Nokogiri::HTML::DocumentFragment) ||
|
|
53
|
+
content.is_a?(Nokogiri::HTML5::DocumentFragment)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Detect HTML version from content and parse with appropriate parser
|
|
57
|
+
#
|
|
58
|
+
# @param content [String] HTML content to parse
|
|
59
|
+
# @return [Nokogiri::HTML::DocumentFragment] Parsed fragment
|
|
60
|
+
def detect_and_parse(content)
|
|
61
|
+
version = detect_version(content)
|
|
62
|
+
if version == :html5
|
|
63
|
+
Nokogiri::HTML5.fragment(content)
|
|
64
|
+
else
|
|
65
|
+
Nokogiri::HTML4.fragment(content)
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Detect HTML version from content string
|
|
70
|
+
#
|
|
71
|
+
# @param content [String] HTML content
|
|
72
|
+
# @return [Symbol] :html5 or :html4
|
|
73
|
+
def detect_version(content)
|
|
74
|
+
# Check for HTML5 DOCTYPE (case-insensitive)
|
|
75
|
+
content.include?("<!DOCTYPE html>") ? :html5 : :html4
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -125,6 +125,18 @@ module Canon
|
|
|
125
125
|
if match_opts[:key_order] != :strict
|
|
126
126
|
keys1 = keys1.sort_by(&:to_s)
|
|
127
127
|
keys2 = keys2.sort_by(&:to_s)
|
|
128
|
+
elsif keys1 != keys2
|
|
129
|
+
# Strict mode: key order matters
|
|
130
|
+
# Check if keys are in same order
|
|
131
|
+
# Keys are different or in different order
|
|
132
|
+
# First check if it's just ordering (same keys, different order)
|
|
133
|
+
if keys1.sort_by(&:to_s) == keys2.sort_by(&:to_s)
|
|
134
|
+
# Same keys, different order - this is a key_order difference
|
|
135
|
+
key_path = path.empty? ? "(key order)" : "#{path}.(key order)"
|
|
136
|
+
add_ruby_difference(key_path, keys1, keys2,
|
|
137
|
+
Comparison::UNEQUAL_HASH_KEY_ORDER, opts, differences)
|
|
138
|
+
return Comparison::UNEQUAL_HASH_KEY_ORDER
|
|
139
|
+
end
|
|
128
140
|
end
|
|
129
141
|
|
|
130
142
|
# Check for missing keys
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module Comparison
|
|
5
|
+
# Public API for JSON parsing operations
|
|
6
|
+
# Provides access to parsing functionality without using send()
|
|
7
|
+
class JsonParser
|
|
8
|
+
# Parse an object to Ruby object
|
|
9
|
+
#
|
|
10
|
+
# @param obj [String, Hash, Array] Object to parse
|
|
11
|
+
# @return [Hash, Array] Parsed Ruby object
|
|
12
|
+
def self.parse_json(obj)
|
|
13
|
+
# Delegate to JsonComparator's private method via public API
|
|
14
|
+
require_relative "json_comparator"
|
|
15
|
+
JsonComparator.parse_json(obj)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../comparison" # Load base module with constants
|
|
4
|
+
require_relative "../diff/diff_node"
|
|
5
|
+
require_relative "../diff/path_builder"
|
|
6
|
+
|
|
7
|
+
module Canon
|
|
8
|
+
module Comparison
|
|
9
|
+
# Base class for markup document comparison (XML, HTML)
|
|
10
|
+
#
|
|
11
|
+
# Provides shared comparison functionality for markup documents,
|
|
12
|
+
# including node type checking, text extraction, filtering,
|
|
13
|
+
# and difference creation.
|
|
14
|
+
#
|
|
15
|
+
# Format-specific comparators (XmlComparator, HtmlComparator)
|
|
16
|
+
# inherit from this class and add format-specific behavior.
|
|
17
|
+
class MarkupComparator
|
|
18
|
+
class << self
|
|
19
|
+
# Add a difference to the differences array
|
|
20
|
+
#
|
|
21
|
+
# Creates a DiffNode with enriched metadata including path,
|
|
22
|
+
# serialized content, and attributes for Stage 4 rendering.
|
|
23
|
+
#
|
|
24
|
+
# @param node1 [Object, nil] First node
|
|
25
|
+
# @param node2 [Object, nil] Second node
|
|
26
|
+
# @param diff1 [Symbol] Difference type for node1
|
|
27
|
+
# @param diff2 [Symbol] Difference type for node2
|
|
28
|
+
# @param dimension [Symbol] The match dimension causing this difference
|
|
29
|
+
# @param _opts [Hash] Options (unused but kept for interface compatibility)
|
|
30
|
+
# @param differences [Array] Array to append difference to
|
|
31
|
+
def add_difference(node1, node2, diff1, diff2, dimension, _opts,
|
|
32
|
+
differences)
|
|
33
|
+
# All differences must be DiffNode objects (OO architecture)
|
|
34
|
+
if dimension.nil?
|
|
35
|
+
raise ArgumentError,
|
|
36
|
+
"dimension required for DiffNode"
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Build informative reason message
|
|
40
|
+
reason = build_difference_reason(node1, node2, diff1, diff2,
|
|
41
|
+
dimension)
|
|
42
|
+
|
|
43
|
+
# Enrich with path, serialized content, and attributes for Stage 4 rendering
|
|
44
|
+
metadata = enrich_diff_metadata(node1, node2)
|
|
45
|
+
|
|
46
|
+
diff_node = Canon::Diff::DiffNode.new(
|
|
47
|
+
node1: node1,
|
|
48
|
+
node2: node2,
|
|
49
|
+
dimension: dimension,
|
|
50
|
+
reason: reason,
|
|
51
|
+
**metadata,
|
|
52
|
+
)
|
|
53
|
+
differences << diff_node
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Enrich DiffNode with canonical path, serialized content, and attributes
|
|
57
|
+
# This extracts presentation-ready metadata from nodes for Stage 4 rendering
|
|
58
|
+
#
|
|
59
|
+
# @param node1 [Object, nil] First node
|
|
60
|
+
# @param node2 [Object, nil] Second node
|
|
61
|
+
# @return [Hash] Enriched metadata hash
|
|
62
|
+
def enrich_diff_metadata(node1, node2)
|
|
63
|
+
{
|
|
64
|
+
path: build_path_for_node(node1 || node2),
|
|
65
|
+
serialized_before: serialize_node(node1),
|
|
66
|
+
serialized_after: serialize_node(node2),
|
|
67
|
+
attributes_before: extract_attributes(node1),
|
|
68
|
+
attributes_after: extract_attributes(node2),
|
|
69
|
+
}
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Build canonical path for a node
|
|
73
|
+
#
|
|
74
|
+
# @param node [Object] Node to build path for
|
|
75
|
+
# @return [String, nil] Canonical path with ordinal indices
|
|
76
|
+
def build_path_for_node(node)
|
|
77
|
+
return nil if node.nil?
|
|
78
|
+
|
|
79
|
+
Canon::Diff::PathBuilder.build(node, format: :document)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Serialize a node to string for display
|
|
83
|
+
#
|
|
84
|
+
# @param node [Object, nil] Node to serialize
|
|
85
|
+
# @return [String, nil] Serialized content
|
|
86
|
+
def serialize_node(node)
|
|
87
|
+
return nil if node.nil?
|
|
88
|
+
|
|
89
|
+
# Canon::Xml::Node types
|
|
90
|
+
if node.is_a?(Canon::Xml::Nodes::RootNode)
|
|
91
|
+
# Serialize all children of root
|
|
92
|
+
node.children.map { |child| serialize_node(child) }.join
|
|
93
|
+
elsif node.is_a?(Canon::Xml::Nodes::ElementNode)
|
|
94
|
+
serialize_element_node(node)
|
|
95
|
+
elsif node.is_a?(Canon::Xml::Nodes::TextNode)
|
|
96
|
+
node.value
|
|
97
|
+
elsif node.is_a?(Canon::Xml::Nodes::CommentNode)
|
|
98
|
+
"<!--#{node.value}-->"
|
|
99
|
+
elsif node.is_a?(Canon::Xml::Nodes::ProcessingInstructionNode)
|
|
100
|
+
"<?#{node.target} #{node.data}?>"
|
|
101
|
+
elsif node.respond_to?(:to_xml)
|
|
102
|
+
node.to_xml
|
|
103
|
+
elsif node.respond_to?(:to_html)
|
|
104
|
+
node.to_html
|
|
105
|
+
else
|
|
106
|
+
node.to_s
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Extract attributes from a node
|
|
111
|
+
#
|
|
112
|
+
# @param node [Object, nil] Node to extract attributes from
|
|
113
|
+
# @return [Hash, nil] Hash of attribute name => value pairs
|
|
114
|
+
def extract_attributes(node)
|
|
115
|
+
return nil if node.nil?
|
|
116
|
+
|
|
117
|
+
# Canon::Xml::Node ElementNode
|
|
118
|
+
if node.is_a?(Canon::Xml::Nodes::ElementNode)
|
|
119
|
+
node.attribute_nodes.each_with_object({}) do |attr, hash|
|
|
120
|
+
hash[attr.name] = attr.value
|
|
121
|
+
end
|
|
122
|
+
# Nokogiri nodes
|
|
123
|
+
elsif node.respond_to?(:attributes)
|
|
124
|
+
node.attributes.each_with_object({}) do |(_, attr), hash|
|
|
125
|
+
hash[attr.name] = attr.value
|
|
126
|
+
end
|
|
127
|
+
else
|
|
128
|
+
{}
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Filter children based on options
|
|
133
|
+
#
|
|
134
|
+
# Removes nodes that should be excluded from comparison based on
|
|
135
|
+
# options like :ignore_nodes, :ignore_comments, etc.
|
|
136
|
+
#
|
|
137
|
+
# @param children [Array] Array of child nodes
|
|
138
|
+
# @param opts [Hash] Comparison options
|
|
139
|
+
# @return [Array] Filtered array of children
|
|
140
|
+
def filter_children(children, opts)
|
|
141
|
+
children.reject do |child|
|
|
142
|
+
node_excluded?(child, opts)
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Check if node should be excluded from comparison
|
|
147
|
+
#
|
|
148
|
+
# @param node [Object] Node to check
|
|
149
|
+
# @param opts [Hash] Comparison options
|
|
150
|
+
# @return [Boolean] true if node should be excluded
|
|
151
|
+
def node_excluded?(node, opts)
|
|
152
|
+
return false if node.nil?
|
|
153
|
+
return true if opts[:ignore_nodes]&.include?(node)
|
|
154
|
+
return true if opts[:ignore_comments] && comment_node?(node)
|
|
155
|
+
return true if opts[:ignore_text_nodes] && text_node?(node)
|
|
156
|
+
|
|
157
|
+
# Check structural_whitespace match option
|
|
158
|
+
match_opts = opts[:match_opts]
|
|
159
|
+
# Filter out whitespace-only text nodes
|
|
160
|
+
if match_opts && %i[ignore
|
|
161
|
+
normalize].include?(match_opts[:structural_whitespace]) && text_node?(node)
|
|
162
|
+
text = node_text(node)
|
|
163
|
+
return true if MatchOptions.normalize_text(text).empty?
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
false
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# Check if two nodes are the same type
|
|
170
|
+
#
|
|
171
|
+
# @param node1 [Object] First node
|
|
172
|
+
# @param node2 [Object] Second node
|
|
173
|
+
# @return [Boolean] true if nodes are same type
|
|
174
|
+
def same_node_type?(node1, node2)
|
|
175
|
+
return false if node1.class != node2.class
|
|
176
|
+
|
|
177
|
+
# For Nokogiri/Canon::Xml nodes, check node type
|
|
178
|
+
if node1.respond_to?(:node_type) && node2.respond_to?(:node_type)
|
|
179
|
+
node1.node_type == node2.node_type
|
|
180
|
+
else
|
|
181
|
+
true
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Check if a node is a comment node
|
|
186
|
+
#
|
|
187
|
+
# @param node [Object] Node to check
|
|
188
|
+
# @return [Boolean] true if node is a comment
|
|
189
|
+
def comment_node?(node)
|
|
190
|
+
node.respond_to?(:comment?) && node.comment? ||
|
|
191
|
+
node.respond_to?(:node_type) && node.node_type == :comment
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Check if a node is a text node
|
|
195
|
+
#
|
|
196
|
+
# @param node [Object] Node to check
|
|
197
|
+
# @return [Boolean] true if node is a text node
|
|
198
|
+
def text_node?(node)
|
|
199
|
+
node.respond_to?(:text?) && node.text? &&
|
|
200
|
+
!node.respond_to?(:element?) ||
|
|
201
|
+
node.respond_to?(:node_type) && node.node_type == :text
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Get text content from a node
|
|
205
|
+
#
|
|
206
|
+
# @param node [Object] Node to get text from
|
|
207
|
+
# @return [String] Text content
|
|
208
|
+
def node_text(node)
|
|
209
|
+
# Canon::Xml::Node TextNode uses .value
|
|
210
|
+
if node.respond_to?(:value)
|
|
211
|
+
node.value.to_s
|
|
212
|
+
# Nokogiri nodes use .content
|
|
213
|
+
elsif node.respond_to?(:content)
|
|
214
|
+
node.content.to_s
|
|
215
|
+
else
|
|
216
|
+
node.to_s
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Check if difference between two texts is only whitespace
|
|
221
|
+
#
|
|
222
|
+
# @param text1 [String] First text
|
|
223
|
+
# @param text2 [String] Second text
|
|
224
|
+
# @return [Boolean] true if difference is only in whitespace
|
|
225
|
+
def whitespace_only_difference?(text1, text2)
|
|
226
|
+
# Normalize both texts (collapse/trim whitespace)
|
|
227
|
+
norm1 = MatchOptions.normalize_text(text1)
|
|
228
|
+
norm2 = MatchOptions.normalize_text(text2)
|
|
229
|
+
|
|
230
|
+
# If normalized texts are the same, the difference was only whitespace
|
|
231
|
+
norm1 == norm2
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
# Build a human-readable reason for a difference
|
|
235
|
+
#
|
|
236
|
+
# @param node1 [Object, nil] First node
|
|
237
|
+
# @param node2 [Object, nil] Second node
|
|
238
|
+
# @param diff1 [Symbol] Difference type for node1
|
|
239
|
+
# @param diff2 [Symbol] Difference type for node2
|
|
240
|
+
# @param dimension [Symbol] The dimension of the difference
|
|
241
|
+
# @return [String] Human-readable reason
|
|
242
|
+
def build_difference_reason(_node1, _node2, diff1, diff2, dimension)
|
|
243
|
+
# Default reason - can be overridden in subclasses
|
|
244
|
+
"Difference in #{dimension}: #{diff1} vs #{diff2}"
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# Serialize an element node to string
|
|
248
|
+
#
|
|
249
|
+
# @param node [Canon::Xml::Nodes::ElementNode] Element node
|
|
250
|
+
# @return [String] Serialized element
|
|
251
|
+
def serialize_element_node(node)
|
|
252
|
+
attrs = node.attribute_nodes.map do |a|
|
|
253
|
+
" #{a.name}=\"#{a.value}\""
|
|
254
|
+
end.join
|
|
255
|
+
children_xml = node.children.map { |c| serialize_node(c) }.join
|
|
256
|
+
|
|
257
|
+
if children_xml.empty?
|
|
258
|
+
"<#{node.name}#{attrs}/>"
|
|
259
|
+
else
|
|
260
|
+
"<#{node.name}#{attrs}>#{children_xml}</#{node.name}>"
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# Determine the appropriate dimension for a node type
|
|
265
|
+
#
|
|
266
|
+
# @param node [Object] The node to check
|
|
267
|
+
# @return [Symbol] The dimension symbol
|
|
268
|
+
def determine_node_dimension(node)
|
|
269
|
+
# Canon::Xml::Node types
|
|
270
|
+
if node.respond_to?(:node_type) && node.node_type.is_a?(Symbol)
|
|
271
|
+
case node.node_type
|
|
272
|
+
when :comment then :comments
|
|
273
|
+
when :text, :cdata then :text_content
|
|
274
|
+
when :processing_instruction then :processing_instructions
|
|
275
|
+
else :text_content
|
|
276
|
+
end
|
|
277
|
+
# Moxml/Nokogiri types
|
|
278
|
+
elsif node.respond_to?(:comment?) && node.comment?
|
|
279
|
+
:comments
|
|
280
|
+
elsif node.respond_to?(:text?) && node.text?
|
|
281
|
+
:text_content
|
|
282
|
+
elsif node.respond_to?(:cdata?) && node.cdata?
|
|
283
|
+
:text_content
|
|
284
|
+
elsif node.respond_to?(:processing_instruction?) && node.processing_instruction?
|
|
285
|
+
:processing_instructions
|
|
286
|
+
else
|
|
287
|
+
:text_content
|
|
288
|
+
end
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
end
|