canon 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +25 -135
- data/README.adoc +13 -13
- data/docs/.lycheeignore +69 -0
- data/docs/advanced/extending-canon.adoc +193 -0
- data/docs/internals/diffnode-enrichment.adoc +611 -0
- data/docs/internals/index.adoc +251 -0
- data/docs/lychee.toml +13 -6
- data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +250 -0
- data/docs/understanding/architecture.adoc +749 -33
- data/docs/understanding/comparison-pipeline.adoc +122 -0
- data/false_positive_analysis.txt +0 -0
- data/file1.html +1 -0
- data/file2.html +1 -0
- data/lib/canon/cache.rb +129 -0
- data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
- data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
- data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
- data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
- data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
- data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
- data/lib/canon/comparison/dimensions/registry.rb +77 -0
- data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
- data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
- data/lib/canon/comparison/dimensions.rb +54 -0
- data/lib/canon/comparison/format_detector.rb +86 -0
- data/lib/canon/comparison/html_comparator.rb +51 -18
- data/lib/canon/comparison/html_parser.rb +80 -0
- data/lib/canon/comparison/json_comparator.rb +12 -0
- data/lib/canon/comparison/json_parser.rb +19 -0
- data/lib/canon/comparison/markup_comparator.rb +293 -0
- data/lib/canon/comparison/match_options/base_resolver.rb +143 -0
- data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
- data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
- data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
- data/lib/canon/comparison/match_options.rb +68 -463
- data/lib/canon/comparison/profile_definition.rb +149 -0
- data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
- data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
- data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +189 -0
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
- data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +74 -0
- data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +95 -0
- data/lib/canon/comparison/xml_comparator.rb +52 -664
- data/lib/canon/comparison/xml_node_comparison.rb +297 -0
- data/lib/canon/comparison/xml_parser.rb +19 -0
- data/lib/canon/comparison/yaml_comparator.rb +3 -3
- data/lib/canon/comparison.rb +265 -110
- data/lib/canon/diff/diff_node.rb +32 -2
- data/lib/canon/diff/node_serializer.rb +191 -0
- data/lib/canon/diff/path_builder.rb +143 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
- data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
- data/lib/canon/diff_formatter.rb +1 -1
- data/lib/canon/rspec_matchers.rb +1 -1
- data/lib/canon/tree_diff/operation_converter.rb +92 -338
- data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
- data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
- data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
- data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
- data/lib/canon/version.rb +1 -1
- data/old-docs/ADVANCED_TOPICS.adoc +20 -0
- data/old-docs/BASIC_USAGE.adoc +16 -0
- data/old-docs/CHARACTER_VISUALIZATION.adoc +567 -0
- data/old-docs/CLI.adoc +497 -0
- data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
- data/old-docs/DIFF_ARCHITECTURE.adoc +435 -0
- data/old-docs/DIFF_FORMATTING.adoc +540 -0
- data/old-docs/DIFF_PARAMETERS.adoc +261 -0
- data/old-docs/DOM_DIFF.adoc +1017 -0
- data/old-docs/ENV_CONFIG.adoc +876 -0
- data/old-docs/FORMATS.adoc +867 -0
- data/old-docs/INPUT_VALIDATION.adoc +477 -0
- data/old-docs/MATCHER_BEHAVIOR.adoc +90 -0
- data/old-docs/MATCH_ARCHITECTURE.adoc +463 -0
- data/old-docs/MATCH_OPTIONS.adoc +912 -0
- data/old-docs/MODES.adoc +432 -0
- data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
- data/old-docs/OPTIONS.adoc +1387 -0
- data/old-docs/PREPROCESSING.adoc +491 -0
- data/old-docs/README.old.adoc +2831 -0
- data/old-docs/RSPEC.adoc +814 -0
- data/old-docs/RUBY_API.adoc +485 -0
- data/old-docs/SEMANTIC_DIFF_REPORT.adoc +646 -0
- data/old-docs/SEMANTIC_TREE_DIFF.adoc +765 -0
- data/old-docs/STRING_COMPARE.adoc +345 -0
- data/old-docs/TMP.adoc +3384 -0
- data/old-docs/TREE_DIFF.adoc +1080 -0
- data/old-docs/UNDERSTANDING_CANON.adoc +17 -0
- data/old-docs/VERBOSE.adoc +482 -0
- data/old-docs/VISUALIZATION_MAP.adoc +625 -0
- data/old-docs/WHITESPACE_TREATMENT.adoc +1155 -0
- data/scripts/analyze_current_state.rb +85 -0
- data/scripts/analyze_false_positives.rb +114 -0
- data/scripts/analyze_remaining_failures.rb +105 -0
- data/scripts/compare_current_failures.rb +95 -0
- data/scripts/compare_dom_tree_diff.rb +158 -0
- data/scripts/compare_failures.rb +151 -0
- data/scripts/debug_attribute_extraction.rb +66 -0
- data/scripts/debug_blocks_839.rb +115 -0
- data/scripts/debug_meta_matching.rb +52 -0
- data/scripts/debug_p_matching.rb +192 -0
- data/scripts/debug_signature_matching.rb +118 -0
- data/scripts/debug_sourcecode_124.rb +32 -0
- data/scripts/debug_whitespace_sensitive.rb +192 -0
- data/scripts/extract_false_positives.rb +138 -0
- data/scripts/find_actual_false_positives.rb +125 -0
- data/scripts/investigate_all_false_positives.rb +161 -0
- data/scripts/investigate_batch1.rb +127 -0
- data/scripts/investigate_classification.rb +150 -0
- data/scripts/investigate_classification_detailed.rb +190 -0
- data/scripts/investigate_common_failures.rb +342 -0
- data/scripts/investigate_false_negative.rb +80 -0
- data/scripts/investigate_false_positive.rb +83 -0
- data/scripts/investigate_false_positives.rb +227 -0
- data/scripts/investigate_false_positives_batch.rb +163 -0
- data/scripts/investigate_mixed_content.rb +125 -0
- data/scripts/investigate_remaining_16.rb +214 -0
- data/scripts/run_single_test.rb +29 -0
- data/scripts/test_all_false_positives.rb +95 -0
- data/scripts/test_attribute_details.rb +61 -0
- data/scripts/test_both_algorithms.rb +49 -0
- data/scripts/test_both_simple.rb +49 -0
- data/scripts/test_enhanced_semantic_output.rb +125 -0
- data/scripts/test_readme_examples.rb +131 -0
- data/scripts/test_semantic_tree_diff.rb +99 -0
- data/scripts/test_semantic_ux_improvements.rb +135 -0
- data/scripts/test_single_false_positive.rb +119 -0
- data/scripts/test_size_limits.rb +99 -0
- data/test_html_1.html +21 -0
- data/test_html_2.html +21 -0
- data/test_nokogiri.rb +33 -0
- data/test_normalize.rb +45 -0
- metadata +123 -2
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "nokogiri"
|
|
4
4
|
require_relative "../comparison" # Load base module with constants first
|
|
5
|
+
require_relative "markup_comparator"
|
|
5
6
|
require_relative "xml_comparator"
|
|
6
7
|
require_relative "match_options"
|
|
7
8
|
require_relative "comparison_result"
|
|
@@ -11,12 +12,15 @@ require_relative "../diff/diff_node"
|
|
|
11
12
|
require_relative "../diff/diff_classifier"
|
|
12
13
|
require_relative "strategies/match_strategy_factory"
|
|
13
14
|
require_relative "../html/data_model"
|
|
15
|
+
require_relative "xml_node_comparison"
|
|
14
16
|
|
|
15
17
|
module Canon
|
|
16
18
|
module Comparison
|
|
17
19
|
# HTML comparison class
|
|
18
20
|
# Handles comparison of HTML nodes with various options
|
|
19
|
-
|
|
21
|
+
#
|
|
22
|
+
# Inherits shared comparison functionality from MarkupComparator.
|
|
23
|
+
class HtmlComparator < MarkupComparator
|
|
20
24
|
# Default comparison options for HTML
|
|
21
25
|
DEFAULT_OPTS = {
|
|
22
26
|
# Structural filtering options
|
|
@@ -108,6 +112,9 @@ module Canon
|
|
|
108
112
|
|
|
109
113
|
# DocumentFragment nodes need special handling - compare their children
|
|
110
114
|
# instead of the fragment nodes themselves
|
|
115
|
+
# This is a SAFETY CHECK for legacy cases where Nokogiri nodes might still be used
|
|
116
|
+
# The main path (parse_node) now returns Canon::Xml::Nodes::RootNode, so this
|
|
117
|
+
# check should rarely trigger, but we keep it for robustness
|
|
111
118
|
if (node1.is_a?(Nokogiri::HTML4::DocumentFragment) ||
|
|
112
119
|
node1.is_a?(Nokogiri::XML::DocumentFragment)) &&
|
|
113
120
|
(node2.is_a?(Nokogiri::HTML4::DocumentFragment) ||
|
|
@@ -117,10 +124,8 @@ module Canon
|
|
|
117
124
|
all_children2 = node2.children.to_a
|
|
118
125
|
|
|
119
126
|
# Filter children based on match options (e.g., ignore comments)
|
|
120
|
-
children1 =
|
|
121
|
-
|
|
122
|
-
children2 = XmlComparator.send(:filter_children, all_children2,
|
|
123
|
-
opts)
|
|
127
|
+
children1 = XmlNodeComparison.filter_children(all_children1, opts)
|
|
128
|
+
children2 = XmlNodeComparison.filter_children(all_children2, opts)
|
|
124
129
|
|
|
125
130
|
if children1.length != children2.length
|
|
126
131
|
result = Comparison::UNEQUAL_ELEMENTS
|
|
@@ -130,9 +135,10 @@ module Canon
|
|
|
130
135
|
# Compare each pair of children
|
|
131
136
|
result = Comparison::EQUIVALENT
|
|
132
137
|
children1.zip(children2).each do |child1, child2|
|
|
133
|
-
child_result =
|
|
134
|
-
|
|
135
|
-
|
|
138
|
+
child_result = XmlNodeComparison.compare_nodes(child1, child2,
|
|
139
|
+
opts, child_opts,
|
|
140
|
+
diff_children,
|
|
141
|
+
differences)
|
|
136
142
|
if child_result != Comparison::EQUIVALENT
|
|
137
143
|
result = child_result
|
|
138
144
|
break
|
|
@@ -140,8 +146,9 @@ module Canon
|
|
|
140
146
|
end
|
|
141
147
|
end
|
|
142
148
|
else
|
|
143
|
-
result =
|
|
144
|
-
|
|
149
|
+
result = XmlNodeComparison.compare_nodes(node1, node2, opts,
|
|
150
|
+
child_opts, diff_children,
|
|
151
|
+
differences)
|
|
145
152
|
end
|
|
146
153
|
|
|
147
154
|
# Classify DiffNodes as normative/informative if we have verbose output
|
|
@@ -287,7 +294,16 @@ module Canon
|
|
|
287
294
|
end
|
|
288
295
|
|
|
289
296
|
# Strip DOCTYPE for consistent parsing
|
|
290
|
-
|
|
297
|
+
# Use non-regex approach to avoid ReDoS vulnerability
|
|
298
|
+
# DOCTYPE declarations end with first > character
|
|
299
|
+
doctype_start = html_string =~ /<!DOCTYPE/i
|
|
300
|
+
if doctype_start
|
|
301
|
+
doctype_end = html_string.index(">", doctype_start)
|
|
302
|
+
html_string = html_string[0...doctype_start] + html_string[(doctype_end + 1)..] if doctype_end
|
|
303
|
+
html_string.strip!
|
|
304
|
+
else
|
|
305
|
+
html_string = html_string.strip
|
|
306
|
+
end
|
|
291
307
|
|
|
292
308
|
# Apply preprocessing to HTML string before parsing
|
|
293
309
|
processed_html = case preprocessing
|
|
@@ -313,8 +329,15 @@ module Canon
|
|
|
313
329
|
|
|
314
330
|
# Parse a node from string or return as-is
|
|
315
331
|
# Applies preprocessing transformation before parsing if specified
|
|
316
|
-
#
|
|
332
|
+
# Returns Nokogiri nodes for DOM comparison (preserves original behavior)
|
|
317
333
|
def parse_node(node, preprocessing = :none, match_opts = {})
|
|
334
|
+
# If already a Canon::Xml::Node, convert to Nokogiri for DOM path
|
|
335
|
+
if node.is_a?(Canon::Xml::Node)
|
|
336
|
+
# Canon nodes used in semantic diff path, convert to Nokogiri for DOM path
|
|
337
|
+
xml_str = Canon::Xml::DataModel.serialize(node)
|
|
338
|
+
node = xml_str
|
|
339
|
+
end
|
|
340
|
+
|
|
318
341
|
# If already a Nokogiri node, check for incompatible XML documents
|
|
319
342
|
unless node.is_a?(String)
|
|
320
343
|
# Detect if this is an XML document (not HTML)
|
|
@@ -357,7 +380,15 @@ module Canon
|
|
|
357
380
|
|
|
358
381
|
# Strip DOCTYPE declarations from HTML strings
|
|
359
382
|
# This normalizes parsed HTML (which includes DOCTYPE) with raw HTML strings
|
|
360
|
-
|
|
383
|
+
# Use non-regex approach to avoid ReDoS vulnerability
|
|
384
|
+
doctype_start = node =~ /<!DOCTYPE/i
|
|
385
|
+
if doctype_start
|
|
386
|
+
doctype_end = node.index(">", doctype_start)
|
|
387
|
+
node = node[0...doctype_start] + node[(doctype_end + 1)..] if doctype_end
|
|
388
|
+
node.strip!
|
|
389
|
+
else
|
|
390
|
+
node = node.strip
|
|
391
|
+
end
|
|
361
392
|
|
|
362
393
|
# Apply preprocessing to HTML string before parsing
|
|
363
394
|
html_string = case preprocessing
|
|
@@ -380,10 +411,12 @@ module Canon
|
|
|
380
411
|
# Use XML fragment parser to avoid auto-inserted meta tags
|
|
381
412
|
frag = Nokogiri::XML.fragment(html_string)
|
|
382
413
|
|
|
383
|
-
# Apply :rendered preprocessing
|
|
384
|
-
if
|
|
414
|
+
# Apply post-parsing filtering for :normalize, :format, and :rendered preprocessing
|
|
415
|
+
if %i[normalize format rendered].include?(preprocessing)
|
|
385
416
|
normalize_html_style_script_comments(frag)
|
|
386
|
-
|
|
417
|
+
if preprocessing == :rendered
|
|
418
|
+
normalize_rendered_whitespace(frag, match_opts)
|
|
419
|
+
end
|
|
387
420
|
remove_whitespace_only_text_nodes(frag)
|
|
388
421
|
end
|
|
389
422
|
|
|
@@ -461,9 +494,9 @@ module Canon
|
|
|
461
494
|
# @param node [Canon::Xml::Node, Nokogiri::HTML::Document] Parsed node
|
|
462
495
|
# @return [String] Serialized HTML string
|
|
463
496
|
def serialize_for_display(node)
|
|
464
|
-
# Use
|
|
497
|
+
# Use XmlNodeComparison's serializer for Canon::Xml::Node
|
|
465
498
|
if node.is_a?(Canon::Xml::Node)
|
|
466
|
-
|
|
499
|
+
XmlNodeComparison.serialize_node_to_xml(node)
|
|
467
500
|
elsif node.respond_to?(:to_html)
|
|
468
501
|
node.to_html
|
|
469
502
|
elsif node.respond_to?(:to_xml)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
|
|
5
|
+
module Canon
|
|
6
|
+
module Comparison
|
|
7
|
+
# HTML parsing service with version detection and fragment support
|
|
8
|
+
#
|
|
9
|
+
# Provides HTML parsing capabilities with automatic HTML4/HTML5 version
|
|
10
|
+
# detection. Handles both full documents and fragments.
|
|
11
|
+
#
|
|
12
|
+
# @example Parse HTML string
|
|
13
|
+
# HtmlParser.parse("<div>content</div>", :html5)
|
|
14
|
+
#
|
|
15
|
+
# @example Auto-detect and parse
|
|
16
|
+
# HtmlParser.detect_and_parse("<!DOCTYPE html><html>...</html>")
|
|
17
|
+
class HtmlParser
|
|
18
|
+
class << self
|
|
19
|
+
# Parse HTML string into Nokogiri document with the correct parser
|
|
20
|
+
#
|
|
21
|
+
# @param content [String, Object] Content to parse (returns as-is if not a string)
|
|
22
|
+
# @param format [Symbol] HTML format (:html, :html4, :html5)
|
|
23
|
+
# @return [Nokogiri::HTML::Document, Nokogiri::HTML5::Document, Nokogiri::HTML::DocumentFragment, Object]
|
|
24
|
+
def parse(content, format)
|
|
25
|
+
return content unless content.is_a?(String)
|
|
26
|
+
return content if already_parsed?(content)
|
|
27
|
+
|
|
28
|
+
begin
|
|
29
|
+
case format
|
|
30
|
+
when :html5
|
|
31
|
+
Nokogiri::HTML5.fragment(content)
|
|
32
|
+
when :html4
|
|
33
|
+
Nokogiri::HTML4.fragment(content)
|
|
34
|
+
when :html
|
|
35
|
+
detect_and_parse(content)
|
|
36
|
+
else
|
|
37
|
+
content
|
|
38
|
+
end
|
|
39
|
+
rescue StandardError
|
|
40
|
+
# Fallback to raw string if parsing fails (maintains backward compatibility)
|
|
41
|
+
content
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Check if content is already a parsed HTML document/fragment
|
|
46
|
+
#
|
|
47
|
+
# @param content [Object] Content to check
|
|
48
|
+
# @return [Boolean] true if already parsed
|
|
49
|
+
def already_parsed?(content)
|
|
50
|
+
content.is_a?(Nokogiri::HTML::Document) ||
|
|
51
|
+
content.is_a?(Nokogiri::HTML5::Document) ||
|
|
52
|
+
content.is_a?(Nokogiri::HTML::DocumentFragment) ||
|
|
53
|
+
content.is_a?(Nokogiri::HTML5::DocumentFragment)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Detect HTML version from content and parse with appropriate parser
|
|
57
|
+
#
|
|
58
|
+
# @param content [String] HTML content to parse
|
|
59
|
+
# @return [Nokogiri::HTML::DocumentFragment] Parsed fragment
|
|
60
|
+
def detect_and_parse(content)
|
|
61
|
+
version = detect_version(content)
|
|
62
|
+
if version == :html5
|
|
63
|
+
Nokogiri::HTML5.fragment(content)
|
|
64
|
+
else
|
|
65
|
+
Nokogiri::HTML4.fragment(content)
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Detect HTML version from content string
|
|
70
|
+
#
|
|
71
|
+
# @param content [String] HTML content
|
|
72
|
+
# @return [Symbol] :html5 or :html4
|
|
73
|
+
def detect_version(content)
|
|
74
|
+
# Check for HTML5 DOCTYPE (case-insensitive)
|
|
75
|
+
content.include?("<!DOCTYPE html>") ? :html5 : :html4
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -125,6 +125,18 @@ module Canon
|
|
|
125
125
|
if match_opts[:key_order] != :strict
|
|
126
126
|
keys1 = keys1.sort_by(&:to_s)
|
|
127
127
|
keys2 = keys2.sort_by(&:to_s)
|
|
128
|
+
elsif keys1 != keys2
|
|
129
|
+
# Strict mode: key order matters
|
|
130
|
+
# Check if keys are in same order
|
|
131
|
+
# Keys are different or in different order
|
|
132
|
+
# First check if it's just ordering (same keys, different order)
|
|
133
|
+
if keys1.sort_by(&:to_s) == keys2.sort_by(&:to_s)
|
|
134
|
+
# Same keys, different order - this is a key_order difference
|
|
135
|
+
key_path = path.empty? ? "(key order)" : "#{path}.(key order)"
|
|
136
|
+
add_ruby_difference(key_path, keys1, keys2,
|
|
137
|
+
Comparison::UNEQUAL_HASH_KEY_ORDER, opts, differences)
|
|
138
|
+
return Comparison::UNEQUAL_HASH_KEY_ORDER
|
|
139
|
+
end
|
|
128
140
|
end
|
|
129
141
|
|
|
130
142
|
# Check for missing keys
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module Comparison
|
|
5
|
+
# Public API for JSON parsing operations
|
|
6
|
+
# Provides access to parsing functionality without using send()
|
|
7
|
+
class JsonParser
|
|
8
|
+
# Parse an object to Ruby object
|
|
9
|
+
#
|
|
10
|
+
# @param obj [String, Hash, Array] Object to parse
|
|
11
|
+
# @return [Hash, Array] Parsed Ruby object
|
|
12
|
+
def self.parse_json(obj)
|
|
13
|
+
# Delegate to JsonComparator's private method via public API
|
|
14
|
+
require_relative "json_comparator"
|
|
15
|
+
JsonComparator.parse_json(obj)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../comparison" # Load base module with constants
|
|
4
|
+
require_relative "../diff/diff_node"
|
|
5
|
+
require_relative "../diff/path_builder"
|
|
6
|
+
|
|
7
|
+
module Canon
|
|
8
|
+
module Comparison
|
|
9
|
+
# Base class for markup document comparison (XML, HTML)
|
|
10
|
+
#
|
|
11
|
+
# Provides shared comparison functionality for markup documents,
|
|
12
|
+
# including node type checking, text extraction, filtering,
|
|
13
|
+
# and difference creation.
|
|
14
|
+
#
|
|
15
|
+
# Format-specific comparators (XmlComparator, HtmlComparator)
|
|
16
|
+
# inherit from this class and add format-specific behavior.
|
|
17
|
+
class MarkupComparator
|
|
18
|
+
class << self
|
|
19
|
+
# Add a difference to the differences array
|
|
20
|
+
#
|
|
21
|
+
# Creates a DiffNode with enriched metadata including path,
|
|
22
|
+
# serialized content, and attributes for Stage 4 rendering.
|
|
23
|
+
#
|
|
24
|
+
# @param node1 [Object, nil] First node
|
|
25
|
+
# @param node2 [Object, nil] Second node
|
|
26
|
+
# @param diff1 [Symbol] Difference type for node1
|
|
27
|
+
# @param diff2 [Symbol] Difference type for node2
|
|
28
|
+
# @param dimension [Symbol] The match dimension causing this difference
|
|
29
|
+
# @param _opts [Hash] Options (unused but kept for interface compatibility)
|
|
30
|
+
# @param differences [Array] Array to append difference to
|
|
31
|
+
def add_difference(node1, node2, diff1, diff2, dimension, _opts,
|
|
32
|
+
differences)
|
|
33
|
+
# All differences must be DiffNode objects (OO architecture)
|
|
34
|
+
if dimension.nil?
|
|
35
|
+
raise ArgumentError,
|
|
36
|
+
"dimension required for DiffNode"
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Build informative reason message
|
|
40
|
+
reason = build_difference_reason(node1, node2, diff1, diff2,
|
|
41
|
+
dimension)
|
|
42
|
+
|
|
43
|
+
# Enrich with path, serialized content, and attributes for Stage 4 rendering
|
|
44
|
+
metadata = enrich_diff_metadata(node1, node2)
|
|
45
|
+
|
|
46
|
+
diff_node = Canon::Diff::DiffNode.new(
|
|
47
|
+
node1: node1,
|
|
48
|
+
node2: node2,
|
|
49
|
+
dimension: dimension,
|
|
50
|
+
reason: reason,
|
|
51
|
+
**metadata,
|
|
52
|
+
)
|
|
53
|
+
differences << diff_node
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Enrich DiffNode with canonical path, serialized content, and attributes
|
|
57
|
+
# This extracts presentation-ready metadata from nodes for Stage 4 rendering
|
|
58
|
+
#
|
|
59
|
+
# @param node1 [Object, nil] First node
|
|
60
|
+
# @param node2 [Object, nil] Second node
|
|
61
|
+
# @return [Hash] Enriched metadata hash
|
|
62
|
+
def enrich_diff_metadata(node1, node2)
|
|
63
|
+
{
|
|
64
|
+
path: build_path_for_node(node1 || node2),
|
|
65
|
+
serialized_before: serialize_node(node1),
|
|
66
|
+
serialized_after: serialize_node(node2),
|
|
67
|
+
attributes_before: extract_attributes(node1),
|
|
68
|
+
attributes_after: extract_attributes(node2),
|
|
69
|
+
}
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Build canonical path for a node
|
|
73
|
+
#
|
|
74
|
+
# @param node [Object] Node to build path for
|
|
75
|
+
# @return [String, nil] Canonical path with ordinal indices
|
|
76
|
+
def build_path_for_node(node)
|
|
77
|
+
return nil if node.nil?
|
|
78
|
+
|
|
79
|
+
Canon::Diff::PathBuilder.build(node, format: :document)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Serialize a node to string for display
|
|
83
|
+
#
|
|
84
|
+
# @param node [Object, nil] Node to serialize
|
|
85
|
+
# @return [String, nil] Serialized content
|
|
86
|
+
def serialize_node(node)
|
|
87
|
+
return nil if node.nil?
|
|
88
|
+
|
|
89
|
+
# Canon::Xml::Node types
|
|
90
|
+
if node.is_a?(Canon::Xml::Nodes::RootNode)
|
|
91
|
+
# Serialize all children of root
|
|
92
|
+
node.children.map { |child| serialize_node(child) }.join
|
|
93
|
+
elsif node.is_a?(Canon::Xml::Nodes::ElementNode)
|
|
94
|
+
serialize_element_node(node)
|
|
95
|
+
elsif node.is_a?(Canon::Xml::Nodes::TextNode)
|
|
96
|
+
node.value
|
|
97
|
+
elsif node.is_a?(Canon::Xml::Nodes::CommentNode)
|
|
98
|
+
"<!--#{node.value}-->"
|
|
99
|
+
elsif node.is_a?(Canon::Xml::Nodes::ProcessingInstructionNode)
|
|
100
|
+
"<?#{node.target} #{node.data}?>"
|
|
101
|
+
elsif node.respond_to?(:to_xml)
|
|
102
|
+
node.to_xml
|
|
103
|
+
elsif node.respond_to?(:to_html)
|
|
104
|
+
node.to_html
|
|
105
|
+
else
|
|
106
|
+
node.to_s
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Extract attributes from a node
|
|
111
|
+
#
|
|
112
|
+
# @param node [Object, nil] Node to extract attributes from
|
|
113
|
+
# @return [Hash, nil] Hash of attribute name => value pairs
|
|
114
|
+
def extract_attributes(node)
|
|
115
|
+
return nil if node.nil?
|
|
116
|
+
|
|
117
|
+
# Canon::Xml::Node ElementNode
|
|
118
|
+
if node.is_a?(Canon::Xml::Nodes::ElementNode)
|
|
119
|
+
node.attribute_nodes.each_with_object({}) do |attr, hash|
|
|
120
|
+
hash[attr.name] = attr.value
|
|
121
|
+
end
|
|
122
|
+
# Nokogiri nodes
|
|
123
|
+
elsif node.respond_to?(:attributes)
|
|
124
|
+
node.attributes.each_with_object({}) do |(_, attr), hash|
|
|
125
|
+
hash[attr.name] = attr.value
|
|
126
|
+
end
|
|
127
|
+
else
|
|
128
|
+
{}
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Filter children based on options
|
|
133
|
+
#
|
|
134
|
+
# Removes nodes that should be excluded from comparison based on
|
|
135
|
+
# options like :ignore_nodes, :ignore_comments, etc.
|
|
136
|
+
#
|
|
137
|
+
# @param children [Array] Array of child nodes
|
|
138
|
+
# @param opts [Hash] Comparison options
|
|
139
|
+
# @return [Array] Filtered array of children
|
|
140
|
+
def filter_children(children, opts)
|
|
141
|
+
children.reject do |child|
|
|
142
|
+
node_excluded?(child, opts)
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Check if node should be excluded from comparison
|
|
147
|
+
#
|
|
148
|
+
# @param node [Object] Node to check
|
|
149
|
+
# @param opts [Hash] Comparison options
|
|
150
|
+
# @return [Boolean] true if node should be excluded
|
|
151
|
+
def node_excluded?(node, opts)
|
|
152
|
+
return false if node.nil?
|
|
153
|
+
return true if opts[:ignore_nodes]&.include?(node)
|
|
154
|
+
return true if opts[:ignore_comments] && comment_node?(node)
|
|
155
|
+
return true if opts[:ignore_text_nodes] && text_node?(node)
|
|
156
|
+
|
|
157
|
+
# Check structural_whitespace match option
|
|
158
|
+
match_opts = opts[:match_opts]
|
|
159
|
+
# Filter out whitespace-only text nodes
|
|
160
|
+
if match_opts && %i[ignore
|
|
161
|
+
normalize].include?(match_opts[:structural_whitespace]) && text_node?(node)
|
|
162
|
+
text = node_text(node)
|
|
163
|
+
return true if MatchOptions.normalize_text(text).empty?
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
false
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# Check if two nodes are the same type
|
|
170
|
+
#
|
|
171
|
+
# @param node1 [Object] First node
|
|
172
|
+
# @param node2 [Object] Second node
|
|
173
|
+
# @return [Boolean] true if nodes are same type
|
|
174
|
+
def same_node_type?(node1, node2)
|
|
175
|
+
return false if node1.class != node2.class
|
|
176
|
+
|
|
177
|
+
# For Nokogiri/Canon::Xml nodes, check node type
|
|
178
|
+
if node1.respond_to?(:node_type) && node2.respond_to?(:node_type)
|
|
179
|
+
node1.node_type == node2.node_type
|
|
180
|
+
else
|
|
181
|
+
true
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Check if a node is a comment node
|
|
186
|
+
#
|
|
187
|
+
# @param node [Object] Node to check
|
|
188
|
+
# @return [Boolean] true if node is a comment
|
|
189
|
+
def comment_node?(node)
|
|
190
|
+
node.respond_to?(:comment?) && node.comment? ||
|
|
191
|
+
node.respond_to?(:node_type) && node.node_type == :comment
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Check if a node is a text node
|
|
195
|
+
#
|
|
196
|
+
# @param node [Object] Node to check
|
|
197
|
+
# @return [Boolean] true if node is a text node
|
|
198
|
+
def text_node?(node)
|
|
199
|
+
node.respond_to?(:text?) && node.text? &&
|
|
200
|
+
!node.respond_to?(:element?) ||
|
|
201
|
+
node.respond_to?(:node_type) && node.node_type == :text
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Get text content from a node
|
|
205
|
+
#
|
|
206
|
+
# @param node [Object] Node to get text from
|
|
207
|
+
# @return [String] Text content
|
|
208
|
+
def node_text(node)
|
|
209
|
+
# Canon::Xml::Node TextNode uses .value
|
|
210
|
+
if node.respond_to?(:value)
|
|
211
|
+
node.value.to_s
|
|
212
|
+
# Nokogiri nodes use .content
|
|
213
|
+
elsif node.respond_to?(:content)
|
|
214
|
+
node.content.to_s
|
|
215
|
+
else
|
|
216
|
+
node.to_s
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Check if difference between two texts is only whitespace
|
|
221
|
+
#
|
|
222
|
+
# @param text1 [String] First text
|
|
223
|
+
# @param text2 [String] Second text
|
|
224
|
+
# @return [Boolean] true if difference is only in whitespace
|
|
225
|
+
def whitespace_only_difference?(text1, text2)
|
|
226
|
+
# Normalize both texts (collapse/trim whitespace)
|
|
227
|
+
norm1 = MatchOptions.normalize_text(text1)
|
|
228
|
+
norm2 = MatchOptions.normalize_text(text2)
|
|
229
|
+
|
|
230
|
+
# If normalized texts are the same, the difference was only whitespace
|
|
231
|
+
norm1 == norm2
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
# Build a human-readable reason for a difference
|
|
235
|
+
#
|
|
236
|
+
# @param node1 [Object, nil] First node
|
|
237
|
+
# @param node2 [Object, nil] Second node
|
|
238
|
+
# @param diff1 [Symbol] Difference type for node1
|
|
239
|
+
# @param diff2 [Symbol] Difference type for node2
|
|
240
|
+
# @param dimension [Symbol] The dimension of the difference
|
|
241
|
+
# @return [String] Human-readable reason
|
|
242
|
+
def build_difference_reason(_node1, _node2, diff1, diff2, dimension)
|
|
243
|
+
# Default reason - can be overridden in subclasses
|
|
244
|
+
"Difference in #{dimension}: #{diff1} vs #{diff2}"
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# Serialize an element node to string
|
|
248
|
+
#
|
|
249
|
+
# @param node [Canon::Xml::Nodes::ElementNode] Element node
|
|
250
|
+
# @return [String] Serialized element
|
|
251
|
+
def serialize_element_node(node)
|
|
252
|
+
attrs = node.attribute_nodes.map do |a|
|
|
253
|
+
" #{a.name}=\"#{a.value}\""
|
|
254
|
+
end.join
|
|
255
|
+
children_xml = node.children.map { |c| serialize_node(c) }.join
|
|
256
|
+
|
|
257
|
+
if children_xml.empty?
|
|
258
|
+
"<#{node.name}#{attrs}/>"
|
|
259
|
+
else
|
|
260
|
+
"<#{node.name}#{attrs}>#{children_xml}</#{node.name}>"
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# Determine the appropriate dimension for a node type
|
|
265
|
+
#
|
|
266
|
+
# @param node [Object] The node to check
|
|
267
|
+
# @return [Symbol] The dimension symbol
|
|
268
|
+
def determine_node_dimension(node)
|
|
269
|
+
# Canon::Xml::Node types
|
|
270
|
+
if node.respond_to?(:node_type) && node.node_type.is_a?(Symbol)
|
|
271
|
+
case node.node_type
|
|
272
|
+
when :comment then :comments
|
|
273
|
+
when :text, :cdata then :text_content
|
|
274
|
+
when :processing_instruction then :processing_instructions
|
|
275
|
+
else :text_content
|
|
276
|
+
end
|
|
277
|
+
# Moxml/Nokogiri types
|
|
278
|
+
elsif node.respond_to?(:comment?) && node.comment?
|
|
279
|
+
:comments
|
|
280
|
+
elsif node.respond_to?(:text?) && node.text?
|
|
281
|
+
:text_content
|
|
282
|
+
elsif node.respond_to?(:cdata?) && node.cdata?
|
|
283
|
+
:text_content
|
|
284
|
+
elsif node.respond_to?(:processing_instruction?) && node.processing_instruction?
|
|
285
|
+
:processing_instructions
|
|
286
|
+
else
|
|
287
|
+
:text_content
|
|
288
|
+
end
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
end
|