canon 0.1.8 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +112 -25
- data/docs/Gemfile +1 -0
- data/docs/_config.yml +90 -1
- data/docs/advanced/diff-classification.adoc +82 -2
- data/docs/features/match-options/index.adoc +239 -1
- data/lib/canon/comparison/format_detector.rb +2 -1
- data/lib/canon/comparison/html_comparator.rb +19 -8
- data/lib/canon/comparison/html_compare_profile.rb +8 -2
- data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
- data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
- data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
- data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
- data/lib/canon/comparison/xml_comparator.rb +48 -23
- data/lib/canon/comparison/xml_node_comparison.rb +25 -3
- data/lib/canon/diff/diff_classifier.rb +101 -2
- data/lib/canon/diff/formatting_detector.rb +1 -1
- data/lib/canon/rspec_matchers.rb +37 -8
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +24 -13
- metadata +3 -78
- data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
- data/false_positive_analysis.txt +0 -0
- data/file1.html +0 -1
- data/file2.html +0 -1
- data/old-docs/ADVANCED_TOPICS.adoc +0 -20
- data/old-docs/BASIC_USAGE.adoc +0 -16
- data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
- data/old-docs/CLI.adoc +0 -497
- data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/old-docs/DIFF_FORMATTING.adoc +0 -540
- data/old-docs/DIFF_PARAMETERS.adoc +0 -261
- data/old-docs/DOM_DIFF.adoc +0 -1017
- data/old-docs/ENV_CONFIG.adoc +0 -876
- data/old-docs/FORMATS.adoc +0 -867
- data/old-docs/INPUT_VALIDATION.adoc +0 -477
- data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
- data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/old-docs/MATCH_OPTIONS.adoc +0 -912
- data/old-docs/MODES.adoc +0 -432
- data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/old-docs/OPTIONS.adoc +0 -1387
- data/old-docs/PREPROCESSING.adoc +0 -491
- data/old-docs/README.old.adoc +0 -2831
- data/old-docs/RSPEC.adoc +0 -814
- data/old-docs/RUBY_API.adoc +0 -485
- data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
- data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
- data/old-docs/STRING_COMPARE.adoc +0 -345
- data/old-docs/TMP.adoc +0 -3384
- data/old-docs/TREE_DIFF.adoc +0 -1080
- data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
- data/old-docs/VERBOSE.adoc +0 -482
- data/old-docs/VISUALIZATION_MAP.adoc +0 -625
- data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
- data/scripts/analyze_current_state.rb +0 -85
- data/scripts/analyze_false_positives.rb +0 -114
- data/scripts/analyze_remaining_failures.rb +0 -105
- data/scripts/compare_current_failures.rb +0 -95
- data/scripts/compare_dom_tree_diff.rb +0 -158
- data/scripts/compare_failures.rb +0 -151
- data/scripts/debug_attribute_extraction.rb +0 -66
- data/scripts/debug_blocks_839.rb +0 -115
- data/scripts/debug_meta_matching.rb +0 -52
- data/scripts/debug_p_matching.rb +0 -192
- data/scripts/debug_signature_matching.rb +0 -118
- data/scripts/debug_sourcecode_124.rb +0 -32
- data/scripts/debug_whitespace_sensitive.rb +0 -192
- data/scripts/extract_false_positives.rb +0 -138
- data/scripts/find_actual_false_positives.rb +0 -125
- data/scripts/investigate_all_false_positives.rb +0 -161
- data/scripts/investigate_batch1.rb +0 -127
- data/scripts/investigate_classification.rb +0 -150
- data/scripts/investigate_classification_detailed.rb +0 -190
- data/scripts/investigate_common_failures.rb +0 -342
- data/scripts/investigate_false_negative.rb +0 -80
- data/scripts/investigate_false_positive.rb +0 -83
- data/scripts/investigate_false_positives.rb +0 -227
- data/scripts/investigate_false_positives_batch.rb +0 -163
- data/scripts/investigate_mixed_content.rb +0 -125
- data/scripts/investigate_remaining_16.rb +0 -214
- data/scripts/run_single_test.rb +0 -29
- data/scripts/test_all_false_positives.rb +0 -95
- data/scripts/test_attribute_details.rb +0 -61
- data/scripts/test_both_algorithms.rb +0 -49
- data/scripts/test_both_simple.rb +0 -49
- data/scripts/test_enhanced_semantic_output.rb +0 -125
- data/scripts/test_readme_examples.rb +0 -131
- data/scripts/test_semantic_tree_diff.rb +0 -99
- data/scripts/test_semantic_ux_improvements.rb +0 -135
- data/scripts/test_single_false_positive.rb +0 -119
- data/scripts/test_size_limits.rb +0 -99
- data/test_html_1.html +0 -21
- data/test_html_2.html +0 -21
- data/test_nokogiri.rb +0 -33
- data/test_normalize.rb +0 -45
|
@@ -23,7 +23,8 @@ module Canon
|
|
|
23
23
|
# @param diff_children [Boolean] Whether to diff children
|
|
24
24
|
# @param differences [Array] Array to collect differences
|
|
25
25
|
# @return [Integer] Comparison result code
|
|
26
|
-
def compare(node1, node2, comparator, opts, child_opts,
|
|
26
|
+
def compare(node1, node2, comparator, opts, child_opts,
|
|
27
|
+
diff_children, differences)
|
|
27
28
|
# Dispatch based on node type
|
|
28
29
|
# Canon::Xml::Node types use .node_type method that returns symbols
|
|
29
30
|
# Nokogiri also has .node_type but returns integers, so check for Symbol
|
|
@@ -51,11 +52,14 @@ module Canon
|
|
|
51
52
|
comparator.send(:compare_element_nodes, node1, node2, opts, child_opts,
|
|
52
53
|
diff_children, differences)
|
|
53
54
|
when :text
|
|
54
|
-
comparator.send(:compare_text_nodes, node1, node2, opts,
|
|
55
|
+
comparator.send(:compare_text_nodes, node1, node2, opts,
|
|
56
|
+
differences)
|
|
55
57
|
when :comment
|
|
56
|
-
comparator.send(:compare_comment_nodes, node1, node2, opts,
|
|
58
|
+
comparator.send(:compare_comment_nodes, node1, node2, opts,
|
|
59
|
+
differences)
|
|
57
60
|
when :cdata
|
|
58
|
-
comparator.send(:compare_text_nodes, node1, node2, opts,
|
|
61
|
+
comparator.send(:compare_text_nodes, node1, node2, opts,
|
|
62
|
+
differences)
|
|
59
63
|
when :processing_instruction
|
|
60
64
|
comparator.send(:compare_processing_instruction_nodes, node1, node2, opts,
|
|
61
65
|
differences)
|
|
@@ -71,11 +75,14 @@ module Canon
|
|
|
71
75
|
comparator.send(:compare_element_nodes, node1, node2, opts, child_opts,
|
|
72
76
|
diff_children, differences)
|
|
73
77
|
elsif node1.respond_to?(:text?) && node1.text?
|
|
74
|
-
comparator.send(:compare_text_nodes, node1, node2, opts,
|
|
78
|
+
comparator.send(:compare_text_nodes, node1, node2, opts,
|
|
79
|
+
differences)
|
|
75
80
|
elsif node1.respond_to?(:comment?) && node1.comment?
|
|
76
|
-
comparator.send(:compare_comment_nodes, node1, node2, opts,
|
|
81
|
+
comparator.send(:compare_comment_nodes, node1, node2, opts,
|
|
82
|
+
differences)
|
|
77
83
|
elsif node1.respond_to?(:cdata?) && node1.cdata?
|
|
78
|
-
comparator.send(:compare_text_nodes, node1, node2, opts,
|
|
84
|
+
comparator.send(:compare_text_nodes, node1, node2, opts,
|
|
85
|
+
differences)
|
|
79
86
|
elsif node1.respond_to?(:processing_instruction?) &&
|
|
80
87
|
node1.processing_instruction?
|
|
81
88
|
comparator.send(:compare_processing_instruction_nodes, node1, node2, opts,
|
|
@@ -18,6 +18,8 @@ require_relative "xml_comparator/namespace_comparator"
|
|
|
18
18
|
require_relative "xml_comparator/node_type_comparator"
|
|
19
19
|
require_relative "xml_comparator/child_comparison"
|
|
20
20
|
require_relative "xml_comparator/diff_node_builder"
|
|
21
|
+
# Whitespace sensitivity module
|
|
22
|
+
require_relative "whitespace_sensitivity"
|
|
21
23
|
|
|
22
24
|
module Canon
|
|
23
25
|
module Comparison
|
|
@@ -90,9 +92,15 @@ module Canon
|
|
|
90
92
|
# Create child_opts with resolved options
|
|
91
93
|
child_opts = opts.merge(child_opts)
|
|
92
94
|
|
|
95
|
+
# Determine if we should preserve whitespace during parsing
|
|
96
|
+
# When structural_whitespace is :strict, preserve all whitespace-only text nodes
|
|
97
|
+
preserve_whitespace = match_opts_hash[:structural_whitespace] == :strict
|
|
98
|
+
|
|
93
99
|
# Parse nodes if they are strings, applying preprocessing if needed
|
|
94
|
-
node1 = parse_node(n1, match_opts_hash[:preprocessing]
|
|
95
|
-
|
|
100
|
+
node1 = parse_node(n1, match_opts_hash[:preprocessing],
|
|
101
|
+
preserve_whitespace: preserve_whitespace)
|
|
102
|
+
node2 = parse_node(n2, match_opts_hash[:preprocessing],
|
|
103
|
+
preserve_whitespace: preserve_whitespace)
|
|
96
104
|
|
|
97
105
|
# Store original strings for line diff display (before preprocessing)
|
|
98
106
|
original1 = if n1.is_a?(String)
|
|
@@ -209,8 +217,9 @@ module Canon
|
|
|
209
217
|
# Parse a node from string or return as-is
|
|
210
218
|
# Applies preprocessing transformation before parsing if specified
|
|
211
219
|
# Delegates to NodeParser module
|
|
212
|
-
def parse_node(node, preprocessing = :none)
|
|
213
|
-
XmlComparatorHelpers::NodeParser.parse(node, preprocessing
|
|
220
|
+
def parse_node(node, preprocessing = :none, preserve_whitespace: false)
|
|
221
|
+
XmlComparatorHelpers::NodeParser.parse(node, preprocessing,
|
|
222
|
+
preserve_whitespace: preserve_whitespace)
|
|
214
223
|
end
|
|
215
224
|
|
|
216
225
|
# Main comparison dispatcher
|
|
@@ -331,7 +340,8 @@ module Canon
|
|
|
331
340
|
|
|
332
341
|
# For HTML, check if text node is inside whitespace-preserving element
|
|
333
342
|
# If so, always use strict comparison regardless of text_content setting
|
|
334
|
-
|
|
343
|
+
sensitive_element = should_preserve_whitespace_strictly?(n1, n2, opts)
|
|
344
|
+
if sensitive_element
|
|
335
345
|
behavior = :strict
|
|
336
346
|
end
|
|
337
347
|
|
|
@@ -344,15 +354,23 @@ module Canon
|
|
|
344
354
|
|
|
345
355
|
# Determine the correct dimension for this difference
|
|
346
356
|
# - If text_content is :strict, ALL differences use :text_content dimension
|
|
347
|
-
# - If text_content is :normalize, whitespace-only diffs use :structural_whitespace
|
|
357
|
+
# - If text_content is :normalize, whitespace-only diffs could use :structural_whitespace
|
|
358
|
+
# but we keep :text_content to ensure correct classification behavior
|
|
348
359
|
# - Otherwise use :text_content
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
360
|
+
# However, if element is whitespace-sensitive (like <pre> in HTML),
|
|
361
|
+
# always use :text_content dimension regardless of behavior
|
|
362
|
+
#
|
|
363
|
+
# NOTE: We keep the dimension as :text_content even for whitespace-only diffs
|
|
364
|
+
# when text_content: :normalize. This ensures that the classification uses
|
|
365
|
+
# the text_content behavior (:normalize) instead of structural_whitespace
|
|
366
|
+
# behavior (:strict for XML), which would incorrectly mark the diff as normative.
|
|
367
|
+
if sensitive_element
|
|
368
|
+
# Whitespace-sensitive element: always use :text_content dimension
|
|
369
|
+
else
|
|
370
|
+
# Always use :text_content for text differences
|
|
371
|
+
# This ensures correct classification based on text_content behavior
|
|
372
|
+
end
|
|
373
|
+
dimension = :text_content
|
|
356
374
|
|
|
357
375
|
# Create DiffNode in verbose mode when raw content differs
|
|
358
376
|
# This ensures informative diffs are created even for :ignore/:normalize
|
|
@@ -368,17 +386,23 @@ module Canon
|
|
|
368
386
|
|
|
369
387
|
# Check if whitespace should be preserved strictly for these text nodes
|
|
370
388
|
# This applies to HTML elements like pre, code, textarea, script, style
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
389
|
+
# and elements with xml:space="preserve" or in user-configured whitelist
|
|
390
|
+
def should_preserve_whitespace_strictly?(n1, n2, opts)
|
|
391
|
+
# Use WhitespaceSensitivity module to check if element is sensitive
|
|
392
|
+
# Check both n1 and n2 - if either is in a sensitive element, preserve strictly
|
|
393
|
+
if n1.respond_to?(:parent)
|
|
394
|
+
sensitivity_opts = { match_opts: opts[:match_opts] }
|
|
395
|
+
return true if WhitespaceSensitivity.element_sensitive?(n1,
|
|
396
|
+
sensitivity_opts)
|
|
397
|
+
end
|
|
375
398
|
|
|
376
|
-
|
|
377
|
-
|
|
399
|
+
if n2.respond_to?(:parent)
|
|
400
|
+
sensitivity_opts = { match_opts: opts[:match_opts] }
|
|
401
|
+
return true if WhitespaceSensitivity.element_sensitive?(n2,
|
|
402
|
+
sensitivity_opts)
|
|
403
|
+
end
|
|
378
404
|
|
|
379
|
-
|
|
380
|
-
in_preserve_element?(n1, preserve_elements) ||
|
|
381
|
-
in_preserve_element?(n2, preserve_elements)
|
|
405
|
+
false
|
|
382
406
|
end
|
|
383
407
|
|
|
384
408
|
# Check if a node is inside a whitespace-preserving element
|
|
@@ -469,7 +493,8 @@ module Canon
|
|
|
469
493
|
#
|
|
470
494
|
# Delegates to ChildComparison module which handles both ElementMatcher
|
|
471
495
|
# (semantic matching) and simple positional comparison.
|
|
472
|
-
def compare_children(n1, n2, opts, child_opts, diff_children,
|
|
496
|
+
def compare_children(n1, n2, opts, child_opts, diff_children,
|
|
497
|
+
differences)
|
|
473
498
|
XmlComparatorHelpers::ChildComparison.compare(
|
|
474
499
|
n1, n2, self, opts, child_opts, diff_children, differences
|
|
475
500
|
)
|
|
@@ -139,9 +139,13 @@ diff_children, differences)
|
|
|
139
139
|
|
|
140
140
|
# Check structural_whitespace match option
|
|
141
141
|
match_opts = opts[:match_opts]
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
142
|
+
return false unless match_opts
|
|
143
|
+
|
|
144
|
+
# Filter out whitespace-only text nodes based on structural_whitespace setting
|
|
145
|
+
# - :ignore or :normalize: Filter all whitespace-only text nodes
|
|
146
|
+
# - :strict: Preserve all whitespace-only text nodes (don't filter any)
|
|
147
|
+
if text_node?(node) && %i[ignore
|
|
148
|
+
normalize].include?(match_opts[:structural_whitespace])
|
|
145
149
|
text = node_text(node)
|
|
146
150
|
return true if MatchOptions.normalize_text(text).empty?
|
|
147
151
|
end
|
|
@@ -184,6 +188,24 @@ diff_children, differences)
|
|
|
184
188
|
node.respond_to?(:node_type) && node.node_type == :text
|
|
185
189
|
end
|
|
186
190
|
|
|
191
|
+
# Extract text content from a node
|
|
192
|
+
#
|
|
193
|
+
# @param node [Object] Node to extract text from
|
|
194
|
+
# @return [String] Text content
|
|
195
|
+
def self.node_text(node)
|
|
196
|
+
return "" unless node
|
|
197
|
+
|
|
198
|
+
if node.respond_to?(:content)
|
|
199
|
+
node.content.to_s
|
|
200
|
+
elsif node.respond_to?(:text)
|
|
201
|
+
node.text.to_s
|
|
202
|
+
elsif node.respond_to?(:value)
|
|
203
|
+
node.value.to_s
|
|
204
|
+
else
|
|
205
|
+
""
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
187
209
|
# Dispatch by Canon::Xml::Node type
|
|
188
210
|
def self.dispatch_canon_node_type(node1, node2, opts, child_opts,
|
|
189
211
|
diff_children, differences)
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require_relative "formatting_detector"
|
|
4
4
|
require_relative "../comparison/compare_profile"
|
|
5
|
+
require_relative "../comparison/whitespace_sensitivity"
|
|
5
6
|
|
|
6
7
|
module Canon
|
|
7
8
|
module Diff
|
|
@@ -28,6 +29,28 @@ module Canon
|
|
|
28
29
|
# @param diff_node [DiffNode] The diff node to classify
|
|
29
30
|
# @return [DiffNode] The same diff node with normative/formatting attributes set
|
|
30
31
|
def classify(diff_node)
|
|
32
|
+
# SPECIAL CASE: text_content with :normalize behavior
|
|
33
|
+
# When text_content is :normalize and the difference is formatting-only,
|
|
34
|
+
# it should be marked as non-normative (informative)
|
|
35
|
+
# This ensures that verbose and non-verbose modes give consistent results
|
|
36
|
+
#
|
|
37
|
+
# EXCEPTION: If the text node is inside a whitespace-sensitive element
|
|
38
|
+
# (like <pre>, <code>, <textarea> in HTML), don't apply formatting detection
|
|
39
|
+
# because whitespace should be preserved in these elements
|
|
40
|
+
#
|
|
41
|
+
# This check must come FIRST, before normative_dimension? is called,
|
|
42
|
+
# because normative_dimension? returns true for text_content: :normalize
|
|
43
|
+
# (since the dimension affects equivalence), which would prevent formatting
|
|
44
|
+
# detection from being applied.
|
|
45
|
+
if diff_node.dimension == :text_content &&
|
|
46
|
+
profile.send(:behavior_for, :text_content) == :normalize &&
|
|
47
|
+
!inside_whitespace_sensitive_element?(diff_node) &&
|
|
48
|
+
formatting_only_diff?(diff_node)
|
|
49
|
+
diff_node.formatting = true
|
|
50
|
+
diff_node.normative = false
|
|
51
|
+
return diff_node
|
|
52
|
+
end
|
|
53
|
+
|
|
31
54
|
# FIRST: Determine if this dimension is normative based on CompareProfile
|
|
32
55
|
# This respects the policy settings (strict/normalize/ignore)
|
|
33
56
|
is_normative = profile.normative_dimension?(diff_node.dimension)
|
|
@@ -45,7 +68,7 @@ module Canon
|
|
|
45
68
|
return diff_node
|
|
46
69
|
end
|
|
47
70
|
|
|
48
|
-
#
|
|
71
|
+
# THIRD: Apply the normative determination from CompareProfile
|
|
49
72
|
diff_node.formatting = false
|
|
50
73
|
diff_node.normative = is_normative
|
|
51
74
|
|
|
@@ -65,10 +88,86 @@ module Canon
|
|
|
65
88
|
# @param diff_node [DiffNode] The diff node to check
|
|
66
89
|
# @return [Boolean] true if formatting-only
|
|
67
90
|
def formatting_only_diff?(diff_node)
|
|
91
|
+
# Only apply formatting detection to actual text content differences
|
|
92
|
+
# If the nodes are not text nodes (e.g., element nodes), don't apply formatting detection
|
|
93
|
+
node1 = diff_node.node1
|
|
94
|
+
node2 = diff_node.node2
|
|
95
|
+
|
|
96
|
+
# Check if both nodes are text nodes
|
|
97
|
+
# If not, this is not a formatting-only difference
|
|
98
|
+
return false unless text_node?(node1) && text_node?(node2)
|
|
99
|
+
|
|
68
100
|
text1 = extract_text_content(diff_node.node1)
|
|
69
101
|
text2 = extract_text_content(diff_node.node2)
|
|
70
102
|
|
|
71
|
-
|
|
103
|
+
# For text_content dimension, use normalized text comparison
|
|
104
|
+
# This handles cases like "" vs " " (both normalize to "")
|
|
105
|
+
if diff_node.dimension == :text_content
|
|
106
|
+
normalized_equivalent?(text1, text2)
|
|
107
|
+
else
|
|
108
|
+
FormattingDetector.formatting_only?(text1, text2)
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Check if two texts are equivalent after normalization
|
|
113
|
+
# This detects formatting-only differences where normalized texts match
|
|
114
|
+
# @param text1 [String, nil] First text
|
|
115
|
+
# @param text2 [String, nil] Second text
|
|
116
|
+
# @return [Boolean] true if normalized texts are equivalent
|
|
117
|
+
def normalized_equivalent?(text1, text2)
|
|
118
|
+
return false if text1.nil? && text2.nil?
|
|
119
|
+
return false if text1.nil? || text2.nil?
|
|
120
|
+
|
|
121
|
+
# Use MatchOptions.normalize_text for consistency
|
|
122
|
+
normalized1 = Canon::Comparison::MatchOptions.normalize_text(text1)
|
|
123
|
+
normalized2 = Canon::Comparison::MatchOptions.normalize_text(text2)
|
|
124
|
+
|
|
125
|
+
# If normalized texts are equivalent but originals are different,
|
|
126
|
+
# it's a formatting-only difference
|
|
127
|
+
normalized1 == normalized2 && text1 != text2
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Check if a node is a text node
|
|
131
|
+
# @param node [Object] The node to check
|
|
132
|
+
# @return [Boolean] true if the node is a text node
|
|
133
|
+
def text_node?(node)
|
|
134
|
+
return false if node.nil?
|
|
135
|
+
|
|
136
|
+
# Canon::Xml::Nodes::TextNode
|
|
137
|
+
return true if node.is_a?(Canon::Xml::Nodes::TextNode)
|
|
138
|
+
|
|
139
|
+
# Nokogiri text nodes (node_type returns integer constant like 3)
|
|
140
|
+
return true if node.respond_to?(:node_type) &&
|
|
141
|
+
node.node_type.is_a?(Integer) &&
|
|
142
|
+
node.node_type == Nokogiri::XML::Node::TEXT_NODE
|
|
143
|
+
|
|
144
|
+
# Moxml text nodes (node_type returns symbol)
|
|
145
|
+
return true if node.respond_to?(:node_type) && node.node_type == :text
|
|
146
|
+
|
|
147
|
+
# String
|
|
148
|
+
return true if node.is_a?(String)
|
|
149
|
+
|
|
150
|
+
# Test doubles or objects with text node-like interface
|
|
151
|
+
# Check if it has a value method (contains text content)
|
|
152
|
+
return true if node.respond_to?(:value)
|
|
153
|
+
|
|
154
|
+
false
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Check if the text node is inside a whitespace-sensitive element
|
|
158
|
+
# @param diff_node [DiffNode] The diff node to check
|
|
159
|
+
# @return [Boolean] true if inside a whitespace-sensitive element
|
|
160
|
+
def inside_whitespace_sensitive_element?(diff_node)
|
|
161
|
+
# Get the text node (not the parent element)
|
|
162
|
+
node = diff_node.node1 || diff_node.node2
|
|
163
|
+
return false unless node
|
|
164
|
+
|
|
165
|
+
# WhitespaceSensitivity.element_sensitive? expects a text node
|
|
166
|
+
# and checks its parent element
|
|
167
|
+
# We need to pass the full options structure with :match_opts key
|
|
168
|
+
opts = { match_opts: @match_options.options }
|
|
169
|
+
|
|
170
|
+
Canon::Comparison::WhitespaceSensitivity.element_sensitive?(node, opts)
|
|
72
171
|
end
|
|
73
172
|
|
|
74
173
|
# Extract text content from a node for formatting comparison
|
|
@@ -11,7 +11,7 @@ module Canon
|
|
|
11
11
|
# @param line2 [String, nil] Second line to compare
|
|
12
12
|
# @return [Boolean] true if lines differ only in formatting
|
|
13
13
|
def self.formatting_only?(line1, line2)
|
|
14
|
-
# If both are nil or empty, not a formatting diff
|
|
14
|
+
# If both are nil or empty, not a formatting diff (no difference)
|
|
15
15
|
return false if blank?(line1) && blank?(line2)
|
|
16
16
|
|
|
17
17
|
# If only one is blank, it's not just formatting
|
data/lib/canon/rspec_matchers.rb
CHANGED
|
@@ -63,6 +63,15 @@ module Canon
|
|
|
63
63
|
self
|
|
64
64
|
end
|
|
65
65
|
|
|
66
|
+
# Chain method for setting match options
|
|
67
|
+
# @param match_opts [Hash] match options
|
|
68
|
+
# @return [SerializationMatcher] self for chaining
|
|
69
|
+
def with_match(**match_opts)
|
|
70
|
+
@match ||= {}
|
|
71
|
+
@match = @match.merge(match_opts)
|
|
72
|
+
self
|
|
73
|
+
end
|
|
74
|
+
|
|
66
75
|
def matches?(target)
|
|
67
76
|
@target = target
|
|
68
77
|
|
|
@@ -252,12 +261,22 @@ module Canon
|
|
|
252
261
|
diff_algorithm: diff_algorithm)
|
|
253
262
|
end
|
|
254
263
|
|
|
255
|
-
def be_yaml_equivalent_to(expected
|
|
256
|
-
|
|
264
|
+
def be_yaml_equivalent_to(expected, match_profile: nil, match: nil,
|
|
265
|
+
preprocessing: nil, diff_algorithm: nil)
|
|
266
|
+
SerializationMatcher.new(expected, :yaml,
|
|
267
|
+
match_profile: match_profile,
|
|
268
|
+
match: match,
|
|
269
|
+
preprocessing: preprocessing,
|
|
270
|
+
diff_algorithm: diff_algorithm)
|
|
257
271
|
end
|
|
258
272
|
|
|
259
|
-
def be_json_equivalent_to(expected
|
|
260
|
-
|
|
273
|
+
def be_json_equivalent_to(expected, match_profile: nil, match: nil,
|
|
274
|
+
preprocessing: nil, diff_algorithm: nil)
|
|
275
|
+
SerializationMatcher.new(expected, :json,
|
|
276
|
+
match_profile: match_profile,
|
|
277
|
+
match: match,
|
|
278
|
+
preprocessing: preprocessing,
|
|
279
|
+
diff_algorithm: diff_algorithm)
|
|
261
280
|
end
|
|
262
281
|
|
|
263
282
|
def be_html_equivalent_to(expected, match_profile: nil, match: nil,
|
|
@@ -287,12 +306,22 @@ module Canon
|
|
|
287
306
|
diff_algorithm: diff_algorithm)
|
|
288
307
|
end
|
|
289
308
|
|
|
290
|
-
def be_equivalent_to(expected
|
|
291
|
-
|
|
309
|
+
def be_equivalent_to(expected, match_profile: nil, match: nil,
|
|
310
|
+
preprocessing: nil, diff_algorithm: nil)
|
|
311
|
+
SerializationMatcher.new(expected, nil,
|
|
312
|
+
match_profile: match_profile,
|
|
313
|
+
match: match,
|
|
314
|
+
preprocessing: preprocessing,
|
|
315
|
+
diff_algorithm: diff_algorithm)
|
|
292
316
|
end
|
|
293
317
|
|
|
294
|
-
def be_string_equivalent_to(expected
|
|
295
|
-
|
|
318
|
+
def be_string_equivalent_to(expected, match_profile: nil, match: nil,
|
|
319
|
+
preprocessing: nil, diff_algorithm: nil)
|
|
320
|
+
SerializationMatcher.new(expected, :string,
|
|
321
|
+
match_profile: match_profile,
|
|
322
|
+
match: match,
|
|
323
|
+
preprocessing: preprocessing,
|
|
324
|
+
diff_algorithm: diff_algorithm)
|
|
296
325
|
end
|
|
297
326
|
|
|
298
327
|
if defined?(::RSpec) && ::RSpec.respond_to?(:configure)
|
data/lib/canon/version.rb
CHANGED
data/lib/canon/xml/data_model.rb
CHANGED
|
@@ -18,8 +18,9 @@ module Canon
|
|
|
18
18
|
# Build XPath data model from XML string
|
|
19
19
|
#
|
|
20
20
|
# @param xml_string [String] XML content to parse
|
|
21
|
+
# @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
|
|
21
22
|
# @return [Nodes::RootNode] Root of the data model tree
|
|
22
|
-
def self.from_xml(xml_string)
|
|
23
|
+
def self.from_xml(xml_string, preserve_whitespace: false)
|
|
23
24
|
# Parse with Nokogiri
|
|
24
25
|
doc = Nokogiri::XML(xml_string) do |config|
|
|
25
26
|
config.nonet # Disable network access
|
|
@@ -30,7 +31,7 @@ module Canon
|
|
|
30
31
|
check_for_relative_namespace_uris(doc)
|
|
31
32
|
|
|
32
33
|
# Convert to XPath data model
|
|
33
|
-
build_from_nokogiri(doc)
|
|
34
|
+
build_from_nokogiri(doc, preserve_whitespace: preserve_whitespace)
|
|
34
35
|
end
|
|
35
36
|
|
|
36
37
|
# Alias for compatibility with base class interface
|
|
@@ -74,19 +75,21 @@ module Canon
|
|
|
74
75
|
|
|
75
76
|
# Build XPath data model from Nokogiri document or fragment
|
|
76
77
|
# rubocop:disable Metrics/MethodLength
|
|
77
|
-
def self.build_from_nokogiri(nokogiri_doc)
|
|
78
|
+
def self.build_from_nokogiri(nokogiri_doc, preserve_whitespace: false)
|
|
78
79
|
root = Nodes::RootNode.new
|
|
79
80
|
|
|
80
81
|
if nokogiri_doc.respond_to?(:root) && nokogiri_doc.root
|
|
81
82
|
# For Documents (XML, HTML4, HTML5, Moxml): process the root element
|
|
82
|
-
root.add_child(build_element_node(nokogiri_doc.root
|
|
83
|
+
root.add_child(build_element_node(nokogiri_doc.root,
|
|
84
|
+
preserve_whitespace: preserve_whitespace))
|
|
83
85
|
|
|
84
86
|
# Process PIs and comments outside doc element
|
|
85
87
|
nokogiri_doc.children.each do |child|
|
|
86
88
|
next if child == nokogiri_doc.root
|
|
87
89
|
next if child.is_a?(Nokogiri::XML::DTD)
|
|
88
90
|
|
|
89
|
-
node = build_node_from_nokogiri(child
|
|
91
|
+
node = build_node_from_nokogiri(child,
|
|
92
|
+
preserve_whitespace: preserve_whitespace)
|
|
90
93
|
root.add_child(node) if node
|
|
91
94
|
end
|
|
92
95
|
else
|
|
@@ -95,7 +98,8 @@ module Canon
|
|
|
95
98
|
nokogiri_doc.children.each do |child|
|
|
96
99
|
next if child.is_a?(Nokogiri::XML::DTD)
|
|
97
100
|
|
|
98
|
-
node = build_node_from_nokogiri(child
|
|
101
|
+
node = build_node_from_nokogiri(child,
|
|
102
|
+
preserve_whitespace: preserve_whitespace)
|
|
99
103
|
root.add_child(node) if node
|
|
100
104
|
end
|
|
101
105
|
end
|
|
@@ -104,12 +108,15 @@ module Canon
|
|
|
104
108
|
end
|
|
105
109
|
|
|
106
110
|
# Build node from Nokogiri node
|
|
107
|
-
def self.build_node_from_nokogiri(nokogiri_node
|
|
111
|
+
def self.build_node_from_nokogiri(nokogiri_node,
|
|
112
|
+
preserve_whitespace: false)
|
|
108
113
|
case nokogiri_node
|
|
109
114
|
when Nokogiri::XML::Element
|
|
110
|
-
build_element_node(nokogiri_node
|
|
115
|
+
build_element_node(nokogiri_node,
|
|
116
|
+
preserve_whitespace: preserve_whitespace)
|
|
111
117
|
when Nokogiri::XML::Text
|
|
112
|
-
build_text_node(nokogiri_node
|
|
118
|
+
build_text_node(nokogiri_node,
|
|
119
|
+
preserve_whitespace: preserve_whitespace)
|
|
113
120
|
when Nokogiri::XML::Comment
|
|
114
121
|
build_comment_node(nokogiri_node)
|
|
115
122
|
when Nokogiri::XML::ProcessingInstruction
|
|
@@ -119,7 +126,7 @@ module Canon
|
|
|
119
126
|
|
|
120
127
|
# Build element node from Nokogiri element
|
|
121
128
|
# rubocop:disable Metrics/MethodLength
|
|
122
|
-
def self.build_element_node(nokogiri_element)
|
|
129
|
+
def self.build_element_node(nokogiri_element, preserve_whitespace: false)
|
|
123
130
|
element = Nodes::ElementNode.new(
|
|
124
131
|
name: nokogiri_element.name,
|
|
125
132
|
namespace_uri: nokogiri_element.namespace&.href,
|
|
@@ -134,7 +141,8 @@ module Canon
|
|
|
134
141
|
|
|
135
142
|
# Build child nodes
|
|
136
143
|
nokogiri_element.children.each do |child|
|
|
137
|
-
node = build_node_from_nokogiri(child
|
|
144
|
+
node = build_node_from_nokogiri(child,
|
|
145
|
+
preserve_whitespace: preserve_whitespace)
|
|
138
146
|
element.add_child(node) if node
|
|
139
147
|
end
|
|
140
148
|
|
|
@@ -195,13 +203,16 @@ module Canon
|
|
|
195
203
|
end
|
|
196
204
|
|
|
197
205
|
# Build text node from Nokogiri text node
|
|
198
|
-
def self.build_text_node(nokogiri_text)
|
|
206
|
+
def self.build_text_node(nokogiri_text, preserve_whitespace: false)
|
|
199
207
|
# XML text nodes: preserve all content including whitespace
|
|
200
208
|
# Unlike HTML, XML treats all whitespace as significant
|
|
201
209
|
content = nokogiri_text.content
|
|
202
210
|
|
|
203
211
|
# Skip empty text nodes between elements (common formatting whitespace)
|
|
204
|
-
|
|
212
|
+
# UNLESS preserve_whitespace is true (for structural_whitespace: :strict)
|
|
213
|
+
if !preserve_whitespace && content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element)
|
|
214
|
+
return nil
|
|
215
|
+
end
|
|
205
216
|
|
|
206
217
|
# Nokogiri already handles CDATA conversion and entity resolution
|
|
207
218
|
Nodes::TextNode.new(value: content)
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: canon
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.9
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-01-
|
|
11
|
+
date: 2026-01-20 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: diff-lcs
|
|
@@ -174,7 +174,6 @@ files:
|
|
|
174
174
|
- docs/internals/diffnode-enrichment.adoc
|
|
175
175
|
- docs/internals/index.adoc
|
|
176
176
|
- docs/lychee.toml
|
|
177
|
-
- docs/plans/2025-01-17-html-parser-selection-fix.adoc
|
|
178
177
|
- docs/reference/cli-options.adoc
|
|
179
178
|
- docs/reference/environment-variables.adoc
|
|
180
179
|
- docs/reference/index.adoc
|
|
@@ -191,9 +190,6 @@ files:
|
|
|
191
190
|
- docs/understanding/formats/yaml.adoc
|
|
192
191
|
- docs/understanding/index.adoc
|
|
193
192
|
- exe/canon
|
|
194
|
-
- false_positive_analysis.txt
|
|
195
|
-
- file1.html
|
|
196
|
-
- file2.html
|
|
197
193
|
- lib/canon.rb
|
|
198
194
|
- lib/canon/cache.rb
|
|
199
195
|
- lib/canon/cli.rb
|
|
@@ -230,6 +226,7 @@ files:
|
|
|
230
226
|
- lib/canon/comparison/strategies/base_match_strategy.rb
|
|
231
227
|
- lib/canon/comparison/strategies/match_strategy_factory.rb
|
|
232
228
|
- lib/canon/comparison/strategies/semantic_tree_match_strategy.rb
|
|
229
|
+
- lib/canon/comparison/whitespace_sensitivity.rb
|
|
233
230
|
- lib/canon/comparison/xml_comparator.rb
|
|
234
231
|
- lib/canon/comparison/xml_comparator/attribute_comparator.rb
|
|
235
232
|
- lib/canon/comparison/xml_comparator/attribute_filter.rb
|
|
@@ -344,79 +341,7 @@ files:
|
|
|
344
341
|
- lib/canon/xml/whitespace_normalizer.rb
|
|
345
342
|
- lib/canon/xml/xml_base_handler.rb
|
|
346
343
|
- lib/xml-c14n.rb
|
|
347
|
-
- old-docs/ADVANCED_TOPICS.adoc
|
|
348
|
-
- old-docs/BASIC_USAGE.adoc
|
|
349
|
-
- old-docs/CHARACTER_VISUALIZATION.adoc
|
|
350
|
-
- old-docs/CLI.adoc
|
|
351
|
-
- old-docs/CUSTOMIZING_BEHAVIOR.adoc
|
|
352
|
-
- old-docs/DIFF_ARCHITECTURE.adoc
|
|
353
|
-
- old-docs/DIFF_FORMATTING.adoc
|
|
354
|
-
- old-docs/DIFF_PARAMETERS.adoc
|
|
355
|
-
- old-docs/DOM_DIFF.adoc
|
|
356
|
-
- old-docs/ENV_CONFIG.adoc
|
|
357
|
-
- old-docs/FORMATS.adoc
|
|
358
|
-
- old-docs/INPUT_VALIDATION.adoc
|
|
359
|
-
- old-docs/MATCHER_BEHAVIOR.adoc
|
|
360
|
-
- old-docs/MATCH_ARCHITECTURE.adoc
|
|
361
|
-
- old-docs/MATCH_OPTIONS.adoc
|
|
362
|
-
- old-docs/MODES.adoc
|
|
363
|
-
- old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc
|
|
364
|
-
- old-docs/OPTIONS.adoc
|
|
365
|
-
- old-docs/PREPROCESSING.adoc
|
|
366
|
-
- old-docs/README.old.adoc
|
|
367
|
-
- old-docs/RSPEC.adoc
|
|
368
|
-
- old-docs/RUBY_API.adoc
|
|
369
|
-
- old-docs/SEMANTIC_DIFF_REPORT.adoc
|
|
370
|
-
- old-docs/SEMANTIC_TREE_DIFF.adoc
|
|
371
|
-
- old-docs/STRING_COMPARE.adoc
|
|
372
|
-
- old-docs/TMP.adoc
|
|
373
|
-
- old-docs/TREE_DIFF.adoc
|
|
374
|
-
- old-docs/UNDERSTANDING_CANON.adoc
|
|
375
|
-
- old-docs/VERBOSE.adoc
|
|
376
|
-
- old-docs/VISUALIZATION_MAP.adoc
|
|
377
|
-
- old-docs/WHITESPACE_TREATMENT.adoc
|
|
378
|
-
- scripts/analyze_current_state.rb
|
|
379
|
-
- scripts/analyze_false_positives.rb
|
|
380
|
-
- scripts/analyze_remaining_failures.rb
|
|
381
|
-
- scripts/compare_current_failures.rb
|
|
382
|
-
- scripts/compare_dom_tree_diff.rb
|
|
383
|
-
- scripts/compare_failures.rb
|
|
384
|
-
- scripts/debug_attribute_extraction.rb
|
|
385
|
-
- scripts/debug_blocks_839.rb
|
|
386
|
-
- scripts/debug_meta_matching.rb
|
|
387
|
-
- scripts/debug_p_matching.rb
|
|
388
|
-
- scripts/debug_signature_matching.rb
|
|
389
|
-
- scripts/debug_sourcecode_124.rb
|
|
390
|
-
- scripts/debug_whitespace_sensitive.rb
|
|
391
|
-
- scripts/extract_false_positives.rb
|
|
392
|
-
- scripts/find_actual_false_positives.rb
|
|
393
|
-
- scripts/investigate_all_false_positives.rb
|
|
394
|
-
- scripts/investigate_batch1.rb
|
|
395
|
-
- scripts/investigate_classification.rb
|
|
396
|
-
- scripts/investigate_classification_detailed.rb
|
|
397
|
-
- scripts/investigate_common_failures.rb
|
|
398
|
-
- scripts/investigate_false_negative.rb
|
|
399
|
-
- scripts/investigate_false_positive.rb
|
|
400
|
-
- scripts/investigate_false_positives.rb
|
|
401
|
-
- scripts/investigate_false_positives_batch.rb
|
|
402
|
-
- scripts/investigate_mixed_content.rb
|
|
403
|
-
- scripts/investigate_remaining_16.rb
|
|
404
|
-
- scripts/run_single_test.rb
|
|
405
|
-
- scripts/test_all_false_positives.rb
|
|
406
|
-
- scripts/test_attribute_details.rb
|
|
407
|
-
- scripts/test_both_algorithms.rb
|
|
408
|
-
- scripts/test_both_simple.rb
|
|
409
|
-
- scripts/test_enhanced_semantic_output.rb
|
|
410
|
-
- scripts/test_readme_examples.rb
|
|
411
|
-
- scripts/test_semantic_tree_diff.rb
|
|
412
|
-
- scripts/test_semantic_ux_improvements.rb
|
|
413
|
-
- scripts/test_single_false_positive.rb
|
|
414
|
-
- scripts/test_size_limits.rb
|
|
415
344
|
- sig/xml/c14n.rbs
|
|
416
|
-
- test_html_1.html
|
|
417
|
-
- test_html_2.html
|
|
418
|
-
- test_nokogiri.rb
|
|
419
|
-
- test_normalize.rb
|
|
420
345
|
homepage: https://github.com/lutaml/canon
|
|
421
346
|
licenses:
|
|
422
347
|
- BSD-2-Clause
|