canon 0.1.8 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +83 -22
- data/docs/Gemfile +1 -0
- data/docs/_config.yml +90 -1
- data/docs/advanced/diff-classification.adoc +196 -24
- data/docs/features/match-options/index.adoc +239 -1
- data/lib/canon/comparison/format_detector.rb +2 -1
- data/lib/canon/comparison/html_comparator.rb +19 -8
- data/lib/canon/comparison/html_compare_profile.rb +8 -2
- data/lib/canon/comparison/markup_comparator.rb +109 -2
- data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
- data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +108 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
- data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
- data/lib/canon/comparison/xml_comparator.rb +240 -23
- data/lib/canon/comparison/xml_node_comparison.rb +25 -3
- data/lib/canon/diff/diff_classifier.rb +119 -5
- data/lib/canon/diff/formatting_detector.rb +1 -1
- data/lib/canon/diff/xml_serialization_formatter.rb +153 -0
- data/lib/canon/rspec_matchers.rb +37 -8
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +24 -13
- metadata +4 -78
- data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
- data/false_positive_analysis.txt +0 -0
- data/file1.html +0 -1
- data/file2.html +0 -1
- data/old-docs/ADVANCED_TOPICS.adoc +0 -20
- data/old-docs/BASIC_USAGE.adoc +0 -16
- data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
- data/old-docs/CLI.adoc +0 -497
- data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/old-docs/DIFF_FORMATTING.adoc +0 -540
- data/old-docs/DIFF_PARAMETERS.adoc +0 -261
- data/old-docs/DOM_DIFF.adoc +0 -1017
- data/old-docs/ENV_CONFIG.adoc +0 -876
- data/old-docs/FORMATS.adoc +0 -867
- data/old-docs/INPUT_VALIDATION.adoc +0 -477
- data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
- data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/old-docs/MATCH_OPTIONS.adoc +0 -912
- data/old-docs/MODES.adoc +0 -432
- data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/old-docs/OPTIONS.adoc +0 -1387
- data/old-docs/PREPROCESSING.adoc +0 -491
- data/old-docs/README.old.adoc +0 -2831
- data/old-docs/RSPEC.adoc +0 -814
- data/old-docs/RUBY_API.adoc +0 -485
- data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
- data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
- data/old-docs/STRING_COMPARE.adoc +0 -345
- data/old-docs/TMP.adoc +0 -3384
- data/old-docs/TREE_DIFF.adoc +0 -1080
- data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
- data/old-docs/VERBOSE.adoc +0 -482
- data/old-docs/VISUALIZATION_MAP.adoc +0 -625
- data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
- data/scripts/analyze_current_state.rb +0 -85
- data/scripts/analyze_false_positives.rb +0 -114
- data/scripts/analyze_remaining_failures.rb +0 -105
- data/scripts/compare_current_failures.rb +0 -95
- data/scripts/compare_dom_tree_diff.rb +0 -158
- data/scripts/compare_failures.rb +0 -151
- data/scripts/debug_attribute_extraction.rb +0 -66
- data/scripts/debug_blocks_839.rb +0 -115
- data/scripts/debug_meta_matching.rb +0 -52
- data/scripts/debug_p_matching.rb +0 -192
- data/scripts/debug_signature_matching.rb +0 -118
- data/scripts/debug_sourcecode_124.rb +0 -32
- data/scripts/debug_whitespace_sensitive.rb +0 -192
- data/scripts/extract_false_positives.rb +0 -138
- data/scripts/find_actual_false_positives.rb +0 -125
- data/scripts/investigate_all_false_positives.rb +0 -161
- data/scripts/investigate_batch1.rb +0 -127
- data/scripts/investigate_classification.rb +0 -150
- data/scripts/investigate_classification_detailed.rb +0 -190
- data/scripts/investigate_common_failures.rb +0 -342
- data/scripts/investigate_false_negative.rb +0 -80
- data/scripts/investigate_false_positive.rb +0 -83
- data/scripts/investigate_false_positives.rb +0 -227
- data/scripts/investigate_false_positives_batch.rb +0 -163
- data/scripts/investigate_mixed_content.rb +0 -125
- data/scripts/investigate_remaining_16.rb +0 -214
- data/scripts/run_single_test.rb +0 -29
- data/scripts/test_all_false_positives.rb +0 -95
- data/scripts/test_attribute_details.rb +0 -61
- data/scripts/test_both_algorithms.rb +0 -49
- data/scripts/test_both_simple.rb +0 -49
- data/scripts/test_enhanced_semantic_output.rb +0 -125
- data/scripts/test_readme_examples.rb +0 -131
- data/scripts/test_semantic_tree_diff.rb +0 -99
- data/scripts/test_semantic_ux_improvements.rb +0 -135
- data/scripts/test_single_false_positive.rb +0 -119
- data/scripts/test_size_limits.rb +0 -99
- data/test_html_1.html +0 -21
- data/test_html_2.html +0 -21
- data/test_nokogiri.rb +0 -33
- data/test_normalize.rb +0 -45
|
@@ -45,10 +45,37 @@ Match dimensions are orthogonal aspects that can be configured independently.
|
|
|
45
45
|
|
|
46
46
|
`:strict`:: Text must match exactly, character-for-character including all whitespace
|
|
47
47
|
|
|
48
|
-
`:normalize`:: Whitespace is normalized (collapsed/trimmed) before comparison
|
|
48
|
+
`:normalize`:: Whitespace is normalized (collapsed/trimmed) before comparison.
|
|
49
|
+
Formatting-only differences (e.g., extra spaces around text) are classified as
|
|
50
|
+
*informative* rather than normative. This means documents with only whitespace
|
|
51
|
+
differences in text content are considered equivalent.
|
|
49
52
|
|
|
50
53
|
`:ignore`:: Text content is completely ignored in comparison
|
|
51
54
|
|
|
55
|
+
.Using text_content: :normalize
|
|
56
|
+
[example]
|
|
57
|
+
====
|
|
58
|
+
[source,ruby]
|
|
59
|
+
----
|
|
60
|
+
# These are equivalent with :normalize
|
|
61
|
+
# Whitespace differences are formatting-only (informative)
|
|
62
|
+
Canon.equivalent?(
|
|
63
|
+
'<p> text </p>',
|
|
64
|
+
'<p>text</p>',
|
|
65
|
+
match: { text_content: :normalize }
|
|
66
|
+
)
|
|
67
|
+
# => true
|
|
68
|
+
|
|
69
|
+
# These differ in :strict mode
|
|
70
|
+
Canon.equivalent?(
|
|
71
|
+
'<p> text </p>',
|
|
72
|
+
'<p>text</p>',
|
|
73
|
+
match: { text_content: :strict }
|
|
74
|
+
)
|
|
75
|
+
# => false
|
|
76
|
+
----
|
|
77
|
+
====
|
|
78
|
+
|
|
52
79
|
=== structural_whitespace
|
|
53
80
|
|
|
54
81
|
**Applies to**: All formats
|
|
@@ -63,6 +90,200 @@ Match dimensions are orthogonal aspects that can be configured independently.
|
|
|
63
90
|
|
|
64
91
|
`:ignore`:: Structural whitespace is completely ignored
|
|
65
92
|
|
|
93
|
+
|
|
94
|
+
=== Whitespace sensitivity at element level
|
|
95
|
+
|
|
96
|
+
==== General
|
|
97
|
+
|
|
98
|
+
In XML, whitespace sensitivity can vary by schema and element:
|
|
99
|
+
|
|
100
|
+
* Elements that apply `xml:space="preserve"` are whitespace-sensitive.
|
|
101
|
+
|
|
102
|
+
* Other elements may be defined as sensitive by schema (e.g.
|
|
103
|
+
`xs:space="preserve"` in XML Schema) or unannounced conventions, such as
|
|
104
|
+
for mixed content.
|
|
105
|
+
|
|
106
|
+
In HTML, elements like `<pre>` and `<code>` preserve whitespace, while others
|
|
107
|
+
like `<div>` and `<p>` do not.
|
|
108
|
+
|
|
109
|
+
In the unannounced cases, the developer must indicate which elements are
|
|
110
|
+
whitespace-sensitive.
|
|
111
|
+
|
|
112
|
+
In Canon, you can control whitespace sensitivity at the element level using
|
|
113
|
+
`structural_whitespace: :strict` or `text_content: :normalize`.
|
|
114
|
+
|
|
115
|
+
Element-level sensitivity controls both:
|
|
116
|
+
|
|
117
|
+
* `structural_whitespace`: Whether whitespace between elements in the element is
|
|
118
|
+
preserved
|
|
119
|
+
|
|
120
|
+
* `text_content`: Whether whitespace within text nodes of the element is
|
|
121
|
+
normalized
|
|
122
|
+
|
|
123
|
+
Options for controlling element-level sensitivity include:
|
|
124
|
+
|
|
125
|
+
* **xml:space attribute** - XML standard for declaring whitespace sensitivity in documents
|
|
126
|
+
* **whitelist/blacklist options** - User-specified element lists
|
|
127
|
+
* **Format defaults** - HTML has built-in sensitive elements
|
|
128
|
+
* **respect_xml_space option** - Control whether xml:space is honored
|
|
129
|
+
|
|
130
|
+
For elements marked as sensitive, whitespace differences are always normative.
|
|
131
|
+
|
|
132
|
+
For non-sensitive elements using `text_content: :normalize`, whitespace
|
|
133
|
+
differences are classified as formatting-only (informative).
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
==== xml:space attribute support
|
|
137
|
+
|
|
138
|
+
The `xml:space` attribute is the XML standard way to declare whitespace
|
|
139
|
+
sensitivity in XML instance documents:
|
|
140
|
+
|
|
141
|
+
[source,xml]
|
|
142
|
+
----
|
|
143
|
+
<!-- Preserve whitespace in this element -->
|
|
144
|
+
<code xml:space="preserve">
|
|
145
|
+
Indentation and newlines matter here
|
|
146
|
+
</code>
|
|
147
|
+
|
|
148
|
+
<!-- Use default behavior -->
|
|
149
|
+
<text xml:space="default">
|
|
150
|
+
Whitespace handling follows configured behavior
|
|
151
|
+
</text>
|
|
152
|
+
----
|
|
153
|
+
|
|
154
|
+
==== Whitelist and blacklist options
|
|
155
|
+
|
|
156
|
+
You can explicitly specify which elements are whitespace-sensitive:
|
|
157
|
+
|
|
158
|
+
[source,ruby]
|
|
159
|
+
----
|
|
160
|
+
# Specify elements that preserve whitespace
|
|
161
|
+
Canon::Comparison.equivalent?(xml1, xml2,
|
|
162
|
+
match: {
|
|
163
|
+
structural_whitespace: :strict,
|
|
164
|
+
whitespace_sensitive_elements: [:pre, :code, :sample],
|
|
165
|
+
whitespace_insensitive_elements: [:p, :div] # Override defaults/whitelist
|
|
166
|
+
}
|
|
167
|
+
)
|
|
168
|
+
----
|
|
169
|
+
|
|
170
|
+
==== respect_xml_space option
|
|
171
|
+
|
|
172
|
+
Control whether xml:space attributes in the document are honored:
|
|
173
|
+
|
|
174
|
+
[source,ruby]
|
|
175
|
+
----
|
|
176
|
+
# Honor xml:space (default)
|
|
177
|
+
Canon::Comparison.equivalent?(xml1, xml2,
|
|
178
|
+
match: {
|
|
179
|
+
structural_whitespace: :strict,
|
|
180
|
+
respect_xml_space: true # Use xml:space attributes in document
|
|
181
|
+
}
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Ignore xml:space, use only user configuration
|
|
185
|
+
Canon::Comparison.equivalent?(xml1, xml2,
|
|
186
|
+
match: {
|
|
187
|
+
structural_whitespace: :strict,
|
|
188
|
+
respect_xml_space: false # Override document declarations
|
|
189
|
+
}
|
|
190
|
+
)
|
|
191
|
+
----
|
|
192
|
+
|
|
193
|
+
==== Priority order
|
|
194
|
+
|
|
195
|
+
When determining if an element is whitespace-sensitive, Canon uses this priority:
|
|
196
|
+
|
|
197
|
+
[source]
|
|
198
|
+
----
|
|
199
|
+
1. respect_xml_space: false → User config only (ignore xml:space)
|
|
200
|
+
↓
|
|
201
|
+
2. User whitelist → Use whitelist (user explicitly declared)
|
|
202
|
+
↓
|
|
203
|
+
3. Format defaults → HTML: [:pre, :textarea, :script, :style], XML: []
|
|
204
|
+
↓
|
|
205
|
+
4. User blacklist → Remove from defaults/whitelist
|
|
206
|
+
↓
|
|
207
|
+
5. xml:space="preserve" → Element is sensitive
|
|
208
|
+
↓
|
|
209
|
+
6. xml:space="default" → Use steps 1-4
|
|
210
|
+
----
|
|
211
|
+
|
|
212
|
+
==== Format-specific defaults
|
|
213
|
+
|
|
214
|
+
**HTML**:: `[:pre, :textarea, :script, :style]` - These elements preserve whitespace by HTML specification
|
|
215
|
+
**XML**:: `[]` - No default whitespace-sensitive elements, purely user-controlled
|
|
216
|
+
|
|
217
|
+
==== Examples
|
|
218
|
+
|
|
219
|
+
.Using xml:space attribute
|
|
220
|
+
[source,ruby]
|
|
221
|
+
----
|
|
222
|
+
xml1 = '<root><code xml:space="preserve"> indented </code></root>'
|
|
223
|
+
xml2 = '<root><code xml:space="preserve">indented</code></root>'
|
|
224
|
+
|
|
225
|
+
# These are NOT equivalent (whitespace matters in xml:space="preserve")
|
|
226
|
+
Canon::Comparison.equivalent?(xml1, xml2,
|
|
227
|
+
match: { structural_whitespace: :strict }
|
|
228
|
+
)
|
|
229
|
+
# => false
|
|
230
|
+
----
|
|
231
|
+
|
|
232
|
+
.Using whitelist
|
|
233
|
+
[source,ruby]
|
|
234
|
+
----
|
|
235
|
+
# Make <p> elements whitespace-sensitive
|
|
236
|
+
Canon::Comparison.equivalent?(xml1, xml2,
|
|
237
|
+
match: {
|
|
238
|
+
structural_whitespace: :strict,
|
|
239
|
+
whitespace_sensitive_elements: [:p, :pre]
|
|
240
|
+
}
|
|
241
|
+
)
|
|
242
|
+
----
|
|
243
|
+
|
|
244
|
+
.Overriding HTML defaults
|
|
245
|
+
[source,ruby]
|
|
246
|
+
----
|
|
247
|
+
# Make <script> NOT whitespace-sensitive (override HTML default)
|
|
248
|
+
Canon::Comparison.equivalent?(html1, html2,
|
|
249
|
+
format: :html,
|
|
250
|
+
match: {
|
|
251
|
+
structural_whitespace: :strict,
|
|
252
|
+
whitespace_insensitive_elements: [:script]
|
|
253
|
+
}
|
|
254
|
+
)
|
|
255
|
+
----
|
|
256
|
+
|
|
257
|
+
.Using text_content: :normalize with whitespace_insensitive_elements
|
|
258
|
+
[source,ruby]
|
|
259
|
+
----
|
|
260
|
+
# HTML defaults: [:pre, :code, :textarea, :script, :style]
|
|
261
|
+
# Excluding :code means it's no longer whitespace-sensitive
|
|
262
|
+
html1 = '<root><pre> indented </pre><code> code </code></root>'
|
|
263
|
+
html2 = '<root><pre> indented </pre><code>code</code></root>'
|
|
264
|
+
|
|
265
|
+
# With :code blacklisted, whitespace in <code> is normalized (formatting-only)
|
|
266
|
+
# HTML uses text_content: :normalize by default
|
|
267
|
+
Canon::Comparison.equivalent?(html1, html2,
|
|
268
|
+
format: :html,
|
|
269
|
+
match: {
|
|
270
|
+
whitespace_insensitive_elements: [:code],
|
|
271
|
+
}
|
|
272
|
+
)
|
|
273
|
+
# => true (whitespace differences in <code> are formatting-only)
|
|
274
|
+
|
|
275
|
+
# Without blacklisting, <code> is sensitive (whitespace matters)
|
|
276
|
+
Canon::Comparison.equivalent?(html1, html2,
|
|
277
|
+
format: :html,
|
|
278
|
+
match: {
|
|
279
|
+
structural_whitespace: :strict,
|
|
280
|
+
}
|
|
281
|
+
)
|
|
282
|
+
# => false (whitespace in <code> is normative)
|
|
283
|
+
----
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
|
|
66
287
|
=== attribute_whitespace
|
|
67
288
|
|
|
68
289
|
**Applies to**: XML, HTML only
|
|
@@ -414,6 +635,23 @@ expect(actual).to be_xml_equivalent_to(expected,
|
|
|
414
635
|
element_position: :ignore,
|
|
415
636
|
element_hierarchy: :ignore
|
|
416
637
|
)
|
|
638
|
+
|
|
639
|
+
# Element-level whitespace sensitivity
|
|
640
|
+
expect(actual).to be_xml_equivalent_to(expected,
|
|
641
|
+
match: { structural_whitespace: :strict }
|
|
642
|
+
)
|
|
643
|
+
.with_options(
|
|
644
|
+
whitespace_sensitive_elements: [:pre, :code, :sample],
|
|
645
|
+
respect_xml_space: true
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
# Override HTML default whitespace-sensitive elements
|
|
649
|
+
expect(html).to be_html_equivalent_to(expected,
|
|
650
|
+
match: { structural_whitespace: :strict }
|
|
651
|
+
)
|
|
652
|
+
.with_options(
|
|
653
|
+
whitespace_insensitive_elements: [:script, :style]
|
|
654
|
+
)
|
|
417
655
|
====
|
|
418
656
|
|
|
419
657
|
== Comments dimension
|
|
@@ -72,7 +72,8 @@ module Canon
|
|
|
72
72
|
return :json if trimmed.start_with?("{", "[")
|
|
73
73
|
|
|
74
74
|
# HTML indicators
|
|
75
|
-
return :html if trimmed.start_with?("<!DOCTYPE html", "<html",
|
|
75
|
+
return :html if trimmed.start_with?("<!DOCTYPE html", "<html",
|
|
76
|
+
"<HTML")
|
|
76
77
|
|
|
77
78
|
# XML indicators - must start with < and end with >
|
|
78
79
|
return :xml if trimmed.start_with?("<") && trimmed.end_with?(">")
|
|
@@ -13,6 +13,8 @@ require_relative "../diff/diff_classifier"
|
|
|
13
13
|
require_relative "strategies/match_strategy_factory"
|
|
14
14
|
require_relative "../html/data_model"
|
|
15
15
|
require_relative "xml_node_comparison"
|
|
16
|
+
# Whitespace sensitivity module (single source of truth for sensitive elements)
|
|
17
|
+
require_relative "whitespace_sensitivity"
|
|
16
18
|
|
|
17
19
|
module Canon
|
|
18
20
|
module Comparison
|
|
@@ -542,16 +544,22 @@ compare_profile = nil)
|
|
|
542
544
|
return if match_opts[:text_content] == :strict
|
|
543
545
|
|
|
544
546
|
# Elements where whitespace is significant - don't normalize
|
|
545
|
-
#
|
|
547
|
+
# SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_sensitive_elements
|
|
548
|
+
# This ensures consistency between preprocessing and comparison logic
|
|
549
|
+
# SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_sensitive_elements
|
|
550
|
+
# This ensures consistency between preprocessing and comparison logic
|
|
546
551
|
preserve_whitespace = if compare_profile.is_a?(HtmlCompareProfile)
|
|
547
552
|
# Profile handles HTML-specific whitespace rules
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
553
|
+
# Get default list and filter by profile
|
|
554
|
+
WhitespaceSensitivity
|
|
555
|
+
.format_default_sensitive_elements(match_opts)
|
|
556
|
+
.select do |elem|
|
|
557
|
+
compare_profile.preserve_whitespace?(elem.to_s)
|
|
558
|
+
end
|
|
559
|
+
.map(&:to_s)
|
|
552
560
|
else
|
|
553
|
-
#
|
|
554
|
-
|
|
561
|
+
# Use default list from WhitespaceSensitivity (single source of truth)
|
|
562
|
+
WhitespaceSensitivity.format_default_sensitive_elements(match_opts).map(&:to_s)
|
|
555
563
|
end
|
|
556
564
|
|
|
557
565
|
# Walk all text nodes
|
|
@@ -607,9 +615,12 @@ compare_profile = nil)
|
|
|
607
615
|
#
|
|
608
616
|
# CRITICAL: Do NOT remove whitespace-only text nodes from whitespace-sensitive
|
|
609
617
|
# elements like <pre>, <code>, <textarea>, <script>, <style>
|
|
618
|
+
#
|
|
619
|
+
# SINGLE SOURCE OF TRUTH: Uses WhitespaceSensitivity.format_default_sensitive_elements
|
|
610
620
|
def remove_whitespace_only_text_nodes(doc)
|
|
611
621
|
# Elements where whitespace is significant - don't remove whitespace-only nodes
|
|
612
|
-
|
|
622
|
+
# SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_sensitive_elements
|
|
623
|
+
preserve_whitespace = WhitespaceSensitivity.format_default_sensitive_elements(format: :html).map(&:to_s)
|
|
613
624
|
|
|
614
625
|
doc.xpath(".//text()").each do |text_node|
|
|
615
626
|
# CRITICAL: Skip if this text node is inside a whitespace-preserving element
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "compare_profile"
|
|
4
|
+
# Whitespace sensitivity module (single source of truth for sensitive elements)
|
|
5
|
+
require_relative "whitespace_sensitivity"
|
|
4
6
|
|
|
5
7
|
module Canon
|
|
6
8
|
module Comparison
|
|
@@ -82,9 +84,13 @@ module Canon
|
|
|
82
84
|
private
|
|
83
85
|
|
|
84
86
|
# Elements where whitespace is semantically significant in HTML
|
|
85
|
-
#
|
|
87
|
+
#
|
|
88
|
+
# SINGLE SOURCE OF TRUTH: Delegates to WhitespaceSensitivity.format_default_sensitive_elements
|
|
89
|
+
# This ensures consistency across the codebase.
|
|
90
|
+
#
|
|
91
|
+
# @return [Array<String>] List of element names (as strings)
|
|
86
92
|
def whitespace_sensitive_elements
|
|
87
|
-
|
|
93
|
+
WhitespaceSensitivity.format_default_sensitive_elements(format: @html_version).map(&:to_s)
|
|
88
94
|
end
|
|
89
95
|
|
|
90
96
|
# Check if a dimension is explicitly set to :strict
|
|
@@ -239,9 +239,116 @@ module Canon
|
|
|
239
239
|
# @param diff2 [Symbol] Difference type for node2
|
|
240
240
|
# @param dimension [Symbol] The dimension of the difference
|
|
241
241
|
# @return [String] Human-readable reason
|
|
242
|
-
def build_difference_reason(
|
|
242
|
+
def build_difference_reason(node1, node2, diff1, diff2, dimension)
|
|
243
|
+
# For attribute presence differences, show what attributes differ
|
|
244
|
+
if dimension == :attribute_presence
|
|
245
|
+
attrs1 = extract_attributes(node1)
|
|
246
|
+
attrs2 = extract_attributes(node2)
|
|
247
|
+
return build_attribute_difference_reason(attrs1, attrs2)
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
# For text content differences, show the actual text (truncated if needed)
|
|
251
|
+
if dimension == :text_content
|
|
252
|
+
text1 = extract_text_content_from_node(node1)
|
|
253
|
+
text2 = extract_text_content_from_node(node2)
|
|
254
|
+
return build_text_difference_reason(text1, text2)
|
|
255
|
+
end
|
|
256
|
+
|
|
243
257
|
# Default reason - can be overridden in subclasses
|
|
244
|
-
"
|
|
258
|
+
"#{diff1} vs #{diff2}"
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Build a clear reason message for attribute presence differences
|
|
262
|
+
# Shows which attributes are only in node1, only in node2, or different values
|
|
263
|
+
#
|
|
264
|
+
# @param attrs1 [Hash, nil] First node's attributes
|
|
265
|
+
# @param attrs2 [Hash, nil] Second node's attributes
|
|
266
|
+
# @return [String] Clear explanation of the attribute difference
|
|
267
|
+
def build_attribute_difference_reason(attrs1, attrs2)
|
|
268
|
+
return "#{attrs1&.keys&.size || 0} vs #{attrs2&.keys&.size || 0} attributes" unless attrs1 && attrs2
|
|
269
|
+
|
|
270
|
+
require "set"
|
|
271
|
+
keys1 = attrs1.keys.to_set
|
|
272
|
+
keys2 = attrs2.keys.to_set
|
|
273
|
+
|
|
274
|
+
only_in_1 = keys1 - keys2
|
|
275
|
+
only_in_2 = keys2 - keys1
|
|
276
|
+
common = keys1 & keys2
|
|
277
|
+
|
|
278
|
+
# Check if values differ for common keys
|
|
279
|
+
different_values = common.reject { |k| attrs1[k] == attrs2[k] }
|
|
280
|
+
|
|
281
|
+
parts = []
|
|
282
|
+
parts << "only in first: #{only_in_1.to_a.sort.join(', ')}" if only_in_1.any?
|
|
283
|
+
parts << "only in second: #{only_in_2.to_a.sort.join(', ')}" if only_in_2.any?
|
|
284
|
+
parts << "different values: #{different_values.sort.join(', ')}" if different_values.any?
|
|
285
|
+
|
|
286
|
+
if parts.empty?
|
|
287
|
+
"#{keys1.size} vs #{keys2.size} attributes (same names)"
|
|
288
|
+
else
|
|
289
|
+
parts.join("; ")
|
|
290
|
+
end
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
# Extract text content from a node for diff reason
|
|
294
|
+
#
|
|
295
|
+
# @param node [Object, nil] Node to extract text from
|
|
296
|
+
# @return [String, nil] Text content or nil
|
|
297
|
+
def extract_text_content_from_node(node)
|
|
298
|
+
return nil if node.nil?
|
|
299
|
+
|
|
300
|
+
# For Canon::Xml::Nodes::TextNode
|
|
301
|
+
return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
|
|
302
|
+
|
|
303
|
+
# For XML/HTML nodes with text_content method
|
|
304
|
+
return node.text_content if node.respond_to?(:text_content)
|
|
305
|
+
|
|
306
|
+
# For nodes with text method
|
|
307
|
+
return node.text if node.respond_to?(:text)
|
|
308
|
+
|
|
309
|
+
# For nodes with content method (Moxml::Text)
|
|
310
|
+
return node.content if node.respond_to?(:content)
|
|
311
|
+
|
|
312
|
+
# For nodes with value method (other types)
|
|
313
|
+
return node.value if node.respond_to?(:value)
|
|
314
|
+
|
|
315
|
+
# For simple text nodes or strings
|
|
316
|
+
return node.to_s if node.is_a?(String)
|
|
317
|
+
|
|
318
|
+
# For other node types, try to_s
|
|
319
|
+
node.to_s
|
|
320
|
+
rescue StandardError
|
|
321
|
+
nil
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
# Build a clear reason message for text content differences
|
|
325
|
+
# Shows the actual text content (truncated if too long)
|
|
326
|
+
#
|
|
327
|
+
# @param text1 [String, nil] First text content
|
|
328
|
+
# @param text2 [String, nil] Second text content
|
|
329
|
+
# @return [String] Clear explanation of the text difference
|
|
330
|
+
def build_text_difference_reason(text1, text2)
|
|
331
|
+
# Handle nil cases
|
|
332
|
+
return "missing vs '#{truncate_text(text2)}'" if text1.nil? && text2
|
|
333
|
+
return "'#{truncate_text(text1)}' vs missing" if text1 && text2.nil?
|
|
334
|
+
return "both missing" if text1.nil? && text2.nil?
|
|
335
|
+
|
|
336
|
+
# Both have content - show truncated versions
|
|
337
|
+
"'#{truncate_text(text1)}' vs '#{truncate_text(text2)}'"
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
# Truncate text for display in reason messages
|
|
341
|
+
#
|
|
342
|
+
# @param text [String] Text to truncate
|
|
343
|
+
# @param max_length [Integer] Maximum length
|
|
344
|
+
# @return [String] Truncated text
|
|
345
|
+
def truncate_text(text, max_length = 40)
|
|
346
|
+
return "" if text.nil?
|
|
347
|
+
|
|
348
|
+
text = text.to_s
|
|
349
|
+
return text if text.length <= max_length
|
|
350
|
+
|
|
351
|
+
"#{text[0...max_length]}..."
|
|
245
352
|
end
|
|
246
353
|
|
|
247
354
|
# Serialize an element node to string
|
|
@@ -27,6 +27,9 @@ module Canon
|
|
|
27
27
|
# Start with format-specific defaults
|
|
28
28
|
options = format_defaults(format).dup
|
|
29
29
|
|
|
30
|
+
# Store format for later use (e.g., WhitespaceSensitivity needs it)
|
|
31
|
+
options[:format] = format
|
|
32
|
+
|
|
30
33
|
# Apply global profile if specified
|
|
31
34
|
if global_profile
|
|
32
35
|
profile_opts = get_profile_options(global_profile)
|
|
@@ -111,12 +114,16 @@ module Canon
|
|
|
111
114
|
def validate_match_options!(match_options)
|
|
112
115
|
# Special options that don't need validation as dimensions
|
|
113
116
|
special_options = %i[
|
|
117
|
+
format
|
|
114
118
|
preprocessing
|
|
115
119
|
semantic_diff
|
|
116
120
|
similarity_threshold
|
|
117
121
|
hash_matching
|
|
118
122
|
similarity_matching
|
|
119
123
|
propagation
|
|
124
|
+
whitespace_sensitive_elements
|
|
125
|
+
whitespace_insensitive_elements
|
|
126
|
+
respect_xml_space
|
|
120
127
|
]
|
|
121
128
|
|
|
122
129
|
match_options.each do |dimension, behavior|
|