canon 0.1.9 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +52 -78
- data/docs/advanced/diff-classification.adoc +118 -26
- data/lib/canon/comparison/markup_comparator.rb +109 -2
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +108 -0
- data/lib/canon/comparison/xml_comparator.rb +192 -0
- data/lib/canon/diff/diff_classifier.rb +48 -33
- data/lib/canon/diff/xml_serialization_formatter.rb +153 -0
- data/lib/canon/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 4f9d0e9c0c1bc9f213d837f480d3d9a26ce11505691ff48b63907e7a4abd530e
|
|
4
|
+
data.tar.gz: aa591a7682cede5f23a8dcb8b8eb8f7616d849bc5f9cad1aa2038463ee9c52b0
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6c0af5461fff1d1cd1347ba57681bc671cda71d55d62efd328ac9424ef10b8329ec877ccf43f9ff78e83a54ca03df1026e160b259396caac7bd2704227ef01b1
|
|
7
|
+
data.tar.gz: 8803713442225ae16c0c6c9c03c9cff55dd27dc6b96f5254ee5f814a29b7ad7b5ef6eafd0cd6a58d17f070a2609154476215147d595a01a69586ca7de8608a7f
|
data/.rubocop_todo.yml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# This configuration was generated by
|
|
2
2
|
# `rubocop --auto-gen-config`
|
|
3
|
-
# on 2026-01-
|
|
3
|
+
# on 2026-01-21 01:26:28 UTC using RuboCop version 1.81.7.
|
|
4
4
|
# The point is for the user to remove these configuration records
|
|
5
5
|
# one by one as the offenses are removed from the code base.
|
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
|
@@ -12,70 +12,51 @@ Gemspec/RequiredRubyVersion:
|
|
|
12
12
|
Exclude:
|
|
13
13
|
- 'canon.gemspec'
|
|
14
14
|
|
|
15
|
-
# Offense count:
|
|
16
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
17
|
-
# Configuration parameters: EnforcedStyleAlignWith.
|
|
18
|
-
# SupportedStylesAlignWith: either, start_of_block, start_of_line
|
|
19
|
-
Layout/BlockAlignment:
|
|
20
|
-
Exclude:
|
|
21
|
-
- 'spec/canon/rspec_matchers_spec.rb'
|
|
22
|
-
|
|
23
|
-
# Offense count: 2
|
|
24
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
25
|
-
Layout/BlockEndNewline:
|
|
26
|
-
Exclude:
|
|
27
|
-
- 'spec/canon/rspec_matchers_spec.rb'
|
|
28
|
-
|
|
29
|
-
# Offense count: 2
|
|
30
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
31
|
-
# Configuration parameters: AllowForAlignment.
|
|
32
|
-
Layout/CommentIndentation:
|
|
33
|
-
Exclude:
|
|
34
|
-
- 'lib/canon/comparison/xml_comparator.rb'
|
|
35
|
-
|
|
36
|
-
# Offense count: 1
|
|
15
|
+
# Offense count: 16
|
|
37
16
|
# This cop supports safe autocorrection (--autocorrect).
|
|
38
|
-
|
|
17
|
+
# Configuration parameters: EnforcedStyle, IndentationWidth.
|
|
18
|
+
# SupportedStyles: with_first_argument, with_fixed_indentation
|
|
19
|
+
Layout/ArgumentAlignment:
|
|
39
20
|
Exclude:
|
|
40
21
|
- 'lib/canon/comparison/xml_comparator.rb'
|
|
22
|
+
- 'lib/canon/diff/xml_serialization_formatter.rb'
|
|
23
|
+
- 'spec/canon/diff/xml_serialization_formatter_spec.rb'
|
|
41
24
|
|
|
42
25
|
# Offense count: 1
|
|
43
26
|
# This cop supports safe autocorrection (--autocorrect).
|
|
44
|
-
# Configuration parameters:
|
|
45
|
-
#
|
|
46
|
-
|
|
27
|
+
# Configuration parameters: AllowMultipleStyles, EnforcedHashRocketStyle, EnforcedColonStyle, EnforcedLastArgumentHashStyle.
|
|
28
|
+
# SupportedHashRocketStyles: key, separator, table
|
|
29
|
+
# SupportedColonStyles: key, separator, table
|
|
30
|
+
# SupportedLastArgumentHashStyles: always_inspect, always_ignore, ignore_implicit, ignore_explicit
|
|
31
|
+
Layout/HashAlignment:
|
|
47
32
|
Exclude:
|
|
48
|
-
- '
|
|
33
|
+
- 'test_verify_equivalent.rb'
|
|
49
34
|
|
|
50
|
-
# Offense count:
|
|
51
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
52
|
-
# Configuration parameters: EnforcedStyle.
|
|
53
|
-
# SupportedStyles: normal, indented_internal_methods
|
|
54
|
-
Layout/IndentationConsistency:
|
|
55
|
-
Exclude:
|
|
56
|
-
- 'lib/canon/comparison/xml_comparator.rb'
|
|
57
|
-
|
|
58
|
-
# Offense count: 4
|
|
59
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
60
|
-
# Configuration parameters: Width, AllowedPatterns.
|
|
61
|
-
Layout/IndentationWidth:
|
|
62
|
-
Exclude:
|
|
63
|
-
- 'spec/canon/rspec_matchers_spec.rb'
|
|
64
|
-
|
|
65
|
-
# Offense count: 655
|
|
35
|
+
# Offense count: 709
|
|
66
36
|
# This cop supports safe autocorrection (--autocorrect).
|
|
67
37
|
# Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, IgnoreCopDirectives, AllowedPatterns, SplitStrings.
|
|
68
38
|
# URISchemes: http, https
|
|
69
39
|
Layout/LineLength:
|
|
70
40
|
Enabled: false
|
|
71
41
|
|
|
72
|
-
# Offense count:
|
|
42
|
+
# Offense count: 4
|
|
73
43
|
# This cop supports safe autocorrection (--autocorrect).
|
|
74
44
|
# Configuration parameters: EnforcedStyle, IndentationWidth.
|
|
75
45
|
# SupportedStyles: aligned, indented
|
|
76
46
|
Layout/MultilineOperationIndentation:
|
|
77
47
|
Exclude:
|
|
78
48
|
- 'lib/canon/diff/diff_classifier.rb'
|
|
49
|
+
- 'lib/canon/diff/xml_serialization_formatter.rb'
|
|
50
|
+
|
|
51
|
+
# Offense count: 17
|
|
52
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
53
|
+
# Configuration parameters: AllowInHeredoc.
|
|
54
|
+
Layout/TrailingWhitespace:
|
|
55
|
+
Exclude:
|
|
56
|
+
- 'lib/canon/comparison/xml_comparator.rb'
|
|
57
|
+
- 'lib/canon/diff/xml_serialization_formatter.rb'
|
|
58
|
+
- 'spec/canon/diff/xml_serialization_formatter_spec.rb'
|
|
59
|
+
- 'test_verify_equivalent.rb'
|
|
79
60
|
|
|
80
61
|
# Offense count: 48
|
|
81
62
|
# Configuration parameters: IgnoreLiteralBranches, IgnoreConstantBranches, IgnoreDuplicateElseBranch.
|
|
@@ -117,7 +98,7 @@ Lint/UnusedMethodArgument:
|
|
|
117
98
|
- 'lib/canon/diff_formatter/by_line/xml_formatter.rb'
|
|
118
99
|
- 'lib/canon/diff_formatter/by_object/base_formatter.rb'
|
|
119
100
|
|
|
120
|
-
# Offense count:
|
|
101
|
+
# Offense count: 207
|
|
121
102
|
# Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
|
|
122
103
|
Metrics/AbcSize:
|
|
123
104
|
Enabled: false
|
|
@@ -128,12 +109,12 @@ Metrics/AbcSize:
|
|
|
128
109
|
Metrics/BlockLength:
|
|
129
110
|
Max: 84
|
|
130
111
|
|
|
131
|
-
# Offense count:
|
|
112
|
+
# Offense count: 176
|
|
132
113
|
# Configuration parameters: AllowedMethods, AllowedPatterns, Max.
|
|
133
114
|
Metrics/CyclomaticComplexity:
|
|
134
115
|
Enabled: false
|
|
135
116
|
|
|
136
|
-
# Offense count:
|
|
117
|
+
# Offense count: 360
|
|
137
118
|
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
|
138
119
|
Metrics/MethodLength:
|
|
139
120
|
Max: 110
|
|
@@ -143,7 +124,7 @@ Metrics/MethodLength:
|
|
|
143
124
|
Metrics/ParameterLists:
|
|
144
125
|
Max: 9
|
|
145
126
|
|
|
146
|
-
# Offense count:
|
|
127
|
+
# Offense count: 142
|
|
147
128
|
# Configuration parameters: AllowedMethods, AllowedPatterns, Max.
|
|
148
129
|
Metrics/PerceivedComplexity:
|
|
149
130
|
Enabled: false
|
|
@@ -167,13 +148,15 @@ Naming/PredicatePrefix:
|
|
|
167
148
|
Exclude:
|
|
168
149
|
- 'lib/canon/comparison/html_comparator.rb'
|
|
169
150
|
|
|
170
|
-
# Offense count:
|
|
151
|
+
# Offense count: 6
|
|
171
152
|
# Configuration parameters: EnforcedStyle, CheckMethodNames, CheckSymbols, AllowedIdentifiers, AllowedPatterns.
|
|
172
153
|
# SupportedStyles: snake_case, normalcase, non_integer
|
|
173
154
|
# AllowedIdentifiers: TLS1_1, TLS1_2, capture3, iso8601, rfc1123_date, rfc822, rfc2822, rfc3339, x86_64
|
|
174
155
|
Naming/VariableNumber:
|
|
175
156
|
Exclude:
|
|
176
157
|
- 'lib/canon/comparison/json_comparator.rb'
|
|
158
|
+
- 'lib/canon/comparison/markup_comparator.rb'
|
|
159
|
+
- 'lib/canon/comparison/xml_comparator/diff_node_builder.rb'
|
|
177
160
|
|
|
178
161
|
# Offense count: 2
|
|
179
162
|
# Configuration parameters: MinSize.
|
|
@@ -199,7 +182,7 @@ RSpec/DescribeMethod:
|
|
|
199
182
|
- 'spec/canon/comparison/multiple_differences_spec.rb'
|
|
200
183
|
- 'spec/canon/diff_formatter/character_map_customization_spec.rb'
|
|
201
184
|
|
|
202
|
-
# Offense count:
|
|
185
|
+
# Offense count: 675
|
|
203
186
|
# Configuration parameters: CountAsOne.
|
|
204
187
|
RSpec/ExampleLength:
|
|
205
188
|
Max: 67
|
|
@@ -250,7 +233,7 @@ RSpec/MultipleDescribes:
|
|
|
250
233
|
Exclude:
|
|
251
234
|
- 'spec/canon/comparison/match_options_spec.rb'
|
|
252
235
|
|
|
253
|
-
# Offense count:
|
|
236
|
+
# Offense count: 518
|
|
254
237
|
RSpec/MultipleExpectations:
|
|
255
238
|
Max: 15
|
|
256
239
|
|
|
@@ -296,26 +279,16 @@ RSpec/SpecFilePathFormat:
|
|
|
296
279
|
- 'spec/canon/yaml/formatter_spec.rb'
|
|
297
280
|
- 'spec/xml_c14n_spec.rb'
|
|
298
281
|
|
|
299
|
-
# Offense count:
|
|
282
|
+
# Offense count: 120
|
|
300
283
|
# Configuration parameters: IgnoreNameless, IgnoreSymbolicNames.
|
|
301
284
|
RSpec/VerifiedDoubles:
|
|
302
285
|
Exclude:
|
|
303
286
|
- 'spec/canon/comparison/whitespace_sensitivity_spec.rb'
|
|
304
287
|
- 'spec/canon/diff/diff_classifier_spec.rb'
|
|
305
288
|
- 'spec/canon/diff/path_builder_spec.rb'
|
|
289
|
+
- 'spec/canon/diff/xml_serialization_formatter_spec.rb'
|
|
306
290
|
- 'spec/canon/tree_diff/operation_converter_spec.rb'
|
|
307
291
|
|
|
308
|
-
# Offense count: 3
|
|
309
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
310
|
-
# Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
|
|
311
|
-
# SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
|
|
312
|
-
# ProceduralMethods: benchmark, bm, bmbm, create, each_with_object, measure, new, realtime, tap, with_object
|
|
313
|
-
# FunctionalMethods: let, let!, subject, watch
|
|
314
|
-
# AllowedMethods: lambda, proc, it
|
|
315
|
-
Style/BlockDelimiters:
|
|
316
|
-
Exclude:
|
|
317
|
-
- 'spec/canon/rspec_matchers_spec.rb'
|
|
318
|
-
|
|
319
292
|
# Offense count: 1
|
|
320
293
|
# This cop supports safe autocorrection (--autocorrect).
|
|
321
294
|
# Configuration parameters: EnforcedStyle, AllowComments.
|
|
@@ -331,14 +304,21 @@ Style/HashLikeCase:
|
|
|
331
304
|
- 'lib/canon/diff/diff_block_builder.rb'
|
|
332
305
|
- 'lib/canon/xml/character_encoder.rb'
|
|
333
306
|
|
|
334
|
-
# Offense count:
|
|
307
|
+
# Offense count: 4
|
|
335
308
|
# This cop supports unsafe autocorrection (--autocorrect-all).
|
|
336
309
|
Style/IdenticalConditionalBranches:
|
|
337
310
|
Exclude:
|
|
338
|
-
- 'lib/canon/comparison/xml_comparator.rb'
|
|
339
311
|
- 'lib/canon/diff_formatter/by_object/base_formatter.rb'
|
|
340
312
|
- 'lib/canon/diff_formatter/legend.rb'
|
|
341
313
|
|
|
314
|
+
# Offense count: 2
|
|
315
|
+
# This cop supports unsafe autocorrection (--autocorrect-all).
|
|
316
|
+
# Configuration parameters: InverseMethods, InverseBlocks.
|
|
317
|
+
Style/InverseMethods:
|
|
318
|
+
Exclude:
|
|
319
|
+
- 'lib/canon/comparison/markup_comparator.rb'
|
|
320
|
+
- 'lib/canon/comparison/xml_comparator/diff_node_builder.rb'
|
|
321
|
+
|
|
342
322
|
# Offense count: 1
|
|
343
323
|
# Configuration parameters: AllowedMethods.
|
|
344
324
|
# AllowedMethods: respond_to_missing?
|
|
@@ -346,26 +326,20 @@ Style/OptionalBooleanParameter:
|
|
|
346
326
|
Exclude:
|
|
347
327
|
- 'lib/canon/diff_formatter/debug_output.rb'
|
|
348
328
|
|
|
349
|
-
# Offense count:
|
|
329
|
+
# Offense count: 3
|
|
350
330
|
# This cop supports safe autocorrection (--autocorrect).
|
|
351
331
|
# Configuration parameters: EnforcedStyle, ConsistentQuotesInMultiline.
|
|
352
332
|
# SupportedStyles: single_quotes, double_quotes
|
|
353
333
|
Style/StringLiterals:
|
|
354
334
|
Exclude:
|
|
355
|
-
- '
|
|
335
|
+
- 'lib/canon/comparison/markup_comparator.rb'
|
|
336
|
+
- 'lib/canon/comparison/xml_comparator/diff_node_builder.rb'
|
|
337
|
+
- 'test_verify_equivalent.rb'
|
|
356
338
|
|
|
357
|
-
# Offense count:
|
|
339
|
+
# Offense count: 12
|
|
358
340
|
# This cop supports safe autocorrection (--autocorrect).
|
|
359
341
|
# Configuration parameters: EnforcedStyleForMultiline.
|
|
360
342
|
# SupportedStylesForMultiline: comma, consistent_comma, diff_comma, no_comma
|
|
361
343
|
Style/TrailingCommaInArguments:
|
|
362
344
|
Exclude:
|
|
363
|
-
- 'spec/canon/
|
|
364
|
-
|
|
365
|
-
# Offense count: 3
|
|
366
|
-
# This cop supports safe autocorrection (--autocorrect).
|
|
367
|
-
# Configuration parameters: EnforcedStyleForMultiline.
|
|
368
|
-
# SupportedStylesForMultiline: comma, consistent_comma, diff_comma, no_comma
|
|
369
|
-
Style/TrailingCommaInHashLiteral:
|
|
370
|
-
Exclude:
|
|
371
|
-
- 'spec/canon/rspec_matchers_spec.rb'
|
|
345
|
+
- 'spec/canon/diff/xml_serialization_formatter_spec.rb'
|
|
@@ -80,14 +80,20 @@ Classification depends on `attribute_order` setting:
|
|
|
80
80
|
│ │
|
|
81
81
|
│ DiffClassifier examines each DiffNode: │
|
|
82
82
|
│ │
|
|
83
|
-
│
|
|
84
|
-
│
|
|
83
|
+
│ 1. Serialization-level formatting (XmlSerializationFormatter) │
|
|
84
|
+
│ → XML syntax differences: <tag/> vs <tag></tag> │
|
|
85
|
+
│ → ALWAYS formatting-only (non-normative) │
|
|
85
86
|
│ │
|
|
86
|
-
│
|
|
87
|
-
│
|
|
88
|
-
│
|
|
89
|
-
│ → NORMATIVE (difference matters) │
|
|
87
|
+
│ 2. Content-level formatting (text_content: :normalize) │
|
|
88
|
+
│ → Whitespace differences in content │
|
|
89
|
+
│ → Formatting-only when normalized content matches │
|
|
90
90
|
│ │
|
|
91
|
+
│ 3. CompareProfile policy (normative vs informative) │
|
|
92
|
+
│ → behavior == :ignore → INFORMATIVE │
|
|
93
|
+
│ → behavior == :strict → NORMATIVE │
|
|
94
|
+
│ → behavior == :normalize → Check content normalization │
|
|
95
|
+
│ │
|
|
96
|
+
│ Sets diff_node.formatting = true/false │
|
|
91
97
|
│ Sets diff_node.normative = true/false │
|
|
92
98
|
└───────────────────────────────────┬───────────────────────────────┘
|
|
93
99
|
↓
|
|
@@ -102,6 +108,27 @@ Classification depends on `attribute_order` setting:
|
|
|
102
108
|
└──────────────────────────────────────────────────────────────────┘
|
|
103
109
|
----
|
|
104
110
|
|
|
111
|
+
=== Three-Level Classification System
|
|
112
|
+
|
|
113
|
+
Canon distinguishes between **three distinct kinds of differences**:
|
|
114
|
+
|
|
115
|
+
| Kind | `formatting:` | `normative:` | Meaning | Examples |
|
|
116
|
+
|------|---------------|--------------|---------|----------|
|
|
117
|
+
| **Serialization formatting** | `true` | `false` | XML syntax differences | `<tag/>` vs `<tag></tag>` |
|
|
118
|
+
| **Content formatting** | `true` | `false` | Whitespace in content | `Hello world` vs `Hello world` |
|
|
119
|
+
| **Informative** | `false` | `false` | Tracked but doesn't affect equivalence | Attribute order (when `:ignore`) |
|
|
120
|
+
| **Normative** | `false` | `true` | Affects equivalence | Different words, missing elements |
|
|
121
|
+
|
|
122
|
+
**Key distinction**:
|
|
123
|
+
|
|
124
|
+
* **Serialization-level formatting**: XML syntax differences that are ALWAYS non-normative regardless of match options, because they represent different valid serializations of the same semantic content. Detected by `XmlSerializationFormatter`.
|
|
125
|
+
|
|
126
|
+
* **Content-level formatting**: Whitespace differences in document content. These are formatting-only (non-normative) when normalized content matches (using `text_content: :normalize`).
|
|
127
|
+
|
|
128
|
+
* **Informative**: Differences tracked for reference but don't affect equivalence (when behavior is `:ignore`).
|
|
129
|
+
|
|
130
|
+
* **Normative**: Semantic content differences that affect equivalence (when behavior is `:strict` or when normalized content differs).
|
|
131
|
+
|
|
105
132
|
== CompareProfile-Based Classification
|
|
106
133
|
|
|
107
134
|
=== Overview
|
|
@@ -120,22 +147,42 @@ DiffNode → DiffClassifier → CompareProfile → normative?
|
|
|
120
147
|
|
|
121
148
|
=== Classification Hierarchy
|
|
122
149
|
|
|
123
|
-
Canon uses a
|
|
150
|
+
Canon uses a **multi-level hierarchy** for classifying differences:
|
|
124
151
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
152
|
+
[source]
|
|
153
|
+
----
|
|
154
|
+
DiffNode → DiffClassifier → XmlSerializationFormatter → serialization formatting?
|
|
155
|
+
↓
|
|
156
|
+
CompareProfile → normative dimension?
|
|
157
|
+
↓
|
|
158
|
+
FormattingDetector → formatting-only?
|
|
159
|
+
↓
|
|
160
|
+
Final classification
|
|
161
|
+
----
|
|
162
|
+
|
|
163
|
+
**Classification priority (from highest to lowest specificity)**:
|
|
164
|
+
|
|
165
|
+
1. **Serialization-level formatting** (highest priority)
|
|
166
|
+
- XML syntax differences: `<tag/>` vs `<tag></tag>`
|
|
167
|
+
- Detected by `XmlSerializationFormatter`
|
|
168
|
+
- **ALWAYS** `formatting: true, normative: false`
|
|
169
|
+
- Bypasses all other classification logic
|
|
129
170
|
|
|
130
|
-
2. **
|
|
171
|
+
2. **Content-level formatting**
|
|
172
|
+
- Whitespace differences in document content
|
|
173
|
+
- Detected by `FormattingDetector` when `text_content: :normalize`
|
|
174
|
+
- `formatting: true, normative: false` when normalized content matches
|
|
175
|
+
- Respects element-level whitespace sensitivity
|
|
176
|
+
|
|
177
|
+
3. **Informative** (based on `:ignore` behavior)
|
|
131
178
|
- Tracked but doesn't affect equivalence
|
|
132
|
-
-
|
|
133
|
-
-
|
|
179
|
+
- `formatting: false, normative: false`
|
|
180
|
+
- Example: Attribute order when `attribute_order: :ignore`
|
|
134
181
|
|
|
135
|
-
|
|
182
|
+
4. **Normative** (based on `:strict` behavior or content mismatch)
|
|
136
183
|
- Affects equivalence
|
|
137
|
-
-
|
|
138
|
-
-
|
|
184
|
+
- `formatting: false, normative: true`
|
|
185
|
+
- Example: Different words, missing elements
|
|
139
186
|
|
|
140
187
|
=== Format-Specific Policies
|
|
141
188
|
|
|
@@ -292,6 +339,34 @@ Canon::Comparison.equivalent?(html1, html2, format: :html)
|
|
|
292
339
|
----
|
|
293
340
|
====
|
|
294
341
|
|
|
342
|
+
.Self-closing vs explicit closing tags
|
|
343
|
+
====
|
|
344
|
+
Per XML standards, `<tag/>` and `<tag></tag>` are semantically equivalent (both represent empty elements). Canon classifies differences in serialisation format as **formatting-only** (non-normative):
|
|
345
|
+
|
|
346
|
+
[source,ruby]
|
|
347
|
+
----
|
|
348
|
+
# Self-closing vs explicit closing - always equivalent
|
|
349
|
+
xml1 = '<svg><rect x="10" y="10"/></svg>'
|
|
350
|
+
xml2 = '<svg><rect x="10" y="10"></rect></svg>'
|
|
351
|
+
|
|
352
|
+
Canon::Comparison.equivalent?(xml1, xml2, format: :xml)
|
|
353
|
+
# => true
|
|
354
|
+
|
|
355
|
+
# Empty/whitespace-only text nodes from serialisation are formatting-only
|
|
356
|
+
result = Canon::Comparison.equivalent?(xml1, xml2, format: :xml, verbose: true)
|
|
357
|
+
result.differences.each do |diff|
|
|
358
|
+
if diff.dimension == :text_content
|
|
359
|
+
puts "Normative: #{diff.normative?}" # => false
|
|
360
|
+
puts "Formatting: #{diff.formatting?}" # => true
|
|
361
|
+
end
|
|
362
|
+
end
|
|
363
|
+
----
|
|
364
|
+
|
|
365
|
+
This applies regardless of `text_content` behavior setting, as these differences are purely serialisation format variations (similar to attribute order).
|
|
366
|
+
|
|
367
|
+
The key insight: empty or whitespace-only text nodes created by different serialisation styles (`<tag/>` vs `<tag></tag>`) are always classified as **formatting-only**, not normative.
|
|
368
|
+
====
|
|
369
|
+
|
|
295
370
|
=== FormattingDetector Integration
|
|
296
371
|
|
|
297
372
|
For dimensions that support it (`:text_content`, `:structural_whitespace`),
|
|
@@ -319,19 +394,35 @@ With `:normalize` mode:
|
|
|
319
394
|
|
|
320
395
|
=== Implementation Details
|
|
321
396
|
|
|
322
|
-
The
|
|
397
|
+
The classification system uses three main classes:
|
|
323
398
|
|
|
324
|
-
*
|
|
325
|
-
|
|
326
|
-
|
|
399
|
+
* **`XmlSerializationFormatter`** - Detects XML serialization-level formatting differences
|
|
400
|
+
- Self-closing vs explicit closing tags: `<tag/>` vs `<tag></tag>`
|
|
401
|
+
- Always returns `formatting: true, normative: false`
|
|
402
|
+
- These differences are ALWAYS non-normative regardless of match options
|
|
327
403
|
|
|
328
|
-
|
|
404
|
+
* **`CompareProfile`** - Determines dimension behavior and policy
|
|
405
|
+
- `normative_dimension?(dimension)` - Is this dimension normative?
|
|
406
|
+
- `affects_equivalence?(dimension)` - Does this dimension affect equivalence?
|
|
407
|
+
- `supports_formatting_detection?(dimension)` - Can this dimension have formatting-only diffs?
|
|
408
|
+
|
|
409
|
+
* **`DiffClassifier`** - Orchestrates classification using the above
|
|
410
|
+
- First checks `XmlSerializationFormatter` for serialization formatting
|
|
411
|
+
- Then handles content-level formatting (text_content: :normalize)
|
|
412
|
+
- Finally applies `CompareProfile` policy for normative vs informative
|
|
329
413
|
|
|
330
414
|
[source,ruby]
|
|
331
415
|
----
|
|
332
416
|
def classify(diff_node)
|
|
333
|
-
#
|
|
334
|
-
#
|
|
417
|
+
# FIRST: Check for XML serialization-level formatting differences
|
|
418
|
+
# These are ALWAYS non-normative (formatting-only) regardless of match options
|
|
419
|
+
if XmlSerializationFormatter.serialization_formatting?(diff_node)
|
|
420
|
+
diff_node.formatting = true
|
|
421
|
+
diff_node.normative = false
|
|
422
|
+
return diff_node
|
|
423
|
+
end
|
|
424
|
+
|
|
425
|
+
# SECOND: Handle content-level formatting for text_content with :normalize
|
|
335
426
|
if diff_node.dimension == :text_content &&
|
|
336
427
|
profile.send(:behavior_for, :text_content) == :normalize &&
|
|
337
428
|
!inside_whitespace_sensitive_element?(diff_node) &&
|
|
@@ -341,10 +432,10 @@ def classify(diff_node)
|
|
|
341
432
|
return diff_node
|
|
342
433
|
end
|
|
343
434
|
|
|
344
|
-
#
|
|
435
|
+
# THIRD: Apply CompareProfile policy
|
|
345
436
|
is_normative = profile.normative_dimension?(diff_node.dimension)
|
|
346
437
|
|
|
347
|
-
#
|
|
438
|
+
# FOURTH: Check FormattingDetector for non-normative dimensions
|
|
348
439
|
if !is_normative && profile.supports_formatting_detection?(diff_node.dimension)
|
|
349
440
|
if formatting_only_diff?(diff_node)
|
|
350
441
|
diff_node.formatting = true
|
|
@@ -353,6 +444,7 @@ def classify(diff_node)
|
|
|
353
444
|
end
|
|
354
445
|
end
|
|
355
446
|
|
|
447
|
+
# FIFTH: Apply normative determination
|
|
356
448
|
diff_node.normative = is_normative
|
|
357
449
|
diff_node
|
|
358
450
|
end
|
|
@@ -239,9 +239,116 @@ module Canon
|
|
|
239
239
|
# @param diff2 [Symbol] Difference type for node2
|
|
240
240
|
# @param dimension [Symbol] The dimension of the difference
|
|
241
241
|
# @return [String] Human-readable reason
|
|
242
|
-
def build_difference_reason(
|
|
242
|
+
def build_difference_reason(node1, node2, diff1, diff2, dimension)
|
|
243
|
+
# For attribute presence differences, show what attributes differ
|
|
244
|
+
if dimension == :attribute_presence
|
|
245
|
+
attrs1 = extract_attributes(node1)
|
|
246
|
+
attrs2 = extract_attributes(node2)
|
|
247
|
+
return build_attribute_difference_reason(attrs1, attrs2)
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
# For text content differences, show the actual text (truncated if needed)
|
|
251
|
+
if dimension == :text_content
|
|
252
|
+
text1 = extract_text_content_from_node(node1)
|
|
253
|
+
text2 = extract_text_content_from_node(node2)
|
|
254
|
+
return build_text_difference_reason(text1, text2)
|
|
255
|
+
end
|
|
256
|
+
|
|
243
257
|
# Default reason - can be overridden in subclasses
|
|
244
|
-
"
|
|
258
|
+
"#{diff1} vs #{diff2}"
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Build a clear reason message for attribute presence differences
|
|
262
|
+
# Shows which attributes are only in node1, only in node2, or different values
|
|
263
|
+
#
|
|
264
|
+
# @param attrs1 [Hash, nil] First node's attributes
|
|
265
|
+
# @param attrs2 [Hash, nil] Second node's attributes
|
|
266
|
+
# @return [String] Clear explanation of the attribute difference
|
|
267
|
+
def build_attribute_difference_reason(attrs1, attrs2)
|
|
268
|
+
return "#{attrs1&.keys&.size || 0} vs #{attrs2&.keys&.size || 0} attributes" unless attrs1 && attrs2
|
|
269
|
+
|
|
270
|
+
require "set"
|
|
271
|
+
keys1 = attrs1.keys.to_set
|
|
272
|
+
keys2 = attrs2.keys.to_set
|
|
273
|
+
|
|
274
|
+
only_in_1 = keys1 - keys2
|
|
275
|
+
only_in_2 = keys2 - keys1
|
|
276
|
+
common = keys1 & keys2
|
|
277
|
+
|
|
278
|
+
# Check if values differ for common keys
|
|
279
|
+
different_values = common.reject { |k| attrs1[k] == attrs2[k] }
|
|
280
|
+
|
|
281
|
+
parts = []
|
|
282
|
+
parts << "only in first: #{only_in_1.to_a.sort.join(', ')}" if only_in_1.any?
|
|
283
|
+
parts << "only in second: #{only_in_2.to_a.sort.join(', ')}" if only_in_2.any?
|
|
284
|
+
parts << "different values: #{different_values.sort.join(', ')}" if different_values.any?
|
|
285
|
+
|
|
286
|
+
if parts.empty?
|
|
287
|
+
"#{keys1.size} vs #{keys2.size} attributes (same names)"
|
|
288
|
+
else
|
|
289
|
+
parts.join("; ")
|
|
290
|
+
end
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
# Extract text content from a node for diff reason
|
|
294
|
+
#
|
|
295
|
+
# @param node [Object, nil] Node to extract text from
|
|
296
|
+
# @return [String, nil] Text content or nil
|
|
297
|
+
def extract_text_content_from_node(node)
|
|
298
|
+
return nil if node.nil?
|
|
299
|
+
|
|
300
|
+
# For Canon::Xml::Nodes::TextNode
|
|
301
|
+
return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
|
|
302
|
+
|
|
303
|
+
# For XML/HTML nodes with text_content method
|
|
304
|
+
return node.text_content if node.respond_to?(:text_content)
|
|
305
|
+
|
|
306
|
+
# For nodes with text method
|
|
307
|
+
return node.text if node.respond_to?(:text)
|
|
308
|
+
|
|
309
|
+
# For nodes with content method (Moxml::Text)
|
|
310
|
+
return node.content if node.respond_to?(:content)
|
|
311
|
+
|
|
312
|
+
# For nodes with value method (other types)
|
|
313
|
+
return node.value if node.respond_to?(:value)
|
|
314
|
+
|
|
315
|
+
# For simple text nodes or strings
|
|
316
|
+
return node.to_s if node.is_a?(String)
|
|
317
|
+
|
|
318
|
+
# For other node types, try to_s
|
|
319
|
+
node.to_s
|
|
320
|
+
rescue StandardError
|
|
321
|
+
nil
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
# Build a clear reason message for text content differences
|
|
325
|
+
# Shows the actual text content (truncated if too long)
|
|
326
|
+
#
|
|
327
|
+
# @param text1 [String, nil] First text content
|
|
328
|
+
# @param text2 [String, nil] Second text content
|
|
329
|
+
# @return [String] Clear explanation of the text difference
|
|
330
|
+
def build_text_difference_reason(text1, text2)
|
|
331
|
+
# Handle nil cases
|
|
332
|
+
return "missing vs '#{truncate_text(text2)}'" if text1.nil? && text2
|
|
333
|
+
return "'#{truncate_text(text1)}' vs missing" if text1 && text2.nil?
|
|
334
|
+
return "both missing" if text1.nil? && text2.nil?
|
|
335
|
+
|
|
336
|
+
# Both have content - show truncated versions
|
|
337
|
+
"'#{truncate_text(text1)}' vs '#{truncate_text(text2)}'"
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
# Truncate text for display in reason messages
|
|
341
|
+
#
|
|
342
|
+
# @param text [String] Text to truncate
|
|
343
|
+
# @param max_length [Integer] Maximum length
|
|
344
|
+
# @return [String] Truncated text
|
|
345
|
+
def truncate_text(text, max_length = 40)
|
|
346
|
+
return "" if text.nil?
|
|
347
|
+
|
|
348
|
+
text = text.to_s
|
|
349
|
+
return text if text.length <= max_length
|
|
350
|
+
|
|
351
|
+
"#{text[0...max_length]}..."
|
|
245
352
|
end
|
|
246
353
|
|
|
247
354
|
# Serialize an element node to string
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "set"
|
|
3
4
|
require_relative "../../diff/diff_node"
|
|
4
5
|
require_relative "../../diff/path_builder"
|
|
5
6
|
require_relative "../../diff/node_serializer"
|
|
@@ -62,6 +63,21 @@ module Canon
|
|
|
62
63
|
end
|
|
63
64
|
end
|
|
64
65
|
|
|
66
|
+
# For attribute presence differences, show what attributes differ
|
|
67
|
+
if dimension == :attribute_presence
|
|
68
|
+
attrs1 = extract_attributes(node1)
|
|
69
|
+
attrs2 = extract_attributes(node2)
|
|
70
|
+
return build_attribute_difference_reason(attrs1, attrs2)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# For text content differences, show the actual text (truncated if needed)
|
|
74
|
+
if dimension == :text_content
|
|
75
|
+
text1 = extract_text_content(node1)
|
|
76
|
+
text2 = extract_text_content(node2)
|
|
77
|
+
return build_text_difference_reason(text1, text2)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Default reason
|
|
65
81
|
"#{diff1} vs #{diff2}"
|
|
66
82
|
end
|
|
67
83
|
|
|
@@ -110,6 +126,98 @@ module Canon
|
|
|
110
126
|
|
|
111
127
|
Canon::Diff::NodeSerializer.extract_attributes(node)
|
|
112
128
|
end
|
|
129
|
+
|
|
130
|
+
# Build a clear reason message for attribute presence differences
|
|
131
|
+
# Shows which attributes are only in node1, only in node2, or different values
|
|
132
|
+
#
|
|
133
|
+
# @param attrs1 [Hash, nil] First node's attributes
|
|
134
|
+
# @param attrs2 [Hash, nil] Second node's attributes
|
|
135
|
+
# @return [String] Clear explanation of the attribute difference
|
|
136
|
+
def self.build_attribute_difference_reason(attrs1, attrs2)
|
|
137
|
+
return "#{attrs1&.keys&.size || 0} vs #{attrs2&.keys&.size || 0} attributes" unless attrs1 && attrs2
|
|
138
|
+
|
|
139
|
+
keys1 = attrs1.keys.to_set
|
|
140
|
+
keys2 = attrs2.keys.to_set
|
|
141
|
+
|
|
142
|
+
only_in_1 = keys1 - keys2
|
|
143
|
+
only_in_2 = keys2 - keys1
|
|
144
|
+
common = keys1 & keys2
|
|
145
|
+
|
|
146
|
+
# Check if values differ for common keys
|
|
147
|
+
different_values = common.reject { |k| attrs1[k] == attrs2[k] }
|
|
148
|
+
|
|
149
|
+
parts = []
|
|
150
|
+
parts << "only in first: #{only_in_1.to_a.sort.join(', ')}" if only_in_1.any?
|
|
151
|
+
parts << "only in second: #{only_in_2.to_a.sort.join(', ')}" if only_in_2.any?
|
|
152
|
+
parts << "different values: #{different_values.sort.join(', ')}" if different_values.any?
|
|
153
|
+
|
|
154
|
+
if parts.empty?
|
|
155
|
+
"#{keys1.size} vs #{keys2.size} attributes (same names)"
|
|
156
|
+
else
|
|
157
|
+
parts.join("; ")
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Extract text content from a node
|
|
162
|
+
#
|
|
163
|
+
# @param node [Object, nil] Node to extract text from
|
|
164
|
+
# @return [String, nil] Text content or nil
|
|
165
|
+
def self.extract_text_content(node)
|
|
166
|
+
return nil if node.nil?
|
|
167
|
+
|
|
168
|
+
# For Canon::Xml::Nodes::TextNode
|
|
169
|
+
return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
|
|
170
|
+
|
|
171
|
+
# For XML/HTML nodes with text_content method
|
|
172
|
+
return node.text_content if node.respond_to?(:text_content)
|
|
173
|
+
|
|
174
|
+
# For nodes with text method
|
|
175
|
+
return node.text if node.respond_to?(:text)
|
|
176
|
+
|
|
177
|
+
# For nodes with content method (Moxml::Text)
|
|
178
|
+
return node.content if node.respond_to?(:content)
|
|
179
|
+
|
|
180
|
+
# For nodes with value method (other types)
|
|
181
|
+
return node.value if node.respond_to?(:value)
|
|
182
|
+
|
|
183
|
+
# For simple text nodes or strings
|
|
184
|
+
return node.to_s if node.is_a?(String)
|
|
185
|
+
|
|
186
|
+
# For other node types, try to_s
|
|
187
|
+
node.to_s
|
|
188
|
+
rescue StandardError
|
|
189
|
+
nil
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Build a clear reason message for text content differences
|
|
193
|
+
# Shows the actual text content (truncated if too long)
|
|
194
|
+
#
|
|
195
|
+
# @param text1 [String, nil] First text content
|
|
196
|
+
# @param text2 [String, nil] Second text content
|
|
197
|
+
# @return [String] Clear explanation of the text difference
|
|
198
|
+
def self.build_text_difference_reason(text1, text2)
|
|
199
|
+
# Handle nil cases
|
|
200
|
+
return "missing vs '#{truncate(text2)}'" if text1.nil? && text2
|
|
201
|
+
return "'#{truncate(text1)}' vs missing" if text1 && text2.nil?
|
|
202
|
+
return "both missing" if text1.nil? && text2.nil?
|
|
203
|
+
|
|
204
|
+
# Both have content - show truncated versions
|
|
205
|
+
"'#{truncate(text1)}' vs '#{truncate(text2)}'"
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# Truncate text for display in reason messages
|
|
209
|
+
#
|
|
210
|
+
# @param text [String] Text to truncate
|
|
211
|
+
# @param max_length [Integer] Maximum length
|
|
212
|
+
# @return [String] Truncated text
|
|
213
|
+
def self.truncate(text, max_length = 40)
|
|
214
|
+
return "" if text.nil?
|
|
215
|
+
|
|
216
|
+
text = text.to_s
|
|
217
|
+
return text if text.length <= max_length
|
|
218
|
+
|
|
219
|
+
"#{text[0...max_length]}..."
|
|
220
|
+
end
|
|
113
221
|
end
|
|
114
222
|
end
|
|
115
223
|
end
|
|
@@ -568,9 +568,201 @@ differences)
|
|
|
568
568
|
end
|
|
569
569
|
end
|
|
570
570
|
|
|
571
|
+
# For attribute presence differences, show what attributes differ
|
|
572
|
+
if dimension == :attribute_presence
|
|
573
|
+
attrs1 = extract_attributes(node1)
|
|
574
|
+
attrs2 = extract_attributes(node2)
|
|
575
|
+
return build_attribute_diff_reason(attrs1, attrs2)
|
|
576
|
+
end
|
|
577
|
+
|
|
578
|
+
# For text content differences, show the actual text (truncated if needed)
|
|
579
|
+
if dimension == :text_content
|
|
580
|
+
text1 = extract_text_from_node(node1)
|
|
581
|
+
text2 = extract_text_from_node(node2)
|
|
582
|
+
return build_text_diff_reason(text1, text2)
|
|
583
|
+
end
|
|
584
|
+
|
|
571
585
|
"#{diff1} vs #{diff2}"
|
|
572
586
|
end
|
|
573
587
|
|
|
588
|
+
# Build a clear reason message for attribute presence differences
|
|
589
|
+
#
|
|
590
|
+
# @param attrs1 [Hash, nil] First node's attributes
|
|
591
|
+
# @param attrs2 [Hash, nil] Second node's attributes
|
|
592
|
+
# @return [String] Clear explanation of the attribute difference
|
|
593
|
+
def build_attribute_diff_reason(attrs1, attrs2)
|
|
594
|
+
return "#{attrs1&.keys&.size || 0} vs #{attrs2&.keys&.size || 0} attributes" unless attrs1 && attrs2
|
|
595
|
+
|
|
596
|
+
require "set"
|
|
597
|
+
keys1 = attrs1.keys.to_set
|
|
598
|
+
keys2 = attrs2.keys.to_set
|
|
599
|
+
|
|
600
|
+
only_in_first = keys1 - keys2
|
|
601
|
+
only_in_second = keys2 - keys1
|
|
602
|
+
common = keys1 & keys2
|
|
603
|
+
|
|
604
|
+
# Check if values differ for common keys
|
|
605
|
+
different_values = common.reject { |k| attrs1[k] == attrs2[k] }
|
|
606
|
+
|
|
607
|
+
parts = []
|
|
608
|
+
parts << "only in first: #{only_in_first.to_a.sort.join(', ')}" if only_in_first.any?
|
|
609
|
+
parts << "only in second: #{only_in_second.to_a.sort.join(', ')}" if only_in_second.any?
|
|
610
|
+
parts << "different values: #{different_values.sort.join(', ')}" if different_values.any?
|
|
611
|
+
|
|
612
|
+
if parts.empty?
|
|
613
|
+
"#{keys1.size} vs #{keys2.size} attributes (same names)"
|
|
614
|
+
else
|
|
615
|
+
parts.join("; ")
|
|
616
|
+
end
|
|
617
|
+
end
|
|
618
|
+
|
|
619
|
+
# Extract text from a node for diff reason
|
|
620
|
+
#
|
|
621
|
+
# @param node [Object, nil] Node to extract text from
|
|
622
|
+
# @return [String, nil] Text content or nil
|
|
623
|
+
def extract_text_from_node(node)
|
|
624
|
+
return nil if node.nil?
|
|
625
|
+
|
|
626
|
+
# For Canon::Xml::Nodes::TextNode
|
|
627
|
+
return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
|
|
628
|
+
|
|
629
|
+
# For XML/HTML nodes with text_content method
|
|
630
|
+
return node.text_content if node.respond_to?(:text_content)
|
|
631
|
+
|
|
632
|
+
# For nodes with text method
|
|
633
|
+
return node.text if node.respond_to?(:text)
|
|
634
|
+
|
|
635
|
+
# For nodes with content method (Moxml::Text)
|
|
636
|
+
return node.content if node.respond_to?(:content)
|
|
637
|
+
|
|
638
|
+
# For nodes with value method (other types)
|
|
639
|
+
return node.value if node.respond_to?(:value)
|
|
640
|
+
|
|
641
|
+
# For simple text nodes or strings
|
|
642
|
+
return node.to_s if node.is_a?(String)
|
|
643
|
+
|
|
644
|
+
# For other node types, try to_s
|
|
645
|
+
node.to_s
|
|
646
|
+
rescue StandardError
|
|
647
|
+
nil
|
|
648
|
+
end
|
|
649
|
+
|
|
650
|
+
# Build a clear reason message for text content differences
|
|
651
|
+
#
|
|
652
|
+
# @param text1 [String, nil] First text content
|
|
653
|
+
# @param text2 [String, nil] Second text content
|
|
654
|
+
# @return [String] Clear explanation of the text difference
|
|
655
|
+
def build_text_diff_reason(text1, text2)
|
|
656
|
+
# Handle nil cases
|
|
657
|
+
return "missing vs '#{truncate_text(text2)}'" if text1.nil? && text2
|
|
658
|
+
return "'#{truncate_text(text2)}' vs missing" if text1 && text2.nil?
|
|
659
|
+
return "both missing" if text1.nil? && text2.nil?
|
|
660
|
+
|
|
661
|
+
# Check if both are whitespace-only
|
|
662
|
+
if whitespace_only?(text1) && whitespace_only?(text2)
|
|
663
|
+
return "whitespace: #{describe_whitespace(text1)} vs #{describe_whitespace(text2)}"
|
|
664
|
+
end
|
|
665
|
+
|
|
666
|
+
# Show text with visible whitespace markers
|
|
667
|
+
# Use escaped representations for clarity: \n for newline, \t for tab, · for spaces
|
|
668
|
+
vis1 = visualize_whitespace(text1)
|
|
669
|
+
vis2 = visualize_whitespace(text2)
|
|
670
|
+
|
|
671
|
+
"Text: \"#{vis1}\" vs \"#{vis2}\""
|
|
672
|
+
end
|
|
673
|
+
|
|
674
|
+
# Check if text is only whitespace
|
|
675
|
+
#
|
|
676
|
+
# @param text [String] Text to check
|
|
677
|
+
# @return [Boolean] true if whitespace-only
|
|
678
|
+
def whitespace_only?(text)
|
|
679
|
+
return false if text.nil?
|
|
680
|
+
|
|
681
|
+
text.to_s.strip.empty?
|
|
682
|
+
end
|
|
683
|
+
|
|
684
|
+
# Make whitespace visible in text content
|
|
685
|
+
# Uses the existing character visualization map from DiffFormatter (single source of truth)
|
|
686
|
+
#
|
|
687
|
+
# @param text [String] Text to visualize
|
|
688
|
+
# @return [String] Text with visible whitespace markers
|
|
689
|
+
def visualize_whitespace(text)
|
|
690
|
+
return "" if text.nil?
|
|
691
|
+
|
|
692
|
+
# Use the character map loader as the single source of truth
|
|
693
|
+
viz_map = character_visualization_map
|
|
694
|
+
|
|
695
|
+
# Replace each character with its visualization
|
|
696
|
+
text.chars.map { |char| viz_map[char] || char }.join
|
|
697
|
+
end
|
|
698
|
+
|
|
699
|
+
# Get the character visualization map (lazy-loaded to avoid circular dependency)
|
|
700
|
+
#
|
|
701
|
+
# @return [Hash] Character to visualization symbol mapping
|
|
702
|
+
def character_visualization_map
|
|
703
|
+
@character_visualization_map ||= begin
|
|
704
|
+
# Load the YAML file directly to avoid circular dependency
|
|
705
|
+
require "yaml"
|
|
706
|
+
lib_root = File.expand_path("../..", __dir__)
|
|
707
|
+
yaml_path = File.join(lib_root,
|
|
708
|
+
"canon/diff_formatter/character_map.yml")
|
|
709
|
+
data = YAML.load_file(yaml_path)
|
|
710
|
+
|
|
711
|
+
# Build visualization map from the YAML data
|
|
712
|
+
visualization_map = {}
|
|
713
|
+
data["characters"].each do |char_data|
|
|
714
|
+
# Get the character from either unicode code point or character field
|
|
715
|
+
char = if char_data["unicode"]
|
|
716
|
+
# Convert hex string to character
|
|
717
|
+
[char_data["unicode"].to_i(16)].pack("U")
|
|
718
|
+
else
|
|
719
|
+
# Use character field directly (handles \n, \t, etc.)
|
|
720
|
+
char_data["character"]
|
|
721
|
+
end
|
|
722
|
+
|
|
723
|
+
vis = char_data["visualization"]
|
|
724
|
+
visualization_map[char] = vis
|
|
725
|
+
end
|
|
726
|
+
|
|
727
|
+
visualization_map
|
|
728
|
+
end
|
|
729
|
+
end
|
|
730
|
+
|
|
731
|
+
# Describe whitespace content in a readable way
|
|
732
|
+
#
|
|
733
|
+
# @param text [String] Whitespace text
|
|
734
|
+
# @return [String] Description like "4 chars (2 newlines, 2 spaces)"
|
|
735
|
+
def describe_whitespace(text)
|
|
736
|
+
return "0 chars" if text.nil? || text.empty?
|
|
737
|
+
|
|
738
|
+
char_count = text.length
|
|
739
|
+
newline_count = text.count("\n")
|
|
740
|
+
space_count = text.count(" ")
|
|
741
|
+
tab_count = text.count("\t")
|
|
742
|
+
|
|
743
|
+
parts = []
|
|
744
|
+
parts << "#{newline_count} newlines" if newline_count.positive?
|
|
745
|
+
parts << "#{space_count} spaces" if space_count.positive?
|
|
746
|
+
parts << "#{tab_count} tabs" if tab_count.positive?
|
|
747
|
+
|
|
748
|
+
description = parts.join(", ")
|
|
749
|
+
"#{char_count} chars (#{description})"
|
|
750
|
+
end
|
|
751
|
+
|
|
752
|
+
# Truncate text for display in reason messages
|
|
753
|
+
#
|
|
754
|
+
# @param text [String] Text to truncate
|
|
755
|
+
# @param max_length [Integer] Maximum length
|
|
756
|
+
# @return [String] Truncated text
|
|
757
|
+
def truncate_text(text, max_length = 40)
|
|
758
|
+
return "" if text.nil?
|
|
759
|
+
|
|
760
|
+
text = text.to_s
|
|
761
|
+
return text if text.length <= max_length
|
|
762
|
+
|
|
763
|
+
"#{text[0...max_length]}..."
|
|
764
|
+
end
|
|
765
|
+
|
|
574
766
|
# Compare namespace declarations (xmlns and xmlns:* attributes)
|
|
575
767
|
# Delegates to XmlComparatorHelpers::NamespaceComparator
|
|
576
768
|
def compare_namespace_declarations(n1, n2, opts, differences)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "formatting_detector"
|
|
4
|
+
require_relative "xml_serialization_formatter"
|
|
4
5
|
require_relative "../comparison/compare_profile"
|
|
5
6
|
require_relative "../comparison/whitespace_sensitivity"
|
|
6
7
|
|
|
@@ -8,6 +9,11 @@ module Canon
|
|
|
8
9
|
module Diff
|
|
9
10
|
# Classifies DiffNodes as normative (affects equivalence) or informative (doesn't affect equivalence)
|
|
10
11
|
# based on the match options in effect
|
|
12
|
+
#
|
|
13
|
+
# Classification hierarchy (three distinct kinds of differences):
|
|
14
|
+
# 1. Serialization formatting: XML syntax differences (always non-normative)
|
|
15
|
+
# 2. Content formatting: Whitespace differences in content (non-normative when normalized)
|
|
16
|
+
# 3. Normative: Semantic content differences (affect equivalence)
|
|
11
17
|
class DiffClassifier
|
|
12
18
|
attr_reader :match_options, :profile
|
|
13
19
|
|
|
@@ -25,11 +31,20 @@ module Canon
|
|
|
25
31
|
|
|
26
32
|
# Classify a single DiffNode as normative or informative
|
|
27
33
|
# Hierarchy: formatting-only < informative < normative
|
|
28
|
-
# CompareProfile determines base classification,
|
|
34
|
+
# CompareProfile determines base classification, XmlSerializationFormatter handles serialization formatting
|
|
29
35
|
# @param diff_node [DiffNode] The diff node to classify
|
|
30
36
|
# @return [DiffNode] The same diff node with normative/formatting attributes set
|
|
31
37
|
def classify(diff_node)
|
|
32
|
-
#
|
|
38
|
+
# FIRST: Check for XML serialization-level formatting differences
|
|
39
|
+
# These are ALWAYS non-normative (formatting-only) regardless of match options
|
|
40
|
+
# Examples: self-closing tags (<tag/>) vs explicit closing tags (<tag></tag>)
|
|
41
|
+
if XmlSerializationFormatter.serialization_formatting?(diff_node)
|
|
42
|
+
diff_node.formatting = true
|
|
43
|
+
diff_node.normative = false
|
|
44
|
+
return diff_node
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# SECOND: Handle content-level formatting for text_content with :normalize behavior
|
|
33
48
|
# When text_content is :normalize and the difference is formatting-only,
|
|
34
49
|
# it should be marked as non-normative (informative)
|
|
35
50
|
# This ensures that verbose and non-verbose modes give consistent results
|
|
@@ -38,7 +53,7 @@ module Canon
|
|
|
38
53
|
# (like <pre>, <code>, <textarea> in HTML), don't apply formatting detection
|
|
39
54
|
# because whitespace should be preserved in these elements
|
|
40
55
|
#
|
|
41
|
-
# This check must come
|
|
56
|
+
# This check must come BEFORE normative_dimension? is called,
|
|
42
57
|
# because normative_dimension? returns true for text_content: :normalize
|
|
43
58
|
# (since the dimension affects equivalence), which would prevent formatting
|
|
44
59
|
# detection from being applied.
|
|
@@ -51,11 +66,11 @@ module Canon
|
|
|
51
66
|
return diff_node
|
|
52
67
|
end
|
|
53
68
|
|
|
54
|
-
#
|
|
69
|
+
# THIRD: Determine if this dimension is normative based on CompareProfile
|
|
55
70
|
# This respects the policy settings (strict/normalize/ignore)
|
|
56
71
|
is_normative = profile.normative_dimension?(diff_node.dimension)
|
|
57
72
|
|
|
58
|
-
#
|
|
73
|
+
# FOURTH: Check if FormattingDetector should be consulted for non-normative dimensions
|
|
59
74
|
# Only check for formatting-only when dimension is NOT normative
|
|
60
75
|
# This ensures strict mode differences remain normative
|
|
61
76
|
should_check_formatting = !is_normative &&
|
|
@@ -68,7 +83,7 @@ module Canon
|
|
|
68
83
|
return diff_node
|
|
69
84
|
end
|
|
70
85
|
|
|
71
|
-
#
|
|
86
|
+
# FIFTH: Apply the normative determination from CompareProfile
|
|
72
87
|
diff_node.formatting = false
|
|
73
88
|
diff_node.normative = is_normative
|
|
74
89
|
|
|
@@ -127,33 +142,6 @@ module Canon
|
|
|
127
142
|
normalized1 == normalized2 && text1 != text2
|
|
128
143
|
end
|
|
129
144
|
|
|
130
|
-
# Check if a node is a text node
|
|
131
|
-
# @param node [Object] The node to check
|
|
132
|
-
# @return [Boolean] true if the node is a text node
|
|
133
|
-
def text_node?(node)
|
|
134
|
-
return false if node.nil?
|
|
135
|
-
|
|
136
|
-
# Canon::Xml::Nodes::TextNode
|
|
137
|
-
return true if node.is_a?(Canon::Xml::Nodes::TextNode)
|
|
138
|
-
|
|
139
|
-
# Nokogiri text nodes (node_type returns integer constant like 3)
|
|
140
|
-
return true if node.respond_to?(:node_type) &&
|
|
141
|
-
node.node_type.is_a?(Integer) &&
|
|
142
|
-
node.node_type == Nokogiri::XML::Node::TEXT_NODE
|
|
143
|
-
|
|
144
|
-
# Moxml text nodes (node_type returns symbol)
|
|
145
|
-
return true if node.respond_to?(:node_type) && node.node_type == :text
|
|
146
|
-
|
|
147
|
-
# String
|
|
148
|
-
return true if node.is_a?(String)
|
|
149
|
-
|
|
150
|
-
# Test doubles or objects with text node-like interface
|
|
151
|
-
# Check if it has a value method (contains text content)
|
|
152
|
-
return true if node.respond_to?(:value)
|
|
153
|
-
|
|
154
|
-
false
|
|
155
|
-
end
|
|
156
|
-
|
|
157
145
|
# Check if the text node is inside a whitespace-sensitive element
|
|
158
146
|
# @param diff_node [DiffNode] The diff node to check
|
|
159
147
|
# @return [Boolean] true if inside a whitespace-sensitive element
|
|
@@ -200,6 +188,33 @@ module Canon
|
|
|
200
188
|
# If extraction fails, return nil (not formatting-only)
|
|
201
189
|
nil
|
|
202
190
|
end
|
|
191
|
+
|
|
192
|
+
# Check if a node is a text node
|
|
193
|
+
# @param node [Object] The node to check
|
|
194
|
+
# @return [Boolean] true if the node is a text node
|
|
195
|
+
def text_node?(node)
|
|
196
|
+
return false if node.nil?
|
|
197
|
+
|
|
198
|
+
# Canon::Xml::Nodes::TextNode
|
|
199
|
+
return true if node.is_a?(Canon::Xml::Nodes::TextNode)
|
|
200
|
+
|
|
201
|
+
# Nokogiri text nodes (node_type returns integer constant like 3)
|
|
202
|
+
return true if node.respond_to?(:node_type) &&
|
|
203
|
+
node.node_type.is_a?(Integer) &&
|
|
204
|
+
node.node_type == Nokogiri::XML::Node::TEXT_NODE
|
|
205
|
+
|
|
206
|
+
# Moxml text nodes (node_type returns symbol)
|
|
207
|
+
return true if node.respond_to?(:node_type) && node.node_type == :text
|
|
208
|
+
|
|
209
|
+
# String
|
|
210
|
+
return true if node.is_a?(String)
|
|
211
|
+
|
|
212
|
+
# Test doubles or objects with text node-like interface
|
|
213
|
+
# Check if it has a value method (contains text content)
|
|
214
|
+
return true if node.respond_to?(:value)
|
|
215
|
+
|
|
216
|
+
false
|
|
217
|
+
end
|
|
203
218
|
end
|
|
204
219
|
end
|
|
205
220
|
end
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module Diff
|
|
5
|
+
# Detects and classifies XML serialization-level formatting differences.
|
|
6
|
+
#
|
|
7
|
+
# Serialization-level formatting differences are differences in XML syntax
|
|
8
|
+
# that do not affect the semantic content of the document. These differences
|
|
9
|
+
# arise from different valid ways to serialize the same semantic content.
|
|
10
|
+
#
|
|
11
|
+
# These differences are ALWAYS non-normative (formatting-only) regardless
|
|
12
|
+
# of match options, because they are purely syntactic variations.
|
|
13
|
+
#
|
|
14
|
+
# Examples:
|
|
15
|
+
# - Self-closing vs explicit closing tags: <tag/> vs <tag></tag>
|
|
16
|
+
# - Attribute quote style: attr="value" vs attr='value' (parser-normalized)
|
|
17
|
+
# - Whitespace within tags: <tag a="1" b="2"> vs <tag a="1" b="2"> (parser-normalized)
|
|
18
|
+
#
|
|
19
|
+
# Note: Some serialization differences are normalized away by XML parsers
|
|
20
|
+
# (attribute quotes, tag spacing). This class focuses on differences that
|
|
21
|
+
# survive parsing and comparison, such as self-closing vs explicit closing.
|
|
22
|
+
class XmlSerializationFormatter
|
|
23
|
+
# Detect if a diff node represents an XML serialization formatting difference.
|
|
24
|
+
#
|
|
25
|
+
# Serialization formatting differences are ALWAYS non-normative because they
|
|
26
|
+
# represent different valid serializations of the same semantic content.
|
|
27
|
+
#
|
|
28
|
+
# @param diff_node [DiffNode] The diff node to check
|
|
29
|
+
# @return [Boolean] true if this is a serialization formatting difference
|
|
30
|
+
def self.serialization_formatting?(diff_node)
|
|
31
|
+
# Currently only handles text_content dimension
|
|
32
|
+
# Future: add detection for other dimensions
|
|
33
|
+
return false unless diff_node.dimension == :text_content
|
|
34
|
+
|
|
35
|
+
empty_text_content_serialization_diff?(diff_node)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Check if a text_content difference is from XML serialization format.
|
|
39
|
+
#
|
|
40
|
+
# Specifically detects self-closing tags (<tag/>) vs explicit closing tags
|
|
41
|
+
# (<tag></tag>), which create different text node structures:
|
|
42
|
+
# - Self-closing: no text node (nil)
|
|
43
|
+
# - Explicit closing: empty or whitespace-only text node ("", " ", "\n", etc.)
|
|
44
|
+
#
|
|
45
|
+
# Per XML standards, these forms are semantically equivalent.
|
|
46
|
+
#
|
|
47
|
+
# @param diff_node [DiffNode] The diff node to check
|
|
48
|
+
# @return [Boolean] true if this is a serialization formatting difference
|
|
49
|
+
def self.empty_text_content_serialization_diff?(diff_node)
|
|
50
|
+
return false unless diff_node.dimension == :text_content
|
|
51
|
+
|
|
52
|
+
node1 = diff_node.node1
|
|
53
|
+
node2 = diff_node.node2
|
|
54
|
+
|
|
55
|
+
# Both nodes are nil - no actual difference, not a serialization formatting diff
|
|
56
|
+
return false if node1.nil? && node2.nil?
|
|
57
|
+
|
|
58
|
+
# Only one is nil (e.g., one doc has self-closing, other has text)
|
|
59
|
+
# If the non-nil one is blank, it's still serialization formatting
|
|
60
|
+
if node1.nil? || node2.nil?
|
|
61
|
+
non_nil = node1 || node2
|
|
62
|
+
return false unless text_node?(non_nil)
|
|
63
|
+
|
|
64
|
+
text = extract_text_content(non_nil)
|
|
65
|
+
return blank?(text)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Both must be text nodes
|
|
69
|
+
return false unless text_node?(node1) && text_node?(node2)
|
|
70
|
+
|
|
71
|
+
text1 = extract_text_content(node1)
|
|
72
|
+
text2 = extract_text_content(node2)
|
|
73
|
+
|
|
74
|
+
# Check if both texts are blank/whitespace-only
|
|
75
|
+
# This indicates self-closing vs explicit closing tag syntax
|
|
76
|
+
blank?(text1) && blank?(text2)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Check if a value is blank (nil or whitespace-only)
|
|
80
|
+
# @param value [String, nil] Value to check
|
|
81
|
+
# @return [Boolean] true if blank
|
|
82
|
+
def self.blank?(value)
|
|
83
|
+
value.nil? ||
|
|
84
|
+
(value.respond_to?(:empty?) && value.empty?) ||
|
|
85
|
+
(value.respond_to?(:strip) && value.strip.empty?)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Check if a node is a text node
|
|
89
|
+
# @param node [Object] The node to check
|
|
90
|
+
# @return [Boolean] true if the node is a text node
|
|
91
|
+
def self.text_node?(node)
|
|
92
|
+
return false if node.nil?
|
|
93
|
+
|
|
94
|
+
# Canon::Xml::Nodes::TextNode
|
|
95
|
+
return true if node.is_a?(Canon::Xml::Nodes::TextNode)
|
|
96
|
+
|
|
97
|
+
# Moxml::Text (check before generic node_type check)
|
|
98
|
+
return true if node.is_a?(Moxml::Text)
|
|
99
|
+
|
|
100
|
+
# Nokogiri text nodes (node_type returns integer constant like 3)
|
|
101
|
+
return true if node.respond_to?(:node_type) &&
|
|
102
|
+
node.node_type.is_a?(Integer) &&
|
|
103
|
+
node.node_type == Nokogiri::XML::Node::TEXT_NODE
|
|
104
|
+
|
|
105
|
+
# Moxml text nodes (node_type returns symbol) - for when using Moxml adapters
|
|
106
|
+
return true if node.respond_to?(:node_type) && node.node_type == :text
|
|
107
|
+
|
|
108
|
+
# String
|
|
109
|
+
return true if node.is_a?(String)
|
|
110
|
+
|
|
111
|
+
# Test doubles or objects with text node-like interface
|
|
112
|
+
# Check if it has a value method (contains text content)
|
|
113
|
+
return true if node.respond_to?(:value)
|
|
114
|
+
|
|
115
|
+
false
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Extract text content from a node
|
|
119
|
+
# @param node [Object] The node to extract text from
|
|
120
|
+
# @return [String, nil] The text content or nil
|
|
121
|
+
def self.extract_text_content(node)
|
|
122
|
+
return nil if node.nil?
|
|
123
|
+
|
|
124
|
+
# For TextNode with value attribute (Canon::Xml::Nodes::TextNode)
|
|
125
|
+
return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
|
|
126
|
+
|
|
127
|
+
# For XML/HTML nodes with text_content method
|
|
128
|
+
return node.text_content if node.respond_to?(:text_content)
|
|
129
|
+
|
|
130
|
+
# For nodes with content method (try before text, as Moxml::Text.text returns "")
|
|
131
|
+
return node.content if node.respond_to?(:content)
|
|
132
|
+
|
|
133
|
+
# For nodes with text method
|
|
134
|
+
return node.text if node.respond_to?(:text)
|
|
135
|
+
|
|
136
|
+
# For nodes with value method (other types)
|
|
137
|
+
return node.value if node.respond_to?(:value)
|
|
138
|
+
|
|
139
|
+
# For simple text nodes or strings
|
|
140
|
+
return node.to_s if node.is_a?(String)
|
|
141
|
+
|
|
142
|
+
# For other node types, try to_s
|
|
143
|
+
node.to_s
|
|
144
|
+
rescue StandardError
|
|
145
|
+
# If extraction fails, return nil (not a serialization difference)
|
|
146
|
+
nil
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
private_class_method :blank?, :text_node?, :extract_text_content,
|
|
150
|
+
:empty_text_content_serialization_diff?
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
data/lib/canon/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: canon
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.10
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-01-
|
|
11
|
+
date: 2026-01-21 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: diff-lcs
|
|
@@ -257,6 +257,7 @@ files:
|
|
|
257
257
|
- lib/canon/diff/formatting_detector.rb
|
|
258
258
|
- lib/canon/diff/node_serializer.rb
|
|
259
259
|
- lib/canon/diff/path_builder.rb
|
|
260
|
+
- lib/canon/diff/xml_serialization_formatter.rb
|
|
260
261
|
- lib/canon/diff_formatter.rb
|
|
261
262
|
- lib/canon/diff_formatter/by_line/base_formatter.rb
|
|
262
263
|
- lib/canon/diff_formatter/by_line/html_formatter.rb
|