canon 0.1.9 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 24f79ae4b9b6817104e388a5bef96d24677f797db5d13bd6f009a26a04170137
4
- data.tar.gz: 3b8260af8e2157f2f449421b3d40649521ba64495a15f7667e64c3e343b6d3b7
3
+ metadata.gz: 4f9d0e9c0c1bc9f213d837f480d3d9a26ce11505691ff48b63907e7a4abd530e
4
+ data.tar.gz: aa591a7682cede5f23a8dcb8b8eb8f7616d849bc5f9cad1aa2038463ee9c52b0
5
5
  SHA512:
6
- metadata.gz: 971daa53fd96c5c46b5c37c2175f12875e7e36f658cc4186848f1df90ab3db9ceff06af69320e5004d1da1d6b2dc4b35d800c5aca1b1522ac05cf14c73025c21
7
- data.tar.gz: 02f5160a42bf651db2a252909966cbc7dea43239cbbae2a53d155c55c4e09709eddc34e599496bfc5cda14705931f24efbebd267b9acc8700b834fb859d8096f
6
+ metadata.gz: 6c0af5461fff1d1cd1347ba57681bc671cda71d55d62efd328ac9424ef10b8329ec877ccf43f9ff78e83a54ca03df1026e160b259396caac7bd2704227ef01b1
7
+ data.tar.gz: 8803713442225ae16c0c6c9c03c9cff55dd27dc6b96f5254ee5f814a29b7ad7b5ef6eafd0cd6a58d17f070a2609154476215147d595a01a69586ca7de8608a7f
data/.rubocop_todo.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2026-01-20 02:18:38 UTC using RuboCop version 1.81.7.
3
+ # on 2026-01-21 01:26:28 UTC using RuboCop version 1.81.7.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
@@ -12,70 +12,51 @@ Gemspec/RequiredRubyVersion:
12
12
  Exclude:
13
13
  - 'canon.gemspec'
14
14
 
15
- # Offense count: 2
16
- # This cop supports safe autocorrection (--autocorrect).
17
- # Configuration parameters: EnforcedStyleAlignWith.
18
- # SupportedStylesAlignWith: either, start_of_block, start_of_line
19
- Layout/BlockAlignment:
20
- Exclude:
21
- - 'spec/canon/rspec_matchers_spec.rb'
22
-
23
- # Offense count: 2
24
- # This cop supports safe autocorrection (--autocorrect).
25
- Layout/BlockEndNewline:
26
- Exclude:
27
- - 'spec/canon/rspec_matchers_spec.rb'
28
-
29
- # Offense count: 2
30
- # This cop supports safe autocorrection (--autocorrect).
31
- # Configuration parameters: AllowForAlignment.
32
- Layout/CommentIndentation:
33
- Exclude:
34
- - 'lib/canon/comparison/xml_comparator.rb'
35
-
36
- # Offense count: 1
15
+ # Offense count: 16
37
16
  # This cop supports safe autocorrection (--autocorrect).
38
- Layout/ElseAlignment:
17
+ # Configuration parameters: EnforcedStyle, IndentationWidth.
18
+ # SupportedStyles: with_first_argument, with_fixed_indentation
19
+ Layout/ArgumentAlignment:
39
20
  Exclude:
40
21
  - 'lib/canon/comparison/xml_comparator.rb'
22
+ - 'lib/canon/diff/xml_serialization_formatter.rb'
23
+ - 'spec/canon/diff/xml_serialization_formatter_spec.rb'
41
24
 
42
25
  # Offense count: 1
43
26
  # This cop supports safe autocorrection (--autocorrect).
44
- # Configuration parameters: EnforcedStyleAlignWith, Severity.
45
- # SupportedStylesAlignWith: keyword, variable, start_of_line
46
- Layout/EndAlignment:
27
+ # Configuration parameters: AllowMultipleStyles, EnforcedHashRocketStyle, EnforcedColonStyle, EnforcedLastArgumentHashStyle.
28
+ # SupportedHashRocketStyles: key, separator, table
29
+ # SupportedColonStyles: key, separator, table
30
+ # SupportedLastArgumentHashStyles: always_inspect, always_ignore, ignore_implicit, ignore_explicit
31
+ Layout/HashAlignment:
47
32
  Exclude:
48
- - 'lib/canon/comparison/xml_comparator.rb'
33
+ - 'test_verify_equivalent.rb'
49
34
 
50
- # Offense count: 1
51
- # This cop supports safe autocorrection (--autocorrect).
52
- # Configuration parameters: EnforcedStyle.
53
- # SupportedStyles: normal, indented_internal_methods
54
- Layout/IndentationConsistency:
55
- Exclude:
56
- - 'lib/canon/comparison/xml_comparator.rb'
57
-
58
- # Offense count: 4
59
- # This cop supports safe autocorrection (--autocorrect).
60
- # Configuration parameters: Width, AllowedPatterns.
61
- Layout/IndentationWidth:
62
- Exclude:
63
- - 'spec/canon/rspec_matchers_spec.rb'
64
-
65
- # Offense count: 655
35
+ # Offense count: 709
66
36
  # This cop supports safe autocorrection (--autocorrect).
67
37
  # Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, IgnoreCopDirectives, AllowedPatterns, SplitStrings.
68
38
  # URISchemes: http, https
69
39
  Layout/LineLength:
70
40
  Enabled: false
71
41
 
72
- # Offense count: 3
42
+ # Offense count: 4
73
43
  # This cop supports safe autocorrection (--autocorrect).
74
44
  # Configuration parameters: EnforcedStyle, IndentationWidth.
75
45
  # SupportedStyles: aligned, indented
76
46
  Layout/MultilineOperationIndentation:
77
47
  Exclude:
78
48
  - 'lib/canon/diff/diff_classifier.rb'
49
+ - 'lib/canon/diff/xml_serialization_formatter.rb'
50
+
51
+ # Offense count: 17
52
+ # This cop supports safe autocorrection (--autocorrect).
53
+ # Configuration parameters: AllowInHeredoc.
54
+ Layout/TrailingWhitespace:
55
+ Exclude:
56
+ - 'lib/canon/comparison/xml_comparator.rb'
57
+ - 'lib/canon/diff/xml_serialization_formatter.rb'
58
+ - 'spec/canon/diff/xml_serialization_formatter_spec.rb'
59
+ - 'test_verify_equivalent.rb'
79
60
 
80
61
  # Offense count: 48
81
62
  # Configuration parameters: IgnoreLiteralBranches, IgnoreConstantBranches, IgnoreDuplicateElseBranch.
@@ -117,7 +98,7 @@ Lint/UnusedMethodArgument:
117
98
  - 'lib/canon/diff_formatter/by_line/xml_formatter.rb'
118
99
  - 'lib/canon/diff_formatter/by_object/base_formatter.rb'
119
100
 
120
- # Offense count: 194
101
+ # Offense count: 207
121
102
  # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
122
103
  Metrics/AbcSize:
123
104
  Enabled: false
@@ -128,12 +109,12 @@ Metrics/AbcSize:
128
109
  Metrics/BlockLength:
129
110
  Max: 84
130
111
 
131
- # Offense count: 164
112
+ # Offense count: 176
132
113
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
133
114
  Metrics/CyclomaticComplexity:
134
115
  Enabled: false
135
116
 
136
- # Offense count: 346
117
+ # Offense count: 360
137
118
  # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
138
119
  Metrics/MethodLength:
139
120
  Max: 110
@@ -143,7 +124,7 @@ Metrics/MethodLength:
143
124
  Metrics/ParameterLists:
144
125
  Max: 9
145
126
 
146
- # Offense count: 131
127
+ # Offense count: 142
147
128
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
148
129
  Metrics/PerceivedComplexity:
149
130
  Enabled: false
@@ -167,13 +148,15 @@ Naming/PredicatePrefix:
167
148
  Exclude:
168
149
  - 'lib/canon/comparison/html_comparator.rb'
169
150
 
170
- # Offense count: 2
151
+ # Offense count: 6
171
152
  # Configuration parameters: EnforcedStyle, CheckMethodNames, CheckSymbols, AllowedIdentifiers, AllowedPatterns.
172
153
  # SupportedStyles: snake_case, normalcase, non_integer
173
154
  # AllowedIdentifiers: TLS1_1, TLS1_2, capture3, iso8601, rfc1123_date, rfc822, rfc2822, rfc3339, x86_64
174
155
  Naming/VariableNumber:
175
156
  Exclude:
176
157
  - 'lib/canon/comparison/json_comparator.rb'
158
+ - 'lib/canon/comparison/markup_comparator.rb'
159
+ - 'lib/canon/comparison/xml_comparator/diff_node_builder.rb'
177
160
 
178
161
  # Offense count: 2
179
162
  # Configuration parameters: MinSize.
@@ -199,7 +182,7 @@ RSpec/DescribeMethod:
199
182
  - 'spec/canon/comparison/multiple_differences_spec.rb'
200
183
  - 'spec/canon/diff_formatter/character_map_customization_spec.rb'
201
184
 
202
- # Offense count: 663
185
+ # Offense count: 675
203
186
  # Configuration parameters: CountAsOne.
204
187
  RSpec/ExampleLength:
205
188
  Max: 67
@@ -250,7 +233,7 @@ RSpec/MultipleDescribes:
250
233
  Exclude:
251
234
  - 'spec/canon/comparison/match_options_spec.rb'
252
235
 
253
- # Offense count: 515
236
+ # Offense count: 518
254
237
  RSpec/MultipleExpectations:
255
238
  Max: 15
256
239
 
@@ -296,26 +279,16 @@ RSpec/SpecFilePathFormat:
296
279
  - 'spec/canon/yaml/formatter_spec.rb'
297
280
  - 'spec/xml_c14n_spec.rb'
298
281
 
299
- # Offense count: 95
282
+ # Offense count: 120
300
283
  # Configuration parameters: IgnoreNameless, IgnoreSymbolicNames.
301
284
  RSpec/VerifiedDoubles:
302
285
  Exclude:
303
286
  - 'spec/canon/comparison/whitespace_sensitivity_spec.rb'
304
287
  - 'spec/canon/diff/diff_classifier_spec.rb'
305
288
  - 'spec/canon/diff/path_builder_spec.rb'
289
+ - 'spec/canon/diff/xml_serialization_formatter_spec.rb'
306
290
  - 'spec/canon/tree_diff/operation_converter_spec.rb'
307
291
 
308
- # Offense count: 3
309
- # This cop supports safe autocorrection (--autocorrect).
310
- # Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
311
- # SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
312
- # ProceduralMethods: benchmark, bm, bmbm, create, each_with_object, measure, new, realtime, tap, with_object
313
- # FunctionalMethods: let, let!, subject, watch
314
- # AllowedMethods: lambda, proc, it
315
- Style/BlockDelimiters:
316
- Exclude:
317
- - 'spec/canon/rspec_matchers_spec.rb'
318
-
319
292
  # Offense count: 1
320
293
  # This cop supports safe autocorrection (--autocorrect).
321
294
  # Configuration parameters: EnforcedStyle, AllowComments.
@@ -331,14 +304,21 @@ Style/HashLikeCase:
331
304
  - 'lib/canon/diff/diff_block_builder.rb'
332
305
  - 'lib/canon/xml/character_encoder.rb'
333
306
 
334
- # Offense count: 6
307
+ # Offense count: 4
335
308
  # This cop supports unsafe autocorrection (--autocorrect-all).
336
309
  Style/IdenticalConditionalBranches:
337
310
  Exclude:
338
- - 'lib/canon/comparison/xml_comparator.rb'
339
311
  - 'lib/canon/diff_formatter/by_object/base_formatter.rb'
340
312
  - 'lib/canon/diff_formatter/legend.rb'
341
313
 
314
+ # Offense count: 2
315
+ # This cop supports unsafe autocorrection (--autocorrect-all).
316
+ # Configuration parameters: InverseMethods, InverseBlocks.
317
+ Style/InverseMethods:
318
+ Exclude:
319
+ - 'lib/canon/comparison/markup_comparator.rb'
320
+ - 'lib/canon/comparison/xml_comparator/diff_node_builder.rb'
321
+
342
322
  # Offense count: 1
343
323
  # Configuration parameters: AllowedMethods.
344
324
  # AllowedMethods: respond_to_missing?
@@ -346,26 +326,20 @@ Style/OptionalBooleanParameter:
346
326
  Exclude:
347
327
  - 'lib/canon/diff_formatter/debug_output.rb'
348
328
 
349
- # Offense count: 6
329
+ # Offense count: 3
350
330
  # This cop supports safe autocorrection (--autocorrect).
351
331
  # Configuration parameters: EnforcedStyle, ConsistentQuotesInMultiline.
352
332
  # SupportedStyles: single_quotes, double_quotes
353
333
  Style/StringLiterals:
354
334
  Exclude:
355
- - 'spec/canon/rspec_matchers_spec.rb'
335
+ - 'lib/canon/comparison/markup_comparator.rb'
336
+ - 'lib/canon/comparison/xml_comparator/diff_node_builder.rb'
337
+ - 'test_verify_equivalent.rb'
356
338
 
357
- # Offense count: 5
339
+ # Offense count: 12
358
340
  # This cop supports safe autocorrection (--autocorrect).
359
341
  # Configuration parameters: EnforcedStyleForMultiline.
360
342
  # SupportedStylesForMultiline: comma, consistent_comma, diff_comma, no_comma
361
343
  Style/TrailingCommaInArguments:
362
344
  Exclude:
363
- - 'spec/canon/rspec_matchers_spec.rb'
364
-
365
- # Offense count: 3
366
- # This cop supports safe autocorrection (--autocorrect).
367
- # Configuration parameters: EnforcedStyleForMultiline.
368
- # SupportedStylesForMultiline: comma, consistent_comma, diff_comma, no_comma
369
- Style/TrailingCommaInHashLiteral:
370
- Exclude:
371
- - 'spec/canon/rspec_matchers_spec.rb'
345
+ - 'spec/canon/diff/xml_serialization_formatter_spec.rb'
@@ -80,14 +80,20 @@ Classification depends on `attribute_order` setting:
80
80
  │ │
81
81
  │ DiffClassifier examines each DiffNode: │
82
82
  │ │
83
- For each dimension:
84
- behavior = match_options.behavior_for(dimension)
83
+ 1. Serialization-level formatting (XmlSerializationFormatter)
84
+ XML syntax differences: <tag/> vs <tag></tag>
85
+ │ → ALWAYS formatting-only (non-normative) │
85
86
  │ │
86
- if behavior == :ignore
87
- INFORMATIVE (difference doesn't matter)
88
- else # :strict or :normalize
89
- │ → NORMATIVE (difference matters) │
87
+ 2. Content-level formatting (text_content: :normalize)
88
+ Whitespace differences in content
89
+ Formatting-only when normalized content matches
90
90
  │ │
91
+ │ 3. CompareProfile policy (normative vs informative) │
92
+ │ → behavior == :ignore → INFORMATIVE │
93
+ │ → behavior == :strict → NORMATIVE │
94
+ │ → behavior == :normalize → Check content normalization │
95
+ │ │
96
+ │ Sets diff_node.formatting = true/false │
91
97
  │ Sets diff_node.normative = true/false │
92
98
  └───────────────────────────────────┬───────────────────────────────┘
93
99
 
@@ -102,6 +108,27 @@ Classification depends on `attribute_order` setting:
102
108
  └──────────────────────────────────────────────────────────────────┘
103
109
  ----
104
110
 
111
+ === Three-Level Classification System
112
+
113
+ Canon distinguishes between **three distinct kinds of differences**:
114
+
115
+ | Kind | `formatting:` | `normative:` | Meaning | Examples |
116
+ |------|---------------|--------------|---------|----------|
117
+ | **Serialization formatting** | `true` | `false` | XML syntax differences | `<tag/>` vs `<tag></tag>` |
118
+ | **Content formatting** | `true` | `false` | Whitespace in content | `Hello world` vs `Hello world` |
119
+ | **Informative** | `false` | `false` | Tracked but doesn't affect equivalence | Attribute order (when `:ignore`) |
120
+ | **Normative** | `false` | `true` | Affects equivalence | Different words, missing elements |
121
+
122
+ **Key distinction**:
123
+
124
+ * **Serialization-level formatting**: XML syntax differences that are ALWAYS non-normative regardless of match options, because they represent different valid serializations of the same semantic content. Detected by `XmlSerializationFormatter`.
125
+
126
+ * **Content-level formatting**: Whitespace differences in document content. These are formatting-only (non-normative) when normalized content matches (using `text_content: :normalize`).
127
+
128
+ * **Informative**: Differences tracked for reference but don't affect equivalence (when behavior is `:ignore`).
129
+
130
+ * **Normative**: Semantic content differences that affect equivalence (when behavior is `:strict` or when normalized content differs).
131
+
105
132
  == CompareProfile-Based Classification
106
133
 
107
134
  === Overview
@@ -120,22 +147,42 @@ DiffNode → DiffClassifier → CompareProfile → normative?
120
147
 
121
148
  === Classification Hierarchy
122
149
 
123
- Canon uses a three-level hierarchy for classifying differences:
150
+ Canon uses a **multi-level hierarchy** for classifying differences:
124
151
 
125
- 1. **Formatting-only** (lowest priority)
126
- - Pure whitespace/formatting differences
127
- - Normalized content is identical
128
- - Markers: `[` and `]` in diff output
152
+ [source]
153
+ ----
154
+ DiffNode DiffClassifier XmlSerializationFormatter → serialization formatting?
155
+
156
+ CompareProfile → normative dimension?
157
+
158
+ FormattingDetector → formatting-only?
159
+
160
+ Final classification
161
+ ----
162
+
163
+ **Classification priority (from highest to lowest specificity)**:
164
+
165
+ 1. **Serialization-level formatting** (highest priority)
166
+ - XML syntax differences: `<tag/>` vs `<tag></tag>`
167
+ - Detected by `XmlSerializationFormatter`
168
+ - **ALWAYS** `formatting: true, normative: false`
169
+ - Bypasses all other classification logic
129
170
 
130
- 2. **Informative** (medium priority)
171
+ 2. **Content-level formatting**
172
+ - Whitespace differences in document content
173
+ - Detected by `FormattingDetector` when `text_content: :normalize`
174
+ - `formatting: true, normative: false` when normalized content matches
175
+ - Respects element-level whitespace sensitivity
176
+
177
+ 3. **Informative** (based on `:ignore` behavior)
131
178
  - Tracked but doesn't affect equivalence
132
- - Based on behavior `:ignore`
133
- - Markers: `<` and `>` in diff output
179
+ - `formatting: false, normative: false`
180
+ - Example: Attribute order when `attribute_order: :ignore`
134
181
 
135
- 3. **Normative** (highest priority)
182
+ 4. **Normative** (based on `:strict` behavior or content mismatch)
136
183
  - Affects equivalence
137
- - Based on behavior `:strict`
138
- - Markers: `-` and `+` in diff output
184
+ - `formatting: false, normative: true`
185
+ - Example: Different words, missing elements
139
186
 
140
187
  === Format-Specific Policies
141
188
 
@@ -292,6 +339,34 @@ Canon::Comparison.equivalent?(html1, html2, format: :html)
292
339
  ----
293
340
  ====
294
341
 
342
+ .Self-closing vs explicit closing tags
343
+ ====
344
+ Per XML standards, `<tag/>` and `<tag></tag>` are semantically equivalent (both represent empty elements). Canon classifies differences in serialisation format as **formatting-only** (non-normative):
345
+
346
+ [source,ruby]
347
+ ----
348
+ # Self-closing vs explicit closing - always equivalent
349
+ xml1 = '<svg><rect x="10" y="10"/></svg>'
350
+ xml2 = '<svg><rect x="10" y="10"></rect></svg>'
351
+
352
+ Canon::Comparison.equivalent?(xml1, xml2, format: :xml)
353
+ # => true
354
+
355
+ # Empty/whitespace-only text nodes from serialisation are formatting-only
356
+ result = Canon::Comparison.equivalent?(xml1, xml2, format: :xml, verbose: true)
357
+ result.differences.each do |diff|
358
+ if diff.dimension == :text_content
359
+ puts "Normative: #{diff.normative?}" # => false
360
+ puts "Formatting: #{diff.formatting?}" # => true
361
+ end
362
+ end
363
+ ----
364
+
365
+ This applies regardless of `text_content` behavior setting, as these differences are purely serialisation format variations (similar to attribute order).
366
+
367
+ The key insight: empty or whitespace-only text nodes created by different serialisation styles (`<tag/>` vs `<tag></tag>`) are always classified as **formatting-only**, not normative.
368
+ ====
369
+
295
370
  === FormattingDetector Integration
296
371
 
297
372
  For dimensions that support it (`:text_content`, `:structural_whitespace`),
@@ -319,19 +394,35 @@ With `:normalize` mode:
319
394
 
320
395
  === Implementation Details
321
396
 
322
- The [`CompareProfile`](../../lib/canon/comparison/compare_profile.rb) class provides:
397
+ The classification system uses three main classes:
323
398
 
324
- * `normative_dimension?(dimension)` - Is this dimension normative?
325
- * `affects_equivalence?(dimension)` - Does this dimension affect equivalence?
326
- * `supports_formatting_detection?(dimension)` - Can this dimension have formatting-only diffs?
399
+ * **`XmlSerializationFormatter`** - Detects XML serialization-level formatting differences
400
+ - Self-closing vs explicit closing tags: `<tag/>` vs `<tag></tag>`
401
+ - Always returns `formatting: true, normative: false`
402
+ - These differences are ALWAYS non-normative regardless of match options
327
403
 
328
- The [`DiffClassifier`](../../lib/canon/diff/diff_classifier.rb) uses CompareProfile to classify differences, with special handling for `text_content: :normalize`:
404
+ * **`CompareProfile`** - Determines dimension behavior and policy
405
+ - `normative_dimension?(dimension)` - Is this dimension normative?
406
+ - `affects_equivalence?(dimension)` - Does this dimension affect equivalence?
407
+ - `supports_formatting_detection?(dimension)` - Can this dimension have formatting-only diffs?
408
+
409
+ * **`DiffClassifier`** - Orchestrates classification using the above
410
+ - First checks `XmlSerializationFormatter` for serialization formatting
411
+ - Then handles content-level formatting (text_content: :normalize)
412
+ - Finally applies `CompareProfile` policy for normative vs informative
329
413
 
330
414
  [source,ruby]
331
415
  ----
332
416
  def classify(diff_node)
333
- # SPECIAL CASE: text_content with :normalize behavior
334
- # Formatting-only differences (whitespace-only) are marked as non-normative
417
+ # FIRST: Check for XML serialization-level formatting differences
418
+ # These are ALWAYS non-normative (formatting-only) regardless of match options
419
+ if XmlSerializationFormatter.serialization_formatting?(diff_node)
420
+ diff_node.formatting = true
421
+ diff_node.normative = false
422
+ return diff_node
423
+ end
424
+
425
+ # SECOND: Handle content-level formatting for text_content with :normalize
335
426
  if diff_node.dimension == :text_content &&
336
427
  profile.send(:behavior_for, :text_content) == :normalize &&
337
428
  !inside_whitespace_sensitive_element?(diff_node) &&
@@ -341,10 +432,10 @@ def classify(diff_node)
341
432
  return diff_node
342
433
  end
343
434
 
344
- # Standard classification flow
435
+ # THIRD: Apply CompareProfile policy
345
436
  is_normative = profile.normative_dimension?(diff_node.dimension)
346
437
 
347
- # Only check formatting for non-normative dimensions
438
+ # FOURTH: Check FormattingDetector for non-normative dimensions
348
439
  if !is_normative && profile.supports_formatting_detection?(diff_node.dimension)
349
440
  if formatting_only_diff?(diff_node)
350
441
  diff_node.formatting = true
@@ -353,6 +444,7 @@ def classify(diff_node)
353
444
  end
354
445
  end
355
446
 
447
+ # FIFTH: Apply normative determination
356
448
  diff_node.normative = is_normative
357
449
  diff_node
358
450
  end
@@ -239,9 +239,116 @@ module Canon
239
239
  # @param diff2 [Symbol] Difference type for node2
240
240
  # @param dimension [Symbol] The dimension of the difference
241
241
  # @return [String] Human-readable reason
242
- def build_difference_reason(_node1, _node2, diff1, diff2, dimension)
242
+ def build_difference_reason(node1, node2, diff1, diff2, dimension)
243
+ # For attribute presence differences, show what attributes differ
244
+ if dimension == :attribute_presence
245
+ attrs1 = extract_attributes(node1)
246
+ attrs2 = extract_attributes(node2)
247
+ return build_attribute_difference_reason(attrs1, attrs2)
248
+ end
249
+
250
+ # For text content differences, show the actual text (truncated if needed)
251
+ if dimension == :text_content
252
+ text1 = extract_text_content_from_node(node1)
253
+ text2 = extract_text_content_from_node(node2)
254
+ return build_text_difference_reason(text1, text2)
255
+ end
256
+
243
257
  # Default reason - can be overridden in subclasses
244
- "Difference in #{dimension}: #{diff1} vs #{diff2}"
258
+ "#{diff1} vs #{diff2}"
259
+ end
260
+
261
+ # Build a clear reason message for attribute presence differences
262
+ # Shows which attributes are only in node1, only in node2, or different values
263
+ #
264
+ # @param attrs1 [Hash, nil] First node's attributes
265
+ # @param attrs2 [Hash, nil] Second node's attributes
266
+ # @return [String] Clear explanation of the attribute difference
267
+ def build_attribute_difference_reason(attrs1, attrs2)
268
+ return "#{attrs1&.keys&.size || 0} vs #{attrs2&.keys&.size || 0} attributes" unless attrs1 && attrs2
269
+
270
+ require "set"
271
+ keys1 = attrs1.keys.to_set
272
+ keys2 = attrs2.keys.to_set
273
+
274
+ only_in_1 = keys1 - keys2
275
+ only_in_2 = keys2 - keys1
276
+ common = keys1 & keys2
277
+
278
+ # Check if values differ for common keys
279
+ different_values = common.reject { |k| attrs1[k] == attrs2[k] }
280
+
281
+ parts = []
282
+ parts << "only in first: #{only_in_1.to_a.sort.join(', ')}" if only_in_1.any?
283
+ parts << "only in second: #{only_in_2.to_a.sort.join(', ')}" if only_in_2.any?
284
+ parts << "different values: #{different_values.sort.join(', ')}" if different_values.any?
285
+
286
+ if parts.empty?
287
+ "#{keys1.size} vs #{keys2.size} attributes (same names)"
288
+ else
289
+ parts.join("; ")
290
+ end
291
+ end
292
+
293
+ # Extract text content from a node for diff reason
294
+ #
295
+ # @param node [Object, nil] Node to extract text from
296
+ # @return [String, nil] Text content or nil
297
+ def extract_text_content_from_node(node)
298
+ return nil if node.nil?
299
+
300
+ # For Canon::Xml::Nodes::TextNode
301
+ return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
302
+
303
+ # For XML/HTML nodes with text_content method
304
+ return node.text_content if node.respond_to?(:text_content)
305
+
306
+ # For nodes with text method
307
+ return node.text if node.respond_to?(:text)
308
+
309
+ # For nodes with content method (Moxml::Text)
310
+ return node.content if node.respond_to?(:content)
311
+
312
+ # For nodes with value method (other types)
313
+ return node.value if node.respond_to?(:value)
314
+
315
+ # For simple text nodes or strings
316
+ return node.to_s if node.is_a?(String)
317
+
318
+ # For other node types, try to_s
319
+ node.to_s
320
+ rescue StandardError
321
+ nil
322
+ end
323
+
324
+ # Build a clear reason message for text content differences
325
+ # Shows the actual text content (truncated if too long)
326
+ #
327
+ # @param text1 [String, nil] First text content
328
+ # @param text2 [String, nil] Second text content
329
+ # @return [String] Clear explanation of the text difference
330
+ def build_text_difference_reason(text1, text2)
331
+ # Handle nil cases
332
+ return "missing vs '#{truncate_text(text2)}'" if text1.nil? && text2
333
+ return "'#{truncate_text(text1)}' vs missing" if text1 && text2.nil?
334
+ return "both missing" if text1.nil? && text2.nil?
335
+
336
+ # Both have content - show truncated versions
337
+ "'#{truncate_text(text1)}' vs '#{truncate_text(text2)}'"
338
+ end
339
+
340
+ # Truncate text for display in reason messages
341
+ #
342
+ # @param text [String] Text to truncate
343
+ # @param max_length [Integer] Maximum length
344
+ # @return [String] Truncated text
345
+ def truncate_text(text, max_length = 40)
346
+ return "" if text.nil?
347
+
348
+ text = text.to_s
349
+ return text if text.length <= max_length
350
+
351
+ "#{text[0...max_length]}..."
245
352
  end
246
353
 
247
354
  # Serialize an element node to string
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "set"
3
4
  require_relative "../../diff/diff_node"
4
5
  require_relative "../../diff/path_builder"
5
6
  require_relative "../../diff/node_serializer"
@@ -62,6 +63,21 @@ module Canon
62
63
  end
63
64
  end
64
65
 
66
+ # For attribute presence differences, show what attributes differ
67
+ if dimension == :attribute_presence
68
+ attrs1 = extract_attributes(node1)
69
+ attrs2 = extract_attributes(node2)
70
+ return build_attribute_difference_reason(attrs1, attrs2)
71
+ end
72
+
73
+ # For text content differences, show the actual text (truncated if needed)
74
+ if dimension == :text_content
75
+ text1 = extract_text_content(node1)
76
+ text2 = extract_text_content(node2)
77
+ return build_text_difference_reason(text1, text2)
78
+ end
79
+
80
+ # Default reason
65
81
  "#{diff1} vs #{diff2}"
66
82
  end
67
83
 
@@ -110,6 +126,98 @@ module Canon
110
126
 
111
127
  Canon::Diff::NodeSerializer.extract_attributes(node)
112
128
  end
129
+
130
+ # Build a clear reason message for attribute presence differences
131
+ # Shows which attributes are only in node1, only in node2, or different values
132
+ #
133
+ # @param attrs1 [Hash, nil] First node's attributes
134
+ # @param attrs2 [Hash, nil] Second node's attributes
135
+ # @return [String] Clear explanation of the attribute difference
136
+ def self.build_attribute_difference_reason(attrs1, attrs2)
137
+ return "#{attrs1&.keys&.size || 0} vs #{attrs2&.keys&.size || 0} attributes" unless attrs1 && attrs2
138
+
139
+ keys1 = attrs1.keys.to_set
140
+ keys2 = attrs2.keys.to_set
141
+
142
+ only_in_1 = keys1 - keys2
143
+ only_in_2 = keys2 - keys1
144
+ common = keys1 & keys2
145
+
146
+ # Check if values differ for common keys
147
+ different_values = common.reject { |k| attrs1[k] == attrs2[k] }
148
+
149
+ parts = []
150
+ parts << "only in first: #{only_in_1.to_a.sort.join(', ')}" if only_in_1.any?
151
+ parts << "only in second: #{only_in_2.to_a.sort.join(', ')}" if only_in_2.any?
152
+ parts << "different values: #{different_values.sort.join(', ')}" if different_values.any?
153
+
154
+ if parts.empty?
155
+ "#{keys1.size} vs #{keys2.size} attributes (same names)"
156
+ else
157
+ parts.join("; ")
158
+ end
159
+ end
160
+
161
+ # Extract text content from a node
162
+ #
163
+ # @param node [Object, nil] Node to extract text from
164
+ # @return [String, nil] Text content or nil
165
+ def self.extract_text_content(node)
166
+ return nil if node.nil?
167
+
168
+ # For Canon::Xml::Nodes::TextNode
169
+ return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
170
+
171
+ # For XML/HTML nodes with text_content method
172
+ return node.text_content if node.respond_to?(:text_content)
173
+
174
+ # For nodes with text method
175
+ return node.text if node.respond_to?(:text)
176
+
177
+ # For nodes with content method (Moxml::Text)
178
+ return node.content if node.respond_to?(:content)
179
+
180
+ # For nodes with value method (other types)
181
+ return node.value if node.respond_to?(:value)
182
+
183
+ # For simple text nodes or strings
184
+ return node.to_s if node.is_a?(String)
185
+
186
+ # For other node types, try to_s
187
+ node.to_s
188
+ rescue StandardError
189
+ nil
190
+ end
191
+
192
+ # Build a clear reason message for text content differences
193
+ # Shows the actual text content (truncated if too long)
194
+ #
195
+ # @param text1 [String, nil] First text content
196
+ # @param text2 [String, nil] Second text content
197
+ # @return [String] Clear explanation of the text difference
198
+ def self.build_text_difference_reason(text1, text2)
199
+ # Handle nil cases
200
+ return "missing vs '#{truncate(text2)}'" if text1.nil? && text2
201
+ return "'#{truncate(text1)}' vs missing" if text1 && text2.nil?
202
+ return "both missing" if text1.nil? && text2.nil?
203
+
204
+ # Both have content - show truncated versions
205
+ "'#{truncate(text1)}' vs '#{truncate(text2)}'"
206
+ end
207
+
208
+ # Truncate text for display in reason messages
209
+ #
210
+ # @param text [String] Text to truncate
211
+ # @param max_length [Integer] Maximum length
212
+ # @return [String] Truncated text
213
+ def self.truncate(text, max_length = 40)
214
+ return "" if text.nil?
215
+
216
+ text = text.to_s
217
+ return text if text.length <= max_length
218
+
219
+ "#{text[0...max_length]}..."
220
+ end
113
221
  end
114
222
  end
115
223
  end
@@ -568,9 +568,201 @@ differences)
568
568
  end
569
569
  end
570
570
 
571
+ # For attribute presence differences, show what attributes differ
572
+ if dimension == :attribute_presence
573
+ attrs1 = extract_attributes(node1)
574
+ attrs2 = extract_attributes(node2)
575
+ return build_attribute_diff_reason(attrs1, attrs2)
576
+ end
577
+
578
+ # For text content differences, show the actual text (truncated if needed)
579
+ if dimension == :text_content
580
+ text1 = extract_text_from_node(node1)
581
+ text2 = extract_text_from_node(node2)
582
+ return build_text_diff_reason(text1, text2)
583
+ end
584
+
571
585
  "#{diff1} vs #{diff2}"
572
586
  end
573
587
 
588
+ # Build a clear reason message for attribute presence differences
589
+ #
590
+ # @param attrs1 [Hash, nil] First node's attributes
591
+ # @param attrs2 [Hash, nil] Second node's attributes
592
+ # @return [String] Clear explanation of the attribute difference
593
+ def build_attribute_diff_reason(attrs1, attrs2)
594
+ return "#{attrs1&.keys&.size || 0} vs #{attrs2&.keys&.size || 0} attributes" unless attrs1 && attrs2
595
+
596
+ require "set"
597
+ keys1 = attrs1.keys.to_set
598
+ keys2 = attrs2.keys.to_set
599
+
600
+ only_in_first = keys1 - keys2
601
+ only_in_second = keys2 - keys1
602
+ common = keys1 & keys2
603
+
604
+ # Check if values differ for common keys
605
+ different_values = common.reject { |k| attrs1[k] == attrs2[k] }
606
+
607
+ parts = []
608
+ parts << "only in first: #{only_in_first.to_a.sort.join(', ')}" if only_in_first.any?
609
+ parts << "only in second: #{only_in_second.to_a.sort.join(', ')}" if only_in_second.any?
610
+ parts << "different values: #{different_values.sort.join(', ')}" if different_values.any?
611
+
612
+ if parts.empty?
613
+ "#{keys1.size} vs #{keys2.size} attributes (same names)"
614
+ else
615
+ parts.join("; ")
616
+ end
617
+ end
618
+
619
+ # Extract text from a node for diff reason
620
+ #
621
+ # @param node [Object, nil] Node to extract text from
622
+ # @return [String, nil] Text content or nil
623
+ def extract_text_from_node(node)
624
+ return nil if node.nil?
625
+
626
+ # For Canon::Xml::Nodes::TextNode
627
+ return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
628
+
629
+ # For XML/HTML nodes with text_content method
630
+ return node.text_content if node.respond_to?(:text_content)
631
+
632
+ # For nodes with text method
633
+ return node.text if node.respond_to?(:text)
634
+
635
+ # For nodes with content method (Moxml::Text)
636
+ return node.content if node.respond_to?(:content)
637
+
638
+ # For nodes with value method (other types)
639
+ return node.value if node.respond_to?(:value)
640
+
641
+ # For simple text nodes or strings
642
+ return node.to_s if node.is_a?(String)
643
+
644
+ # For other node types, try to_s
645
+ node.to_s
646
+ rescue StandardError
647
+ nil
648
+ end
649
+
650
+ # Build a clear reason message for text content differences
651
+ #
652
+ # @param text1 [String, nil] First text content
653
+ # @param text2 [String, nil] Second text content
654
+ # @return [String] Clear explanation of the text difference
655
+ def build_text_diff_reason(text1, text2)
656
+ # Handle nil cases
657
+ return "missing vs '#{truncate_text(text2)}'" if text1.nil? && text2
658
+ return "'#{truncate_text(text2)}' vs missing" if text1 && text2.nil?
659
+ return "both missing" if text1.nil? && text2.nil?
660
+
661
+ # Check if both are whitespace-only
662
+ if whitespace_only?(text1) && whitespace_only?(text2)
663
+ return "whitespace: #{describe_whitespace(text1)} vs #{describe_whitespace(text2)}"
664
+ end
665
+
666
+ # Show text with visible whitespace markers
667
+ # Use escaped representations for clarity: \n for newline, \t for tab, · for spaces
668
+ vis1 = visualize_whitespace(text1)
669
+ vis2 = visualize_whitespace(text2)
670
+
671
+ "Text: \"#{vis1}\" vs \"#{vis2}\""
672
+ end
673
+
674
+ # Check if text is only whitespace
675
+ #
676
+ # @param text [String] Text to check
677
+ # @return [Boolean] true if whitespace-only
678
+ def whitespace_only?(text)
679
+ return false if text.nil?
680
+
681
+ text.to_s.strip.empty?
682
+ end
683
+
684
+ # Make whitespace visible in text content
685
+ # Uses the existing character visualization map from DiffFormatter (single source of truth)
686
+ #
687
+ # @param text [String] Text to visualize
688
+ # @return [String] Text with visible whitespace markers
689
+ def visualize_whitespace(text)
690
+ return "" if text.nil?
691
+
692
+ # Use the character map loader as the single source of truth
693
+ viz_map = character_visualization_map
694
+
695
+ # Replace each character with its visualization
696
+ text.chars.map { |char| viz_map[char] || char }.join
697
+ end
698
+
699
+ # Get the character visualization map (lazy-loaded to avoid circular dependency)
700
+ #
701
+ # @return [Hash] Character to visualization symbol mapping
702
+ def character_visualization_map
703
+ @character_visualization_map ||= begin
704
+ # Load the YAML file directly to avoid circular dependency
705
+ require "yaml"
706
+ lib_root = File.expand_path("../..", __dir__)
707
+ yaml_path = File.join(lib_root,
708
+ "canon/diff_formatter/character_map.yml")
709
+ data = YAML.load_file(yaml_path)
710
+
711
+ # Build visualization map from the YAML data
712
+ visualization_map = {}
713
+ data["characters"].each do |char_data|
714
+ # Get the character from either unicode code point or character field
715
+ char = if char_data["unicode"]
716
+ # Convert hex string to character
717
+ [char_data["unicode"].to_i(16)].pack("U")
718
+ else
719
+ # Use character field directly (handles \n, \t, etc.)
720
+ char_data["character"]
721
+ end
722
+
723
+ vis = char_data["visualization"]
724
+ visualization_map[char] = vis
725
+ end
726
+
727
+ visualization_map
728
+ end
729
+ end
730
+
731
+ # Describe whitespace content in a readable way
732
+ #
733
+ # @param text [String] Whitespace text
734
+ # @return [String] Description like "4 chars (2 newlines, 2 spaces)"
735
+ def describe_whitespace(text)
736
+ return "0 chars" if text.nil? || text.empty?
737
+
738
+ char_count = text.length
739
+ newline_count = text.count("\n")
740
+ space_count = text.count(" ")
741
+ tab_count = text.count("\t")
742
+
743
+ parts = []
744
+ parts << "#{newline_count} newlines" if newline_count.positive?
745
+ parts << "#{space_count} spaces" if space_count.positive?
746
+ parts << "#{tab_count} tabs" if tab_count.positive?
747
+
748
+ description = parts.join(", ")
749
+ "#{char_count} chars (#{description})"
750
+ end
751
+
752
+ # Truncate text for display in reason messages
753
+ #
754
+ # @param text [String] Text to truncate
755
+ # @param max_length [Integer] Maximum length
756
+ # @return [String] Truncated text
757
+ def truncate_text(text, max_length = 40)
758
+ return "" if text.nil?
759
+
760
+ text = text.to_s
761
+ return text if text.length <= max_length
762
+
763
+ "#{text[0...max_length]}..."
764
+ end
765
+
574
766
  # Compare namespace declarations (xmlns and xmlns:* attributes)
575
767
  # Delegates to XmlComparatorHelpers::NamespaceComparator
576
768
  def compare_namespace_declarations(n1, n2, opts, differences)
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "formatting_detector"
4
+ require_relative "xml_serialization_formatter"
4
5
  require_relative "../comparison/compare_profile"
5
6
  require_relative "../comparison/whitespace_sensitivity"
6
7
 
@@ -8,6 +9,11 @@ module Canon
8
9
  module Diff
9
10
  # Classifies DiffNodes as normative (affects equivalence) or informative (doesn't affect equivalence)
10
11
  # based on the match options in effect
12
+ #
13
+ # Classification hierarchy (three distinct kinds of differences):
14
+ # 1. Serialization formatting: XML syntax differences (always non-normative)
15
+ # 2. Content formatting: Whitespace differences in content (non-normative when normalized)
16
+ # 3. Normative: Semantic content differences (affect equivalence)
11
17
  class DiffClassifier
12
18
  attr_reader :match_options, :profile
13
19
 
@@ -25,11 +31,20 @@ module Canon
25
31
 
26
32
  # Classify a single DiffNode as normative or informative
27
33
  # Hierarchy: formatting-only < informative < normative
28
- # CompareProfile determines base classification, FormattingDetector refines informative differences
34
+ # CompareProfile determines base classification, XmlSerializationFormatter handles serialization formatting
29
35
  # @param diff_node [DiffNode] The diff node to classify
30
36
  # @return [DiffNode] The same diff node with normative/formatting attributes set
31
37
  def classify(diff_node)
32
- # SPECIAL CASE: text_content with :normalize behavior
38
+ # FIRST: Check for XML serialization-level formatting differences
39
+ # These are ALWAYS non-normative (formatting-only) regardless of match options
40
+ # Examples: self-closing tags (<tag/>) vs explicit closing tags (<tag></tag>)
41
+ if XmlSerializationFormatter.serialization_formatting?(diff_node)
42
+ diff_node.formatting = true
43
+ diff_node.normative = false
44
+ return diff_node
45
+ end
46
+
47
+ # SECOND: Handle content-level formatting for text_content with :normalize behavior
33
48
  # When text_content is :normalize and the difference is formatting-only,
34
49
  # it should be marked as non-normative (informative)
35
50
  # This ensures that verbose and non-verbose modes give consistent results
@@ -38,7 +53,7 @@ module Canon
38
53
  # (like <pre>, <code>, <textarea> in HTML), don't apply formatting detection
39
54
  # because whitespace should be preserved in these elements
40
55
  #
41
- # This check must come FIRST, before normative_dimension? is called,
56
+ # This check must come BEFORE normative_dimension? is called,
42
57
  # because normative_dimension? returns true for text_content: :normalize
43
58
  # (since the dimension affects equivalence), which would prevent formatting
44
59
  # detection from being applied.
@@ -51,11 +66,11 @@ module Canon
51
66
  return diff_node
52
67
  end
53
68
 
54
- # FIRST: Determine if this dimension is normative based on CompareProfile
69
+ # THIRD: Determine if this dimension is normative based on CompareProfile
55
70
  # This respects the policy settings (strict/normalize/ignore)
56
71
  is_normative = profile.normative_dimension?(diff_node.dimension)
57
72
 
58
- # SECOND: Check if FormattingDetector should be consulted
73
+ # FOURTH: Check if FormattingDetector should be consulted for non-normative dimensions
59
74
  # Only check for formatting-only when dimension is NOT normative
60
75
  # This ensures strict mode differences remain normative
61
76
  should_check_formatting = !is_normative &&
@@ -68,7 +83,7 @@ module Canon
68
83
  return diff_node
69
84
  end
70
85
 
71
- # THIRD: Apply the normative determination from CompareProfile
86
+ # FIFTH: Apply the normative determination from CompareProfile
72
87
  diff_node.formatting = false
73
88
  diff_node.normative = is_normative
74
89
 
@@ -127,33 +142,6 @@ module Canon
127
142
  normalized1 == normalized2 && text1 != text2
128
143
  end
129
144
 
130
- # Check if a node is a text node
131
- # @param node [Object] The node to check
132
- # @return [Boolean] true if the node is a text node
133
- def text_node?(node)
134
- return false if node.nil?
135
-
136
- # Canon::Xml::Nodes::TextNode
137
- return true if node.is_a?(Canon::Xml::Nodes::TextNode)
138
-
139
- # Nokogiri text nodes (node_type returns integer constant like 3)
140
- return true if node.respond_to?(:node_type) &&
141
- node.node_type.is_a?(Integer) &&
142
- node.node_type == Nokogiri::XML::Node::TEXT_NODE
143
-
144
- # Moxml text nodes (node_type returns symbol)
145
- return true if node.respond_to?(:node_type) && node.node_type == :text
146
-
147
- # String
148
- return true if node.is_a?(String)
149
-
150
- # Test doubles or objects with text node-like interface
151
- # Check if it has a value method (contains text content)
152
- return true if node.respond_to?(:value)
153
-
154
- false
155
- end
156
-
157
145
  # Check if the text node is inside a whitespace-sensitive element
158
146
  # @param diff_node [DiffNode] The diff node to check
159
147
  # @return [Boolean] true if inside a whitespace-sensitive element
@@ -200,6 +188,33 @@ module Canon
200
188
  # If extraction fails, return nil (not formatting-only)
201
189
  nil
202
190
  end
191
+
192
+ # Check if a node is a text node
193
+ # @param node [Object] The node to check
194
+ # @return [Boolean] true if the node is a text node
195
+ def text_node?(node)
196
+ return false if node.nil?
197
+
198
+ # Canon::Xml::Nodes::TextNode
199
+ return true if node.is_a?(Canon::Xml::Nodes::TextNode)
200
+
201
+ # Nokogiri text nodes (node_type returns integer constant like 3)
202
+ return true if node.respond_to?(:node_type) &&
203
+ node.node_type.is_a?(Integer) &&
204
+ node.node_type == Nokogiri::XML::Node::TEXT_NODE
205
+
206
+ # Moxml text nodes (node_type returns symbol)
207
+ return true if node.respond_to?(:node_type) && node.node_type == :text
208
+
209
+ # String
210
+ return true if node.is_a?(String)
211
+
212
+ # Test doubles or objects with text node-like interface
213
+ # Check if it has a value method (contains text content)
214
+ return true if node.respond_to?(:value)
215
+
216
+ false
217
+ end
203
218
  end
204
219
  end
205
220
  end
@@ -0,0 +1,153 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module Diff
5
+ # Detects and classifies XML serialization-level formatting differences.
6
+ #
7
+ # Serialization-level formatting differences are differences in XML syntax
8
+ # that do not affect the semantic content of the document. These differences
9
+ # arise from different valid ways to serialize the same semantic content.
10
+ #
11
+ # These differences are ALWAYS non-normative (formatting-only) regardless
12
+ # of match options, because they are purely syntactic variations.
13
+ #
14
+ # Examples:
15
+ # - Self-closing vs explicit closing tags: <tag/> vs <tag></tag>
16
+ # - Attribute quote style: attr="value" vs attr='value' (parser-normalized)
17
+ # - Whitespace within tags: <tag a="1" b="2"> vs <tag a="1" b="2"> (parser-normalized)
18
+ #
19
+ # Note: Some serialization differences are normalized away by XML parsers
20
+ # (attribute quotes, tag spacing). This class focuses on differences that
21
+ # survive parsing and comparison, such as self-closing vs explicit closing.
22
+ class XmlSerializationFormatter
23
+ # Detect if a diff node represents an XML serialization formatting difference.
24
+ #
25
+ # Serialization formatting differences are ALWAYS non-normative because they
26
+ # represent different valid serializations of the same semantic content.
27
+ #
28
+ # @param diff_node [DiffNode] The diff node to check
29
+ # @return [Boolean] true if this is a serialization formatting difference
30
+ def self.serialization_formatting?(diff_node)
31
+ # Currently only handles text_content dimension
32
+ # Future: add detection for other dimensions
33
+ return false unless diff_node.dimension == :text_content
34
+
35
+ empty_text_content_serialization_diff?(diff_node)
36
+ end
37
+
38
+ # Check if a text_content difference is from XML serialization format.
39
+ #
40
+ # Specifically detects self-closing tags (<tag/>) vs explicit closing tags
41
+ # (<tag></tag>), which create different text node structures:
42
+ # - Self-closing: no text node (nil)
43
+ # - Explicit closing: empty or whitespace-only text node ("", " ", "\n", etc.)
44
+ #
45
+ # Per XML standards, these forms are semantically equivalent.
46
+ #
47
+ # @param diff_node [DiffNode] The diff node to check
48
+ # @return [Boolean] true if this is a serialization formatting difference
49
+ def self.empty_text_content_serialization_diff?(diff_node)
50
+ return false unless diff_node.dimension == :text_content
51
+
52
+ node1 = diff_node.node1
53
+ node2 = diff_node.node2
54
+
55
+ # Both nodes are nil - no actual difference, not a serialization formatting diff
56
+ return false if node1.nil? && node2.nil?
57
+
58
+ # Only one is nil (e.g., one doc has self-closing, other has text)
59
+ # If the non-nil one is blank, it's still serialization formatting
60
+ if node1.nil? || node2.nil?
61
+ non_nil = node1 || node2
62
+ return false unless text_node?(non_nil)
63
+
64
+ text = extract_text_content(non_nil)
65
+ return blank?(text)
66
+ end
67
+
68
+ # Both must be text nodes
69
+ return false unless text_node?(node1) && text_node?(node2)
70
+
71
+ text1 = extract_text_content(node1)
72
+ text2 = extract_text_content(node2)
73
+
74
+ # Check if both texts are blank/whitespace-only
75
+ # This indicates self-closing vs explicit closing tag syntax
76
+ blank?(text1) && blank?(text2)
77
+ end
78
+
79
+ # Check if a value is blank (nil or whitespace-only)
80
+ # @param value [String, nil] Value to check
81
+ # @return [Boolean] true if blank
82
+ def self.blank?(value)
83
+ value.nil? ||
84
+ (value.respond_to?(:empty?) && value.empty?) ||
85
+ (value.respond_to?(:strip) && value.strip.empty?)
86
+ end
87
+
88
+ # Check if a node is a text node
89
+ # @param node [Object] The node to check
90
+ # @return [Boolean] true if the node is a text node
91
+ def self.text_node?(node)
92
+ return false if node.nil?
93
+
94
+ # Canon::Xml::Nodes::TextNode
95
+ return true if node.is_a?(Canon::Xml::Nodes::TextNode)
96
+
97
+ # Moxml::Text (check before generic node_type check)
98
+ return true if node.is_a?(Moxml::Text)
99
+
100
+ # Nokogiri text nodes (node_type returns integer constant like 3)
101
+ return true if node.respond_to?(:node_type) &&
102
+ node.node_type.is_a?(Integer) &&
103
+ node.node_type == Nokogiri::XML::Node::TEXT_NODE
104
+
105
+ # Moxml text nodes (node_type returns symbol) - for when using Moxml adapters
106
+ return true if node.respond_to?(:node_type) && node.node_type == :text
107
+
108
+ # String
109
+ return true if node.is_a?(String)
110
+
111
+ # Test doubles or objects with text node-like interface
112
+ # Check if it has a value method (contains text content)
113
+ return true if node.respond_to?(:value)
114
+
115
+ false
116
+ end
117
+
118
+ # Extract text content from a node
119
+ # @param node [Object] The node to extract text from
120
+ # @return [String, nil] The text content or nil
121
+ def self.extract_text_content(node)
122
+ return nil if node.nil?
123
+
124
+ # For TextNode with value attribute (Canon::Xml::Nodes::TextNode)
125
+ return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
126
+
127
+ # For XML/HTML nodes with text_content method
128
+ return node.text_content if node.respond_to?(:text_content)
129
+
130
+ # For nodes with content method (try before text, as Moxml::Text.text returns "")
131
+ return node.content if node.respond_to?(:content)
132
+
133
+ # For nodes with text method
134
+ return node.text if node.respond_to?(:text)
135
+
136
+ # For nodes with value method (other types)
137
+ return node.value if node.respond_to?(:value)
138
+
139
+ # For simple text nodes or strings
140
+ return node.to_s if node.is_a?(String)
141
+
142
+ # For other node types, try to_s
143
+ node.to_s
144
+ rescue StandardError
145
+ # If extraction fails, return nil (not a serialization difference)
146
+ nil
147
+ end
148
+
149
+ private_class_method :blank?, :text_node?, :extract_text_content,
150
+ :empty_text_content_serialization_diff?
151
+ end
152
+ end
153
+ end
data/lib/canon/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Canon
4
- VERSION = "0.1.9"
4
+ VERSION = "0.1.10"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: canon
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.1.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-01-20 00:00:00.000000000 Z
11
+ date: 2026-01-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: diff-lcs
@@ -257,6 +257,7 @@ files:
257
257
  - lib/canon/diff/formatting_detector.rb
258
258
  - lib/canon/diff/node_serializer.rb
259
259
  - lib/canon/diff/path_builder.rb
260
+ - lib/canon/diff/xml_serialization_formatter.rb
260
261
  - lib/canon/diff_formatter.rb
261
262
  - lib/canon/diff_formatter/by_line/base_formatter.rb
262
263
  - lib/canon/diff_formatter/by_line/html_formatter.rb