canon 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +31 -149
  3. data/README.adoc +9 -0
  4. data/docs/advanced/semantic-diff-report.adoc +31 -0
  5. data/docs/features/configuration-profiles.adoc +4 -2
  6. data/docs/features/match-options/html-policies.adoc +2 -0
  7. data/docs/features/match-options/index.adoc +40 -0
  8. data/docs/guides/choosing-configuration.adoc +12 -1
  9. data/docs/reference/cli-options.adoc +3 -0
  10. data/docs/reference/options-across-interfaces.adoc +7 -1
  11. data/docs/understanding/formats/html.adoc +9 -2
  12. data/lib/canon/cli.rb +4 -0
  13. data/lib/canon/commands/diff_command.rb +1 -0
  14. data/lib/canon/comparison/comparison_result.rb +79 -0
  15. data/lib/canon/comparison/html_comparator.rb +92 -11
  16. data/lib/canon/comparison/markup_comparator.rb +19 -0
  17. data/lib/canon/comparison/match_options/base_resolver.rb +1 -0
  18. data/lib/canon/comparison/match_options/xml_resolver.rb +8 -0
  19. data/lib/canon/comparison/match_options.rb +23 -2
  20. data/lib/canon/comparison/whitespace_sensitivity.rb +96 -0
  21. data/lib/canon/comparison/xml_comparator/child_comparison.rb +6 -0
  22. data/lib/canon/comparison/xml_comparator/node_parser.rb +45 -7
  23. data/lib/canon/comparison/xml_comparator.rb +80 -4
  24. data/lib/canon/comparison/xml_node_comparison.rb +29 -3
  25. data/lib/canon/comparison.rb +84 -22
  26. data/lib/canon/config/env_schema.rb +2 -1
  27. data/lib/canon/config/profiles/metanorma.yml +3 -0
  28. data/lib/canon/config.rb +51 -5
  29. data/lib/canon/diff/diff_classifier.rb +18 -2
  30. data/lib/canon/diff/diff_line_builder.rb +9 -8
  31. data/lib/canon/diff_formatter/by_line/base_formatter.rb +39 -4
  32. data/lib/canon/diff_formatter/by_line/html_formatter.rb +5 -2
  33. data/lib/canon/diff_formatter/by_line_formatter.rb +84 -0
  34. data/lib/canon/diff_formatter/by_object_formatter.rb +53 -0
  35. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +65 -17
  36. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +17 -0
  37. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +29 -0
  38. data/lib/canon/diff_formatter/pretty_diff_formatter.rb +109 -0
  39. data/lib/canon/diff_formatter.rb +57 -173
  40. data/lib/canon/html/data_model.rb +10 -4
  41. data/lib/canon/tree_diff/adapters/html_adapter.rb +55 -2
  42. data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
  43. data/lib/canon/version.rb +1 -1
  44. data/lib/canon/xml/c14n.rb +59 -5
  45. data/lib/canon/xml/element_matcher.rb +3 -0
  46. data/lib/canon/xml/node.rb +8 -1
  47. data/lib/canon/xml/nodes/comment_node.rb +4 -0
  48. data/lib/canon/xml/nodes/element_node.rb +4 -0
  49. data/lib/canon/xml/nodes/text_node.rb +4 -0
  50. data/lib/canon/xml/sax_builder.rb +11 -2
  51. data/lib/canon/xml/xpath_engine.rb +238 -0
  52. metadata +6 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f2d050730d102cb224140f806e5b56634cdb98a16206301b0773d310dec582b9
4
- data.tar.gz: 0b3bbf793abcfc9c3dd96a8c935260f4cf3360c7ee326e65ef8f165c48421f77
3
+ metadata.gz: c24b7c541d6159a3c261d389c0d41b85f954cd4152d88d9ca4748d9a3ceb34ef
4
+ data.tar.gz: 1de985c950b90c6979432b7b0bd1ed1b469240456fb7fd985a3d7f6929448b83
5
5
  SHA512:
6
- metadata.gz: 69b187c69b2aee23b1c1763504a33b226347ccbaaaab92d228d2a70da7d1f94f7160fb84b67356001de7de4e51155cc856104344fa795da8573ccf5e27af0633
7
- data.tar.gz: 687fb81fbf5cafd49e8b1818e3dc641c44f62cf0f5d387829916ab270bdd4b9fffa56278e61d09f6bdabec52da1640ad59baf3c8d472d6248d1e0fce38209227
6
+ metadata.gz: 719eefd6be6d642503adb82e50609983fe9082ec8c7efe34c5e6cf27bfdc8065edc05b7ae75a959db8e5fe117f0ec67d71d81006d342a1c01f2846b4aa54b196
7
+ data.tar.gz: 32a1bece85afd8265f158fdea547de08759773ba8a1e574ca72e42c79f6f59b02ed881cc4ba4bb78e54d135c9f4362100c8d409d2ee08b0eaa3561b13652296c
data/.rubocop_todo.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2026-04-12 07:40:40 UTC using RuboCop version 1.86.0.
3
+ # on 2026-04-27 09:48:55 UTC using RuboCop version 1.86.0.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
@@ -11,125 +11,38 @@ Gemspec/RequiredRubyVersion:
11
11
  Exclude:
12
12
  - 'canon.gemspec'
13
13
 
14
- # Offense count: 49
14
+ # Offense count: 30
15
15
  # This cop supports safe autocorrection (--autocorrect).
16
16
  # Configuration parameters: EnforcedStyle, IndentationWidth.
17
17
  # SupportedStyles: with_first_argument, with_fixed_indentation
18
18
  Layout/ArgumentAlignment:
19
19
  Exclude:
20
- - 'lib/canon/comparison/whitespace_sensitivity.rb'
21
20
  - 'lib/canon/comparison/xml_comparator.rb'
22
- - 'lib/canon/comparison/xml_node_comparison.rb'
23
- - 'lib/canon/config.rb'
24
- - 'lib/canon/diff/diff_classifier.rb'
25
- - 'lib/canon/diff_formatter.rb'
26
- - 'lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb'
27
- - 'lib/canon/pretty_printer/xml_normalized.rb'
28
- - 'spec/canon/config/profile_loader_spec.rb'
29
- - 'spec/canon/config/profile_spec.rb'
30
- - 'spec/canon/diff_formatter/diff_detail_formatter_spec.rb'
31
- - 'spec/canon/diff_formatter/pretty_diff_spec.rb'
32
- - 'spec/canon/diff_formatter/show_diffs_filtering_spec.rb'
33
- - 'spec/canon/pretty_printer/xml_normalized_spec.rb'
21
+ - 'spec/canon/comparison/html4_html5_whitespace_parity_spec.rb'
34
22
 
35
- # Offense count: 3
36
- # This cop supports safe autocorrection (--autocorrect).
37
- # Configuration parameters: EnforcedStyle, IndentationWidth.
38
- # SupportedStyles: with_first_element, with_fixed_indentation
39
- Layout/ArrayAlignment:
40
- Exclude:
41
- - 'lib/canon/comparison/match_options/base_resolver.rb'
42
- - 'lib/canon/comparison/match_options/xml_resolver.rb'
43
- - 'spec/canon/config/profile_spec.rb'
44
-
45
- # Offense count: 16
23
+ # Offense count: 1
46
24
  # This cop supports safe autocorrection (--autocorrect).
47
25
  # Configuration parameters: EnforcedStyleAlignWith.
48
26
  # SupportedStylesAlignWith: either, start_of_block, start_of_line
49
27
  Layout/BlockAlignment:
50
28
  Exclude:
51
- - 'lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb'
52
- - 'lib/canon/pretty_printer/xml_normalized.rb'
53
- - 'spec/canon/config/profile_loader_spec.rb'
54
- - 'spec/canon/diff_formatter/display_preprocessing_spec.rb'
55
- - 'spec/canon/diff_formatter/pretty_diff_spec.rb'
56
- - 'spec/canon/diff_formatter/show_diffs_filtering_spec.rb'
57
- - 'spec/canon/pretty_printer/xml_normalized_spec.rb'
29
+ - 'lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb'
58
30
 
59
- # Offense count: 16
31
+ # Offense count: 1
60
32
  # This cop supports safe autocorrection (--autocorrect).
61
33
  Layout/BlockEndNewline:
62
34
  Exclude:
63
- - 'lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb'
64
- - 'lib/canon/pretty_printer/xml_normalized.rb'
65
- - 'spec/canon/config/profile_loader_spec.rb'
66
- - 'spec/canon/diff_formatter/display_preprocessing_spec.rb'
67
- - 'spec/canon/diff_formatter/pretty_diff_spec.rb'
68
- - 'spec/canon/diff_formatter/show_diffs_filtering_spec.rb'
69
- - 'spec/canon/pretty_printer/xml_normalized_spec.rb'
70
-
71
- # Offense count: 5
72
- # This cop supports safe autocorrection (--autocorrect).
73
- Layout/ClosingParenthesisIndentation:
74
- Exclude:
75
- - 'lib/canon/config/profile_loader.rb'
76
- - 'lib/canon/diff/diff_classifier.rb'
77
- - 'spec/canon/config/profile_loader_spec.rb'
35
+ - 'lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb'
78
36
 
79
37
  # Offense count: 2
80
38
  # This cop supports safe autocorrection (--autocorrect).
81
- Layout/ElseAlignment:
82
- Exclude:
83
- - 'lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb'
84
-
85
- # Offense count: 2
86
- # This cop supports safe autocorrection (--autocorrect).
87
- # Configuration parameters: EnforcedStyleAlignWith.
88
- # SupportedStylesAlignWith: keyword, variable, start_of_line
89
- Layout/EndAlignment:
90
- Exclude:
91
- - 'lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb'
92
-
93
- # Offense count: 5
94
- # This cop supports safe autocorrection (--autocorrect).
95
- # Configuration parameters: EnforcedStyle, IndentationWidth.
96
- # SupportedStyles: consistent, consistent_relative_to_receiver, special_for_inner_method_call, special_for_inner_method_call_in_parentheses
97
- Layout/FirstArgumentIndentation:
98
- Exclude:
99
- - 'lib/canon/config/profile_loader.rb'
100
- - 'lib/canon/diff/diff_classifier.rb'
101
- - 'spec/canon/config/profile_loader_spec.rb'
102
-
103
- # Offense count: 30
104
- # This cop supports safe autocorrection (--autocorrect).
105
- # Configuration parameters: AllowMultipleStyles, EnforcedHashRocketStyle, EnforcedColonStyle, EnforcedLastArgumentHashStyle.
106
- # SupportedHashRocketStyles: key, separator, table
107
- # SupportedColonStyles: key, separator, table
108
- # SupportedLastArgumentHashStyles: always_inspect, always_ignore, ignore_implicit, ignore_explicit
109
- Layout/HashAlignment:
110
- Exclude:
111
- - 'spec/canon/diff_formatter/diff_detail_formatter_spec.rb'
112
- - 'spec/canon/diff_formatter/display_preprocessing_spec.rb'
113
- - 'spec/canon/diff_formatter/pretty_diff_spec.rb'
114
- - 'spec/canon/diff_formatter/show_diffs_filtering_spec.rb'
115
- - 'spec/canon/pretty_printer/xml_normalized_spec.rb'
116
-
117
- # Offense count: 36
118
- # This cop supports safe autocorrection (--autocorrect).
119
39
  # Configuration parameters: Width, EnforcedStyleAlignWith, AllowedPatterns.
120
40
  # SupportedStylesAlignWith: start_of_line, relative_to_receiver
121
41
  Layout/IndentationWidth:
122
42
  Exclude:
123
- - 'lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb'
124
- - 'lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb'
125
- - 'lib/canon/pretty_printer/xml_normalized.rb'
126
- - 'spec/canon/config/profile_loader_spec.rb'
127
- - 'spec/canon/diff_formatter/display_preprocessing_spec.rb'
128
- - 'spec/canon/diff_formatter/pretty_diff_spec.rb'
129
- - 'spec/canon/diff_formatter/show_diffs_filtering_spec.rb'
130
- - 'spec/canon/pretty_printer/xml_normalized_spec.rb'
43
+ - 'lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb'
131
44
 
132
- # Offense count: 1375
45
+ # Offense count: 1347
133
46
  # This cop supports safe autocorrection (--autocorrect).
134
47
  # Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
135
48
  # URISchemes: http, https
@@ -138,20 +51,12 @@ Layout/LineLength:
138
51
 
139
52
  # Offense count: 2
140
53
  # This cop supports safe autocorrection (--autocorrect).
141
- # Configuration parameters: EnforcedStyle.
142
- # SupportedStyles: symmetrical, new_line, same_line
143
- Layout/MultilineMethodCallBraceLayout:
144
- Exclude:
145
- - 'lib/canon/config/profile_loader.rb'
146
- - 'lib/canon/diff/diff_classifier.rb'
147
-
148
- # Offense count: 57
149
- # This cop supports safe autocorrection (--autocorrect).
150
54
  # Configuration parameters: AllowInHeredoc.
151
55
  Layout/TrailingWhitespace:
152
- Enabled: false
56
+ Exclude:
57
+ - 'lib/canon/comparison/xml_comparator.rb'
153
58
 
154
- # Offense count: 56
59
+ # Offense count: 58
155
60
  # Configuration parameters: IgnoreLiteralBranches, IgnoreConstantBranches, IgnoreDuplicateElseBranch.
156
61
  Lint/DuplicateBranch:
157
62
  Enabled: false
@@ -196,7 +101,7 @@ Lint/UselessConstantScoping:
196
101
  Exclude:
197
102
  - 'lib/canon/diff_formatter/theme.rb'
198
103
 
199
- # Offense count: 309
104
+ # Offense count: 322
200
105
  # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
201
106
  Metrics/AbcSize:
202
107
  Enabled: false
@@ -207,32 +112,32 @@ Metrics/AbcSize:
207
112
  Metrics/BlockLength:
208
113
  Max: 92
209
114
 
210
- # Offense count: 3
115
+ # Offense count: 1
211
116
  # Configuration parameters: CountBlocks, CountModifierForms.
212
117
  Metrics/BlockNesting:
213
118
  Max: 4
214
119
 
215
- # Offense count: 272
120
+ # Offense count: 281
216
121
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
217
122
  Metrics/CyclomaticComplexity:
218
123
  Enabled: false
219
124
 
220
- # Offense count: 498
125
+ # Offense count: 517
221
126
  # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
222
127
  Metrics/MethodLength:
223
128
  Max: 146
224
129
 
225
- # Offense count: 58
130
+ # Offense count: 56
226
131
  # Configuration parameters: CountKeywordArgs, MaxOptionalParameters.
227
132
  Metrics/ParameterLists:
228
133
  Max: 10
229
134
 
230
- # Offense count: 219
135
+ # Offense count: 225
231
136
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
232
137
  Metrics/PerceivedComplexity:
233
138
  Enabled: false
234
139
 
235
- # Offense count: 28
140
+ # Offense count: 30
236
141
  # Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
237
142
  # AllowedNames: as, at, by, cc, db, id, if, in, io, ip, of, on, os, pp, to
238
143
  Naming/MethodParameterName:
@@ -260,13 +165,13 @@ Performance/CollectionLiteralInLoop:
260
165
  - 'lib/canon/xml/xml_base_handler.rb'
261
166
  - 'spec/canon/diff/diff_node_mapper_comments_spec.rb'
262
167
 
263
- # Offense count: 82
168
+ # Offense count: 85
264
169
  # Configuration parameters: Prefixes, AllowedPatterns.
265
170
  # Prefixes: when, with, without
266
171
  RSpec/ContextWording:
267
172
  Enabled: false
268
173
 
269
- # Offense count: 37
174
+ # Offense count: 43
270
175
  # Configuration parameters: IgnoredMetadata.
271
176
  RSpec/DescribeClass:
272
177
  Enabled: false
@@ -277,7 +182,7 @@ RSpec/DescribeMethod:
277
182
  - 'spec/canon/comparison/multiple_differences_spec.rb'
278
183
  - 'spec/canon/diff_formatter/character_map_customization_spec.rb'
279
184
 
280
- # Offense count: 804
185
+ # Offense count: 847
281
186
  # Configuration parameters: CountAsOne.
282
187
  RSpec/ExampleLength:
283
188
  Max: 44
@@ -291,6 +196,12 @@ RSpec/ExpectActual:
291
196
  - 'spec/canon/rspec_matchers_spec.rb'
292
197
  - 'spec/canon/string_matcher_spec.rb'
293
198
 
199
+ # Offense count: 7
200
+ # This cop supports unsafe autocorrection (--autocorrect-all).
201
+ RSpec/IncludeExamples:
202
+ Exclude:
203
+ - 'spec/canon/comparison/html4_html5_whitespace_parity_spec.rb'
204
+
294
205
  # Offense count: 177
295
206
  # Configuration parameters: Max, AllowedIdentifiers, AllowedPatterns.
296
207
  RSpec/IndexedLet:
@@ -329,7 +240,7 @@ RSpec/MultipleDescribes:
329
240
  Exclude:
330
241
  - 'spec/canon/comparison/match_options_spec.rb'
331
242
 
332
- # Offense count: 654
243
+ # Offense count: 694
333
244
  RSpec/MultipleExpectations:
334
245
  Max: 15
335
246
 
@@ -347,7 +258,7 @@ RSpec/NamedSubject:
347
258
  - 'spec/canon/pretty_printer/json_spec.rb'
348
259
  - 'spec/canon/pretty_printer/xml_spec.rb'
349
260
 
350
- # Offense count: 50
261
+ # Offense count: 53
351
262
  # Configuration parameters: AllowedGroups.
352
263
  RSpec/NestedGroups:
353
264
  Max: 4
@@ -381,7 +292,7 @@ RSpec/SpecFilePathFormat:
381
292
  - 'spec/canon/yaml/formatter_spec.rb'
382
293
  - 'spec/xml_c14n_spec.rb'
383
294
 
384
- # Offense count: 131
295
+ # Offense count: 134
385
296
  # Configuration parameters: IgnoreNameless, IgnoreSymbolicNames.
386
297
  RSpec/VerifiedDoubles:
387
298
  Exclude:
@@ -393,23 +304,6 @@ RSpec/VerifiedDoubles:
393
304
  - 'spec/canon/diff_formatter/diff_detail_formatter_spec.rb'
394
305
  - 'spec/canon/tree_diff/operation_converter_spec.rb'
395
306
 
396
- # Offense count: 25
397
- # This cop supports safe autocorrection (--autocorrect).
398
- # Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
399
- # SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
400
- # ProceduralMethods: benchmark, bm, bmbm, create, each_with_object, measure, new, realtime, tap, with_object
401
- # FunctionalMethods: let, let!, subject, watch
402
- # AllowedMethods: lambda, proc, it
403
- Style/BlockDelimiters:
404
- Exclude:
405
- - 'lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb'
406
- - 'lib/canon/pretty_printer/xml_normalized.rb'
407
- - 'spec/canon/config/profile_loader_spec.rb'
408
- - 'spec/canon/diff_formatter/display_preprocessing_spec.rb'
409
- - 'spec/canon/diff_formatter/pretty_diff_spec.rb'
410
- - 'spec/canon/diff_formatter/show_diffs_filtering_spec.rb'
411
- - 'spec/canon/pretty_printer/xml_normalized_spec.rb'
412
-
413
307
  # Offense count: 1
414
308
  # This cop supports safe autocorrection (--autocorrect).
415
309
  # Configuration parameters: EnforcedStyle, AllowComments.
@@ -432,18 +326,6 @@ Style/IdenticalConditionalBranches:
432
326
  - 'lib/canon/diff_formatter/by_object/base_formatter.rb'
433
327
  - 'lib/canon/diff_formatter/legend.rb'
434
328
 
435
- # Offense count: 3
436
- # This cop supports safe autocorrection (--autocorrect).
437
- Style/MultilineIfModifier:
438
- Exclude:
439
- - 'lib/canon/pretty_printer/xml_normalized.rb'
440
-
441
- # Offense count: 2
442
- # This cop supports safe autocorrection (--autocorrect).
443
- Style/MultilineTernaryOperator:
444
- Exclude:
445
- - 'lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb'
446
-
447
329
  # Offense count: 1
448
330
  # Configuration parameters: AllowedMethods.
449
331
  # AllowedMethods: respond_to_missing?
data/README.adoc CHANGED
@@ -770,6 +770,15 @@ Each dimension can have one of three behaviors:
770
770
  * **`:normalize`**: Differences are normalized; only semantic changes are normative
771
771
  * **`:ignore`**: Differences are informative only (don't affect equivalence)
772
772
 
773
+ In addition, the `whitespace_type` option controls how Unicode whitespace
774
+ characters are compared:
775
+
776
+ * **`whitespace_type: :strict`** (default): Different whitespace types (space,
777
+ NBSP, ideographic space, etc.) are detected as differences — useful for catching
778
+ accidental insertion of wrong whitespace.
779
+ * **`whitespace_type: :normalize`**: All Unicode whitespace types are treated as
780
+ equivalent.
781
+
773
782
  .Example: Whitespace handling
774
783
  [example]
775
784
  ====
@@ -148,6 +148,37 @@ Location: /html/body/div/table/tbody/tr/td/pre/text
148
148
 
149
149
  The warning appears for text inside whitespace-preserving elements where Canon automatically switches to strict mode.
150
150
 
151
+ ==== Parent-context fallback for ambiguous text diffs
152
+
153
+ For a `text_content` difference, Canon normally renders the two sides as JSON-quoted strings.
154
+ When both sides would collapse to the same (or visually indistinguishable) short string -- both empty (`""`), both whitespace-only, or both equal on the text-node extraction even though the surrounding DOM differs -- that rendering conveys nothing.
155
+
156
+ In this case Canon instead serializes each side's *parent element* compactly and visualizes whitespace (`·` for space, `→` for tab, `¬` for newline, `<NBSP>` for non-breaking space) so the structural contrast is visible.
157
+
158
+ .Example: Ambiguous empty-vs-whitespace text diff
159
+ [example]
160
+ ====
161
+ [source]
162
+ ----
163
+ 🔍 DIFFERENCE #1/1 [NORMATIVE]
164
+ ──────────────────────────────────────────────────────────────────────
165
+ Dimension: text_content
166
+ Location: /#document[0]/fmt-title[0]/span[0]/span/text()[0]
167
+ Reason: Text: "¬······:¬······"
168
+ vs.: ":"
169
+
170
+ ⊖ Expected (File 1):
171
+ <span·class="fmt-caption-delim">¬······:¬······<tab/>¬···</span>
172
+ ⊕ Actual (File 2):
173
+ <span·class="fmt-caption-delim">:<tab/></span>
174
+
175
+ ✨ Changes:
176
+ Content differs: <span·class="fmt-caption-delim">¬······:¬······<tab/>¬···</span> → <span·class="fmt-caption-delim">:<tab/></span>
177
+ ----
178
+ ====
179
+
180
+ This fallback is implemented in `Canon::DiffFormatter::DiffDetailFormatterHelpers::DimensionFormatter.format_text_content_details` and only triggers when `TextUtils.ambiguous_text_pair?` returns `true` _and_ at least one side has a parent element to render.
181
+
151
182
  === Structural Whitespace
152
183
 
153
184
  Shows whitespace-only differences (usually informative).
@@ -29,8 +29,10 @@ variant can extend a base profile with only the differences.
29
29
 
30
30
  | `:metanorma`
31
31
  | Standard Metanorma spec configuration. Sets preprocessing to `:format`,
32
- match profile to `:spec_friendly`, diff algorithm to `:dom`, canonical
33
- display format, normalized pretty-print display preprocessing,
32
+ match profile to `:spec_friendly`, `whitespace_type` to `:normalize`
33
+ (so that Unicode whitespace variants like space vs NBSP are treated as
34
+ equivalent for backward compatibility), diff algorithm to `:dom`,
35
+ canonical display format, normalized pretty-print display preprocessing,
34
36
  and XML-specific whitespace element lists.
35
37
 
36
38
  | `:metanorma_debug`
@@ -44,6 +44,8 @@ Canon automatically detects HTML version:
44
44
 
45
45
  Detection is based on DOCTYPE or parsing mode.
46
46
 
47
+ NOTE: Whitespace sensitivity does not differ between HTML4 and HTML5 — both apply HTML's content-model whitespace rules. `be_html4_equivalent_to` and `be_html5_equivalent_to` therefore agree on whether two inputs are whitespace-equivalent. Differences between the matchers are limited to genuine HTML4/HTML5 distinctions such as case sensitivity. Internally Canon parses both via `Nokogiri::HTML5` to share the content-model logic.
48
+
47
49
  === Whitespace Preservation
48
50
 
49
51
  Certain HTML elements require strict whitespace preservation regardless of the
@@ -92,6 +92,46 @@ Canon.equivalent?(
92
92
  `:ignore`:: Structural whitespace is completely ignored
93
93
 
94
94
 
95
+ === whitespace_type
96
+
97
+ **Applies to**: XML, HTML
98
+
99
+ **Purpose**: Controls whether different Unicode whitespace characters (space, NBSP, ideographic space, etc.) are treated as equivalent or distinct.
100
+
101
+ **Behaviors**:
102
+
103
+ `:strict`:: (default) Different Unicode whitespace types are significant.
104
+ Space (U+0020) and NBSP (U+00A0) are treated as different characters.
105
+ This is useful for catching accidental insertion of wrong whitespace types
106
+ (e.g., a pasted NBSP where a regular space was intended).
107
+
108
+ `:normalize`:: All Unicode whitespace characters are collapsed to a single space
109
+ before comparison. Space, NBSP, ideographic space (U+3000), and other Unicode
110
+ whitespace characters are treated as equivalent.
111
+
112
+ .Using whitespace_type: :strict (default)
113
+ [example]
114
+ ====
115
+ [source,ruby]
116
+ ----
117
+ # By default, space and NBSP are different
118
+ xml1 = '<root><span>ISO</span> <span>712</span></root>'
119
+ xml2 = '<root><span>ISO</span>&#xa0;<span>712</span></root>'
120
+
121
+ Canon::Comparison.equivalent?(xml1, xml2,
122
+ match_profile: :spec_friendly
123
+ )
124
+ # => false (NBSP detected as different from space)
125
+
126
+ # Opt into treating all whitespace types as equivalent
127
+ Canon::Comparison.equivalent?(xml1, xml2,
128
+ match_profile: :spec_friendly,
129
+ match: { whitespace_type: :normalize }
130
+ )
131
+ # => true
132
+ ----
133
+ ====
134
+
95
135
  === Whitespace sensitivity at element level
96
136
 
97
137
  ==== General
@@ -210,13 +210,24 @@ Canon::Comparison.equivalent?(doc1, doc2,
210
210
  structural_whitespace: :ignore, # ignore, normalize, strict
211
211
  attribute_order: :ignore, # ignore, strict (XML/HTML)
212
212
  attribute_values: :normalize, # normalize, strict, ignore
213
- comments: :ignore # ignore, normalize, strict
213
+ comments: :ignore, # ignore, normalize, strict
214
+ whitespace_type: :strict # strict (default), normalize
214
215
  }
215
216
  )
216
217
  ----
217
218
 
218
219
  **Remember**: Match options behave differently with each algorithm! See link:../features/match-options/algorithm-specific-behavior.adoc[Algorithm-Specific Behavior].
219
220
 
221
+ ==== Whitespace Type Sensitivity
222
+
223
+ By default, Canon distinguishes between different Unicode whitespace types
224
+ (e.g. regular space U+0020 vs non-breaking space U+00A0 vs ideographic space
225
+ U+3000). This catches accidental insertion of wrong whitespace characters.
226
+
227
+ Use `whitespace_type: :normalize` when all Unicode whitespace variants should
228
+ be treated as equivalent (e.g. when output from different tools may use
229
+ different whitespace types for the same visual result).
230
+
220
231
  === Layer 4: Diff Formatting
221
232
 
222
233
  **Question**: How should differences be displayed?
@@ -145,6 +145,9 @@ Individual dimension control (overrides profile settings):
145
145
 
146
146
  |`--comments BEHAVIOR`
147
147
  |Comments: `strict`, `normalize`, `ignore`
148
+
149
+ |`--whitespace-type BEHAVIOR`
150
+ |Whitespace type sensitivity: `strict` (default), `normalize`
148
151
  |===
149
152
 
150
153
  See link:../features/match-options/[Match Options] for details.
@@ -223,9 +223,15 @@ Profile values: `strict`, `rendered`, `spec_friendly`, `content_only`
223
223
  |`match: { element_hierarchy: :strict }`
224
224
  |`config.canon.xml.match.options = { element_hierarchy: :strict }`
225
225
  |`CANON_ELEMENT_HIERARCHY=strict`
226
+
227
+ |Whitespace Type
228
+ |`--whitespace-type normalize`
229
+ |`match: { whitespace_type: :normalize }`
230
+ |`config.canon.xml.match.options = { whitespace_type: :normalize }`
231
+ |`CANON_WHITESPACE_TYPE=normalize`
226
232
  |===
227
233
 
228
- Values: `strict`, `normalize`, `ignore` (or `strict`, `ignore` for structure/position/hierarchy)
234
+ Values: `strict`, `normalize`, `ignore` (or `strict`, `ignore` for structure/position/hierarchy). `whitespace_type` values: `strict` (default), `normalize`
229
235
 
230
236
  ==== XML/HTML-Specific Match Dimensions
231
237
 
@@ -19,7 +19,7 @@ Canon supports HTML 4, HTML5, and XHTML with automatic format detection.
19
19
  **Key features:**
20
20
 
21
21
  * Automatic HTML vs XHTML detection
22
- * HTML5 parser for modern HTML
22
+ * HTML5 parser for HTML input regardless of declared version (HTML4 and HTML5 share the same content model and parsing whitespace rules — see <<html4-html5-parity>>)
23
23
  * XML parser for XHTML
24
24
  * Consistent attribute ordering
25
25
  * Whitespace normalization
@@ -203,9 +203,16 @@ Automatically detects HTML5, HTML4, or XHTML based on DOCTYPE and structure.
203
203
  ----
204
204
  ====
205
205
 
206
+ [[html4-html5-parity]]
207
+ === HTML4 / HTML5 parity
208
+
209
+ `be_html4_equivalent_to` and `be_html5_equivalent_to` apply the same whitespace-sensitivity rules. Whitespace sensitivity is a property of HTML's content model and is identical across the two HTML versions, so any input that compares equivalent under one matcher must compare equivalent under the other.
210
+
211
+ Internally, both matchers parse input via `Nokogiri::HTML5.fragment`. (Earlier releases routed `:html` and `:html4` through `Nokogiri::XML.fragment`, which silently applied XML whitespace rules — meaning `be_html4_equivalent_to` could reject inputs that `be_html5_equivalent_to` correctly accepted.) See https://github.com/lutaml/canon/issues/118 for the full background.
212
+
206
213
  === Whitespace handling
207
214
 
208
- HTML whitespace is collapsed per CSS rendering rules. Empty text nodes between elements are removed.
215
+ HTML whitespace is collapsed per CSS rendering rules. Empty text nodes between elements are removed. Whitespace-only text between two adjacent inline elements (`<span>A</span> <span>B</span>`) is preserved because it renders as a visible space; whitespace at a block boundary (between an inline element and a block element, or between two block siblings) is collapsed.
209
216
 
210
217
  .Whitespace handling example
211
218
  [example]
data/lib/canon/cli.rb CHANGED
@@ -218,6 +218,10 @@ module Canon
218
218
  type: :string,
219
219
  enum: %w[strict normalize ignore],
220
220
  desc: "Comment matching: strict, normalize, or ignore"
221
+ method_option :whitespace_type,
222
+ type: :string,
223
+ enum: %w[strict normalize],
224
+ desc: "Whitespace type sensitivity: strict (default) or normalize"
221
225
  method_option :show_diffs,
222
226
  type: :string,
223
227
  enum: %w[all normative informative],
@@ -143,6 +143,7 @@ module Canon
143
143
  dimensions = %i[
144
144
  text_content structural_whitespace attribute_whitespace
145
145
  attribute_order attribute_values comments key_order
146
+ whitespace_type
146
147
  ]
147
148
 
148
149
  dimensions.each_with_object({}) do |dim, opts|
@@ -84,6 +84,30 @@ html_version: nil, match_options: nil, algorithm: :dom, original_strings: nil)
84
84
  @match_options&.[](:tree_diff_operations) || []
85
85
  end
86
86
 
87
+ # Generate a human-readable summary of the first difference.
88
+ #
89
+ # When documents are equivalent, returns "Equivalent".
90
+ # When they differ, returns a single-line string with the first normative
91
+ # (or first informative) difference location and reason.
92
+ #
93
+ # @return [String] Summary string
94
+ def summary
95
+ return "Equivalent" if equivalent?
96
+
97
+ diff = normative_differences.first || informative_differences.first ||
98
+ @differences.first # rubocop:disable Layout/MultilineOperationIndentation
99
+
100
+ return "Not equivalent" unless diff
101
+
102
+ if diff.is_a?(Canon::Diff::DiffNode)
103
+ summarize_diff_node(diff)
104
+ elsif diff.is_a?(Hash)
105
+ summarize_legacy_hash(diff)
106
+ else
107
+ "Not equivalent"
108
+ end
109
+ end
110
+
87
111
  # Generate formatted diff output
88
112
  #
89
113
  # @param use_color [Boolean] Whether to use ANSI color codes
@@ -116,6 +140,61 @@ show_diffs: :all, diff_mode: :separate, legacy_terminal: false)
116
140
  html_version: @html_version,
117
141
  )
118
142
  end
143
+
144
+ private
145
+
146
+ # Format a single DiffNode into a summary string.
147
+ #
148
+ # @param diff [DiffNode] The difference to summarize
149
+ # @return [String] Human-readable summary
150
+ def summarize_diff_node(diff)
151
+ parts = ["Not equivalent:"]
152
+
153
+ # rubocop:disable Layout/SpaceBeforeInterpolation,Style/ConditionalAssignment
154
+ if diff.path
155
+ parts << "#{diff.reason} at #{diff.path}"
156
+ else
157
+ parts << diff.reason.to_s
158
+ end
159
+ # rubocop:enable Layout/SpaceBeforeInterpolation,Style/ConditionalAssignment
160
+
161
+ if diff.serialized_before && diff.serialized_after
162
+ before_preview = truncate_preview(diff.serialized_before)
163
+ after_preview = truncate_preview(diff.serialized_after)
164
+ parts << "(#{before_preview} vs #{after_preview})"
165
+ end
166
+
167
+ parts.join(" ")
168
+ end
169
+
170
+ # Format a legacy Hash difference into a summary string.
171
+ #
172
+ # @param diff [Hash] Legacy difference hash with :path, :value1, :value2
173
+ # @return [String] Human-readable summary
174
+ def summarize_legacy_hash(diff)
175
+ parts = ["Not equivalent:"]
176
+ parts << "#{diff[:diff_code_description]} at #{diff[:path]}" if diff[:path]
177
+
178
+ if diff[:value1] && diff[:value2]
179
+ parts << "(#{truncate_preview(diff[:value1].to_s)} vs #{truncate_preview(diff[:value2].to_s)})"
180
+ end
181
+
182
+ parts.size > 1 ? parts.join(" ") : "Not equivalent: values differ"
183
+ end
184
+
185
+ # Truncate a string for preview display.
186
+ #
187
+ # @param text [String] Text to truncate
188
+ # @param max_len [Integer] Maximum length
189
+ # @return [String] Truncated text with ellipsis if needed
190
+ def truncate_preview(text, max_len = 40)
191
+ stripped = text.strip.gsub(/\s+/, " ")
192
+ if stripped.length > max_len
193
+ "#{stripped[0...(max_len - 3)]}..."
194
+ else
195
+ stripped
196
+ end
197
+ end
119
198
  end
120
199
  end
121
200
  end