canon 0.2.6 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6437f1a8b556bb49bffbcecf47ec0eeecabdf6541bd5baa5954ac88f98f33a2c
4
- data.tar.gz: 98eff2aa558165dc7e13c8d29da21d8d9c6589cae1d48a18d27f0420d6be7198
3
+ metadata.gz: b5218e18de7c596c5875ee1cf906331269cd58475a1f00de5c20af398bb07f08
4
+ data.tar.gz: 0dedd6f9e8ca265d37c610a183ed0e695ba68a9d8c9f0766b890f3d8db7d1f66
5
5
  SHA512:
6
- metadata.gz: 055614c143bca292b575755f5b4a1554a002e0d6f264ddee3e29049f89d6f9795a61069c1bdf7ebfa459ed11ac2a21203a779e115f8aee143a2aa3c77951a086
7
- data.tar.gz: ff64c25654c1eef41dcc80b471df2958c516e69fa625723bdecfd18c5a99716e7f347c313bf72c28f3b898547d5c97aac3ebb62bcf44e726bc49e668a73e0dd9
6
+ metadata.gz: c24944e5600684e24f4b32cd16d90d68f64ca07671da7cd30a4cc7e13e818e98f86ab849ee28f7780f40ae3514df1ba3087cb32f55c7923d5303c65819aa8d59
7
+ data.tar.gz: 13a3c944492c29a916569b86829cefd8bf7baaf247eea105ee3f608d25789ef7c79ab0b0a95a3806e66b85348dd629169f8e19bef127e3ca79685a0e13d1bca9
data/.rubocop_todo.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2026-04-27 09:48:55 UTC using RuboCop version 1.86.0.
3
+ # on 2026-05-05 13:09:45 UTC using RuboCop version 1.86.0.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
@@ -11,52 +11,58 @@ Gemspec/RequiredRubyVersion:
11
11
  Exclude:
12
12
  - 'canon.gemspec'
13
13
 
14
- # Offense count: 30
14
+ # Offense count: 5
15
15
  # This cop supports safe autocorrection (--autocorrect).
16
16
  # Configuration parameters: EnforcedStyle, IndentationWidth.
17
17
  # SupportedStyles: with_first_argument, with_fixed_indentation
18
18
  Layout/ArgumentAlignment:
19
19
  Exclude:
20
- - 'lib/canon/comparison/xml_comparator.rb'
21
- - 'spec/canon/comparison/html4_html5_whitespace_parity_spec.rb'
20
+ - 'lib/canon/comparison/child_realignment.rb'
21
+ - 'lib/canon/comparison/xml_comparator/child_comparison.rb'
22
+ - 'spec/canon/diff_formatter/diff_detail_formatter_spec.rb'
22
23
 
23
- # Offense count: 1
24
+ # Offense count: 5
24
25
  # This cop supports safe autocorrection (--autocorrect).
25
26
  # Configuration parameters: EnforcedStyleAlignWith.
26
27
  # SupportedStylesAlignWith: either, start_of_block, start_of_line
27
28
  Layout/BlockAlignment:
28
29
  Exclude:
29
- - 'lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb'
30
+ - 'spec/canon/comparison/comments_asymmetry_spec.rb'
31
+ - 'spec/canon/comparison/whitespace_adjacency_spec.rb'
30
32
 
31
- # Offense count: 1
33
+ # Offense count: 5
32
34
  # This cop supports safe autocorrection (--autocorrect).
33
35
  Layout/BlockEndNewline:
34
36
  Exclude:
35
- - 'lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb'
37
+ - 'spec/canon/comparison/comments_asymmetry_spec.rb'
38
+ - 'spec/canon/comparison/whitespace_adjacency_spec.rb'
36
39
 
37
- # Offense count: 2
40
+ # Offense count: 10
38
41
  # This cop supports safe autocorrection (--autocorrect).
39
42
  # Configuration parameters: Width, EnforcedStyleAlignWith, AllowedPatterns.
40
43
  # SupportedStylesAlignWith: start_of_line, relative_to_receiver
41
44
  Layout/IndentationWidth:
42
45
  Exclude:
43
- - 'lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb'
46
+ - 'spec/canon/comparison/comments_asymmetry_spec.rb'
47
+ - 'spec/canon/comparison/whitespace_adjacency_spec.rb'
44
48
 
45
- # Offense count: 1347
49
+ # Offense count: 1386
46
50
  # This cop supports safe autocorrection (--autocorrect).
47
51
  # Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
48
52
  # URISchemes: http, https
49
53
  Layout/LineLength:
50
54
  Enabled: false
51
55
 
52
- # Offense count: 2
56
+ # Offense count: 6
53
57
  # This cop supports safe autocorrection (--autocorrect).
54
58
  # Configuration parameters: AllowInHeredoc.
55
59
  Layout/TrailingWhitespace:
56
60
  Exclude:
57
- - 'lib/canon/comparison/xml_comparator.rb'
61
+ - 'lib/canon/comparison/child_realignment.rb'
62
+ - 'lib/canon/comparison/xml_comparator/child_comparison.rb'
63
+ - 'spec/canon/diff_formatter/diff_detail_formatter_spec.rb'
58
64
 
59
- # Offense count: 58
65
+ # Offense count: 63
60
66
  # Configuration parameters: IgnoreLiteralBranches, IgnoreConstantBranches, IgnoreDuplicateElseBranch.
61
67
  Lint/DuplicateBranch:
62
68
  Enabled: false
@@ -101,7 +107,7 @@ Lint/UselessConstantScoping:
101
107
  Exclude:
102
108
  - 'lib/canon/diff_formatter/theme.rb'
103
109
 
104
- # Offense count: 322
110
+ # Offense count: 321
105
111
  # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
106
112
  Metrics/AbcSize:
107
113
  Enabled: false
@@ -117,12 +123,12 @@ Metrics/BlockLength:
117
123
  Metrics/BlockNesting:
118
124
  Max: 4
119
125
 
120
- # Offense count: 281
126
+ # Offense count: 285
121
127
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
122
128
  Metrics/CyclomaticComplexity:
123
129
  Enabled: false
124
130
 
125
- # Offense count: 517
131
+ # Offense count: 529
126
132
  # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
127
133
  Metrics/MethodLength:
128
134
  Max: 146
@@ -132,7 +138,7 @@ Metrics/MethodLength:
132
138
  Metrics/ParameterLists:
133
139
  Max: 10
134
140
 
135
- # Offense count: 225
141
+ # Offense count: 221
136
142
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
137
143
  Metrics/PerceivedComplexity:
138
144
  Enabled: false
@@ -165,13 +171,13 @@ Performance/CollectionLiteralInLoop:
165
171
  - 'lib/canon/xml/xml_base_handler.rb'
166
172
  - 'spec/canon/diff/diff_node_mapper_comments_spec.rb'
167
173
 
168
- # Offense count: 85
174
+ # Offense count: 107
169
175
  # Configuration parameters: Prefixes, AllowedPatterns.
170
176
  # Prefixes: when, with, without
171
177
  RSpec/ContextWording:
172
178
  Enabled: false
173
179
 
174
- # Offense count: 43
180
+ # Offense count: 46
175
181
  # Configuration parameters: IgnoredMetadata.
176
182
  RSpec/DescribeClass:
177
183
  Enabled: false
@@ -182,7 +188,7 @@ RSpec/DescribeMethod:
182
188
  - 'spec/canon/comparison/multiple_differences_spec.rb'
183
189
  - 'spec/canon/diff_formatter/character_map_customization_spec.rb'
184
190
 
185
- # Offense count: 847
191
+ # Offense count: 876
186
192
  # Configuration parameters: CountAsOne.
187
193
  RSpec/ExampleLength:
188
194
  Max: 44
@@ -196,12 +202,6 @@ RSpec/ExpectActual:
196
202
  - 'spec/canon/rspec_matchers_spec.rb'
197
203
  - 'spec/canon/string_matcher_spec.rb'
198
204
 
199
- # Offense count: 7
200
- # This cop supports unsafe autocorrection (--autocorrect-all).
201
- RSpec/IncludeExamples:
202
- Exclude:
203
- - 'spec/canon/comparison/html4_html5_whitespace_parity_spec.rb'
204
-
205
205
  # Offense count: 177
206
206
  # Configuration parameters: Max, AllowedIdentifiers, AllowedPatterns.
207
207
  RSpec/IndexedLet:
@@ -240,7 +240,7 @@ RSpec/MultipleDescribes:
240
240
  Exclude:
241
241
  - 'spec/canon/comparison/match_options_spec.rb'
242
242
 
243
- # Offense count: 694
243
+ # Offense count: 735
244
244
  RSpec/MultipleExpectations:
245
245
  Max: 15
246
246
 
@@ -249,7 +249,7 @@ RSpec/MultipleExpectations:
249
249
  RSpec/MultipleMemoizedHelpers:
250
250
  Max: 16
251
251
 
252
- # Offense count: 17
252
+ # Offense count: 29
253
253
  # Configuration parameters: EnforcedStyle, IgnoreSharedExamples.
254
254
  # SupportedStyles: always, named_only
255
255
  RSpec/NamedSubject:
@@ -258,7 +258,7 @@ RSpec/NamedSubject:
258
258
  - 'spec/canon/pretty_printer/json_spec.rb'
259
259
  - 'spec/canon/pretty_printer/xml_spec.rb'
260
260
 
261
- # Offense count: 53
261
+ # Offense count: 54
262
262
  # Configuration parameters: AllowedGroups.
263
263
  RSpec/NestedGroups:
264
264
  Max: 4
@@ -292,7 +292,7 @@ RSpec/SpecFilePathFormat:
292
292
  - 'spec/canon/yaml/formatter_spec.rb'
293
293
  - 'spec/xml_c14n_spec.rb'
294
294
 
295
- # Offense count: 134
295
+ # Offense count: 100
296
296
  # Configuration parameters: IgnoreNameless, IgnoreSymbolicNames.
297
297
  RSpec/VerifiedDoubles:
298
298
  Exclude:
@@ -304,6 +304,18 @@ RSpec/VerifiedDoubles:
304
304
  - 'spec/canon/diff_formatter/diff_detail_formatter_spec.rb'
305
305
  - 'spec/canon/tree_diff/operation_converter_spec.rb'
306
306
 
307
+ # Offense count: 8
308
+ # This cop supports safe autocorrection (--autocorrect).
309
+ # Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
310
+ # SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
311
+ # ProceduralMethods: benchmark, bm, bmbm, create, each_with_object, measure, new, realtime, tap, with_object
312
+ # FunctionalMethods: let, let!, subject, watch
313
+ # AllowedMethods: lambda, proc, it
314
+ Style/BlockDelimiters:
315
+ Exclude:
316
+ - 'spec/canon/comparison/comments_asymmetry_spec.rb'
317
+ - 'spec/canon/comparison/whitespace_adjacency_spec.rb'
318
+
307
319
  # Offense count: 1
308
320
  # This cop supports safe autocorrection (--autocorrect).
309
321
  # Configuration parameters: EnforcedStyle, AllowComments.
data/README.adoc CHANGED
@@ -618,6 +618,11 @@ See link:docs/MODES[Diff modes] for details.
618
618
  reported as a dedicated `:whitespace_adjacency` dimension with direction
619
619
  wording (`before`/`after`/`adjacent to`) instead of cascading into
620
620
  misleading `:text_content` mismatches
621
+ * **Asymmetric comment reporting**: A `<!-- ... -->` node present on only
622
+ one side is reported as a dedicated `:comments` dimension diff anchored
623
+ at the comment node, instead of shifting children alignment and
624
+ surfacing a misleading `:element_structure` "Element removed" diff
625
+ against an unrelated trailing sibling
621
626
  * **Non-ASCII detection**: Warnings for unexpected Unicode characters
622
627
  * **Customizable**: Character maps, context lines, grouping options
623
628
 
@@ -212,6 +212,8 @@ Reason: Text: "¬······:¬······"
212
212
 
213
213
  This fallback is implemented in `Canon::DiffFormatter::DiffDetailFormatterHelpers::DimensionFormatter.format_text_content_details` and only triggers when `TextUtils.ambiguous_text_pair?` returns `true` _and_ at least one side has a parent element to render.
214
214
 
215
+ The same fallback also applies to the `whitespace_adjacency` dimension (see <<whitespace-adjacency,Whitespace adjacency>>): when the alignment partner of a stray whitespace node extracts to an empty / whitespace-only string, the Reason line reads `Whitespace inside <PARENT>` (rather than `Whitespace before ""`), and the Expected/Actual block surfaces each side's parent element compactly. See `format_whitespace_adjacency_details` and `Canon::Comparison::XmlComparator#build_whitespace_adjacency_reason`.
216
+
215
217
  ==== One-sided text diffs (added or removed text nodes)
216
218
 
217
219
  When a `text_content` difference carries a text node on one side and `nil` on the other (issue #125) -- the shape that fragment-length mismatches and child-comparison emit when a text-node child is missing -- the renderer mirrors `element_structure`: the missing side reads `(not present)`, and the present side reads the text-node content (whitespace-visualised) plus a brief parent open-tag hint for context. The full ancestor subtree is *not* dumped; only the immediate parent's opening tag is shown, so a missing whitespace text node cannot make the diff look like the entire ancestor differs.
@@ -0,0 +1,160 @@
1
+ ---
2
+ title: Comment asymmetry in diff reports
3
+ parent: Diff Formatting
4
+ nav_order: 9
5
+ ---
6
+ = Comment asymmetry in diff reports
7
+ :toc:
8
+ :toclevels: 2
9
+
10
+ == Purpose
11
+
12
+ Canon's diff reports anchor `<!-- ... -->` comment nodes that have no
13
+ counterpart on the other side to a dedicated `:comments` dimension
14
+ instead of letting the resulting children-array length mismatch cascade
15
+ into a misleading `:element_structure` "Element removed" diff against
16
+ the trailing named sibling.
17
+
18
+ This is a *report-only* shape change — equivalence verdicts are
19
+ unchanged. Whether asymmetric comments cause a non-equivalent verdict
20
+ or not depends on the `comments` match option (`:strict` /
21
+ `:ignore` / `:exact`), as before.
22
+
23
+ == The problem
24
+
25
+ Consider an HTML fragment compared with `verbose: true`:
26
+
27
+ [source,html]
28
+ ----
29
+ <!-- expected -->
30
+ <body>
31
+ <div>first</div>
32
+ <div>second</div>
33
+ <!-- a comment that exists only on side A -->
34
+ <div style="mso-element:footnote-list"></div>
35
+ </body>
36
+
37
+ <!-- actual -->
38
+ <body>
39
+ <div>first</div>
40
+ <div>second</div>
41
+ <div style="mso-element:footnote-list"></div>
42
+ </body>
43
+ ----
44
+
45
+ The `<div style="mso-element:footnote-list">` is byte-identical between
46
+ the two sides; the only real difference is the comment on the expected
47
+ side. Pre-#144, the diff report contained:
48
+
49
+ [source]
50
+ ----
51
+ DIFFERENCE #1 — element_structure: Element removed:
52
+ <div style="mso-element:footnote-list"/>
53
+ ----
54
+
55
+ That is the wrong dimension, anchored at the wrong node. The element is
56
+ present on both sides — what is missing is the comment.
57
+
58
+ The cascade comes from positional alignment in
59
+ `Canon::Comparison::HtmlComparator#compare_fragment_children` (and the
60
+ analogous walker in `XmlComparatorHelpers::ChildComparison`): in
61
+ verbose mode, comments are intentionally kept by `filter_children` so
62
+ informative differences can be recorded, but the resulting unequal
63
+ children-array lengths fell through to a name-based mismatch heuristic
64
+ that filtered out generic `#`-prefixed names (`#text`, `#comment`),
65
+ leaving the trailing named element to take the blame.
66
+
67
+ == The contract
68
+
69
+ When the children alignment encounters a comment node on one side
70
+ paired against a non-comment node on the other (or sitting past the
71
+ trailing edge of the shorter side), Canon:
72
+
73
+ 1. Treats the comment as a *single-side gap* in the alignment.
74
+ 2. Emits one `:comments` diff entry anchored at the comment node
75
+ itself (not at a mis-paired neighbouring element).
76
+ 3. Advances only the cursor that carries the comment, so the next
77
+ iteration aligns content against content.
78
+
79
+ The Reason line names the side that carries the comment and surfaces
80
+ its text:
81
+
82
+ [source]
83
+ ----
84
+ DIFFERENCE #1 — comments: Comment present on EXPECTED only:
85
+ <!-- a comment that exists only on side A -->
86
+ ----
87
+
88
+ == Combined with whitespace asymmetry
89
+
90
+ The same realignment walk handles asymmetric whitespace-only text
91
+ nodes (link:whitespace-adjacency.adoc[issue #137]) and asymmetric
92
+ comment nodes together. When a children mismatch is fully explained by
93
+ a combination of asymmetric whitespace and asymmetric comments, the
94
+ walker emits one diff per asymmetric node with the appropriate
95
+ dimension (`:whitespace_adjacency` for whitespace, `:comments` for
96
+ comments) — no `:element_structure` diff is produced.
97
+
98
+ When a real structural mismatch coexists with an asymmetric comment,
99
+ both kinds of diff are emitted — the structural one under
100
+ `:element_structure`, the comment one under `:comments`.
101
+
102
+ == Working with :comments diffs programmatically
103
+
104
+ [source,ruby]
105
+ ----
106
+ result = Canon::Comparison.equivalent?(html1, html2,
107
+ format: :html5, verbose: true)
108
+
109
+ comment_diffs = result.differences.select { |d| d.dimension == :comments }
110
+
111
+ # Whether these affect equivalence depends on the comments match option.
112
+ # Under the default :ignore profile they are informative; under :strict
113
+ # they are normative.
114
+ ----
115
+
116
+ == What this contract does NOT do
117
+
118
+ * **Does not silence asymmetric comments.** They are always reported
119
+ in verbose output; the change is the dimension label and the anchor
120
+ node.
121
+ * **Does not affect symmetric comments.** When both sides carry
122
+ parallel comment nodes, those compare normally — content-vs-content
123
+ comparison applies.
124
+ * **Does not change equivalence outcomes.** A comparison whose
125
+ equivalence verdict was driven by asymmetric comments retains the
126
+ same verdict; only the report shape changes.
127
+
128
+ == Where it runs
129
+
130
+ The noise-aware realignment is a single shared implementation:
131
+
132
+ * `Canon::Comparison::ChildRealignment` — the two-cursor walk that
133
+ detects noise nodes via `NodeInspector.noise_dimension_for`,
134
+ emits per-orphan diffs with the appropriate dimension
135
+ (`:whitespace_adjacency`, `:comments`), and advances only the
136
+ noise-side cursor so content nodes stay aligned.
137
+
138
+ Both comparison paths delegate to `ChildRealignment.walk`:
139
+
140
+ * `Canon::Comparison::HtmlComparator#compare_fragment_children` — the
141
+ HTML fragment path (passes `emit_structural_orphans: true` because it
142
+ has no separate length-mismatch step).
143
+ * `Canon::Comparison::XmlComparatorHelpers::ChildComparison` — the XML
144
+ comparator path (passes `emit_structural_orphans: false`; structural
145
+ orphans are handled by the pre-walk length-mismatch step via
146
+ `asymmetric_noise_explains_length_diff?`).
147
+
148
+ == Related
149
+
150
+ * link:whitespace-adjacency.adoc[Whitespace adjacency] — sibling
151
+ contract for asymmetric whitespace-only text nodes.
152
+ * link:../../advanced/diff-classification.adoc[Diff classification] —
153
+ Normative vs informative differences.
154
+
155
+ == History
156
+
157
+ The false-positive cascade was reported in
158
+ https://github.com/lutaml/canon/issues/144[issue #144]. The fix
159
+ mirrors the structure of the `:whitespace_adjacency` work in
160
+ https://github.com/lutaml/canon/issues/137[issue #137].
@@ -430,7 +430,7 @@ pretty-printer. This is a known future work item.
430
430
  |✓ Full
431
431
  |✓ (via XML serializer)
432
432
  |✓ Full
433
- |`:pretty_print` uses `Canon::PrettyPrinter::Html`; `:normalize_pretty_print` falls back to `XmlNormalized` pending a dedicated `HtmlNormalized`; `:c14n` uses Nokogiri HTML5 serialization
433
+ |`:pretty_print` uses `Canon::PrettyPrinter::Html` in fixture-ready mode (`FORMAT|AS_XHTML|NO_DECLARATION`); `:normalize_pretty_print` falls back to `XmlNormalized` pending a dedicated `HtmlNormalized`; `:c14n` uses Nokogiri HTML5 serialization. In fixture-ready mode, stray structural whitespace (whitespace-only text nodes between block-level siblings) is stripped before formatting so that libxml's `FORMAT` flag produces correct indentation. Whitespace inside `<pre>`, `<script>`, `<style>`, and `<textarea>` is preserved.
434
434
 
435
435
  |JSON
436
436
  |Planned
@@ -103,6 +103,15 @@ edge of a parent.
103
103
  `adjacent to`:: Degenerate fallback for a whitespace node with no
104
104
  non-whitespace siblings at all. Rarely emitted.
105
105
 
106
+ When the alignment partner extracts to an empty / whitespace-only
107
+ string (e.g. an element with no text descendants), the direction
108
+ phrasing degenerates to `Whitespace before ""` which carries no
109
+ information. In that case Canon falls back to naming the parent
110
+ element instead — `Whitespace inside <PARENT>` — and the
111
+ Expected/Actual detail block renders each side's parent element
112
+ compactly per the contract from
113
+ link:../../advanced/semantic-diff-report.adoc#parent-context-fallback-for-ambiguous-text-diffs[issue #112].
114
+
106
115
  NOTE: An earlier wording (`Whitespace surrounding "X"`) classified the
107
116
  *whitespace node's position among its own siblings* rather than its
108
117
  direction relative to the partner. That label was misleading when the
@@ -116,6 +116,9 @@ Where:
116
116
  `{Format}`:: The format module (`Xml`, `Html`, `Json`)
117
117
  `n`:: Number of spaces (default: 2) or tabs (use 1 for tabs)
118
118
  `type`:: Indentation type: `'space'` (default) or `'tab'`
119
+ `fixture_ready`:: (HTML only) When `true`, emit indented XHTML-shaped
120
+ output that strips structural whitespace before formatting. Designed for
121
+ copy-paste into RSpec heredoc fixtures. Default: `false`.
119
122
  `content`:: The input string
120
123
 
121
124
  .Pretty-print examples
@@ -151,6 +154,23 @@ Canon::Xml::PrettyPrinter.new(
151
154
  html_input = '<div><p>Hello</p></div>'
152
155
  Canon::Html::PrettyPrinter.new(indent: 2).format(html_input)
153
156
 
157
+ # HTML fixture-ready mode: produces indented XHTML-shaped output
158
+ # suitable for pasting into RSpec heredoc fixtures. Strips stray
159
+ # structural whitespace (inter-element text nodes) so libxml's FORMAT
160
+ # flag can indent block-level siblings that would otherwise be treated
161
+ # as mixed content. Whitespace inside <pre>, <script>, <style>, and
162
+ # <textarea> is preserved.
163
+ Canon::Html::PrettyPrinter.new(indent: 2, fixture_ready: true)
164
+ .format('<html><body><div>a</div> <div>b</div></body></html>')
165
+ # =>
166
+ # <html xmlns="http://www.w3.org/1999/xhtml">
167
+ # <head>...</head>
168
+ # <body>
169
+ # <div>a</div>
170
+ # <div>b</div>
171
+ # </body>
172
+ # </html>
173
+
154
174
  # JSON with 2-space indentation
155
175
  json_input = '{"z":3,"a":{"b":1}}'
156
176
  Canon::Json::PrettyPrinter.new(indent: 2).format(json_input)
@@ -235,6 +235,23 @@ HTML whitespace is collapsed per CSS rendering rules. Empty text nodes between e
235
235
  Multiple spaces within text content are collapsed to single spaces when `text_content: :normalize` is used.
236
236
  ====
237
237
 
238
+ ==== Fixture-ready pretty-print and structural whitespace
239
+
240
+ When using `Canon::PrettyPrinter::Html` with `fixture_ready: true` (the mode
241
+ used by the diff pipeline's *PRETTY-PRINTED INPUTS* section), Canon strips
242
+ stray structural whitespace before formatting. Real-world HTML5 input from
243
+ upstream pipelines often carries whitespace-only text nodes between block-level
244
+ siblings (`<body>` → `<div>`, `<br>`, `<div>`, ...). libxml's `FORMAT` flag
245
+ treats any element with a non-whitespace-only text child as mixed content and
246
+ refuses to indent its children — producing a single-line blob instead of a
247
+ readable tree.
248
+
249
+ The fixture-ready mode removes whitespace-only text nodes from parents that
250
+ are purely structural (no real text content) and are not whitespace-preserving
251
+ elements (`<pre>`, `<script>`, `<style>`, `<textarea>`). Mixed-content runs
252
+ like `<p>foo <em>bar</em> baz</p>` are left untouched so that significant
253
+ inline whitespace is preserved.
254
+
238
255
  === Attribute order
239
256
 
240
257
  HTML attributes are inherently unordered per the HTML specification, so default is `:ignore`.
@@ -0,0 +1,140 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "node_inspector"
4
+
5
+ module Canon
6
+ module Comparison
7
+ # Shared two-cursor walk over child arrays with noise-aware realignment.
8
+ #
9
+ # When positional pairing would match a noise node (whitespace-only
10
+ # text or comment) against a content node, the walker treats the
11
+ # noise node as a single-side gap: emits a diff for it and advances
12
+ # only that cursor, so the next iteration aligns content against
13
+ # content.
14
+ #
15
+ # Noise classification is delegated to +NodeInspector.noise_dimension_for+,
16
+ # making the walk open for extension — new noise types only require
17
+ # adding a branch there.
18
+ #
19
+ # The walk is parameterised by a diff emitter (a callable that
20
+ # receives node1, node2, diff1, diff2, dimension) so both the HTML
21
+ # comparator (DiffNodeBuilder.build) and the XML comparator
22
+ # (comparator.add_difference) reuse the same cursor logic.
23
+ module ChildRealignment
24
+ class << self
25
+ # Walk two child arrays, emitting diffs for noise nodes and
26
+ # yielding matched content pairs.
27
+ #
28
+ # @param children1 [Array] Left-side children
29
+ # @param children2 [Array] Right-side children
30
+ # @param emitter [#call] Callable receiving
31
+ # (node1, node2, diff1, diff2, dimension)
32
+ # @param emit_structural_orphans [Boolean] When true, trailing-edge
33
+ # non-noise orphans are emitted as +:element_structure+ diffs.
34
+ # HTML fragment path sets this to true (it has no separate
35
+ # length-mismatch step); XML path sets it to false (structural
36
+ # orphans are already recorded by +use_positional_comparison+).
37
+ # @yield [child1, child2] Compare two matched content nodes.
38
+ # Must return a Comparison result constant.
39
+ # @return [Symbol] Worst comparison result encountered
40
+ def walk(children1, children2, emitter,
41
+ emit_structural_orphans: false)
42
+ worst = Comparison::EQUIVALENT
43
+ i = 0
44
+ j = 0
45
+
46
+ while i < children1.length || j < children2.length
47
+ child1 = children1[i]
48
+ child2 = children2[j]
49
+
50
+ if child1.nil?
51
+ result = emit_orphan(child2, :right, emitter,
52
+ emit_structural_orphans)
53
+ worst = result if result && result != Comparison::EQUIVALENT
54
+ j += 1
55
+ next
56
+ elsif child2.nil?
57
+ result = emit_orphan(child1, :left, emitter,
58
+ emit_structural_orphans)
59
+ worst = result if result && result != Comparison::EQUIVALENT
60
+ i += 1
61
+ next
62
+ end
63
+
64
+ dim1 = NodeInspector.noise_dimension_for(child1)
65
+ dim2 = NodeInspector.noise_dimension_for(child2)
66
+
67
+ if dim1 && !dim2
68
+ result = emit_inline_noise(child1, child2, dim1, :left, emitter)
69
+ worst = result unless result == Comparison::EQUIVALENT
70
+ i += 1
71
+ next
72
+ elsif dim2 && !dim1
73
+ result = emit_inline_noise(child1, child2, dim2, :right, emitter)
74
+ worst = result unless result == Comparison::EQUIVALENT
75
+ j += 1
76
+ next
77
+ end
78
+
79
+ if block_given?
80
+ child_result = yield(child1, child2)
81
+ worst = child_result unless child_result == Comparison::EQUIVALENT
82
+ end
83
+ i += 1
84
+ j += 1
85
+ end
86
+
87
+ worst
88
+ end
89
+
90
+ private
91
+
92
+ # Emit a diff for an inline noise node that sits opposite a
93
+ # content node. Whitespace passes both nodes for context;
94
+ # comments pass only the comment node.
95
+ def emit_inline_noise(node_left, node_right, dimension, noise_side,
96
+ emitter)
97
+ if dimension == :whitespace_adjacency
98
+ emitter.call(node_left, node_right,
99
+ Comparison::UNEQUAL_TEXT_CONTENTS,
100
+ Comparison::UNEQUAL_TEXT_CONTENTS,
101
+ dimension)
102
+ Comparison::UNEQUAL_TEXT_CONTENTS
103
+ else
104
+ n1 = noise_side == :left ? node_left : nil
105
+ n2 = noise_side == :right ? node_right : nil
106
+ emitter.call(n1, n2,
107
+ Comparison::MISSING_NODE,
108
+ Comparison::MISSING_NODE,
109
+ dimension)
110
+ Comparison::UNEQUAL_ELEMENTS
111
+ end
112
+ end
113
+
114
+ # Emit a diff for a trailing-edge orphan (one side exhausted).
115
+ # Noise orphans are always emitted; structural orphans only when
116
+ # +emit_structural+ is true.
117
+ def emit_orphan(orphan, side, emitter, emit_structural)
118
+ dim = NodeInspector.noise_dimension_for(orphan)
119
+ if dim
120
+ n1 = side == :left ? orphan : nil
121
+ n2 = side == :right ? orphan : nil
122
+ emitter.call(n1, n2,
123
+ Comparison::MISSING_NODE,
124
+ Comparison::MISSING_NODE,
125
+ dim)
126
+ Comparison::UNEQUAL_ELEMENTS
127
+ elsif emit_structural
128
+ n1 = side == :left ? orphan : nil
129
+ n2 = side == :right ? orphan : nil
130
+ emitter.call(n1, n2,
131
+ Comparison::MISSING_NODE,
132
+ Comparison::MISSING_NODE,
133
+ :element_structure)
134
+ Comparison::UNEQUAL_ELEMENTS
135
+ end
136
+ end
137
+ end
138
+ end
139
+ end
140
+ end
@@ -188,32 +188,9 @@ module Canon
188
188
  node.is_a?(Nokogiri::HTML5::DocumentFragment)
189
189
  end
190
190
 
191
- # Record a DiffNode for a fragment-level child-count mismatch.
192
- # Each surplus child becomes its own MISSING_NODE diff so the
193
- # downstream report shows what was added or removed.
194
- def record_fragment_length_mismatch(_node1, _node2, children1,
195
- children2, differences)
196
- longer, shorter, side = if children1.length > children2.length
197
- [children1, children2, :removed]
198
- else
199
- [children2, children1, :added]
200
- end
201
-
202
- longer[shorter.length...].each do |orphan|
203
- n1 = side == :removed ? orphan : nil
204
- n2 = side == :removed ? nil : orphan
205
- differences <<
206
- Canon::Comparison::DiffNodeBuilder.build(
207
- node1: n1,
208
- node2: n2,
209
- diff1: Comparison::MISSING_NODE,
210
- diff2: Comparison::MISSING_NODE,
211
- dimension: :element_structure,
212
- )
213
- end
214
- end
215
-
216
- # Compare children of document fragments
191
+ # Compare children of document fragments using the shared
192
+ # +ChildRealignment+ walk. Structural orphans are emitted here
193
+ # (the HTML fragment path has no separate length-mismatch step).
217
194
  #
218
195
  # @param node1 [Nokogiri::DocumentFragment] First fragment
219
196
  # @param node2 [Nokogiri::DocumentFragment] Second fragment
@@ -230,29 +207,24 @@ module Canon
230
207
  children1 = XmlNodeComparison.filter_children(all_children1, opts)
231
208
  children2 = XmlNodeComparison.filter_children(all_children2, opts)
232
209
 
233
- if children1.length != children2.length
234
- # Record the length mismatch as a DiffNode so verbose mode
235
- # surfaces it. Without this, equivalent? wraps an empty
236
- # differences array and incorrectly reports the inputs as
237
- # equivalent.
238
- record_fragment_length_mismatch(node1, node2,
239
- children1, children2,
240
- differences)
241
- return Comparison::UNEQUAL_ELEMENTS
242
- elsif children1.empty?
243
- return Comparison::EQUIVALENT
244
- end
210
+ return Comparison::EQUIVALENT if children1.empty? && children2.empty?
245
211
 
246
- # Compare each pair of children
247
- children1.zip(children2).each do |child1, child2|
248
- child_result = XmlNodeComparison.compare_nodes(child1, child2,
249
- opts, child_opts,
250
- diff_children,
251
- differences)
252
- return child_result if child_result != Comparison::EQUIVALENT
212
+ emitter = html_diff_emitter(differences)
213
+ ChildRealignment.walk(children1, children2, emitter,
214
+ emit_structural_orphans: true) do |c1, c2|
215
+ XmlNodeComparison.compare_nodes(c1, c2, opts, child_opts,
216
+ diff_children, differences)
253
217
  end
218
+ end
254
219
 
255
- Comparison::EQUIVALENT
220
+ # Build a diff emitter for the HTML comparator path that
221
+ # creates DiffNode objects via DiffNodeBuilder.
222
+ def html_diff_emitter(differences)
223
+ proc do |n1, n2, d1, d2, dim|
224
+ differences << Canon::Comparison::DiffNodeBuilder.build(
225
+ node1: n1, node2: n2, diff1: d1, diff2: d2, dimension: dim,
226
+ )
227
+ end
256
228
  end
257
229
 
258
230
  # Perform semantic tree diff using SemanticTreeMatchStrategy
@@ -83,6 +83,32 @@ module Canon
83
83
  end
84
84
  end
85
85
 
86
+ # Classify +node+ as a noise node and return the diff dimension
87
+ # it should be reported under, or +nil+ if it is structural content.
88
+ #
89
+ # Noise nodes (whitespace-only text, comments) are realigned past
90
+ # during child comparison so that content nodes line up correctly
91
+ # across sides.
92
+ #
93
+ # @param node [Object] DOM node to classify
94
+ # @return [Symbol, nil] +:whitespace_adjacency+, +:comments+, or +nil+
95
+ def self.noise_dimension_for(node)
96
+ if whitespace_only_text?(node)
97
+ :whitespace_adjacency
98
+ elsif comment_node?(node)
99
+ :comments
100
+ end
101
+ end
102
+
103
+ # True when +node+ is a noise node (whitespace-only text or comment).
104
+ # Convenience wrapper around +noise_dimension_for+.
105
+ #
106
+ # @param node [Object] DOM node to check
107
+ # @return [Boolean]
108
+ def self.noise_node?(node)
109
+ !noise_dimension_for(node).nil?
110
+ end
111
+
86
112
  # Extract parse-time errors carried on a node or its owning document.
87
113
  # Returns an Array of Strings.
88
114
  def self.parse_errors(node)
@@ -98,6 +124,15 @@ module Canon
98
124
  []
99
125
  end
100
126
  end
127
+
128
+ # Return the parent node of +node+, or nil when +node+ is not a
129
+ # recognised DOM backend type or has no parent.
130
+ def self.parent_of(node)
131
+ case node
132
+ when Canon::Xml::Node, Nokogiri::XML::Node
133
+ node.parent
134
+ end
135
+ end
101
136
  end
102
137
  end
103
138
  end
@@ -98,7 +98,7 @@ module Canon
98
98
  end
99
99
 
100
100
  # If no matches and children exist, they're all different
101
- if matches.empty? && (!children1.empty? || !children2.empty?)
101
+ if matches.empty? && (!children1.empty? || children2.empty?)
102
102
  comparator.add_difference(parent_node, parent_node,
103
103
  Comparison::MISSING_NODE, Comparison::MISSING_NODE,
104
104
  :text_content, opts, differences)
@@ -156,13 +156,12 @@ module Canon
156
156
  end
157
157
 
158
158
  # Use simple positional comparison for children, with
159
- # whitespace-asymmetry-aware re-alignment. When positional
160
- # +zip()+ would pair a whitespace-only text node on one side
161
- # against a content node on the other, treat the whitespace
162
- # node as a single-side gap: emit one +:whitespace_adjacency+
163
- # diff anchored at the whitespace node and advance only the
164
- # cursor carrying the whitespace, so the next iteration aligns
165
- # content against content. See lutaml/canon#137.
159
+ # noise-aware re-alignment via ChildRealignment. When the
160
+ # children arrays differ in length, a pre-walk step records
161
+ # structural orphans (or suppresses them when the length
162
+ # difference is fully explained by noise nodes). The shared
163
+ # walk then handles noise realignment and content comparison.
164
+ # See lutaml/canon#137 (whitespace) and #144 (comments).
166
165
  def use_positional_comparison(
167
166
  children1, children2, parent_node, comparator,
168
167
  opts, child_opts, diff_children, differences
@@ -173,11 +172,11 @@ module Canon
173
172
  unless children1.length == children2.length
174
173
  has_mismatch = true
175
174
 
176
- ws_asymmetric = asymmetric_whitespace_explains_length_diff?(
175
+ noise_asymmetric = asymmetric_noise_explains_length_diff?(
177
176
  children1, children2
178
177
  )
179
178
 
180
- if ws_asymmetric
179
+ if noise_asymmetric
181
180
  dimension = nil
182
181
  mismatched_children = []
183
182
  else
@@ -191,7 +190,7 @@ module Canon
191
190
  end
192
191
 
193
192
  if mismatched_children.empty?
194
- unless ws_asymmetric
193
+ unless noise_asymmetric
195
194
  comparator.add_difference(parent_node, parent_node,
196
195
  Comparison::MISSING_NODE, Comparison::MISSING_NODE,
197
196
  dimension, opts, differences)
@@ -215,75 +214,31 @@ module Canon
215
214
  end
216
215
 
217
216
  result = has_mismatch ? Comparison::UNEQUAL_ELEMENTS : Comparison::EQUIVALENT
218
- walk_result = walk_children_with_realignment(
219
- children1, children2, comparator,
220
- child_opts, diff_children, opts, differences
221
- )
217
+
218
+ emitter = xml_diff_emitter(comparator, opts, differences)
219
+ walk_result = ChildRealignment.walk(children1, children2,
220
+ emitter) do |c1, c2|
221
+ comparator.compare_nodes(c1, c2, child_opts, child_opts,
222
+ diff_children, differences)
223
+ end
222
224
  result = walk_result unless walk_result == Comparison::EQUIVALENT
223
225
  result
224
226
  end
225
227
 
226
- # Two-cursor walk over paired children that re-aligns past
227
- # asymmetric whitespace-only text nodes. Returns the worst
228
- # child result encountered.
229
- def walk_children_with_realignment(
230
- children1, children2, comparator,
231
- child_opts, diff_children, opts, differences
232
- )
233
- result = Comparison::EQUIVALENT
234
- i = 0
235
- j = 0
236
-
237
- while i < children1.length || j < children2.length
238
- c1 = children1[i]
239
- c2 = children2[j]
240
-
241
- if c1.nil?
242
- j += 1
243
- next
244
- elsif c2.nil?
245
- i += 1
246
- next
247
- end
248
-
249
- ws1 = NodeInspector.whitespace_only_text?(c1)
250
- ws2 = NodeInspector.whitespace_only_text?(c2)
251
-
252
- if ws1 && !ws2
253
- comparator.add_difference(c1, c2,
254
- Comparison::UNEQUAL_TEXT_CONTENTS,
255
- Comparison::UNEQUAL_TEXT_CONTENTS,
256
- :whitespace_adjacency, opts, differences)
257
- result = Comparison::UNEQUAL_TEXT_CONTENTS
258
- i += 1
259
- next
260
- elsif ws2 && !ws1
261
- comparator.add_difference(c1, c2,
262
- Comparison::UNEQUAL_TEXT_CONTENTS,
263
- Comparison::UNEQUAL_TEXT_CONTENTS,
264
- :whitespace_adjacency, opts, differences)
265
- result = Comparison::UNEQUAL_TEXT_CONTENTS
266
- j += 1
267
- next
268
- end
269
-
270
- child_result = comparator.compare_nodes(c1, c2,
271
- child_opts, child_opts,
272
- diff_children, differences)
273
- result = child_result unless child_result == Comparison::EQUIVALENT
274
- i += 1
275
- j += 1
228
+ # Build a diff emitter for the XML comparator path that
229
+ # delegates to comparator.add_difference.
230
+ def xml_diff_emitter(comparator, opts, differences)
231
+ proc do |n1, n2, d1, d2, dim|
232
+ comparator.add_difference(n1, n2, d1, d2, dim, opts, differences)
276
233
  end
277
-
278
- result
279
234
  end
280
235
 
281
- # True when the length difference between the two child arrays
282
- # is fully explained by asymmetric whitespace-only text nodes.
283
- def asymmetric_whitespace_explains_length_diff?(children1, children2)
284
- non_ws1 = children1.reject { |c| NodeInspector.whitespace_only_text?(c) }
285
- non_ws2 = children2.reject { |c| NodeInspector.whitespace_only_text?(c) }
286
- non_ws1.length == non_ws2.length
236
+ # True when the length difference is fully explained by
237
+ # asymmetric noise nodes (whitespace-only text and/or comments).
238
+ def asymmetric_noise_explains_length_diff?(children1, children2)
239
+ signal1 = children1.reject { |c| NodeInspector.noise_node?(c) }
240
+ signal2 = children2.reject { |c| NodeInspector.noise_node?(c) }
241
+ signal1.length == signal2.length
287
242
  end
288
243
 
289
244
  # Determine dimension for length mismatch
@@ -86,6 +86,14 @@ module Canon
86
86
  return "Attribute order changed: [#{attrs1.join(', ')}] → [#{attrs2.join(', ')}]"
87
87
  end
88
88
 
89
+ # For asymmetric comment nodes (#144), name the side that carries
90
+ # the comment and surface the comment text rather than reusing
91
+ # the generic "element structure mismatch" wording.
92
+ if dimension == :comments
93
+ comment_reason = build_comment_difference_reason(node1, node2)
94
+ return comment_reason if comment_reason
95
+ end
96
+
89
97
  # Default reason
90
98
  if diff1 == Canon::Comparison::MISSING_NODE && diff2 == Canon::Comparison::MISSING_NODE
91
99
  "element structure mismatch (children differ)"
@@ -217,6 +225,31 @@ module Canon
217
225
  "'#{truncate(text1)}' vs '#{truncate(text2)}'"
218
226
  end
219
227
 
228
+ # Build a Reason line for a +:comments+ diff. Returns +nil+ when
229
+ # neither side carries a comment (caller falls back to default).
230
+ def self.build_comment_difference_reason(node1, node2)
231
+ cm1 = node1 && Canon::Comparison::NodeInspector.comment_node?(node1)
232
+ cm2 = node2 && Canon::Comparison::NodeInspector.comment_node?(node2)
233
+
234
+ return nil unless cm1 || cm2
235
+
236
+ if cm1 && !cm2
237
+ "Comment present on EXPECTED only: " \
238
+ "<!--#{truncate(comment_text(node1))}-->"
239
+ elsif cm2 && !cm1
240
+ "Comment present on ACTUAL only: " \
241
+ "<!--#{truncate(comment_text(node2))}-->"
242
+ else
243
+ t1 = truncate(comment_text(node1))
244
+ t2 = truncate(comment_text(node2))
245
+ "Comment text differs: <!--#{t1}--> vs <!--#{t2}-->"
246
+ end
247
+ end
248
+
249
+ def self.comment_text(node)
250
+ Canon::Comparison::NodeInspector.text_content(node).to_s
251
+ end
252
+
220
253
  # Truncate text for display in reason messages
221
254
  #
222
255
  # @param text [String] Text to truncate
@@ -703,6 +703,10 @@ differences)
703
703
  return build_whitespace_adjacency_reason(node1, node2)
704
704
  end
705
705
 
706
+ if dimension == :comments
707
+ return build_comments_reason(node1, node2)
708
+ end
709
+
706
710
  # For attribute values differences, show the actual values
707
711
  if dimension == :attribute_values
708
712
  attrs1 = extract_attributes(node1)
@@ -873,12 +877,31 @@ differences)
873
877
  return build_text_diff_reason(text1, text2)
874
878
  end
875
879
 
876
- direction = whitespace_partner_direction(ws_node)
877
880
  ws_vis = visualize_whitespace(ws_text)
878
- content_vis = content_text ? visualize_whitespace(truncate_text(content_text)) : "(none)"
879
881
 
880
- "Whitespace #{direction} \"#{content_vis}\": " \
881
- "present on #{present_side} (\"#{ws_vis}\"), absent on #{absent_side}"
882
+ if content_text.nil? || content_text.strip.empty?
883
+ # Partner content extracts to "" / whitespace-only — naming it
884
+ # in the Reason ("Whitespace before \"\"") gives the reader
885
+ # nothing. Fall back to the parent element name so the
886
+ # diff carries structural context (issue #112's contract,
887
+ # extended from :text_content to :whitespace_adjacency).
888
+ parent_label = whitespace_adjacency_parent_label(ws_node)
889
+ "Whitespace inside #{parent_label}: " \
890
+ "present on #{present_side} (\"#{ws_vis}\"), absent on #{absent_side}"
891
+ else
892
+ direction = whitespace_partner_direction(ws_node)
893
+ content_vis = visualize_whitespace(truncate_text(content_text))
894
+ "Whitespace #{direction} \"#{content_vis}\": " \
895
+ "present on #{present_side} (\"#{ws_vis}\"), absent on #{absent_side}"
896
+ end
897
+ end
898
+
899
+ def whitespace_adjacency_parent_label(ws_node)
900
+ parent = NodeInspector.parent_of(ws_node)
901
+ return "(unknown parent)" unless parent
902
+
903
+ name = parent.name
904
+ name && !name.empty? ? "<#{name}>" : "(unknown parent)"
882
905
  end
883
906
 
884
907
  # Direction of the partner content relative to the whitespace node,
@@ -889,11 +912,8 @@ differences)
889
912
  # "adjacent to" as a degenerate fallback when neither neighbour
890
913
  # exists.
891
914
  def whitespace_partner_direction(ws_node)
892
- return "adjacent to" unless ws_node.is_a?(Canon::Xml::Node) ||
893
- ws_node.is_a?(Nokogiri::XML::Node)
894
-
895
- parent = ws_node.parent
896
- return "adjacent to" if parent.nil?
915
+ parent = NodeInspector.parent_of(ws_node)
916
+ return "adjacent to" unless parent
897
917
 
898
918
  siblings = parent.children
899
919
  idx = siblings.index(ws_node)
@@ -918,6 +938,30 @@ differences)
918
938
  false
919
939
  end
920
940
 
941
+ # Build a Reason line for a +:comments+ diff (#144).
942
+ # Names the side that carries the comment and surfaces the
943
+ # comment text.
944
+ def build_comments_reason(node1, node2)
945
+ cm1 = node1 && NodeInspector.comment_node?(node1)
946
+ cm2 = node2 && NodeInspector.comment_node?(node2)
947
+
948
+ if cm1 && !cm2
949
+ "Comment present on EXPECTED only: <!--#{truncate_text(comment_text(node1))}-->"
950
+ elsif cm2 && !cm1
951
+ "Comment present on ACTUAL only: <!--#{truncate_text(comment_text(node2))}-->"
952
+ elsif cm1 && cm2
953
+ t1 = truncate_text(comment_text(node1))
954
+ t2 = truncate_text(comment_text(node2))
955
+ "Comment text differs: <!--#{t1}--> vs <!--#{t2}-->"
956
+ else
957
+ "element structure mismatch (children differ)"
958
+ end
959
+ end
960
+
961
+ def comment_text(node)
962
+ NodeInspector.text_content(node).to_s
963
+ end
964
+
921
965
  # Check if text is only whitespace
922
966
  #
923
967
  # @param text [String] Text to check
@@ -104,6 +104,8 @@ module Canon
104
104
  # - diff_code: Type of difference
105
105
  #
106
106
  module Comparison
107
+ autoload :ChildRealignment, "canon/comparison/child_realignment"
108
+
107
109
  # Comparison result constants
108
110
  EQUIVALENT = 1
109
111
  MISSING_ATTRIBUTE = 2
@@ -74,7 +74,7 @@ module Canon
74
74
  end
75
75
 
76
76
  # :whitespace_adjacency is a report-only re-label of an
77
- # asymmetric whitespace mismatch emitted by ChildComparison's
77
+ # asymmetric whitespace mismatch emitted by ChildRealignment's
78
78
  # two-cursor walk. Equivalence behaviour is unchanged — the
79
79
  # underlying mismatch is normative regardless of match options.
80
80
  if diff_node.dimension == :whitespace_adjacency
@@ -83,6 +83,14 @@ module Canon
83
83
  return diff_node
84
84
  end
85
85
 
86
+ # :comments diffs from asymmetric comment nodes intentionally
87
+ # fall through to profile.normative_dimension? below. Unlike
88
+ # :whitespace_adjacency (always normative), the classification
89
+ # of comment diffs respects the :comments match option:
90
+ # :strict → normative, :ignore → informative. This is by
91
+ # design — callers can control whether asymmetric comments
92
+ # affect equivalence via the match profile.
93
+
86
94
  # THIRD: Determine if this dimension is normative based on CompareProfile
87
95
  # This respects the policy settings (strict/normalize/ignore)
88
96
  is_normative = profile.normative_dimension?(diff_node.dimension)
@@ -525,14 +525,34 @@ expand_difference: false)
525
525
  text1 = NodeUtils.get_node_text(node1).to_s
526
526
  text2 = NodeUtils.get_node_text(node2).to_s
527
527
 
528
- detail1 = ColorHelper.colorize(
529
- "\"#{TextUtils.visualize_whitespace(text1)}\"",
530
- :red, use_color
531
- )
532
- detail2 = ColorHelper.colorize(
533
- "\"#{TextUtils.visualize_whitespace(text2)}\"",
534
- :green, use_color
535
- )
528
+ if TextUtils.ambiguous_text_pair?(text1, text2) &&
529
+ (NodeUtils.parent_of(node1) || NodeUtils.parent_of(node2))
530
+ # Both sides extract to empty / whitespace-only strings —
531
+ # `""` / `""` tells the reader nothing. Fall back to a
532
+ # brief parent open-tag hint per #112's contract, but
533
+ # without dumping the full ancestor subtree (#125).
534
+ hint1 = NodeUtils.serialize_open_tag(NodeUtils.parent_of(node1))
535
+ hint2 = NodeUtils.serialize_open_tag(NodeUtils.parent_of(node2))
536
+ ws1 = TextUtils.visualize_whitespace(text1)
537
+ ws2 = TextUtils.visualize_whitespace(text2)
538
+ detail1 = ColorHelper.colorize(
539
+ "\"#{ws1}\" in #{hint1}",
540
+ :red, use_color
541
+ )
542
+ detail2 = ColorHelper.colorize(
543
+ "\"#{ws2}\" in #{hint2}",
544
+ :green, use_color
545
+ )
546
+ else
547
+ detail1 = ColorHelper.colorize(
548
+ "\"#{TextUtils.visualize_whitespace(text1)}\"",
549
+ :red, use_color
550
+ )
551
+ detail2 = ColorHelper.colorize(
552
+ "\"#{TextUtils.visualize_whitespace(text2)}\"",
553
+ :green, use_color
554
+ )
555
+ end
536
556
 
537
557
  reason = if diff.is_a?(Canon::Diff::DiffNode)
538
558
  diff.reason
@@ -29,6 +29,8 @@ module Canon
29
29
  #
30
30
  # See lutaml/canon#133, lutaml/canon#135.
31
31
  class Html
32
+ WHITESPACE_PRESERVING_ELEMENTS = %w[pre textarea script style].freeze
33
+
32
34
  def initialize(indent: 2, indent_type: "space", fixture_ready: false)
33
35
  @indent = indent.to_i
34
36
  @indent_type = indent_type
@@ -83,6 +85,7 @@ module Canon
83
85
  # suppresses the +<?xml ...?>+ prefix.
84
86
  def format_fixture_ready(html_string)
85
87
  doc = Nokogiri::HTML5(html_string)
88
+ strip_structural_whitespace!(doc)
86
89
  io = StringIO.new
87
90
  if @indent_type == "tab"
88
91
  doc.write_to(io, save_with: fixture_ready_save_options,
@@ -94,6 +97,37 @@ module Canon
94
97
  io.string
95
98
  end
96
99
 
100
+ # libxml's +FORMAT+ save flag does not insert indentation around
101
+ # the children of any element it sees as mixed content (any
102
+ # non-whitespace-only text node child). +Nokogiri::HTML5+ does
103
+ # not accept the +noblanks+ option that the XML parser uses to
104
+ # strip these inter-sibling text nodes pre-serialisation, so we
105
+ # do it manually here: drop whitespace-only text nodes whose
106
+ # parent is structural (no real text content) and not a
107
+ # whitespace-preserving element. Mixed-content runs like
108
+ # +<p>foo <em>bar</em> baz</p>+ are left alone.
109
+ def strip_structural_whitespace!(doc)
110
+ to_remove = []
111
+ doc.traverse do |node|
112
+ next unless node.text?
113
+ next unless node.content.strip.empty?
114
+
115
+ parent = node.parent
116
+ next if parent.nil?
117
+ next if WHITESPACE_PRESERVING_ELEMENTS.include?(parent.name)
118
+ next if parent_has_real_text?(parent)
119
+
120
+ to_remove << node
121
+ end
122
+ to_remove.each(&:remove)
123
+ end
124
+
125
+ def parent_has_real_text?(parent)
126
+ parent.children.any? do |c|
127
+ c.text? && !c.content.strip.empty?
128
+ end
129
+ end
130
+
97
131
  def fixture_ready_save_options
98
132
  Nokogiri::XML::Node::SaveOptions::FORMAT |
99
133
  Nokogiri::XML::Node::SaveOptions::AS_XHTML |
data/lib/canon/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Canon
4
- VERSION = "0.2.6"
4
+ VERSION = "0.2.8"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: canon
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.2.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-05-03 00:00:00.000000000 Z
11
+ date: 2026-05-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: diff-lcs
@@ -167,6 +167,7 @@ files:
167
167
  - docs/features/diff-formatting/algorithm-specific-output.adoc
168
168
  - docs/features/diff-formatting/character-visualization.adoc
169
169
  - docs/features/diff-formatting/colors-and-symbols.adoc
170
+ - docs/features/diff-formatting/comment-asymmetry.adoc
170
171
  - docs/features/diff-formatting/context-and-grouping.adoc
171
172
  - docs/features/diff-formatting/display-filtering.adoc
172
173
  - docs/features/diff-formatting/display-preprocessing.adoc
@@ -221,6 +222,7 @@ files:
221
222
  - lib/canon/commands/format_command.rb
222
223
  - lib/canon/comparison.rb
223
224
  - lib/canon/comparison/base_comparator.rb
225
+ - lib/canon/comparison/child_realignment.rb
224
226
  - lib/canon/comparison/compare_profile.rb
225
227
  - lib/canon/comparison/comparison_result.rb
226
228
  - lib/canon/comparison/dimensions.rb