canon 0.1.23 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +155 -30
  3. data/docs/INDEX.adoc +4 -0
  4. data/docs/advanced/diff-classification.adoc +3 -2
  5. data/docs/advanced/verbose-mode-architecture.adoc +23 -0
  6. data/docs/features/configuration-profiles.adoc +288 -0
  7. data/docs/features/diff-formatting/character-visualization.adoc +153 -454
  8. data/docs/features/diff-formatting/display-filtering.adoc +44 -0
  9. data/docs/features/diff-formatting/display-preprocessing.adoc +656 -0
  10. data/docs/features/diff-formatting/index.adoc +47 -0
  11. data/docs/features/diff-formatting/pretty-diff-mode.adoc +154 -0
  12. data/docs/features/environment-configuration/override-system.adoc +10 -3
  13. data/docs/features/index.adoc +9 -0
  14. data/docs/features/match-options/html-policies.adoc +3 -0
  15. data/docs/features/match-options/index.adoc +32 -42
  16. data/docs/features/match-options/pretty-printed-fixtures.adoc +270 -0
  17. data/docs/guides/choosing-configuration.adoc +22 -0
  18. data/docs/reference/environment-variables.adoc +121 -1
  19. data/docs/reference/options-across-interfaces.adoc +182 -2
  20. data/lib/canon/cli.rb +20 -0
  21. data/lib/canon/commands/diff_command.rb +7 -2
  22. data/lib/canon/commands/format_command.rb +1 -1
  23. data/lib/canon/comparison/html_comparator.rb +29 -19
  24. data/lib/canon/comparison/html_compare_profile.rb +4 -4
  25. data/lib/canon/comparison/markup_comparator.rb +12 -3
  26. data/lib/canon/comparison/match_options/base_resolver.rb +29 -7
  27. data/lib/canon/comparison/match_options/json_resolver.rb +9 -0
  28. data/lib/canon/comparison/match_options/xml_resolver.rb +16 -2
  29. data/lib/canon/comparison/match_options/yaml_resolver.rb +10 -0
  30. data/lib/canon/comparison/match_options.rb +4 -1
  31. data/lib/canon/comparison/whitespace_sensitivity.rb +189 -137
  32. data/lib/canon/comparison/xml_comparator/child_comparison.rb +21 -4
  33. data/lib/canon/comparison/xml_comparator.rb +14 -12
  34. data/lib/canon/comparison/xml_node_comparison.rb +51 -6
  35. data/lib/canon/comparison.rb +52 -9
  36. data/lib/canon/config/env_schema.rb +32 -4
  37. data/lib/canon/config/override_resolver.rb +16 -3
  38. data/lib/canon/config/profile_loader.rb +135 -0
  39. data/lib/canon/config/profiles/metanorma.yml +74 -0
  40. data/lib/canon/config/profiles/metanorma_debug.yml +8 -0
  41. data/lib/canon/config/type_converter.rb +8 -0
  42. data/lib/canon/config.rb +469 -5
  43. data/lib/canon/diff/diff_classifier.rb +41 -11
  44. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +48 -17
  45. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +58 -0
  46. data/lib/canon/diff_formatter/diff_detail_formatter.rb +73 -17
  47. data/lib/canon/diff_formatter.rb +493 -36
  48. data/lib/canon/pretty_printer/xml_normalized.rb +395 -0
  49. data/lib/canon/rspec_matchers.rb +36 -0
  50. data/lib/canon/version.rb +1 -1
  51. data/lib/canon/xml/nodes/namespace_node.rb +4 -0
  52. data/lib/canon/xml/nodes/processing_instruction_node.rb +4 -0
  53. data/lib/canon/xml/nodes/root_node.rb +4 -0
  54. data/lib/canon/xml/nodes/text_node.rb +4 -0
  55. data/lib/tasks/performance_helpers.rb +2 -2
  56. metadata +24 -2
@@ -4,30 +4,74 @@ module Canon
4
4
  module Comparison
5
5
  # Whitespace sensitivity utilities for element-level control
6
6
  #
7
- # This module provides logic to determine whether whitespace should be
8
- # preserved during comparison based on:
9
- # - Format-specific defaults (HTML has built-in sensitive elements)
10
- # - User-configured whitelist (elements that care about whitespace)
11
- # - User-configured blacklist (elements that don't care about whitespace)
12
- # - xml:space attribute in the document itself
13
- # - respect_xml_space flag (whether to honor or override xml:space)
7
+ # This module provides three-way classification of whitespace behaviour
8
+ # at the element level:
9
+ #
10
+ # * **:preserve** every whitespace character is significant. `" "` ≠ `"\n"`.
11
+ # Configured via +preserve_whitespace_elements+ (HTML default: pre, code,
12
+ # textarea, script, style; XML default: none).
13
+ #
14
+ # * **:collapse** — presence ≠ absence, but all whitespace forms are
15
+ # equivalent: `" "` == `"\n "`. Configured via +collapse_whitespace_elements+
16
+ # (HTML default: p, li, dt, dd, td, th, h1-h6, caption, figcaption, label,
17
+ # legend, summary, blockquote, address; XML default: none).
18
+ #
19
+ # * **:strip** — all whitespace is structural formatting noise and is
20
+ # dropped. Default for XML; HTML elements not in the above lists.
21
+ #
22
+ # Classification is **ancestor-based**: the closest matching ancestor
23
+ # determines the class. The strip blacklist (+strip_whitespace_elements+)
24
+ # overrides any sensitive ancestor.
14
25
  #
15
26
  # == Priority Order
16
27
  #
17
28
  # 1. respect_xml_space: false → User config only (ignore xml:space)
18
- # 2. User whitelist Use whitelist (user explicitly declared)
19
- # 3. Format defaults HTML: [:pre, :textarea, :script, :style], XML: []
20
- # 4. User blacklist Remove from defaults/whitelist
21
- # 5. xml:space="preserve" Element is sensitive
22
- # 6. xml:space="default" → Use steps 1-4
29
+ # 2. Ancestor walk (strip blacklist wins; then preserve; then collapse)
30
+ # 3. xml:space="preserve"preserve
31
+ # 4. xml:space="default"use configured behaviour
32
+ # 5. Format defaults (HTML: collapse for most elements; XML: strip)
23
33
  #
24
34
  # == Usage
25
35
  #
36
+ # WhitespaceSensitivity.classify_element(element, match_opts)
37
+ # => :preserve, :collapse, or :strip
38
+ #
26
39
  # WhitespaceSensitivity.element_sensitive?(node, opts)
27
- # => true if whitespace should be preserved for this element
40
+ # => true if whitespace should be preserved (preserve or collapse)
28
41
  module WhitespaceSensitivity
42
+ # HTML mixed-content "leaf block" elements where whitespace presence
43
+ # matters but all forms are equivalent (CSS block whitespace collapsing).
44
+ HTML_COLLAPSE_ELEMENTS = %w[
45
+ p li dt dd td th caption figcaption label legend summary
46
+ h1 h2 h3 h4 h5 h6
47
+ blockquote address button
48
+ ].freeze
49
+
50
+ # HTML elements where every whitespace character is significant.
51
+ HTML_PRESERVE_ELEMENTS = %w[pre code textarea script style].freeze
52
+
29
53
  class << self
30
- # Check if an element is whitespace-sensitive based on configuration
54
+ # Classify the whitespace behaviour for an element using ancestor walk.
55
+ #
56
+ # @param element [Object] The element node to classify
57
+ # @param match_opts [Hash] Resolved match options
58
+ # @return [Symbol] :preserve, :collapse, or :strip
59
+ def classify_element(element, match_opts)
60
+ return :strip unless element
61
+ return :strip unless element.respond_to?(:name)
62
+
63
+ preserve_set = resolved_preserve_elements_set(match_opts)
64
+ collapse_set = resolved_collapse_elements_set(match_opts)
65
+ strip_set = resolved_strip_elements_set(match_opts)
66
+
67
+ # Ancestor walk: start at the element itself, walk up.
68
+ # Strip blacklist wins over any sensitive ancestor.
69
+ walk_ancestor_classification(element, preserve_set, collapse_set,
70
+ strip_set, match_opts)
71
+ end
72
+
73
+ # Check if an element is whitespace-sensitive based on configuration.
74
+ # Returns true for :preserve or :collapse classification.
31
75
  #
32
76
  # @param node [Object] The element node to check
33
77
  # @param opts [Hash] Comparison options containing match_opts
@@ -40,7 +84,7 @@ module Canon
40
84
  parent = node.parent
41
85
 
42
86
  # 1. Check if we should ignore xml:space (user override)
43
- if !respect_xml_space?(match_opts)
87
+ unless respect_xml_space?(match_opts)
44
88
  return user_config_sensitive?(parent, match_opts)
45
89
  end
46
90
 
@@ -50,8 +94,9 @@ module Canon
50
94
  # 3. Check xml:space="default" (use configured behavior)
51
95
  return false if xml_space_default?(parent)
52
96
 
53
- # 4. Use user configuration + format defaults
54
- configured_sensitive?(parent, match_opts)
97
+ # 4. Three-way classification (ancestor-based)
98
+ classification = classify_element(parent, match_opts)
99
+ %i[preserve collapse].include?(classification)
55
100
  end
56
101
 
57
102
  # Check if whitespace-only text node should be filtered
@@ -66,105 +111,93 @@ module Canon
66
111
  element_sensitive?(node, opts)
67
112
  end
68
113
 
69
- # Check if structural whitespace is preserved (not stripped) for an element.
114
+ # Return the whitespace class for a text node used during comparison.
115
+ #
116
+ # :preserve → preserve all whitespace character-by-character
117
+ # :collapse → preserve presence (normalize to single space)
118
+ # :strip → drop whitespace-only text nodes
70
119
  #
71
- # Uses sensitive_elements (whitelist) and insensitive_elements (blacklist)
72
- # from match_opts. Blacklist takes precedence over whitelist.
73
- # Format defaults apply when neither is configured.
120
+ # @param node [Object] Text node to classify
121
+ # @param opts [Hash] Comparison options containing match_opts
122
+ # @return [Symbol] :preserve, :collapse, or :strip
123
+ def classify_text_node(node, opts)
124
+ match_opts = opts[:match_opts]
125
+ return :strip unless match_opts
126
+ return :strip unless text_node_parent?(node)
127
+
128
+ parent = node.parent
129
+
130
+ unless respect_xml_space?(match_opts)
131
+ return user_config_sensitive?(parent,
132
+ match_opts) ? :preserve : :strip
133
+ end
134
+
135
+ return :preserve if xml_space_preserve?(parent)
136
+ return :strip if xml_space_default?(parent)
137
+
138
+ classify_element(parent, match_opts)
139
+ end
140
+
141
+ # Check if structural whitespace is preserved (not stripped) for an element.
74
142
  #
75
- # No inheritance from ancestors checks only the immediate parent element name.
143
+ # Uses the same priority chain as element_sensitive? / classify_text_node:
144
+ # 1. xml:space="preserve" → always preserved
145
+ # 2. xml:space="default" → use configured behaviour
146
+ # 3. ancestor-walk classification (strip = dropped)
76
147
  #
77
148
  # @param element [Object] Element node to check
78
149
  # @param match_opts [Hash] Resolved match options
79
150
  # @return [Boolean] true if whitespace is preserved (not stripped)
80
151
  def whitespace_preserved?(element, match_opts)
81
- return false unless element
82
- return false unless element.respond_to?(:name)
83
-
84
- elem_name = element.name.to_s
85
-
86
- # Blacklist: always strip (highest priority)
87
- insensitive_raw = match_opts[:insensitive_elements]
88
- insensitive_raw ||= match_opts[:whitespace_insensitive_elements]
89
- insensitive = (insensitive_raw || []).map(&:to_s)
90
- return false if insensitive.include?(elem_name)
91
-
92
- # Check if we should ignore xml:space (user override)
93
152
  if respect_xml_space?(match_opts)
94
- # Check xml:space="preserve" (document declaration)
95
- return true if xml_space_preserve?(element)
96
-
97
- # Check xml:space="default" (use configured behavior)
153
+ return true if xml_space_preserve?(element)
98
154
  return false if xml_space_default?(element)
99
155
  end
100
156
 
101
- # Whitelist: preserve whitespace
102
- sensitive = resolved_sensitive_elements(match_opts)
103
- return true if sensitive.include?(elem_name)
104
-
105
- # Default: preserve for HTML, strip for XML
106
- format = match_opts[:format] || :xml
107
- case format
108
- when :html, :html4, :html5
109
- true
110
- else
111
- false
112
- end
157
+ classification = classify_element(element, match_opts)
158
+ %i[preserve collapse].include?(classification)
113
159
  end
114
160
 
115
- # Get resolved list of whitespace-sensitive element names (strings).
161
+ # Get resolved list of preserve whitespace element names (strings).
116
162
  #
117
- # Combines format defaults + user whitelist, minus user blacklist.
118
- # Supports both short names (sensitive_elements) and long names
119
- # (whitespace_sensitive_elements) for backward compatibility.
163
+ # @param match_opts [Hash] Resolved match options
164
+ # @return [Array<String>] Preserve element names
165
+ def resolved_preserve_elements(match_opts)
166
+ resolved_preserve_elements_set(match_opts).to_a
167
+ end
168
+
169
+ # Get resolved list of collapse whitespace element names (strings).
120
170
  #
121
171
  # @param match_opts [Hash] Resolved match options
122
- # @return [Array<String>] Sensitive element names
123
- def resolved_sensitive_elements(match_opts)
124
- sensitive = []
172
+ # @return [Array<String>] Collapse element names
173
+ def resolved_collapse_elements(match_opts)
174
+ resolved_collapse_elements_set(match_opts).to_a
175
+ end
125
176
 
126
- # 1. Format defaults
177
+ # Get format-specific default preserve (exact-whitespace) elements.
178
+ # This is the SINGLE SOURCE OF TRUTH for default preserve-whitespace elements.
179
+ #
180
+ # @param match_opts [Hash] Resolved match options
181
+ # @return [Array<Symbol>] Default preserve element names
182
+ def format_default_preserve_elements(match_opts)
127
183
  format = match_opts[:format] || :xml
128
184
  case format
129
185
  when :html, :html4, :html5
130
- sensitive += %w[pre code textarea script style]
131
- end
132
-
133
- # 2. User whitelist (additive to format defaults)
134
- whitelist = match_opts[:sensitive_elements]
135
- whitelist ||= match_opts[:whitespace_sensitive_elements]
136
- if whitelist
137
- sensitive += whitelist.map(&:to_s)
138
- end
139
-
140
- # 3. User blacklist removes from combined set
141
- blacklist_raw = match_opts[:insensitive_elements]
142
- blacklist_raw ||= match_opts[:whitespace_insensitive_elements]
143
- if blacklist_raw
144
- blacklist = blacklist_raw.to_set(&:to_s)
145
- sensitive.reject! { |e| blacklist.include?(e) }
186
+ HTML_PRESERVE_ELEMENTS.map(&:to_sym).freeze
187
+ else
188
+ [].freeze
146
189
  end
147
-
148
- sensitive.uniq
149
190
  end
150
191
 
151
- # Get format-specific default sensitive elements
152
- #
153
- # This is the SINGLE SOURCE OF TRUTH for default whitespace-sensitive
154
- # elements. All other code should use this method to get the list.
192
+ # Get format-specific default collapse elements.
155
193
  #
156
194
  # @param match_opts [Hash] Resolved match options
157
- # @return [Array<Symbol>] Default sensitive element names
158
- def format_default_sensitive_elements(match_opts)
195
+ # @return [Array<Symbol>] Default collapse element names
196
+ def format_default_collapse_elements(match_opts)
159
197
  format = match_opts[:format] || :xml
160
-
161
198
  case format
162
199
  when :html, :html4, :html5
163
- # HTML specification: these elements preserve whitespace
164
- %i[pre code textarea script style].freeze
165
- when :xml
166
- # XML has no default sensitive elements - purely user-controlled
167
- [].freeze
200
+ HTML_COLLAPSE_ELEMENTS.map(&:to_sym).freeze
168
201
  else
169
202
  [].freeze
170
203
  end
@@ -172,23 +205,80 @@ module Canon
172
205
 
173
206
  # Check if an element is in the default sensitive list for its format
174
207
  #
175
- # Convenience method for checking element sensitivity without building
176
- # the full list first.
177
- #
178
208
  # @param element_name [String, Symbol] The element name to check
179
209
  # @param match_opts [Hash] Resolved match options
180
210
  # @return [Boolean] true if element is in default sensitive list
181
211
  def default_sensitive_element?(element_name, match_opts)
182
- format_default_sensitive_elements(match_opts)
212
+ format_default_preserve_elements(match_opts)
183
213
  .include?(element_name.to_sym)
184
214
  end
185
215
 
186
216
  private
187
217
 
218
+ # Build the Set of preserve whitespace element names (strings).
219
+ def resolved_preserve_elements_set(match_opts)
220
+ set = Set.new(format_default_preserve_elements(match_opts).map(&:to_s))
221
+
222
+ if match_opts[:preserve_whitespace_elements]
223
+ set |= match_opts[:preserve_whitespace_elements].map(&:to_s)
224
+ end
225
+
226
+ # Remove blacklisted elements
227
+ strip_set = resolved_strip_elements_set(match_opts)
228
+ set.reject { |e| strip_set.include?(e) }.to_set
229
+ end
230
+
231
+ # Build the Set of collapse whitespace element names (strings).
232
+ def resolved_collapse_elements_set(match_opts)
233
+ set = Set.new(format_default_collapse_elements(match_opts).map(&:to_s))
234
+
235
+ if match_opts[:collapse_whitespace_elements]
236
+ set |= match_opts[:collapse_whitespace_elements].map(&:to_s)
237
+ end
238
+
239
+ # Remove blacklisted elements
240
+ strip_set = resolved_strip_elements_set(match_opts)
241
+ set.reject { |e| strip_set.include?(e) }.to_set
242
+ end
243
+
244
+ # Build the Set of strip (blacklist) element names (strings).
245
+ def resolved_strip_elements_set(match_opts)
246
+ raw = match_opts[:strip_whitespace_elements]
247
+ Set.new((raw || []).map(&:to_s))
248
+ end
249
+
250
+ # Perform the ancestor walk classification.
251
+ # The element itself is checked first, then its ancestors.
252
+ # Strip blacklist wins over any sensitive ancestor.
253
+ def walk_ancestor_classification(element, preserve_set, collapse_set,
254
+ strip_set, _match_opts)
255
+ current = element
256
+ while current.respond_to?(:name)
257
+ name = current.name.to_s
258
+
259
+ return :strip if strip_set.include?(name)
260
+ return :preserve if preserve_set.include?(name)
261
+ return :collapse if collapse_set.include?(name)
262
+
263
+ # Walk up
264
+ break unless current.respond_to?(:parent)
265
+
266
+ parent = current.parent
267
+ break if parent.nil?
268
+ break unless parent.respond_to?(:name)
269
+ break if parent == current # guard infinite loop
270
+
271
+ current = parent
272
+ end
273
+
274
+ # No matching ancestor — whitespace sensitivity is always opt-in.
275
+ # Elements not in any list are strip regardless of format.
276
+ # (HTML_COLLAPSE_ELEMENTS are already merged into the collapse_set
277
+ # by resolved_collapse_elements_set, so they are found during the walk.)
278
+ :strip
279
+ end
280
+
188
281
  # Check if we should respect xml:space attribute
189
- #
190
- # @param match_opts [Hash] Resolved match options
191
- # @return [Boolean] true if xml:space should be respected
192
282
  def respect_xml_space?(match_opts)
193
283
  if match_opts.key?(:respect_xml_space)
194
284
  match_opts[:respect_xml_space]
@@ -198,13 +288,8 @@ module Canon
198
288
  end
199
289
 
200
290
  # Check if xml:space="preserve" is set
201
- #
202
- # @param element [Object] The element to check
203
- # @return [Boolean] true if xml:space="preserve"
204
291
  def xml_space_preserve?(element)
205
292
  if element.is_a?(Canon::Xml::Nodes::ElementNode)
206
- # Check attribute_nodes for xml:space attribute
207
- # xml:space is stored with name="space" and namespace_uri="http://www.w3.org/XML/1998/namespace"
208
293
  element.attribute_nodes.any? do |attr|
209
294
  attr.name == "space" &&
210
295
  attr.namespace_uri == "http://www.w3.org/XML/1998/namespace" &&
@@ -218,13 +303,8 @@ module Canon
218
303
  end
219
304
 
220
305
  # Check if xml:space="default" is set
221
- #
222
- # @param element [Object] The element to check
223
- # @return [Boolean] true if xml:space="default"
224
306
  def xml_space_default?(element)
225
307
  if element.is_a?(Canon::Xml::Nodes::ElementNode)
226
- # Check attribute_nodes for xml:space attribute
227
- # xml:space is stored with name="space" and namespace_uri="http://www.w3.org/XML/1998/namespace"
228
308
  element.attribute_nodes.any? do |attr|
229
309
  attr.name == "space" &&
230
310
  attr.namespace_uri == "http://www.w3.org/XML/1998/namespace" &&
@@ -237,43 +317,15 @@ module Canon
237
317
  end
238
318
  end
239
319
 
240
- # Check sensitivity based on user configuration
241
- #
242
- # @param element [Object] The element to check
243
- # @param match_opts [Hash] Resolved match options
244
- # @return [Boolean] true if element is in whitelist
320
+ # Check sensitivity based on user configuration (binary, no ancestor)
245
321
  def user_config_sensitive?(element, match_opts)
246
- return false unless match_opts[:whitespace_sensitive_elements]
322
+ list = match_opts[:preserve_whitespace_elements]
323
+ return false unless list
247
324
 
248
- match_opts[:whitespace_sensitive_elements].include?(element.name.to_sym)
249
- end
250
-
251
- # Check sensitivity based on user config + format defaults
252
- #
253
- # @param element [Object] The element to check
254
- # @param match_opts [Hash] Resolved match options
255
- # @return [Boolean] true if element should be sensitive
256
- def configured_sensitive?(element, match_opts)
257
- # Start with format defaults
258
- sensitive = format_default_sensitive_elements(match_opts).to_set
259
-
260
- # Apply whitelist (adds to defaults)
261
- if match_opts[:whitespace_sensitive_elements]
262
- sensitive |= match_opts[:whitespace_sensitive_elements]
263
- end
264
-
265
- # Apply blacklist (removes from everything)
266
- if match_opts[:whitespace_insensitive_elements]
267
- sensitive -= match_opts[:whitespace_insensitive_elements]
268
- end
269
-
270
- sensitive.include?(element.name.to_sym)
325
+ list.map(&:to_s).include?(element.name.to_s)
271
326
  end
272
327
 
273
328
  # Check if node has a parent that's an element (not document root)
274
- #
275
- # @param node [Object] The node to check
276
- # @return [Boolean] true if node has an element parent
277
329
  def text_node_parent?(node)
278
330
  return false unless node.respond_to?(:parent)
279
331
  return false unless node.parent
@@ -28,8 +28,17 @@ module Canon
28
28
  # @return [Integer] Comparison result code
29
29
  def compare(node1, node2, comparator, opts, child_opts,
30
30
  diff_children, differences)
31
- children1 = comparator.send(:filter_children, node1.children, opts)
32
- children2 = comparator.send(:filter_children, node2.children, opts)
31
+ # Apply side-specific pretty-print heuristic when either flag is set:
32
+ # pretty_printed_expected drop \n-starting whitespace nodes from node1
33
+ # pretty_printed_received → drop \n-starting whitespace nodes from node2
34
+ # The ephemeral _pretty_print_side_active flag is consumed by node_excluded?
35
+ # and must NOT be forwarded into recursive compare_nodes calls.
36
+ require_relative "../xml_node_comparison"
37
+ opts1 = XmlNodeComparison.opts_for_side(opts, :expected)
38
+ opts2 = XmlNodeComparison.opts_for_side(opts, :received)
39
+
40
+ children1 = comparator.send(:filter_children, node1.children, opts1)
41
+ children2 = comparator.send(:filter_children, node2.children, opts2)
33
42
 
34
43
  # Quick check: if both have no children, they're equivalent
35
44
  return Comparison::EQUIVALENT if children1.empty? && children2.empty?
@@ -241,7 +250,12 @@ diff_children, differences)
241
250
  end
242
251
 
243
252
  smaller_set_names = smaller_set.filter_map do |c|
244
- c.respond_to?(:name) ? c.name : nil
253
+ next nil unless c.respond_to?(:name)
254
+ # Exclude generic node-type names (e.g. "#text") that are
255
+ # shared by all text nodes and cannot be used for matching.
256
+ next nil if c.name.start_with?("#")
257
+
258
+ c.name
245
259
  end
246
260
 
247
261
  new_larger_set = []
@@ -252,9 +266,12 @@ diff_children, differences)
252
266
  # consider it a mismatch
253
267
  mismatch_children << larger_set[i]
254
268
  elsif larger_set[i].respond_to?(:name) &&
269
+ !larger_set[i].name.start_with?("#") &&
255
270
  !smaller_set_names.include?(larger_set[i].name)
256
271
  # If the name of the node is not found in the smaller set,
257
- # consider it a mismatch
272
+ # consider it a mismatch. Skip nodes with generic names
273
+ # starting with "#" (e.g. "#text") since those names are
274
+ # shared by all nodes of that type and not useful for matching.
258
275
  mismatch_children << larger_set[i]
259
276
  else
260
277
  new_larger_set << larger_set[i]
@@ -417,20 +417,22 @@ module Canon
417
417
 
418
418
  # Check if whitespace should be preserved strictly for these text nodes
419
419
  # This applies to HTML elements like pre, code, textarea, script, style
420
- # and elements with xml:space="preserve" or in user-configured whitelist
420
+ # and elements with xml:space="preserve" or in user-configured preserve list.
421
+ #
422
+ # IMPORTANT: This returns true ONLY for :preserve classification.
423
+ # For :collapse classification, whitespace differences ARE acceptable
424
+ # (they are detected as formatting-only by DiffClassifier).
421
425
  def should_preserve_whitespace_strictly?(n1, n2, opts)
422
- # Use WhitespaceSensitivity module to check if element is sensitive
423
- # Check both n1 and n2 - if either is in a sensitive element, preserve strictly
424
- if n1.respond_to?(:parent)
425
- sensitivity_opts = { match_opts: opts[:match_opts] }
426
- return true if WhitespaceSensitivity.element_sensitive?(n1,
427
- sensitivity_opts)
428
- end
426
+ # Check both n1 and n2 - if either is in a preserve whitespace element, preserve strictly
427
+ [n1, n2].each do |node|
428
+ next unless node.respond_to?(:parent)
429
+
430
+ parent = node.parent
431
+ next unless parent
429
432
 
430
- if n2.respond_to?(:parent)
431
- sensitivity_opts = { match_opts: opts[:match_opts] }
432
- return true if WhitespaceSensitivity.element_sensitive?(n2,
433
- sensitivity_opts)
433
+ classification = WhitespaceSensitivity.classify_element(parent,
434
+ opts[:match_opts])
435
+ return true if classification == :preserve
434
436
  end
435
437
 
436
438
  false
@@ -90,6 +90,39 @@ differences)
90
90
  end
91
91
  end
92
92
 
93
+ # Build a side-specific opts copy that activates the pretty-print
94
+ # structural-whitespace heuristic for the given side.
95
+ #
96
+ # When +pretty_printed_expected+ (side :expected) or
97
+ # +pretty_printed_received+ (side :received) is truthy in match_opts,
98
+ # returns a shallow copy of +opts+ with an ephemeral
99
+ # +_pretty_print_side_active: true+ flag merged into +:match_opts+.
100
+ # Otherwise returns +opts+ unchanged (no allocation overhead).
101
+ #
102
+ # The flag is consumed by +node_excluded?+ to drop whitespace-only text
103
+ # nodes that start with "\n" in +:normalize+ whitespace elements.
104
+ # It is intentionally NOT propagated to recursive +compare_nodes+ calls —
105
+ # each level of +ChildComparison.compare+ re-evaluates it from the
106
+ # original +pretty_printed_*+ flags.
107
+ #
108
+ # @param opts [Hash] Full comparison options hash
109
+ # @param side [Symbol] :expected or :received
110
+ # @return [Hash] opts copy with ephemeral flag, or opts itself
111
+ def self.opts_for_side(opts, side)
112
+ match_opts = opts[:match_opts]
113
+ return opts unless match_opts
114
+
115
+ active = case side
116
+ when :expected then match_opts[:pretty_printed_expected]
117
+ when :received then match_opts[:pretty_printed_received]
118
+ else false
119
+ end
120
+
121
+ return opts unless active
122
+
123
+ opts.merge(match_opts: match_opts.merge(_pretty_print_side_active: true))
124
+ end
125
+
93
126
  # Compare document fragments by comparing their children
94
127
  #
95
128
  # @param node1 [Nokogiri::XML::DocumentFragment] First fragment
@@ -104,9 +137,12 @@ differences)
104
137
  childrenode1 = node1.children.to_a
105
138
  childrenode2 = node2.children.to_a
106
139
 
107
- # Filter children before comparison to handle ignored nodes (like comments with :ignore)
108
- children1 = filter_children(childrenode1, opts)
109
- children2 = filter_children(childrenode2, opts)
140
+ # Filter children before comparison to handle ignored nodes (like comments with :ignore).
141
+ # Apply side-specific pretty-print heuristic when the relevant flag is active.
142
+ children1 = filter_children(childrenode1,
143
+ opts_for_side(opts, :expected))
144
+ children2 = filter_children(childrenode2,
145
+ opts_for_side(opts, :received))
110
146
 
111
147
  if children1.length != children2.length
112
148
  add_difference(node1, node2, Comparison::UNEQUAL_ELEMENTS,
@@ -191,8 +227,8 @@ diff_children, differences)
191
227
  end
192
228
 
193
229
  # Strip whitespace-only text nodes based on parent element configuration.
194
- # Use sensitive_elements / insensitive_elements to control.
195
- # Blacklist (insensitive) > whitelist (sensitive) > format defaults.
230
+ # Use preserve_whitespace_elements / strip_whitespace_elements to control.
231
+ # Blacklist (strip) > preserve > collapse > format defaults.
196
232
  return false unless text_node?(node) && node.parent
197
233
  return false unless MatchOptions.normalize_text(node_text(node)).empty?
198
234
 
@@ -200,7 +236,16 @@ diff_children, differences)
200
236
  node.parent, match_opts
201
237
  )
202
238
 
203
- false
239
+ # When the pretty-print-side flag is active (set by opts_for_side in
240
+ # ChildComparison.compare), drop whitespace-only text nodes that start
241
+ # with "\n" inside :collapse elements — they are structural indentation
242
+ # from the pretty-printer, not content. Space-only nodes (no "\n") are
243
+ # real inline content and are kept for normalised comparison.
244
+ # :preserve elements are always left unchanged.
245
+ if match_opts[:_pretty_print_side_active]
246
+ ws_class = WhitespaceSensitivity.classify_text_node(node, opts)
247
+ return true if ws_class == :collapse && node_text(node).start_with?("\n")
248
+ end
204
249
 
205
250
  false
206
251
  end