canon 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +31 -149
  3. data/README.adoc +9 -0
  4. data/docs/advanced/semantic-diff-report.adoc +96 -0
  5. data/docs/features/configuration-profiles.adoc +4 -2
  6. data/docs/features/diff-formatting/index.adoc +3 -0
  7. data/docs/features/diff-formatting/whitespace-adjacency.adoc +140 -0
  8. data/docs/features/match-options/html-policies.adoc +2 -0
  9. data/docs/features/match-options/index.adoc +40 -0
  10. data/docs/guides/choosing-configuration.adoc +12 -1
  11. data/docs/reference/cli-options.adoc +3 -0
  12. data/docs/reference/environment-variables.adoc +3 -1
  13. data/docs/reference/options-across-interfaces.adoc +7 -1
  14. data/docs/understanding/formats/html.adoc +9 -2
  15. data/lib/canon/cli.rb +4 -0
  16. data/lib/canon/commands/diff_command.rb +1 -0
  17. data/lib/canon/comparison/comparison_result.rb +95 -2
  18. data/lib/canon/comparison/html_comparator.rb +96 -11
  19. data/lib/canon/comparison/markup_comparator.rb +68 -71
  20. data/lib/canon/comparison/match_options/base_resolver.rb +1 -0
  21. data/lib/canon/comparison/match_options/xml_resolver.rb +8 -0
  22. data/lib/canon/comparison/match_options.rb +23 -2
  23. data/lib/canon/comparison/node_inspector.rb +103 -0
  24. data/lib/canon/comparison/whitespace_sensitivity.rb +96 -0
  25. data/lib/canon/comparison/xml_comparator/child_comparison.rb +133 -55
  26. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +24 -23
  27. data/lib/canon/comparison/xml_comparator/node_parser.rb +45 -7
  28. data/lib/canon/comparison/xml_comparator.rb +174 -7
  29. data/lib/canon/comparison/xml_node_comparison.rb +48 -66
  30. data/lib/canon/comparison.rb +143 -22
  31. data/lib/canon/config/env_schema.rb +2 -1
  32. data/lib/canon/config/profiles/metanorma.yml +3 -0
  33. data/lib/canon/config.rb +51 -5
  34. data/lib/canon/diff/diff_classifier.rb +55 -41
  35. data/lib/canon/diff/diff_line_builder.rb +9 -8
  36. data/lib/canon/diff/xml_serialization_formatter.rb +27 -42
  37. data/lib/canon/diff_formatter/by_line/base_formatter.rb +39 -4
  38. data/lib/canon/diff_formatter/by_line/html_formatter.rb +5 -2
  39. data/lib/canon/diff_formatter/by_line_formatter.rb +84 -0
  40. data/lib/canon/diff_formatter/by_object_formatter.rb +53 -0
  41. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +184 -26
  42. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +92 -4
  43. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +29 -0
  44. data/lib/canon/diff_formatter/pretty_diff_formatter.rb +109 -0
  45. data/lib/canon/diff_formatter.rb +128 -175
  46. data/lib/canon/html/data_model.rb +10 -4
  47. data/lib/canon/pretty_printer/html.rb +76 -14
  48. data/lib/canon/pretty_printer/html_void_elements.rb +20 -0
  49. data/lib/canon/pretty_printer/xml_normalized.rb +10 -3
  50. data/lib/canon/tree_diff/adapters/html_adapter.rb +55 -2
  51. data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
  52. data/lib/canon/version.rb +1 -1
  53. data/lib/canon/xml/c14n.rb +59 -5
  54. data/lib/canon/xml/data_model.rb +13 -1
  55. data/lib/canon/xml/element_matcher.rb +3 -0
  56. data/lib/canon/xml/node.rb +23 -1
  57. data/lib/canon/xml/nodes/comment_node.rb +4 -0
  58. data/lib/canon/xml/nodes/element_node.rb +4 -0
  59. data/lib/canon/xml/nodes/text_node.rb +4 -0
  60. data/lib/canon/xml/sax_builder.rb +29 -2
  61. data/lib/canon/xml/xpath_engine.rb +238 -0
  62. metadata +9 -2
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "../comparison" # Load base module with constants
4
+ require_relative "node_inspector"
4
5
  require_relative "../diff/diff_node"
5
6
  require_relative "../diff/path_builder"
6
7
 
@@ -87,23 +88,20 @@ module Canon
87
88
  return nil if node.nil?
88
89
 
89
90
  # Canon::Xml::Node types
90
- if node.is_a?(Canon::Xml::Nodes::RootNode)
91
+ case node
92
+ when Canon::Xml::Nodes::RootNode
91
93
  # Serialize all children of root
92
94
  node.children.map { |child| serialize_node(child) }.join
93
- elsif node.is_a?(Canon::Xml::Nodes::ElementNode)
95
+ when Canon::Xml::Nodes::ElementNode
94
96
  serialize_element_node(node)
95
- elsif node.is_a?(Canon::Xml::Nodes::TextNode)
97
+ when Canon::Xml::Nodes::TextNode
96
98
  # Use original text (with entity references) if available,
97
99
  # otherwise fall back to value (decoded text)
98
100
  node.original || node.value
99
- elsif node.is_a?(Canon::Xml::Nodes::CommentNode)
101
+ when Canon::Xml::Nodes::CommentNode
100
102
  "<!--#{node.value}-->"
101
- elsif node.is_a?(Canon::Xml::Nodes::ProcessingInstructionNode)
103
+ when Canon::Xml::Nodes::ProcessingInstructionNode
102
104
  "<?#{node.target} #{node.data}?>"
103
- elsif node.respond_to?(:to_xml)
104
- node.to_xml
105
- elsif node.respond_to?(:to_html)
106
- node.to_html
107
105
  else
108
106
  node.to_s
109
107
  end
@@ -121,8 +119,8 @@ module Canon
121
119
  node.attribute_nodes.to_h do |attr|
122
120
  [attr.name, attr.value]
123
121
  end
124
- # Nokogiri nodes
125
- elsif node.respond_to?(:attributes)
122
+ # Nokogiri elements
123
+ elsif node.is_a?(Nokogiri::XML::Element)
126
124
  node.attributes.to_h do |_, attr|
127
125
  [attr.name, attr.value]
128
126
  end
@@ -182,6 +180,25 @@ module Canon
182
180
  return false unless text_node?(node) && node.parent
183
181
  return false unless MatchOptions.normalize_text(node_text(node)).empty?
184
182
 
183
+ # NBSP (U+00A0) is never insignificant whitespace —
184
+ # it always renders as a visible non-breaking space.
185
+ # For HTML: always preserve NBSP nodes.
186
+ # For XML with whitespace_type: :strict: preserve NBSP nodes so
187
+ # different Unicode whitespace types remain distinguishable.
188
+ format = opts[:format] || match_opts[:format]
189
+ whitespace_type = match_opts[:whitespace_type] || :strict
190
+ if (%i[html html4
191
+ html5].include?(format) || whitespace_type == :strict) && WhitespaceSensitivity.contains_nbsp?(node_text(node))
192
+ return false
193
+ end
194
+
195
+ if %i[html html4
196
+ html5].include?(format) && WhitespaceSensitivity.inline_whitespace_significant?(node)
197
+ # Whitespace between inline element siblings is semantically
198
+ # significant (renders as a visible gap) and must not be stripped.
199
+ return false
200
+ end
201
+
185
202
  return true unless WhitespaceSensitivity.whitespace_preserved?(
186
203
  node.parent, match_opts
187
204
  )
@@ -208,8 +225,8 @@ module Canon
208
225
  def same_node_type?(node1, node2)
209
226
  return false if node1.class != node2.class
210
227
 
211
- # For Nokogiri/Canon::Xml nodes, check node type
212
- if node1.respond_to?(:node_type) && node2.respond_to?(:node_type)
228
+ case node1
229
+ when Canon::Xml::Node, Nokogiri::XML::Node
213
230
  node1.node_type == node2.node_type
214
231
  else
215
232
  true
@@ -226,20 +243,7 @@ module Canon
226
243
  # @param node [Object] Node to check
227
244
  # @return [Boolean] true if node is a comment
228
245
  def comment_node?(node)
229
- return true if node.respond_to?(:comment?) && node.comment?
230
- return true if node.respond_to?(:node_type) && node.node_type == :comment
231
-
232
- # HTML comments are parsed as TEXT nodes by Nokogiri
233
- # Check if this is a text node with HTML comment content
234
- if text_node?(node)
235
- text = node_text(node)
236
- # Strip whitespace and backslashes for comparison
237
- # Nokogiri escapes HTML comments as "<\\!-- comment -->" in full documents
238
- text_stripped = text.to_s.strip.gsub("\\", "")
239
- return true if text_stripped.start_with?("<!--") && text_stripped.end_with?("-->")
240
- end
241
-
242
- false
246
+ NodeInspector.comment_node?(node)
243
247
  end
244
248
 
245
249
  # Check if a node is a text node
@@ -247,9 +251,7 @@ module Canon
247
251
  # @param node [Object] Node to check
248
252
  # @return [Boolean] true if node is a text node
249
253
  def text_node?(node)
250
- (node.respond_to?(:text?) && node.text? &&
251
- !node.respond_to?(:element?)) ||
252
- (node.respond_to?(:node_type) && node.node_type == :text)
254
+ NodeInspector.text_node?(node)
253
255
  end
254
256
 
255
257
  # Get text content from a node
@@ -257,15 +259,7 @@ module Canon
257
259
  # @param node [Object] Node to get text from
258
260
  # @return [String] Text content
259
261
  def node_text(node)
260
- # Canon::Xml::Node TextNode uses .value
261
- if node.respond_to?(:value)
262
- node.value.to_s
263
- # Nokogiri nodes use .content
264
- elsif node.respond_to?(:content)
265
- node.content.to_s
266
- else
267
- node.to_s
268
- end
262
+ NodeInspector.text_content(node)
269
263
  end
270
264
 
271
265
  # Check if difference between two texts is only whitespace
@@ -309,7 +303,7 @@ module Canon
309
303
  if diff1 == Canon::Comparison::MISSING_NODE && diff2 == Canon::Comparison::MISSING_NODE
310
304
  "element structure mismatch (children differ)"
311
305
  else
312
- "#{diff1} vs #{diff2}"
306
+ Canon::Comparison.code_pair_label(diff1, diff2)
313
307
  end
314
308
  end
315
309
 
@@ -352,26 +346,18 @@ module Canon
352
346
  def extract_text_content_from_node(node)
353
347
  return nil if node.nil?
354
348
 
355
- # For Canon::Xml::Nodes::TextNode
356
- return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
357
-
358
- # For XML/HTML nodes with text_content method
359
- return node.text_content if node.respond_to?(:text_content)
360
-
361
- # For nodes with text method
362
- return node.text if node.respond_to?(:text)
363
-
364
- # For nodes with content method (Moxml::Text)
365
- return node.content if node.respond_to?(:content)
366
-
367
- # For nodes with value method (other types)
368
- return node.value if node.respond_to?(:value)
369
-
370
- # For simple text nodes or strings
371
- return node.to_s if node.is_a?(String)
372
-
373
- # For other node types, try to_s
374
- node.to_s
349
+ case node
350
+ when Canon::Xml::Nodes::TextNode
351
+ node.value
352
+ when Canon::Xml::Node
353
+ node.text_content
354
+ when Nokogiri::XML::Node
355
+ node.content.to_s
356
+ when String
357
+ node
358
+ else
359
+ node.to_s
360
+ end
375
361
  rescue StandardError
376
362
  nil
377
363
  end
@@ -425,26 +411,37 @@ module Canon
425
411
 
426
412
  # Determine the appropriate dimension for a node type
427
413
  #
414
+ # Used by ChildComparison to tag per-child orphan diffs with a
415
+ # dimension that matches what the node *is*, so the formatter
416
+ # renders correctly. An element orphan tagged :text_content
417
+ # would otherwise route through PR #126's one-sided text
418
+ # formatter and render as +text ""+ instead of as the actual
419
+ # element (see lutaml/canon#125 follow-up).
420
+ #
428
421
  # @param node [Object] The node to check
429
422
  # @return [Symbol] The dimension symbol
430
423
  def determine_node_dimension(node)
431
- # Canon::Xml::Node types
432
- if node.respond_to?(:node_type) && node.node_type.is_a?(Symbol)
424
+ case node
425
+ when Canon::Xml::Node
433
426
  case node.node_type
427
+ when :element then :element_structure
434
428
  when :comment then :comments
435
429
  when :text, :cdata then :text_content
436
430
  when :processing_instruction then :processing_instructions
437
431
  else :text_content
438
432
  end
439
- # Moxml/Nokogiri types
440
- elsif node.respond_to?(:comment?) && node.comment?
441
- :comments
442
- elsif node.respond_to?(:text?) && node.text?
443
- :text_content
444
- elsif node.respond_to?(:cdata?) && node.cdata?
445
- :text_content
446
- elsif node.respond_to?(:processing_instruction?) && node.processing_instruction?
447
- :processing_instructions
433
+ when Nokogiri::XML::Node
434
+ if node.comment?
435
+ :comments
436
+ elsif node.text? || node.cdata?
437
+ :text_content
438
+ elsif node.processing_instruction?
439
+ :processing_instructions
440
+ elsif node.element?
441
+ :element_structure
442
+ else
443
+ :text_content
444
+ end
448
445
  else
449
446
  :text_content
450
447
  end
@@ -146,6 +146,7 @@ module Canon
146
146
  respect_xml_space
147
147
  pretty_printed_expected
148
148
  pretty_printed_received
149
+ whitespace_type
149
150
  ]
150
151
 
151
152
  match_options.each do |dimension, behavior|
@@ -24,6 +24,7 @@ module Canon
24
24
  attribute_values: :strict,
25
25
  element_position: :ignore,
26
26
  comments: :ignore,
27
+ whitespace_type: :strict,
27
28
  },
28
29
  xml: {
29
30
  preprocessing: :none,
@@ -34,6 +35,7 @@ module Canon
34
35
  attribute_values: :strict,
35
36
  element_position: :strict,
36
37
  comments: :strict,
38
+ whitespace_type: :strict,
37
39
  },
38
40
  }.freeze
39
41
 
@@ -51,6 +53,7 @@ module Canon
51
53
  attribute_values: :strict,
52
54
  element_position: :strict,
53
55
  comments: :strict,
56
+ whitespace_type: :strict,
54
57
  },
55
58
 
56
59
  # Rendered: Match rendered output (HTML default)
@@ -64,6 +67,7 @@ module Canon
64
67
  attribute_values: :strict,
65
68
  element_position: :strict,
66
69
  comments: :ignore,
70
+ whitespace_type: :strict,
67
71
  },
68
72
 
69
73
  # HTML4: Match HTML4 rendered output
@@ -77,6 +81,7 @@ module Canon
77
81
  attribute_values: :normalize,
78
82
  element_position: :ignore,
79
83
  comments: :ignore,
84
+ whitespace_type: :strict,
80
85
  },
81
86
 
82
87
  # HTML5: Match HTML5 rendered output (same as rendered)
@@ -89,6 +94,7 @@ module Canon
89
94
  attribute_values: :strict,
90
95
  element_position: :ignore,
91
96
  comments: :ignore,
97
+ whitespace_type: :strict,
92
98
  },
93
99
 
94
100
  # Spec-friendly: Formatting doesn't matter
@@ -102,6 +108,7 @@ module Canon
102
108
  attribute_values: :normalize,
103
109
  element_position: :ignore,
104
110
  comments: :ignore,
111
+ whitespace_type: :strict,
105
112
  },
106
113
 
107
114
  # Content-only: Only content matters
@@ -114,6 +121,7 @@ module Canon
114
121
  attribute_values: :normalize,
115
122
  element_position: :ignore,
116
123
  comments: :ignore,
124
+ whitespace_type: :strict,
117
125
  },
118
126
  }.freeze
119
127
 
@@ -69,13 +69,18 @@ module Canon
69
69
  # @param text1 [String] First text
70
70
  # @param text2 [String] Second text
71
71
  # @param behavior [Symbol] Match behavior (:strict, :normalize, :ignore)
72
+ # @param whitespace_type [Symbol] Whitespace type handling (:strict, :normalize)
72
73
  # @return [Boolean] true if texts match according to behavior
73
- def match_text?(text1, text2, behavior)
74
+ def match_text?(text1, text2, behavior, whitespace_type: :strict)
74
75
  case behavior
75
76
  when :strict
76
77
  text1 == text2
77
78
  when :normalize
78
- normalize_text(text1) == normalize_text(text2)
79
+ if whitespace_type == :normalize
80
+ normalize_text(text1) == normalize_text(text2)
81
+ else
82
+ normalize_text_preserving_type(text1) == normalize_text_preserving_type(text2)
83
+ end
79
84
  when :ignore
80
85
  true
81
86
  else
@@ -101,6 +106,22 @@ module Canon
101
106
  .strip # Remove leading/trailing whitespace
102
107
  end
103
108
 
109
+ # Normalize text preserving Unicode whitespace type distinctions.
110
+ #
111
+ # Only ASCII whitespace (space, tab, newline, etc.) is collapsed.
112
+ # Unicode whitespace (NBSP, ideographic space, etc.) is preserved,
113
+ # so different whitespace types remain distinguishable.
114
+ #
115
+ # @param text [String] Text to normalize
116
+ # @return [String] Normalized text with preserved whitespace types
117
+ def normalize_text_preserving_type(text)
118
+ return "" if text.nil?
119
+
120
+ text.to_s
121
+ .gsub(/[ \t\r\n\f\v]+/, " ") # Collapse only ASCII whitespace
122
+ .strip
123
+ end
124
+
104
125
  # Process attribute value according to match behavior
105
126
  #
106
127
  # @param value [String] Attribute value to process
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module Comparison
5
+ # Single source of truth for cross-backend node type operations.
6
+ #
7
+ # The comparison pipeline handles nodes from two backends:
8
+ # * Canon::Xml::Node (+ RootNode, ElementNode, TextNode, etc.) —
9
+ # custom DOM built by SAX builder and DataModel.
10
+ # * Nokogiri::XML::Node (+ subclasses) — native Nokogiri nodes used
11
+ # by the HTML comparator and some legacy paths.
12
+ #
13
+ # Every method here dispatches on type via +case/when+ (+is_a?+).
14
+ # No +respond_to?+ — the types are known at every call site.
15
+ module NodeInspector
16
+ CANON_TEXT_TYPE = :text
17
+ NOKOGIRI_TEXT_TYPE = defined?(Nokogiri::XML::Node::TEXT_NODE) ? Nokogiri::XML::Node::TEXT_NODE : 3
18
+
19
+ # True when +node+ is a text node (whitespace, content, etc.).
20
+ def self.text_node?(node)
21
+ case node
22
+ when Canon::Xml::Node
23
+ node.node_type == CANON_TEXT_TYPE
24
+ when Nokogiri::XML::Node
25
+ node.node_type == NOKOGIRI_TEXT_TYPE
26
+ else
27
+ false
28
+ end
29
+ end
30
+
31
+ # Extract the text content of +node+ as a String.
32
+ def self.text_content(node)
33
+ case node
34
+ when Canon::Xml::Node
35
+ node.value.to_s
36
+ when Nokogiri::XML::Node
37
+ node.content.to_s
38
+ else
39
+ node.to_s
40
+ end
41
+ end
42
+
43
+ # True when +node+ is a text node whose content is whitespace-only.
44
+ # Empty-string text nodes return false — those represent genuine
45
+ # empty-vs-content asymmetry, not pretty-print indentation.
46
+ def self.whitespace_only_text?(node)
47
+ return false unless text_node?(node)
48
+
49
+ text = text_content(node)
50
+ !text.empty? && text.strip.empty?
51
+ end
52
+
53
+ # True when +node+ is a comment node.
54
+ # For HTML, also detects comments that Nokogiri parses as TEXT nodes
55
+ # (content like "<!-- comment -->" or escaped "<\\!-- comment -->").
56
+ def self.comment_node?(node)
57
+ case node
58
+ when Canon::Xml::Node
59
+ node.node_type == :comment
60
+ when Nokogiri::XML::Node
61
+ return true if node.comment?
62
+
63
+ # HTML comments are parsed as TEXT nodes by Nokogiri
64
+ if node.text?
65
+ text_stripped = text_content(node).to_s.strip.gsub("\\", "")
66
+ return true if text_stripped.start_with?("<!--") && text_stripped.end_with?("-->")
67
+ end
68
+ false
69
+ else
70
+ false
71
+ end
72
+ end
73
+
74
+ # True when +node+ is an element node.
75
+ def self.element_node?(node)
76
+ case node
77
+ when Canon::Xml::Node
78
+ node.node_type == :element
79
+ when Nokogiri::XML::Node
80
+ node.element?
81
+ else
82
+ false
83
+ end
84
+ end
85
+
86
+ # Extract parse-time errors carried on a node or its owning document.
87
+ # Returns an Array of Strings.
88
+ def self.parse_errors(node)
89
+ case node
90
+ when nil
91
+ []
92
+ when Canon::Xml::Node
93
+ errors = node.parse_errors
94
+ Array(errors).map(&:to_s)
95
+ when Nokogiri::XML::Document, Nokogiri::HTML5::Document
96
+ Array(node.errors).map(&:to_s)
97
+ else
98
+ []
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
@@ -50,6 +50,15 @@ module Canon
50
50
  # HTML elements where every whitespace character is significant.
51
51
  HTML_PRESERVE_ELEMENTS = %w[pre code textarea script style].freeze
52
52
 
53
+ # HTML inline elements — whitespace between these is semantically
54
+ # significant (renders as a visible space). Whitespace-only text
55
+ # nodes that sit between two inline siblings must not be stripped.
56
+ INLINE_ELEMENTS = %w[
57
+ a abbr acronym b bdo big br button cite code dfn em i img input kbd
58
+ label map object output q s samp select small span strong sub sup
59
+ time tt u var wbr
60
+ ].freeze
61
+
53
62
  class << self
54
63
  # Classify the whitespace behaviour for an element using ancestor walk.
55
64
  #
@@ -213,6 +222,69 @@ module Canon
213
222
  .include?(element_name.to_sym)
214
223
  end
215
224
 
225
+ # Check if a whitespace-only text node sits between two inline element
226
+ # siblings, making the whitespace semantically significant.
227
+ #
228
+ # In HTML rendering, a space between <span>A</span> <span>B</span>
229
+ # produces visible output. Stripping such nodes produces false
230
+ # equivalence.
231
+ #
232
+ # Works with any parent type (element, DocumentFragment, RootNode)
233
+ # since the check is about sibling context, not parent type.
234
+ #
235
+ # @param text_node [Object] Text node (Nokogiri or Canon::Xml::Node)
236
+ # @return [Boolean] true if whitespace is between inline siblings
237
+ def inline_whitespace_significant?(text_node)
238
+ return false unless text_node.respond_to?(:parent)
239
+
240
+ parent = text_node.parent
241
+ return false unless parent
242
+ return false unless parent.respond_to?(:children)
243
+
244
+ siblings = parent.children
245
+ idx = siblings.index(text_node)
246
+ return false unless idx
247
+
248
+ # Look at the IMMEDIATE non-whitespace-text neighbour on each
249
+ # side. Whitespace at a block boundary is collapsed per CSS,
250
+ # so both immediate neighbours must be inline for the
251
+ # whitespace to be significant. Walking all siblings (the
252
+ # earlier behaviour) misclassified whitespace at a block
253
+ # boundary as significant whenever any inline element existed
254
+ # elsewhere among the siblings.
255
+ prev_neighbour = nearest_non_whitespace_sibling(siblings, idx, -1)
256
+ next_neighbour = nearest_non_whitespace_sibling(siblings, idx, 1)
257
+
258
+ inline_element?(prev_neighbour) && inline_element?(next_neighbour)
259
+ end
260
+
261
+ # Walk outward from +idx+ in +direction+ (+1 forward, -1 back),
262
+ # skipping whitespace-only text nodes, and return the first
263
+ # non-whitespace sibling found. Returns nil if none.
264
+ def nearest_non_whitespace_sibling(siblings, idx, direction)
265
+ i = idx + direction
266
+ while i >= 0 && i < siblings.length
267
+ s = siblings[i]
268
+ unless s.respond_to?(:text?) && s.text? &&
269
+ s.respond_to?(:content) && s.content.to_s.strip.empty?
270
+ return s
271
+ end
272
+
273
+ i += direction
274
+ end
275
+ nil
276
+ end
277
+
278
+ # Check if text content contains a non-breaking space (U+00A0).
279
+ # NBSP is NOT collapsible whitespace in HTML — it always renders as
280
+ # a visible space and must never be stripped.
281
+ #
282
+ # @param text [String] Text content to check
283
+ # @return [Boolean] true if text contains U+00A0
284
+ def contains_nbsp?(text)
285
+ text.to_s.include?("\u00A0")
286
+ end
287
+
216
288
  private
217
289
 
218
290
  # Build the Set of preserve whitespace element names (strings).
@@ -336,6 +408,30 @@ module Canon
336
408
  # Nokogiri compatibility
337
409
  parent.respond_to?(:node_type) && parent.node_type == :element
338
410
  end
411
+
412
+ # Get the parent element of a text node, or nil.
413
+ # Works with both Nokogiri and Canon::Xml::Node types.
414
+ def parent_element_of(text_node)
415
+ return nil unless text_node.respond_to?(:parent)
416
+
417
+ parent = text_node.parent
418
+ return nil unless parent
419
+
420
+ if parent.is_a?(Canon::Xml::Nodes::ElementNode)
421
+ parent
422
+ elsif parent.respond_to?(:element?) && parent.element?
423
+ parent
424
+ elsif parent.respond_to?(:node_type) && parent.node_type == :element
425
+ parent
426
+ end
427
+ end
428
+
429
+ # Check if a node is an HTML inline element.
430
+ def inline_element?(node)
431
+ return false unless node.respond_to?(:name)
432
+
433
+ INLINE_ELEMENTS.include?(node.name.to_s.downcase)
434
+ end
339
435
  end
340
436
  end
341
437
  end