canon 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +31 -149
  3. data/README.adoc +9 -0
  4. data/docs/advanced/semantic-diff-report.adoc +96 -0
  5. data/docs/features/configuration-profiles.adoc +4 -2
  6. data/docs/features/diff-formatting/index.adoc +3 -0
  7. data/docs/features/diff-formatting/whitespace-adjacency.adoc +140 -0
  8. data/docs/features/match-options/html-policies.adoc +2 -0
  9. data/docs/features/match-options/index.adoc +40 -0
  10. data/docs/guides/choosing-configuration.adoc +12 -1
  11. data/docs/reference/cli-options.adoc +3 -0
  12. data/docs/reference/environment-variables.adoc +3 -1
  13. data/docs/reference/options-across-interfaces.adoc +7 -1
  14. data/docs/understanding/formats/html.adoc +9 -2
  15. data/lib/canon/cli.rb +4 -0
  16. data/lib/canon/commands/diff_command.rb +1 -0
  17. data/lib/canon/comparison/comparison_result.rb +95 -2
  18. data/lib/canon/comparison/html_comparator.rb +96 -11
  19. data/lib/canon/comparison/markup_comparator.rb +68 -71
  20. data/lib/canon/comparison/match_options/base_resolver.rb +1 -0
  21. data/lib/canon/comparison/match_options/xml_resolver.rb +8 -0
  22. data/lib/canon/comparison/match_options.rb +23 -2
  23. data/lib/canon/comparison/node_inspector.rb +103 -0
  24. data/lib/canon/comparison/whitespace_sensitivity.rb +96 -0
  25. data/lib/canon/comparison/xml_comparator/child_comparison.rb +133 -55
  26. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +24 -23
  27. data/lib/canon/comparison/xml_comparator/node_parser.rb +45 -7
  28. data/lib/canon/comparison/xml_comparator.rb +174 -7
  29. data/lib/canon/comparison/xml_node_comparison.rb +48 -66
  30. data/lib/canon/comparison.rb +143 -22
  31. data/lib/canon/config/env_schema.rb +2 -1
  32. data/lib/canon/config/profiles/metanorma.yml +3 -0
  33. data/lib/canon/config.rb +51 -5
  34. data/lib/canon/diff/diff_classifier.rb +55 -41
  35. data/lib/canon/diff/diff_line_builder.rb +9 -8
  36. data/lib/canon/diff/xml_serialization_formatter.rb +27 -42
  37. data/lib/canon/diff_formatter/by_line/base_formatter.rb +39 -4
  38. data/lib/canon/diff_formatter/by_line/html_formatter.rb +5 -2
  39. data/lib/canon/diff_formatter/by_line_formatter.rb +84 -0
  40. data/lib/canon/diff_formatter/by_object_formatter.rb +53 -0
  41. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +184 -26
  42. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +92 -4
  43. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +29 -0
  44. data/lib/canon/diff_formatter/pretty_diff_formatter.rb +109 -0
  45. data/lib/canon/diff_formatter.rb +128 -175
  46. data/lib/canon/html/data_model.rb +10 -4
  47. data/lib/canon/pretty_printer/html.rb +76 -14
  48. data/lib/canon/pretty_printer/html_void_elements.rb +20 -0
  49. data/lib/canon/pretty_printer/xml_normalized.rb +10 -3
  50. data/lib/canon/tree_diff/adapters/html_adapter.rb +55 -2
  51. data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
  52. data/lib/canon/version.rb +1 -1
  53. data/lib/canon/xml/c14n.rb +59 -5
  54. data/lib/canon/xml/data_model.rb +13 -1
  55. data/lib/canon/xml/element_matcher.rb +3 -0
  56. data/lib/canon/xml/node.rb +23 -1
  57. data/lib/canon/xml/nodes/comment_node.rb +4 -0
  58. data/lib/canon/xml/nodes/element_node.rb +4 -0
  59. data/lib/canon/xml/nodes/text_node.rb +4 -0
  60. data/lib/canon/xml/sax_builder.rb +29 -2
  61. data/lib/canon/xml/xpath_engine.rb +238 -0
  62. metadata +9 -2
@@ -92,6 +92,46 @@ Canon.equivalent?(
92
92
  `:ignore`:: Structural whitespace is completely ignored
93
93
 
94
94
 
95
+ === whitespace_type
96
+
97
+ **Applies to**: XML, HTML
98
+
99
+ **Purpose**: Controls whether different Unicode whitespace characters (space, NBSP, ideographic space, etc.) are treated as equivalent or distinct.
100
+
101
+ **Behaviors**:
102
+
103
+ `:strict`:: (default) Different Unicode whitespace types are significant.
104
+ Space (U+0020) and NBSP (U+00A0) are treated as different characters.
105
+ This is useful for catching accidental insertion of wrong whitespace types
106
+ (e.g., a pasted NBSP where a regular space was intended).
107
+
108
+ `:normalize`:: All Unicode whitespace characters are collapsed to a single space
109
+ before comparison. Space, NBSP, ideographic space (U+3000), and other Unicode
110
+ whitespace characters are treated as equivalent.
111
+
112
+ .Using whitespace_type: :strict (default)
113
+ [example]
114
+ ====
115
+ [source,ruby]
116
+ ----
117
+ # By default, space and NBSP are different
118
+ xml1 = '<root><span>ISO</span> <span>712</span></root>'
119
+ xml2 = '<root><span>ISO</span>&#xa0;<span>712</span></root>'
120
+
121
+ Canon::Comparison.equivalent?(xml1, xml2,
122
+ match_profile: :spec_friendly
123
+ )
124
+ # => false (NBSP detected as different from space)
125
+
126
+ # Opt into treating all whitespace types as equivalent
127
+ Canon::Comparison.equivalent?(xml1, xml2,
128
+ match_profile: :spec_friendly,
129
+ match: { whitespace_type: :normalize }
130
+ )
131
+ # => true
132
+ ----
133
+ ====
134
+
95
135
  === Whitespace sensitivity at element level
96
136
 
97
137
  ==== General
@@ -210,13 +210,24 @@ Canon::Comparison.equivalent?(doc1, doc2,
210
210
  structural_whitespace: :ignore, # ignore, normalize, strict
211
211
  attribute_order: :ignore, # ignore, strict (XML/HTML)
212
212
  attribute_values: :normalize, # normalize, strict, ignore
213
- comments: :ignore # ignore, normalize, strict
213
+ comments: :ignore, # ignore, normalize, strict
214
+ whitespace_type: :strict # strict (default), normalize
214
215
  }
215
216
  )
216
217
  ----
217
218
 
218
219
  **Remember**: Match options behave differently with each algorithm! See link:../features/match-options/algorithm-specific-behavior.adoc[Algorithm-Specific Behavior].
219
220
 
221
+ ==== Whitespace Type Sensitivity
222
+
223
+ By default, Canon distinguishes between different Unicode whitespace types
224
+ (e.g. regular space U+0020 vs non-breaking space U+00A0 vs ideographic space
225
+ U+3000). This catches accidental insertion of wrong whitespace characters.
226
+
227
+ Use `whitespace_type: :normalize` when all Unicode whitespace variants should
228
+ be treated as equivalent (e.g. when output from different tools may use
229
+ different whitespace types for the same visual result).
230
+
220
231
  === Layer 4: Diff Formatting
221
232
 
222
233
  **Question**: How should differences be displayed?
@@ -145,6 +145,9 @@ Individual dimension control (overrides profile settings):
145
145
 
146
146
  |`--comments BEHAVIOR`
147
147
  |Comments: `strict`, `normalize`, `ignore`
148
+
149
+ |`--whitespace-type BEHAVIOR`
150
+ |Whitespace type sensitivity: `strict` (default), `normalize`
148
151
  |===
149
152
 
150
153
  See link:../features/match-options/[Match Options] for details.
@@ -194,7 +194,9 @@ export CANON_JSON_FORMAT_PREPROCESSING=normalize
194
194
  |`CANON_SHOW_PRETTYPRINT_RECEIVED`
195
195
  |boolean
196
196
  |`false`
197
- |Show only the RECEIVED (actual) block in the fixture-ready pretty-printed section. This is the most common fixture-update workflow: enable this option to get a copy-pasteable pretty-printed form of the generated output that can replace the old fixture heredoc. Format-specific: `CANON_{FORMAT}_DIFF_SHOW_PRETTYPRINT_RECEIVED`
197
+ |Show only the RECEIVED (actual) block in the fixture-ready pretty-printed section. This is the most common fixture-update workflow: enable this option to get a copy-pasteable pretty-printed form of the generated output that can replace the old fixture heredoc. Format-specific: `CANON_{FORMAT}_DIFF_SHOW_PRETTYPRINT_RECEIVED`.
198
+
199
+ For HTML / HTML4 / HTML5 inputs, the pretty-printed output is XHTML-shaped: void elements are self-closed (`<br/>`, `<meta/>`), non-void elements are paired (`<a></a>`), and Nokogiri may add `xmlns="http://www.w3.org/1999/xhtml"` on `<html>` and an `xml:lang` mirror of `lang`. This is a display-only serialisation chosen because libxml's `FORMAT` save flag (the only path that actually indents HTML5 input) requires the XHTML save mode -- `Nokogiri::HTML5#to_html` silently ignores its `indent:` keyword. See lutaml/canon#133.
198
200
  |All formats (display only)
199
201
 
200
202
  |`CANON_COMPACT_SEMANTIC_REPORT`
@@ -223,9 +223,15 @@ Profile values: `strict`, `rendered`, `spec_friendly`, `content_only`
223
223
  |`match: { element_hierarchy: :strict }`
224
224
  |`config.canon.xml.match.options = { element_hierarchy: :strict }`
225
225
  |`CANON_ELEMENT_HIERARCHY=strict`
226
+
227
+ |Whitespace Type
228
+ |`--whitespace-type normalize`
229
+ |`match: { whitespace_type: :normalize }`
230
+ |`config.canon.xml.match.options = { whitespace_type: :normalize }`
231
+ |`CANON_WHITESPACE_TYPE=normalize`
226
232
  |===
227
233
 
228
- Values: `strict`, `normalize`, `ignore` (or `strict`, `ignore` for structure/position/hierarchy)
234
+ Values: `strict`, `normalize`, `ignore` (or `strict`, `ignore` for structure/position/hierarchy). `whitespace_type` values: `strict` (default), `normalize`
229
235
 
230
236
  ==== XML/HTML-Specific Match Dimensions
231
237
 
@@ -19,7 +19,7 @@ Canon supports HTML 4, HTML5, and XHTML with automatic format detection.
19
19
  **Key features:**
20
20
 
21
21
  * Automatic HTML vs XHTML detection
22
- * HTML5 parser for modern HTML
22
+ * HTML5 parser for HTML input regardless of declared version (HTML4 and HTML5 share the same content model and parsing whitespace rules — see <<html4-html5-parity>>)
23
23
  * XML parser for XHTML
24
24
  * Consistent attribute ordering
25
25
  * Whitespace normalization
@@ -203,9 +203,16 @@ Automatically detects HTML5, HTML4, or XHTML based on DOCTYPE and structure.
203
203
  ----
204
204
  ====
205
205
 
206
+ [[html4-html5-parity]]
207
+ === HTML4 / HTML5 parity
208
+
209
+ `be_html4_equivalent_to` and `be_html5_equivalent_to` apply the same whitespace-sensitivity rules. Whitespace sensitivity is a property of HTML's content model and is identical across the two HTML versions, so any input that compares equivalent under one matcher must compare equivalent under the other.
210
+
211
+ Internally, both matchers parse input via `Nokogiri::HTML5.fragment`. (Earlier releases routed `:html` and `:html4` through `Nokogiri::XML.fragment`, which silently applied XML whitespace rules — meaning `be_html4_equivalent_to` could reject inputs that `be_html5_equivalent_to` correctly accepted.) See https://github.com/lutaml/canon/issues/118 for the full background.
212
+
206
213
  === Whitespace handling
207
214
 
208
- HTML whitespace is collapsed per CSS rendering rules. Empty text nodes between elements are removed.
215
+ HTML whitespace is collapsed per CSS rendering rules. Empty text nodes between elements are removed. Whitespace-only text between two adjacent inline elements (`<span>A</span> <span>B</span>`) is preserved because it renders as a visible space; whitespace at a block boundary (between an inline element and a block element, or between two block siblings) is collapsed.
209
216
 
210
217
  .Whitespace handling example
211
218
  [example]
data/lib/canon/cli.rb CHANGED
@@ -218,6 +218,10 @@ module Canon
218
218
  type: :string,
219
219
  enum: %w[strict normalize ignore],
220
220
  desc: "Comment matching: strict, normalize, or ignore"
221
+ method_option :whitespace_type,
222
+ type: :string,
223
+ enum: %w[strict normalize],
224
+ desc: "Whitespace type sensitivity: strict (default) or normalize"
221
225
  method_option :show_diffs,
222
226
  type: :string,
223
227
  enum: %w[all normative informative],
@@ -143,6 +143,7 @@ module Canon
143
143
  dimensions = %i[
144
144
  text_content structural_whitespace attribute_whitespace
145
145
  attribute_order attribute_values comments key_order
146
+ whitespace_type
146
147
  ]
147
148
 
148
149
  dimensions.each_with_object({}) do |dim, opts|
@@ -6,7 +6,8 @@ module Canon
6
6
  # Provides methods to query equivalence based on normative diffs
7
7
  class ComparisonResult
8
8
  attr_reader :differences, :preprocessed_strings, :format, :html_version,
9
- :match_options, :algorithm, :original_strings
9
+ :match_options, :algorithm, :original_strings,
10
+ :parse_errors_expected, :parse_errors_received
10
11
 
11
12
  # @param differences [Array<DiffNode>] Array of difference nodes
12
13
  # @param preprocessed_strings [Array<String, String>] Pre-processed content for display
@@ -15,8 +16,11 @@ module Canon
15
16
  # @param match_options [Hash, nil] Resolved match options used for comparison
16
17
  # @param algorithm [Symbol] Diff algorithm used (:dom or :semantic)
17
18
  # @param original_strings [Array<String, String>, nil] Original unprocessed content for line diff
19
+ # @param parse_errors_expected [Array<String>, nil] Parser errors from the expected side
20
+ # @param parse_errors_received [Array<String>, nil] Parser errors from the received side
18
21
  def initialize(differences:, preprocessed_strings:, format:,
19
- html_version: nil, match_options: nil, algorithm: :dom, original_strings: nil)
22
+ html_version: nil, match_options: nil, algorithm: :dom, original_strings: nil,
23
+ parse_errors_expected: nil, parse_errors_received: nil)
20
24
  @differences = differences
21
25
  @preprocessed_strings = preprocessed_strings
22
26
  @original_strings = original_strings || preprocessed_strings
@@ -24,6 +28,16 @@ html_version: nil, match_options: nil, algorithm: :dom, original_strings: nil)
24
28
  @html_version = html_version
25
29
  @match_options = match_options
26
30
  @algorithm = algorithm
31
+ @parse_errors_expected = Array(parse_errors_expected)
32
+ @parse_errors_received = Array(parse_errors_received)
33
+ end
34
+
35
+ # Whether either side reported parse errors. Used by the diff
36
+ # formatter to decide whether to render the parse-error banner.
37
+ #
38
+ # @return [Boolean]
39
+ def parse_errors?
40
+ @parse_errors_expected.any? || @parse_errors_received.any?
27
41
  end
28
42
 
29
43
  # Check if documents are semantically equivalent (no normative diffs)
@@ -84,6 +98,30 @@ html_version: nil, match_options: nil, algorithm: :dom, original_strings: nil)
84
98
  @match_options&.[](:tree_diff_operations) || []
85
99
  end
86
100
 
101
+ # Generate a human-readable summary of the first difference.
102
+ #
103
+ # When documents are equivalent, returns "Equivalent".
104
+ # When they differ, returns a single-line string with the first normative
105
+ # (or first informative) difference location and reason.
106
+ #
107
+ # @return [String] Summary string
108
+ def summary
109
+ return "Equivalent" if equivalent?
110
+
111
+ diff = normative_differences.first || informative_differences.first ||
112
+ @differences.first # rubocop:disable Layout/MultilineOperationIndentation
113
+
114
+ return "Not equivalent" unless diff
115
+
116
+ if diff.is_a?(Canon::Diff::DiffNode)
117
+ summarize_diff_node(diff)
118
+ elsif diff.is_a?(Hash)
119
+ summarize_legacy_hash(diff)
120
+ else
121
+ "Not equivalent"
122
+ end
123
+ end
124
+
87
125
  # Generate formatted diff output
88
126
  #
89
127
  # @param use_color [Boolean] Whether to use ANSI color codes
@@ -116,6 +154,61 @@ show_diffs: :all, diff_mode: :separate, legacy_terminal: false)
116
154
  html_version: @html_version,
117
155
  )
118
156
  end
157
+
158
+ private
159
+
160
+ # Format a single DiffNode into a summary string.
161
+ #
162
+ # @param diff [DiffNode] The difference to summarize
163
+ # @return [String] Human-readable summary
164
+ def summarize_diff_node(diff)
165
+ parts = ["Not equivalent:"]
166
+
167
+ # rubocop:disable Layout/SpaceBeforeInterpolation,Style/ConditionalAssignment
168
+ if diff.path
169
+ parts << "#{diff.reason} at #{diff.path}"
170
+ else
171
+ parts << diff.reason.to_s
172
+ end
173
+ # rubocop:enable Layout/SpaceBeforeInterpolation,Style/ConditionalAssignment
174
+
175
+ if diff.serialized_before && diff.serialized_after
176
+ before_preview = truncate_preview(diff.serialized_before)
177
+ after_preview = truncate_preview(diff.serialized_after)
178
+ parts << "(#{before_preview} vs #{after_preview})"
179
+ end
180
+
181
+ parts.join(" ")
182
+ end
183
+
184
+ # Format a legacy Hash difference into a summary string.
185
+ #
186
+ # @param diff [Hash] Legacy difference hash with :path, :value1, :value2
187
+ # @return [String] Human-readable summary
188
+ def summarize_legacy_hash(diff)
189
+ parts = ["Not equivalent:"]
190
+ parts << "#{diff[:diff_code_description]} at #{diff[:path]}" if diff[:path]
191
+
192
+ if diff[:value1] && diff[:value2]
193
+ parts << "(#{truncate_preview(diff[:value1].to_s)} vs #{truncate_preview(diff[:value2].to_s)})"
194
+ end
195
+
196
+ parts.size > 1 ? parts.join(" ") : "Not equivalent: values differ"
197
+ end
198
+
199
+ # Truncate a string for preview display.
200
+ #
201
+ # @param text [String] Text to truncate
202
+ # @param max_len [Integer] Maximum length
203
+ # @return [String] Truncated text with ellipsis if needed
204
+ def truncate_preview(text, max_len = 40)
205
+ stripped = text.strip.gsub(/\s+/, " ")
206
+ if stripped.length > max_len
207
+ "#{stripped[0...(max_len - 3)]}..."
208
+ else
209
+ stripped
210
+ end
211
+ end
119
212
  end
120
213
  end
121
214
  end
@@ -13,6 +13,7 @@ require_relative "../diff/diff_classifier"
13
13
  require_relative "strategies/match_strategy_factory"
14
14
  require_relative "../html/data_model"
15
15
  require_relative "xml_node_comparison"
16
+ require_relative "xml_comparator/diff_node_builder"
16
17
  # Whitespace sensitivity module (single source of truth for sensitive elements)
17
18
  require_relative "whitespace_sensitivity"
18
19
 
@@ -150,6 +151,8 @@ module Canon
150
151
  html_version: detect_html_version_from_node(node1),
151
152
  match_options: match_opts_hash,
152
153
  algorithm: :dom,
154
+ parse_errors_expected: Comparison.parse_errors_for(node1),
155
+ parse_errors_received: Comparison.parse_errors_for(node2),
153
156
  )
154
157
  elsif result != Comparison::EQUIVALENT && !differences.empty?
155
158
  # Non-verbose mode: check equivalence
@@ -172,10 +175,42 @@ module Canon
172
175
  # @param node2 [Object] Second node
173
176
  # @return [Boolean] true if both are document fragments
174
177
  def fragment_nodes?(node1, node2)
175
- (node1.is_a?(Nokogiri::HTML4::DocumentFragment) ||
176
- node1.is_a?(Nokogiri::XML::DocumentFragment)) &&
177
- (node2.is_a?(Nokogiri::HTML4::DocumentFragment) ||
178
- node2.is_a?(Nokogiri::XML::DocumentFragment))
178
+ fragment_node?(node1) && fragment_node?(node2)
179
+ end
180
+
181
+ # Check if a single node is a recognised document fragment.
182
+ # All three Nokogiri fragment types (XML, HTML4, HTML5) must be
183
+ # accepted: dom_diff routes html/html4/html5 input through
184
+ # Nokogiri::HTML5.fragment per #118.
185
+ def fragment_node?(node)
186
+ node.is_a?(Nokogiri::XML::DocumentFragment) ||
187
+ node.is_a?(Nokogiri::HTML4::DocumentFragment) ||
188
+ node.is_a?(Nokogiri::HTML5::DocumentFragment)
189
+ end
190
+
191
+ # Record a DiffNode for a fragment-level child-count mismatch.
192
+ # Each surplus child becomes its own MISSING_NODE diff so the
193
+ # downstream report shows what was added or removed.
194
+ def record_fragment_length_mismatch(_node1, _node2, children1,
195
+ children2, differences)
196
+ longer, shorter, side = if children1.length > children2.length
197
+ [children1, children2, :removed]
198
+ else
199
+ [children2, children1, :added]
200
+ end
201
+
202
+ longer[shorter.length...].each do |orphan|
203
+ n1 = side == :removed ? orphan : nil
204
+ n2 = side == :removed ? nil : orphan
205
+ differences <<
206
+ Canon::Comparison::DiffNodeBuilder.build(
207
+ node1: n1,
208
+ node2: n2,
209
+ diff1: Comparison::MISSING_NODE,
210
+ diff2: Comparison::MISSING_NODE,
211
+ dimension: :element_structure,
212
+ )
213
+ end
179
214
  end
180
215
 
181
216
  # Compare children of document fragments
@@ -196,6 +231,13 @@ module Canon
196
231
  children2 = XmlNodeComparison.filter_children(all_children2, opts)
197
232
 
198
233
  if children1.length != children2.length
234
+ # Record the length mismatch as a DiffNode so verbose mode
235
+ # surfaces it. Without this, equivalent? wraps an empty
236
+ # differences array and incorrectly reports the inputs as
237
+ # equivalent.
238
+ record_fragment_length_mismatch(node1, node2,
239
+ children1, children2,
240
+ differences)
199
241
  return Comparison::UNEQUAL_ELEMENTS
200
242
  elsif children1.empty?
201
243
  return Comparison::EQUIVALENT
@@ -260,6 +302,8 @@ module Canon
260
302
  html_version: html_version,
261
303
  match_options: match_opts_hash.merge(strategy.metadata),
262
304
  algorithm: :semantic,
305
+ parse_errors_expected: Comparison.parse_errors_for(node1),
306
+ parse_errors_received: Comparison.parse_errors_for(node2),
263
307
  )
264
308
  else
265
309
  # Simple boolean result - equivalent if no normative differences
@@ -291,10 +335,12 @@ module Canon
291
335
  node.to_html
292
336
  end
293
337
 
294
- # Use XML fragment parser to preserve structure without auto-generated elements
295
- # This avoids both HTML4's meta tag insertion and HTML5's tag stripping
296
- # See: https://stackoverflow.com/questions/25998824/stop-nokogiri-from-adding-doctype-and-meta-tags
297
- frag = Nokogiri::XML.fragment(html_string)
338
+ # Use XML fragment parser to preserve structure without auto-generated elements.
339
+ # Decode HTML named entities (&nbsp; etc.) to UTF-8 characters since XML
340
+ # parser only understands the five XML entities.
341
+ frag = Nokogiri::XML.fragment(
342
+ decode_html_named_entities(html_string),
343
+ )
298
344
 
299
345
  # Apply preprocessing if needed
300
346
  if preprocessing == :rendered
@@ -448,8 +494,12 @@ module Canon
448
494
  end
449
495
 
450
496
  # Parse as Nokogiri fragment for DOM comparison
451
- # Use XML fragment parser to avoid auto-inserted meta tags
452
- frag = Nokogiri::XML.fragment(html_string)
497
+ # Use XML fragment parser to avoid auto-inserted meta tags.
498
+ # Decode HTML named entities (&nbsp; etc.) to UTF-8 characters since
499
+ # XML parser only understands the five XML entities.
500
+ frag = Nokogiri::XML.fragment(
501
+ decode_html_named_entities(html_string),
502
+ )
453
503
 
454
504
  # Apply post-parsing filtering for :normalize, :format, and :rendered preprocessing
455
505
  if %i[normalize format rendered].include?(preprocessing)
@@ -496,6 +546,33 @@ module Canon
496
546
 
497
547
  # Detect HTML version from content
498
548
  #
549
+ # Decode HTML named entities to their UTF-8 character equivalents.
550
+ # This is a targeted replacement that only changes entity references,
551
+ # preserving all tag structure. Needed because Nokogiri::XML.fragment
552
+ # only understands the five XML entities (&amp; &lt; &gt; &quot; &apos;).
553
+ #
554
+ # @param str [String] HTML string possibly containing named entities
555
+ # @return [String] String with named entities replaced by UTF-8 chars
556
+ def decode_html_named_entities(str)
557
+ return str unless str.include?("&")
558
+
559
+ str.gsub(/&nbsp;/i, "\u00A0")
560
+ .gsub(/&ensp;/i, "\u2002")
561
+ .gsub(/&emsp;/i, "\u2003")
562
+ .gsub(/&thinsp;/i, "\u2009")
563
+ .gsub(/&copy;/i, "\u00A9")
564
+ .gsub(/&reg;/i, "\u00AE")
565
+ .gsub(/&trade;/i, "\u2122")
566
+ .gsub(/&mdash;/i, "\u2014")
567
+ .gsub(/&ndash;/i, "\u2013")
568
+ .gsub(/&lsquo;/i, "\u2018")
569
+ .gsub(/&rsquo;/i, "\u2019")
570
+ .gsub(/&ldquo;/i, "\u201C")
571
+ .gsub(/&rdquo;/i, "\u201D")
572
+ .gsub(/&bull;/i, "\u2022")
573
+ .gsub(/&hellip;/i, "\u2026")
574
+ end
575
+
499
576
  # @param content [String] HTML content
500
577
  # @return [Symbol] :html5 or :html4
501
578
  def detect_html_version(content)
@@ -721,8 +798,16 @@ compare_profile = nil)
721
798
  parent = text_node.parent
722
799
  next if ancestor_preserves_whitespace?(parent, preserve_whitespace)
723
800
 
801
+ content = text_node.content
802
+
803
+ # NBSP (U+00A0) is never insignificant — don't remove
804
+ next if content.include?("\u00A0")
805
+
806
+ # Whitespace between inline siblings is significant — don't remove
807
+ next if WhitespaceSensitivity.inline_whitespace_significant?(text_node)
808
+
724
809
  # Remove if the text is only whitespace (after normalization)
725
- if text_node.content.strip.empty?
810
+ if content.strip.empty?
726
811
  text_node.remove
727
812
  end
728
813
  end