canon 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "nokogiri"
3
4
  require_relative "../../xml/namespace_helper"
4
5
 
5
6
  module Canon
@@ -260,12 +261,15 @@ module Canon
260
261
  end
261
262
  end
262
263
 
263
- # Serialize a Canon Xml node tree as compact XML for display.
264
+ # Serialize a node tree as compact XML for display.
264
265
  #
265
266
  # Produces a human-readable inline XML string without namespace
266
267
  # declarations and without indentation — suitable for use in Semantic
267
- # Diff Report entries. Only handles Canon::Xml::Nodes types; for any
268
- # other node (Nokogiri, etc.) falls back to +get_node_text+.
268
+ # Diff Report entries. Handles both +Canon::Xml::Nodes+ types and
269
+ # Nokogiri XML/HTML nodes (the html DOM comparison path uses
270
+ # Nokogiri nodes, so element-structure diffs originating there must
271
+ # be rendered structurally too — see issue #120). For any other
272
+ # node type, falls back to +get_node_text+.
269
273
  #
270
274
  # @param node [Object] Node to serialize
271
275
  # @return [String] Compact XML string
@@ -294,12 +298,79 @@ module Canon
294
298
  when Canon::Xml::Nodes::CommentNode
295
299
  text = node.respond_to?(:value) ? node.value.to_s : ""
296
300
  "<!--#{CGI.escapeHTML(text)}-->"
301
+ when Nokogiri::XML::Text, Nokogiri::XML::CDATA
302
+ CGI.escapeHTML(node.content.to_s)
303
+ when Nokogiri::XML::Comment
304
+ "<!--#{CGI.escapeHTML(node.content.to_s)}-->"
305
+ when Nokogiri::XML::Element
306
+ tag = node.name.to_s
307
+ attrs = node.attribute_nodes.map do |a|
308
+ " #{a.name}=\"#{CGI.escapeHTML(a.value.to_s)}\""
309
+ end.join
310
+ children_xml = node.children.map do |c|
311
+ serialize_node_compact(c)
312
+ end.join
313
+ if children_xml.empty?
314
+ "<#{tag}#{attrs}/>"
315
+ else
316
+ "<#{tag}#{attrs}>#{children_xml}</#{tag}>"
317
+ end
297
318
  else
298
- # Nokogiri nodes or other unknown types — fall back to text extraction
319
+ # Unknown node types — fall back to text extraction
299
320
  get_node_text(node)
300
321
  end
301
322
  end
302
323
 
324
+ # Serialize a node's open tag only — name + attributes, no children,
325
+ # no closing tag. Used by +format_text_content_one_sided+ to render
326
+ # a brief parent-element context hint (e.g. +<div id="A">+) for a
327
+ # one-sided text diff, instead of the full ancestor subtree that
328
+ # +serialize_node_compact+ would produce. See lutaml/canon#125.
329
+ #
330
+ # @param node [Object] Element node to serialize
331
+ # @return [String] Open-tag string, or "" for non-elements / nil
332
+ def self.serialize_open_tag(node)
333
+ require "cgi"
334
+ return "" unless node
335
+
336
+ case node
337
+ when Canon::Xml::Nodes::ElementNode
338
+ tag = node.name.to_s
339
+ attrs = node.attribute_nodes.map do |attr|
340
+ " #{attr.name}=\"#{CGI.escapeHTML(attr.value.to_s)}\""
341
+ end.join
342
+ "<#{tag}#{attrs}>"
343
+ when Nokogiri::XML::Element
344
+ tag = node.name.to_s
345
+ attrs = node.attribute_nodes.map do |a|
346
+ " #{a.name}=\"#{CGI.escapeHTML(a.value.to_s)}\""
347
+ end.join
348
+ "<#{tag}#{attrs}>"
349
+ else
350
+ ""
351
+ end
352
+ end
353
+
354
+ # Return the raw text content of a text node without stripping
355
+ # whitespace. +get_node_text+ strips ASCII whitespace, which
356
+ # destroys whitespace-only payloads that callers (e.g. one-sided
357
+ # text-content diff rendering) need to display verbatim.
358
+ #
359
+ # @param node [Object] Text node
360
+ # @return [String] Raw text content, or "" if not a text-bearing node
361
+ def self.raw_text_value(node)
362
+ return "" unless node
363
+
364
+ case node
365
+ when Canon::Xml::Node
366
+ node.value.to_s
367
+ when Nokogiri::XML::Node
368
+ node.content.to_s
369
+ else
370
+ ""
371
+ end
372
+ end
373
+
303
374
  # Return the best display string for a node.
304
375
  #
305
376
  # When +compact: true+ and the node is a Canon ElementNode, returns a
@@ -366,8 +366,13 @@ module Canon
366
366
  # @param actual [Object] Actual value
367
367
  # @return [String] Formatted diff output
368
368
  def format_comparison_result(comparison_result, expected, actual)
369
- # Detect format from expected content
370
- format = Canon::Comparison::FormatDetector.detect(expected)
369
+ # Prefer the matcher-supplied format (e.g. :html4 from
370
+ # be_html4_equivalent_to). Auto-detection from the expected string
371
+ # cannot distinguish HTML from XML for fragments like
372
+ # `<div class="x"></div>` and would mis-route HTML fixtures
373
+ # through the XML pretty-printer (issue #135).
374
+ format = (comparison_result.is_a?(Canon::Comparison::ComparisonResult) && comparison_result.format) ||
375
+ Canon::Comparison::FormatDetector.detect(expected)
371
376
 
372
377
  formatter_options = {
373
378
  use_color: @use_color,
@@ -392,6 +397,18 @@ module Canon
392
397
  output << "" # Blank line for spacing
393
398
  end
394
399
 
400
+ # Parse-error banner. When libxml flagged any errors during
401
+ # parsing, surface them at the top of the report so the user
402
+ # is not left chasing diffs that describe a partial tree.
403
+ # See lutaml/canon#130.
404
+ if comparison_result.is_a?(Canon::Comparison::ComparisonResult) &&
405
+ comparison_result.parse_errors?
406
+ output << format_parse_error_banner(
407
+ comparison_result.parse_errors_expected,
408
+ comparison_result.parse_errors_received,
409
+ )
410
+ end
411
+
395
412
  # 1. CANON VERBOSE tables (ONLY if CANON_VERBOSE=1)
396
413
  verbose_tables = DebugOutput.verbose_tables_only(
397
414
  comparison_result,
@@ -507,6 +524,53 @@ module Canon
507
524
 
508
525
  private
509
526
 
527
+ # Render the parse-error banner that appears at the top of the
528
+ # diff report when libxml flagged any errors during parsing.
529
+ # Names the offending side(s) and warns that the diff below
530
+ # describes the parsed tree, not the input. See lutaml/canon#130.
531
+ #
532
+ # @param errors_expected [Array<String>] Errors from the expected side
533
+ # @param errors_received [Array<String>] Errors from the received side
534
+ # @return [String] Multi-line banner
535
+ def format_parse_error_banner(errors_expected, errors_received)
536
+ lines = []
537
+ rule = "=" * 70
538
+ lines << colorize(rule, :yellow, :bold)
539
+ lines << colorize(" ⚠️ PARSE ERRORS", :yellow, :bold)
540
+ lines << colorize(rule, :yellow, :bold)
541
+
542
+ if errors_expected.any?
543
+ lines << colorize(" Expected side:", :yellow, :bold)
544
+ errors_expected.each do |err|
545
+ lines << " #{colorize(err, :red)}"
546
+ end
547
+ end
548
+
549
+ if errors_received.any?
550
+ lines << colorize(" Received side:", :yellow, :bold)
551
+ errors_received.each do |err|
552
+ lines << " #{colorize(err, :red)}"
553
+ end
554
+ end
555
+
556
+ lines << ""
557
+ lines << colorize(
558
+ " ⚠️ The diff below describes the parsed tree, not the input.",
559
+ :yellow,
560
+ )
561
+ lines << colorize(
562
+ " Content that the parser could not represent has been",
563
+ :yellow,
564
+ )
565
+ lines << colorize(
566
+ " dropped and may appear as \"missing\" in the report.",
567
+ :yellow,
568
+ )
569
+ lines << colorize(rule, :yellow, :bold)
570
+ lines << ""
571
+ lines.join("\n")
572
+ end
573
+
510
574
  # Normalize content for display in diffs
511
575
  #
512
576
  # @param expected [Object] Expected value
@@ -850,6 +914,7 @@ module Canon
850
914
  collapse_whitespace_elements: @collapse_whitespace_elements,
851
915
  strip_whitespace_elements: @strip_whitespace_elements,
852
916
  sort_attributes: @pretty_printer_sort_attributes,
917
+ html_mode: %i[html html4 html5].include?(format),
853
918
  }
854
919
 
855
920
  printer_expected = Canon::PrettyPrinter::XmlNormalized.new(
@@ -931,9 +996,13 @@ module Canon
931
996
 
932
997
  if %i[html html4 html5].include?(format)
933
998
  require "canon/pretty_printer/html"
999
+ # Fixture-ready mode actually indents (libxml FORMAT save flag
1000
+ # via AS_XHTML). The default mode is structurally faithful but
1001
+ # does not indent on HTML5 input -- see lutaml/canon#133.
934
1002
  printer = Canon::PrettyPrinter::Html.new(
935
1003
  indent: @pretty_printer_indent,
936
1004
  indent_type: indent_type_str,
1005
+ fixture_ready: true,
937
1006
  )
938
1007
  elsif format == :xml
939
1008
  require "canon/pretty_printer/xml"
@@ -1,19 +1,43 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "nokogiri"
4
+ require "stringio"
5
+ require_relative "html_void_elements"
4
6
 
5
7
  module Canon
6
8
  module PrettyPrinter
7
- # Pretty printer for HTML with consistent indentation
9
+ # Pretty printer for HTML with consistent indentation.
10
+ #
11
+ # Two modes:
12
+ #
13
+ # 1. Default mode (+fixture_ready: false+): retains the existing
14
+ # behaviour for callers that use the pretty-printer as a
15
+ # structural normaliser (the canon round-trip tests, the
16
+ # diff-pipeline +apply_pretty_print+ stage, etc). These callers
17
+ # do not require actual indentation; they require structural
18
+ # equivalence to the input.
19
+ #
20
+ # 2. Fixture-ready mode (+fixture_ready: true+): emits
21
+ # actually-indented XHTML-shaped output via libxml's +FORMAT+
22
+ # save flag. Used by +DiffFormatter#prettyprint_for_display+
23
+ # (the +CANON_<FORMAT>_DIFF_SHOW_PRETTYPRINT_RECEIVED+ surface)
24
+ # so the user can read or paste the formatted output directly
25
+ # into a fixture heredoc. Output is XHTML-shaped (void
26
+ # elements self-closed, non-void paired) via the +AS_XHTML+
27
+ # save flag; the +NO_DECLARATION+ flag suppresses the
28
+ # +<?xml ...?>+ prefix.
29
+ #
30
+ # See lutaml/canon#133, lutaml/canon#135.
8
31
  class Html
9
- def initialize(indent: 2, indent_type: "space")
32
+ def initialize(indent: 2, indent_type: "space", fixture_ready: false)
10
33
  @indent = indent.to_i
11
34
  @indent_type = indent_type
35
+ @fixture_ready = fixture_ready
12
36
  end
13
37
 
14
- # Pretty print HTML with consistent indentation
15
38
  def format(html_string)
16
- # Detect if this is XHTML or HTML
39
+ return format_fixture_ready(html_string) if @fixture_ready
40
+
17
41
  if xhtml?(html_string)
18
42
  format_as_xhtml(html_string)
19
43
  else
@@ -24,34 +48,72 @@ module Canon
24
48
  private
25
49
 
26
50
  def xhtml?(html_string)
27
- # Check for XHTML DOCTYPE or xmlns attribute
28
51
  html_string.include?("XHTML") ||
29
52
  html_string.include?('xmlns="http://www.w3.org/1999/xhtml"')
30
53
  end
31
54
 
32
55
  def format_as_xhtml(html_string)
33
- # Parse as XML for XHTML
34
56
  doc = Nokogiri::XML(html_string, &:noblanks)
35
57
 
36
- # Use Nokogiri's built-in pretty printing
37
- if @indent_type == "tab"
38
- doc.to_xml(indent: 1, indent_text: "\t", encoding: "UTF-8")
39
- else
40
- doc.to_xml(indent: @indent, encoding: "UTF-8")
41
- end
58
+ out = if @indent_type == "tab"
59
+ doc.to_xml(indent: 1, indent_text: "\t", encoding: "UTF-8")
60
+ else
61
+ doc.to_xml(indent: @indent, encoding: "UTF-8")
62
+ end
63
+
64
+ expand_non_void_self_closing(out)
42
65
  end
43
66
 
44
67
  def format_as_html(html_string)
45
- # Parse as HTML5
46
68
  doc = Nokogiri::HTML5(html_string)
47
69
 
48
- # Use Nokogiri's built-in pretty printing
49
70
  if @indent_type == "tab"
50
71
  doc.to_html(indent: 1, indent_text: "\t", encoding: "UTF-8")
51
72
  else
52
73
  doc.to_html(indent: @indent, encoding: "UTF-8")
53
74
  end
54
75
  end
76
+
77
+ # Fixture-ready serialisation: parse with Nokogiri::HTML5 (so we
78
+ # get permissive recovery on real-world Word / XHTML5 / HTML5
79
+ # input shapes), then write through libxml's XML writer with
80
+ # +FORMAT+ + +AS_XHTML+ + +NO_DECLARATION+. +FORMAT+ inserts
81
+ # indentation; +AS_XHTML+ produces well-shaped output (void
82
+ # elements self-closed, non-void paired); +NO_DECLARATION+
83
+ # suppresses the +<?xml ...?>+ prefix.
84
+ def format_fixture_ready(html_string)
85
+ doc = Nokogiri::HTML5(html_string)
86
+ io = StringIO.new
87
+ if @indent_type == "tab"
88
+ doc.write_to(io, save_with: fixture_ready_save_options,
89
+ indent: 1, indent_text: "\t")
90
+ else
91
+ doc.write_to(io, save_with: fixture_ready_save_options,
92
+ indent: @indent)
93
+ end
94
+ io.string
95
+ end
96
+
97
+ def fixture_ready_save_options
98
+ Nokogiri::XML::Node::SaveOptions::FORMAT |
99
+ Nokogiri::XML::Node::SaveOptions::AS_XHTML |
100
+ Nokogiri::XML::Node::SaveOptions::NO_DECLARATION
101
+ end
102
+
103
+ # Rewrite +<tag …/>+ into +<tag …></tag>+ for every element name
104
+ # that is not an HTML5 void element. +<a/>+ is illegal HTML;
105
+ # void tags like +<br/>+ and +<img …/>+ pass through unchanged.
106
+ def expand_non_void_self_closing(html)
107
+ html.gsub(%r{<([A-Za-z][A-Za-z0-9:_-]*)((?:\s+[^<>"]*(?:"[^"]*"[^<>"]*)*)?)/>}) do
108
+ name = ::Regexp.last_match(1)
109
+ attrs = ::Regexp.last_match(2)
110
+ if HtmlVoidElements.void?(name)
111
+ "<#{name}#{attrs}/>"
112
+ else
113
+ "<#{name}#{attrs}></#{name}>"
114
+ end
115
+ end
116
+ end
55
117
  end
56
118
  end
57
119
  end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "set"
4
+
5
+ module Canon
6
+ module PrettyPrinter
7
+ # The 14 HTML5 void elements — those whose start tag may stand alone
8
+ # (with no end tag) and which cannot have any content. Every other
9
+ # element with no children must be written as +<tag></tag>+ in HTML;
10
+ # writing +<a/>+ is illegal HTML and is parsed as +<a>+ (start tag only).
11
+ module HtmlVoidElements
12
+ VOID = Set.new(%w[area base br col embed hr img input link meta param
13
+ source track wbr]).freeze
14
+
15
+ def self.void?(name)
16
+ VOID.include?(name.to_s.downcase)
17
+ end
18
+ end
19
+ end
20
+ end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "nokogiri"
4
+ require_relative "html_void_elements"
4
5
 
5
6
  module Canon
6
7
  module PrettyPrinter
@@ -133,12 +134,14 @@ module Canon
133
134
  collapse_whitespace_elements: [],
134
135
  strip_whitespace_elements: [],
135
136
  pretty_printed: false,
136
- sort_attributes: false)
137
+ sort_attributes: false,
138
+ html_mode: false)
137
139
  @indent = indent.to_i
138
140
  @indent_char = indent_type == "tab" ? "\t" : " "
139
141
  @vis_map = visualization_map || default_vis_map
140
142
  @pretty_printed = pretty_printed
141
143
  @sort_attributes = sort_attributes
144
+ @html_mode = html_mode
142
145
 
143
146
  @strict_ws = Set.new((preserve_whitespace_elements || []).map(&:to_s))
144
147
  @norm_ws = Set.new((collapse_whitespace_elements || []).map(&:to_s))
@@ -151,10 +154,10 @@ module Canon
151
154
  # @return [String] Serialized XML, one node per line, with content
152
155
  # whitespace visualized at line boundaries
153
156
  def format(xml_string)
154
- doc = Nokogiri::XML(xml_string)
157
+ doc = @html_mode ? Nokogiri::HTML5(xml_string) : Nokogiri::XML(xml_string)
155
158
  lines = []
156
159
 
157
- if doc.version
160
+ if !@html_mode && doc.version
158
161
  enc = doc.encoding ? " encoding=\"#{doc.encoding}\"" : ""
159
162
  lines << "<?xml version=\"#{doc.version}\"#{enc}?>"
160
163
  end
@@ -198,6 +201,10 @@ module Canon
198
201
  children = node.children.reject { |c| c.text? && c.content.empty? }
199
202
 
200
203
  if children.empty?
204
+ if @html_mode && !HtmlVoidElements.void?(node.name)
205
+ return "#{ind(depth)}#{open_tag(node)}</#{node.name}>"
206
+ end
207
+
201
208
  return "#{ind(depth)}#{open_tag(node,
202
209
  self_close: true)}"
203
210
  end
data/lib/canon/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Canon
4
- VERSION = "0.2.4"
4
+ VERSION = "0.2.5"
5
5
  end
@@ -31,7 +31,19 @@ module Canon
31
31
  check_for_relative_namespace_uris(doc)
32
32
 
33
33
  # Convert to XPath data model
34
- build_from_nokogiri(doc, preserve_whitespace: preserve_whitespace)
34
+ result = build_from_nokogiri(doc,
35
+ preserve_whitespace: preserve_whitespace)
36
+
37
+ # Carry libxml's parse errors on the resulting tree so the diff
38
+ # report can surface them (see lutaml/canon#130). libxml's
39
+ # FATAL conditions (e.g. duplicate attributes) silently drop
40
+ # content from the parse tree; without surfacing the error
41
+ # list, downstream diffs describe the partial tree, not the
42
+ # input.
43
+ errors = Array(doc.errors).map(&:to_s)
44
+ result.parse_errors = errors if errors.any?
45
+
46
+ result
35
47
  end
36
48
 
37
49
  # Normalize XML string encoding to UTF-8
@@ -24,6 +24,21 @@ module Canon
24
24
  @in_node_set = value
25
25
  end
26
26
 
27
+ # Parse-time errors carried alongside the node tree, captured at
28
+ # parse boundaries (Canon::Xml::DataModel.from_xml, etc.) so the
29
+ # diff report can surface libxml-level FATAL conditions that
30
+ # would otherwise be silently swallowed and produce misleading
31
+ # diffs against a partially-loaded tree. See lutaml/canon#130.
32
+ #
33
+ # @return [Array<String>] Parse errors as strings (empty by default)
34
+ def parse_errors
35
+ @parse_errors || []
36
+ end
37
+
38
+ def parse_errors=(value)
39
+ @parse_errors = Array(value)
40
+ end
41
+
27
42
  # Return the text content of this node and all descendants.
28
43
  # ElementNode concatenates children's text_content; other nodes
29
44
  # (TextNode, CommentNode, etc.) return their value.
@@ -93,6 +93,23 @@ strip_doctype: false)
93
93
  # Track in-scope namespaces at each level
94
94
  # Each entry is a hash of prefix => uri
95
95
  @namespace_stack = [build_initial_namespaces]
96
+ # Captured libxml errors during SAX parsing. Surfaced on the
97
+ # resulting RootNode so the diff report can warn the user
98
+ # when a FATAL parse error has caused content loss
99
+ # (see lutaml/canon#130).
100
+ @parse_errors = []
101
+ end
102
+
103
+ # SAX callbacks for libxml errors and warnings. Without these
104
+ # overrides the default handlers swallow the events; with them,
105
+ # libxml's "Attribute xml:lang redefined" and similar messages
106
+ # land in @parse_errors and ride through to ComparisonResult.
107
+ def error(string)
108
+ @parse_errors << string.to_s.strip
109
+ end
110
+
111
+ def warning(string)
112
+ @parse_errors << string.to_s.strip
96
113
  end
97
114
 
98
115
  # Called when an element starts
@@ -229,6 +246,7 @@ strip_doctype: false)
229
246
  # followed by PIs and comments outside the document element
230
247
  # (C14N requires this ordering)
231
248
  reorder_children(@root)
249
+ @root.parse_errors = @parse_errors if @parse_errors.any?
232
250
  @root
233
251
  end
234
252
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: canon
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-04-27 00:00:00.000000000 Z
11
+ date: 2026-05-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: diff-lcs
@@ -173,6 +173,7 @@ files:
173
173
  - docs/features/diff-formatting/index.adoc
174
174
  - docs/features/diff-formatting/pretty-diff-mode.adoc
175
175
  - docs/features/diff-formatting/themes.adoc
176
+ - docs/features/diff-formatting/whitespace-adjacency.adoc
176
177
  - docs/features/environment-configuration/index.adoc
177
178
  - docs/features/environment-configuration/override-system.adoc
178
179
  - docs/features/environment-configuration/size-limits.adoc
@@ -244,6 +245,7 @@ files:
244
245
  - lib/canon/comparison/match_options/json_resolver.rb
245
246
  - lib/canon/comparison/match_options/xml_resolver.rb
246
247
  - lib/canon/comparison/match_options/yaml_resolver.rb
248
+ - lib/canon/comparison/node_inspector.rb
247
249
  - lib/canon/comparison/profile_definition.rb
248
250
  - lib/canon/comparison/ruby_object_comparator.rb
249
251
  - lib/canon/comparison/strategies/base_match_strategy.rb
@@ -326,6 +328,7 @@ files:
326
328
  - lib/canon/options/cli_generator.rb
327
329
  - lib/canon/options/registry.rb
328
330
  - lib/canon/pretty_printer/html.rb
331
+ - lib/canon/pretty_printer/html_void_elements.rb
329
332
  - lib/canon/pretty_printer/json.rb
330
333
  - lib/canon/pretty_printer/xml.rb
331
334
  - lib/canon/pretty_printer/xml_normalized.rb