canon 0.2.4 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +4 -0
- data/docs/advanced/diff-classification.adoc +16 -0
- data/docs/advanced/semantic-diff-report.adoc +65 -0
- data/docs/features/diff-formatting/index.adoc +5 -0
- data/docs/features/diff-formatting/whitespace-adjacency.adoc +218 -0
- data/docs/reference/environment-variables.adoc +3 -1
- data/lib/canon/comparison/comparison_result.rb +16 -2
- data/lib/canon/comparison/html_comparator.rb +4 -0
- data/lib/canon/comparison/markup_comparator.rb +49 -71
- data/lib/canon/comparison/node_inspector.rb +103 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +127 -55
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +24 -23
- data/lib/canon/comparison/xml_comparator.rb +97 -3
- data/lib/canon/comparison/xml_node_comparison.rb +37 -81
- data/lib/canon/comparison.rb +59 -0
- data/lib/canon/diff/diff_classifier.rb +37 -39
- data/lib/canon/diff/xml_serialization_formatter.rb +27 -42
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +119 -9
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +75 -4
- data/lib/canon/diff_formatter.rb +71 -2
- data/lib/canon/pretty_printer/html.rb +76 -14
- data/lib/canon/pretty_printer/html_void_elements.rb +20 -0
- data/lib/canon/pretty_printer/xml_normalized.rb +10 -3
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +13 -1
- data/lib/canon/xml/node.rb +15 -0
- data/lib/canon/xml/sax_builder.rb +18 -0
- metadata +5 -2
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "nokogiri"
|
|
3
4
|
require_relative "../../xml/namespace_helper"
|
|
4
5
|
|
|
5
6
|
module Canon
|
|
@@ -260,12 +261,15 @@ module Canon
|
|
|
260
261
|
end
|
|
261
262
|
end
|
|
262
263
|
|
|
263
|
-
# Serialize a
|
|
264
|
+
# Serialize a node tree as compact XML for display.
|
|
264
265
|
#
|
|
265
266
|
# Produces a human-readable inline XML string without namespace
|
|
266
267
|
# declarations and without indentation — suitable for use in Semantic
|
|
267
|
-
# Diff Report entries.
|
|
268
|
-
#
|
|
268
|
+
# Diff Report entries. Handles both +Canon::Xml::Nodes+ types and
|
|
269
|
+
# Nokogiri XML/HTML nodes (the html DOM comparison path uses
|
|
270
|
+
# Nokogiri nodes, so element-structure diffs originating there must
|
|
271
|
+
# be rendered structurally too — see issue #120). For any other
|
|
272
|
+
# node type, falls back to +get_node_text+.
|
|
269
273
|
#
|
|
270
274
|
# @param node [Object] Node to serialize
|
|
271
275
|
# @return [String] Compact XML string
|
|
@@ -294,12 +298,79 @@ module Canon
|
|
|
294
298
|
when Canon::Xml::Nodes::CommentNode
|
|
295
299
|
text = node.respond_to?(:value) ? node.value.to_s : ""
|
|
296
300
|
"<!--#{CGI.escapeHTML(text)}-->"
|
|
301
|
+
when Nokogiri::XML::Text, Nokogiri::XML::CDATA
|
|
302
|
+
CGI.escapeHTML(node.content.to_s)
|
|
303
|
+
when Nokogiri::XML::Comment
|
|
304
|
+
"<!--#{CGI.escapeHTML(node.content.to_s)}-->"
|
|
305
|
+
when Nokogiri::XML::Element
|
|
306
|
+
tag = node.name.to_s
|
|
307
|
+
attrs = node.attribute_nodes.map do |a|
|
|
308
|
+
" #{a.name}=\"#{CGI.escapeHTML(a.value.to_s)}\""
|
|
309
|
+
end.join
|
|
310
|
+
children_xml = node.children.map do |c|
|
|
311
|
+
serialize_node_compact(c)
|
|
312
|
+
end.join
|
|
313
|
+
if children_xml.empty?
|
|
314
|
+
"<#{tag}#{attrs}/>"
|
|
315
|
+
else
|
|
316
|
+
"<#{tag}#{attrs}>#{children_xml}</#{tag}>"
|
|
317
|
+
end
|
|
297
318
|
else
|
|
298
|
-
#
|
|
319
|
+
# Unknown node types — fall back to text extraction
|
|
299
320
|
get_node_text(node)
|
|
300
321
|
end
|
|
301
322
|
end
|
|
302
323
|
|
|
324
|
+
# Serialize a node's open tag only — name + attributes, no children,
|
|
325
|
+
# no closing tag. Used by +format_text_content_one_sided+ to render
|
|
326
|
+
# a brief parent-element context hint (e.g. +<div id="A">+) for a
|
|
327
|
+
# one-sided text diff, instead of the full ancestor subtree that
|
|
328
|
+
# +serialize_node_compact+ would produce. See lutaml/canon#125.
|
|
329
|
+
#
|
|
330
|
+
# @param node [Object] Element node to serialize
|
|
331
|
+
# @return [String] Open-tag string, or "" for non-elements / nil
|
|
332
|
+
def self.serialize_open_tag(node)
|
|
333
|
+
require "cgi"
|
|
334
|
+
return "" unless node
|
|
335
|
+
|
|
336
|
+
case node
|
|
337
|
+
when Canon::Xml::Nodes::ElementNode
|
|
338
|
+
tag = node.name.to_s
|
|
339
|
+
attrs = node.attribute_nodes.map do |attr|
|
|
340
|
+
" #{attr.name}=\"#{CGI.escapeHTML(attr.value.to_s)}\""
|
|
341
|
+
end.join
|
|
342
|
+
"<#{tag}#{attrs}>"
|
|
343
|
+
when Nokogiri::XML::Element
|
|
344
|
+
tag = node.name.to_s
|
|
345
|
+
attrs = node.attribute_nodes.map do |a|
|
|
346
|
+
" #{a.name}=\"#{CGI.escapeHTML(a.value.to_s)}\""
|
|
347
|
+
end.join
|
|
348
|
+
"<#{tag}#{attrs}>"
|
|
349
|
+
else
|
|
350
|
+
""
|
|
351
|
+
end
|
|
352
|
+
end
|
|
353
|
+
|
|
354
|
+
# Return the raw text content of a text node without stripping
|
|
355
|
+
# whitespace. +get_node_text+ strips ASCII whitespace, which
|
|
356
|
+
# destroys whitespace-only payloads that callers (e.g. one-sided
|
|
357
|
+
# text-content diff rendering) need to display verbatim.
|
|
358
|
+
#
|
|
359
|
+
# @param node [Object] Text node
|
|
360
|
+
# @return [String] Raw text content, or "" if not a text-bearing node
|
|
361
|
+
def self.raw_text_value(node)
|
|
362
|
+
return "" unless node
|
|
363
|
+
|
|
364
|
+
case node
|
|
365
|
+
when Canon::Xml::Node
|
|
366
|
+
node.value.to_s
|
|
367
|
+
when Nokogiri::XML::Node
|
|
368
|
+
node.content.to_s
|
|
369
|
+
else
|
|
370
|
+
""
|
|
371
|
+
end
|
|
372
|
+
end
|
|
373
|
+
|
|
303
374
|
# Return the best display string for a node.
|
|
304
375
|
#
|
|
305
376
|
# When +compact: true+ and the node is a Canon ElementNode, returns a
|
data/lib/canon/diff_formatter.rb
CHANGED
|
@@ -366,8 +366,13 @@ module Canon
|
|
|
366
366
|
# @param actual [Object] Actual value
|
|
367
367
|
# @return [String] Formatted diff output
|
|
368
368
|
def format_comparison_result(comparison_result, expected, actual)
|
|
369
|
-
#
|
|
370
|
-
|
|
369
|
+
# Prefer the matcher-supplied format (e.g. :html4 from
|
|
370
|
+
# be_html4_equivalent_to). Auto-detection from the expected string
|
|
371
|
+
# cannot distinguish HTML from XML for fragments like
|
|
372
|
+
# `<div class="x"></div>` and would mis-route HTML fixtures
|
|
373
|
+
# through the XML pretty-printer (issue #135).
|
|
374
|
+
format = (comparison_result.is_a?(Canon::Comparison::ComparisonResult) && comparison_result.format) ||
|
|
375
|
+
Canon::Comparison::FormatDetector.detect(expected)
|
|
371
376
|
|
|
372
377
|
formatter_options = {
|
|
373
378
|
use_color: @use_color,
|
|
@@ -392,6 +397,18 @@ module Canon
|
|
|
392
397
|
output << "" # Blank line for spacing
|
|
393
398
|
end
|
|
394
399
|
|
|
400
|
+
# Parse-error banner. When libxml flagged any errors during
|
|
401
|
+
# parsing, surface them at the top of the report so the user
|
|
402
|
+
# is not left chasing diffs that describe a partial tree.
|
|
403
|
+
# See lutaml/canon#130.
|
|
404
|
+
if comparison_result.is_a?(Canon::Comparison::ComparisonResult) &&
|
|
405
|
+
comparison_result.parse_errors?
|
|
406
|
+
output << format_parse_error_banner(
|
|
407
|
+
comparison_result.parse_errors_expected,
|
|
408
|
+
comparison_result.parse_errors_received,
|
|
409
|
+
)
|
|
410
|
+
end
|
|
411
|
+
|
|
395
412
|
# 1. CANON VERBOSE tables (ONLY if CANON_VERBOSE=1)
|
|
396
413
|
verbose_tables = DebugOutput.verbose_tables_only(
|
|
397
414
|
comparison_result,
|
|
@@ -507,6 +524,53 @@ module Canon
|
|
|
507
524
|
|
|
508
525
|
private
|
|
509
526
|
|
|
527
|
+
# Render the parse-error banner that appears at the top of the
|
|
528
|
+
# diff report when libxml flagged any errors during parsing.
|
|
529
|
+
# Names the offending side(s) and warns that the diff below
|
|
530
|
+
# describes the parsed tree, not the input. See lutaml/canon#130.
|
|
531
|
+
#
|
|
532
|
+
# @param errors_expected [Array<String>] Errors from the expected side
|
|
533
|
+
# @param errors_received [Array<String>] Errors from the received side
|
|
534
|
+
# @return [String] Multi-line banner
|
|
535
|
+
def format_parse_error_banner(errors_expected, errors_received)
|
|
536
|
+
lines = []
|
|
537
|
+
rule = "=" * 70
|
|
538
|
+
lines << colorize(rule, :yellow, :bold)
|
|
539
|
+
lines << colorize(" ⚠️ PARSE ERRORS", :yellow, :bold)
|
|
540
|
+
lines << colorize(rule, :yellow, :bold)
|
|
541
|
+
|
|
542
|
+
if errors_expected.any?
|
|
543
|
+
lines << colorize(" Expected side:", :yellow, :bold)
|
|
544
|
+
errors_expected.each do |err|
|
|
545
|
+
lines << " #{colorize(err, :red)}"
|
|
546
|
+
end
|
|
547
|
+
end
|
|
548
|
+
|
|
549
|
+
if errors_received.any?
|
|
550
|
+
lines << colorize(" Received side:", :yellow, :bold)
|
|
551
|
+
errors_received.each do |err|
|
|
552
|
+
lines << " #{colorize(err, :red)}"
|
|
553
|
+
end
|
|
554
|
+
end
|
|
555
|
+
|
|
556
|
+
lines << ""
|
|
557
|
+
lines << colorize(
|
|
558
|
+
" ⚠️ The diff below describes the parsed tree, not the input.",
|
|
559
|
+
:yellow,
|
|
560
|
+
)
|
|
561
|
+
lines << colorize(
|
|
562
|
+
" Content that the parser could not represent has been",
|
|
563
|
+
:yellow,
|
|
564
|
+
)
|
|
565
|
+
lines << colorize(
|
|
566
|
+
" dropped and may appear as \"missing\" in the report.",
|
|
567
|
+
:yellow,
|
|
568
|
+
)
|
|
569
|
+
lines << colorize(rule, :yellow, :bold)
|
|
570
|
+
lines << ""
|
|
571
|
+
lines.join("\n")
|
|
572
|
+
end
|
|
573
|
+
|
|
510
574
|
# Normalize content for display in diffs
|
|
511
575
|
#
|
|
512
576
|
# @param expected [Object] Expected value
|
|
@@ -850,6 +914,7 @@ module Canon
|
|
|
850
914
|
collapse_whitespace_elements: @collapse_whitespace_elements,
|
|
851
915
|
strip_whitespace_elements: @strip_whitespace_elements,
|
|
852
916
|
sort_attributes: @pretty_printer_sort_attributes,
|
|
917
|
+
html_mode: %i[html html4 html5].include?(format),
|
|
853
918
|
}
|
|
854
919
|
|
|
855
920
|
printer_expected = Canon::PrettyPrinter::XmlNormalized.new(
|
|
@@ -931,9 +996,13 @@ module Canon
|
|
|
931
996
|
|
|
932
997
|
if %i[html html4 html5].include?(format)
|
|
933
998
|
require "canon/pretty_printer/html"
|
|
999
|
+
# Fixture-ready mode actually indents (libxml FORMAT save flag
|
|
1000
|
+
# via AS_XHTML). The default mode is structurally faithful but
|
|
1001
|
+
# does not indent on HTML5 input -- see lutaml/canon#133.
|
|
934
1002
|
printer = Canon::PrettyPrinter::Html.new(
|
|
935
1003
|
indent: @pretty_printer_indent,
|
|
936
1004
|
indent_type: indent_type_str,
|
|
1005
|
+
fixture_ready: true,
|
|
937
1006
|
)
|
|
938
1007
|
elsif format == :xml
|
|
939
1008
|
require "canon/pretty_printer/xml"
|
|
@@ -1,19 +1,43 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "nokogiri"
|
|
4
|
+
require "stringio"
|
|
5
|
+
require_relative "html_void_elements"
|
|
4
6
|
|
|
5
7
|
module Canon
|
|
6
8
|
module PrettyPrinter
|
|
7
|
-
# Pretty printer for HTML with consistent indentation
|
|
9
|
+
# Pretty printer for HTML with consistent indentation.
|
|
10
|
+
#
|
|
11
|
+
# Two modes:
|
|
12
|
+
#
|
|
13
|
+
# 1. Default mode (+fixture_ready: false+): retains the existing
|
|
14
|
+
# behaviour for callers that use the pretty-printer as a
|
|
15
|
+
# structural normaliser (the canon round-trip tests, the
|
|
16
|
+
# diff-pipeline +apply_pretty_print+ stage, etc). These callers
|
|
17
|
+
# do not require actual indentation; they require structural
|
|
18
|
+
# equivalence to the input.
|
|
19
|
+
#
|
|
20
|
+
# 2. Fixture-ready mode (+fixture_ready: true+): emits
|
|
21
|
+
# actually-indented XHTML-shaped output via libxml's +FORMAT+
|
|
22
|
+
# save flag. Used by +DiffFormatter#prettyprint_for_display+
|
|
23
|
+
# (the +CANON_<FORMAT>_DIFF_SHOW_PRETTYPRINT_RECEIVED+ surface)
|
|
24
|
+
# so the user can read or paste the formatted output directly
|
|
25
|
+
# into a fixture heredoc. Output is XHTML-shaped (void
|
|
26
|
+
# elements self-closed, non-void paired) via the +AS_XHTML+
|
|
27
|
+
# save flag; the +NO_DECLARATION+ flag suppresses the
|
|
28
|
+
# +<?xml ...?>+ prefix.
|
|
29
|
+
#
|
|
30
|
+
# See lutaml/canon#133, lutaml/canon#135.
|
|
8
31
|
class Html
|
|
9
|
-
def initialize(indent: 2, indent_type: "space")
|
|
32
|
+
def initialize(indent: 2, indent_type: "space", fixture_ready: false)
|
|
10
33
|
@indent = indent.to_i
|
|
11
34
|
@indent_type = indent_type
|
|
35
|
+
@fixture_ready = fixture_ready
|
|
12
36
|
end
|
|
13
37
|
|
|
14
|
-
# Pretty print HTML with consistent indentation
|
|
15
38
|
def format(html_string)
|
|
16
|
-
|
|
39
|
+
return format_fixture_ready(html_string) if @fixture_ready
|
|
40
|
+
|
|
17
41
|
if xhtml?(html_string)
|
|
18
42
|
format_as_xhtml(html_string)
|
|
19
43
|
else
|
|
@@ -24,34 +48,72 @@ module Canon
|
|
|
24
48
|
private
|
|
25
49
|
|
|
26
50
|
def xhtml?(html_string)
|
|
27
|
-
# Check for XHTML DOCTYPE or xmlns attribute
|
|
28
51
|
html_string.include?("XHTML") ||
|
|
29
52
|
html_string.include?('xmlns="http://www.w3.org/1999/xhtml"')
|
|
30
53
|
end
|
|
31
54
|
|
|
32
55
|
def format_as_xhtml(html_string)
|
|
33
|
-
# Parse as XML for XHTML
|
|
34
56
|
doc = Nokogiri::XML(html_string, &:noblanks)
|
|
35
57
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
58
|
+
out = if @indent_type == "tab"
|
|
59
|
+
doc.to_xml(indent: 1, indent_text: "\t", encoding: "UTF-8")
|
|
60
|
+
else
|
|
61
|
+
doc.to_xml(indent: @indent, encoding: "UTF-8")
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
expand_non_void_self_closing(out)
|
|
42
65
|
end
|
|
43
66
|
|
|
44
67
|
def format_as_html(html_string)
|
|
45
|
-
# Parse as HTML5
|
|
46
68
|
doc = Nokogiri::HTML5(html_string)
|
|
47
69
|
|
|
48
|
-
# Use Nokogiri's built-in pretty printing
|
|
49
70
|
if @indent_type == "tab"
|
|
50
71
|
doc.to_html(indent: 1, indent_text: "\t", encoding: "UTF-8")
|
|
51
72
|
else
|
|
52
73
|
doc.to_html(indent: @indent, encoding: "UTF-8")
|
|
53
74
|
end
|
|
54
75
|
end
|
|
76
|
+
|
|
77
|
+
# Fixture-ready serialisation: parse with Nokogiri::HTML5 (so we
|
|
78
|
+
# get permissive recovery on real-world Word / XHTML5 / HTML5
|
|
79
|
+
# input shapes), then write through libxml's XML writer with
|
|
80
|
+
# +FORMAT+ + +AS_XHTML+ + +NO_DECLARATION+. +FORMAT+ inserts
|
|
81
|
+
# indentation; +AS_XHTML+ produces well-shaped output (void
|
|
82
|
+
# elements self-closed, non-void paired); +NO_DECLARATION+
|
|
83
|
+
# suppresses the +<?xml ...?>+ prefix.
|
|
84
|
+
def format_fixture_ready(html_string)
|
|
85
|
+
doc = Nokogiri::HTML5(html_string)
|
|
86
|
+
io = StringIO.new
|
|
87
|
+
if @indent_type == "tab"
|
|
88
|
+
doc.write_to(io, save_with: fixture_ready_save_options,
|
|
89
|
+
indent: 1, indent_text: "\t")
|
|
90
|
+
else
|
|
91
|
+
doc.write_to(io, save_with: fixture_ready_save_options,
|
|
92
|
+
indent: @indent)
|
|
93
|
+
end
|
|
94
|
+
io.string
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def fixture_ready_save_options
|
|
98
|
+
Nokogiri::XML::Node::SaveOptions::FORMAT |
|
|
99
|
+
Nokogiri::XML::Node::SaveOptions::AS_XHTML |
|
|
100
|
+
Nokogiri::XML::Node::SaveOptions::NO_DECLARATION
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Rewrite +<tag …/>+ into +<tag …></tag>+ for every element name
|
|
104
|
+
# that is not an HTML5 void element. +<a/>+ is illegal HTML;
|
|
105
|
+
# void tags like +<br/>+ and +<img …/>+ pass through unchanged.
|
|
106
|
+
def expand_non_void_self_closing(html)
|
|
107
|
+
html.gsub(%r{<([A-Za-z][A-Za-z0-9:_-]*)((?:\s+[^<>"]*(?:"[^"]*"[^<>"]*)*)?)/>}) do
|
|
108
|
+
name = ::Regexp.last_match(1)
|
|
109
|
+
attrs = ::Regexp.last_match(2)
|
|
110
|
+
if HtmlVoidElements.void?(name)
|
|
111
|
+
"<#{name}#{attrs}/>"
|
|
112
|
+
else
|
|
113
|
+
"<#{name}#{attrs}></#{name}>"
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
55
117
|
end
|
|
56
118
|
end
|
|
57
119
|
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "set"
|
|
4
|
+
|
|
5
|
+
module Canon
|
|
6
|
+
module PrettyPrinter
|
|
7
|
+
# The 14 HTML5 void elements — those whose start tag may stand alone
|
|
8
|
+
# (with no end tag) and which cannot have any content. Every other
|
|
9
|
+
# element with no children must be written as +<tag></tag>+ in HTML;
|
|
10
|
+
# writing +<a/>+ is illegal HTML and is parsed as +<a>+ (start tag only).
|
|
11
|
+
module HtmlVoidElements
|
|
12
|
+
VOID = Set.new(%w[area base br col embed hr img input link meta param
|
|
13
|
+
source track wbr]).freeze
|
|
14
|
+
|
|
15
|
+
def self.void?(name)
|
|
16
|
+
VOID.include?(name.to_s.downcase)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "nokogiri"
|
|
4
|
+
require_relative "html_void_elements"
|
|
4
5
|
|
|
5
6
|
module Canon
|
|
6
7
|
module PrettyPrinter
|
|
@@ -133,12 +134,14 @@ module Canon
|
|
|
133
134
|
collapse_whitespace_elements: [],
|
|
134
135
|
strip_whitespace_elements: [],
|
|
135
136
|
pretty_printed: false,
|
|
136
|
-
sort_attributes: false
|
|
137
|
+
sort_attributes: false,
|
|
138
|
+
html_mode: false)
|
|
137
139
|
@indent = indent.to_i
|
|
138
140
|
@indent_char = indent_type == "tab" ? "\t" : " "
|
|
139
141
|
@vis_map = visualization_map || default_vis_map
|
|
140
142
|
@pretty_printed = pretty_printed
|
|
141
143
|
@sort_attributes = sort_attributes
|
|
144
|
+
@html_mode = html_mode
|
|
142
145
|
|
|
143
146
|
@strict_ws = Set.new((preserve_whitespace_elements || []).map(&:to_s))
|
|
144
147
|
@norm_ws = Set.new((collapse_whitespace_elements || []).map(&:to_s))
|
|
@@ -151,10 +154,10 @@ module Canon
|
|
|
151
154
|
# @return [String] Serialized XML, one node per line, with content
|
|
152
155
|
# whitespace visualized at line boundaries
|
|
153
156
|
def format(xml_string)
|
|
154
|
-
doc = Nokogiri::XML(xml_string)
|
|
157
|
+
doc = @html_mode ? Nokogiri::HTML5(xml_string) : Nokogiri::XML(xml_string)
|
|
155
158
|
lines = []
|
|
156
159
|
|
|
157
|
-
if doc.version
|
|
160
|
+
if !@html_mode && doc.version
|
|
158
161
|
enc = doc.encoding ? " encoding=\"#{doc.encoding}\"" : ""
|
|
159
162
|
lines << "<?xml version=\"#{doc.version}\"#{enc}?>"
|
|
160
163
|
end
|
|
@@ -198,6 +201,10 @@ module Canon
|
|
|
198
201
|
children = node.children.reject { |c| c.text? && c.content.empty? }
|
|
199
202
|
|
|
200
203
|
if children.empty?
|
|
204
|
+
if @html_mode && !HtmlVoidElements.void?(node.name)
|
|
205
|
+
return "#{ind(depth)}#{open_tag(node)}</#{node.name}>"
|
|
206
|
+
end
|
|
207
|
+
|
|
201
208
|
return "#{ind(depth)}#{open_tag(node,
|
|
202
209
|
self_close: true)}"
|
|
203
210
|
end
|
data/lib/canon/version.rb
CHANGED
data/lib/canon/xml/data_model.rb
CHANGED
|
@@ -31,7 +31,19 @@ module Canon
|
|
|
31
31
|
check_for_relative_namespace_uris(doc)
|
|
32
32
|
|
|
33
33
|
# Convert to XPath data model
|
|
34
|
-
build_from_nokogiri(doc,
|
|
34
|
+
result = build_from_nokogiri(doc,
|
|
35
|
+
preserve_whitespace: preserve_whitespace)
|
|
36
|
+
|
|
37
|
+
# Carry libxml's parse errors on the resulting tree so the diff
|
|
38
|
+
# report can surface them (see lutaml/canon#130). libxml's
|
|
39
|
+
# FATAL conditions (e.g. duplicate attributes) silently drop
|
|
40
|
+
# content from the parse tree; without surfacing the error
|
|
41
|
+
# list, downstream diffs describe the partial tree, not the
|
|
42
|
+
# input.
|
|
43
|
+
errors = Array(doc.errors).map(&:to_s)
|
|
44
|
+
result.parse_errors = errors if errors.any?
|
|
45
|
+
|
|
46
|
+
result
|
|
35
47
|
end
|
|
36
48
|
|
|
37
49
|
# Normalize XML string encoding to UTF-8
|
data/lib/canon/xml/node.rb
CHANGED
|
@@ -24,6 +24,21 @@ module Canon
|
|
|
24
24
|
@in_node_set = value
|
|
25
25
|
end
|
|
26
26
|
|
|
27
|
+
# Parse-time errors carried alongside the node tree, captured at
|
|
28
|
+
# parse boundaries (Canon::Xml::DataModel.from_xml, etc.) so the
|
|
29
|
+
# diff report can surface libxml-level FATAL conditions that
|
|
30
|
+
# would otherwise be silently swallowed and produce misleading
|
|
31
|
+
# diffs against a partially-loaded tree. See lutaml/canon#130.
|
|
32
|
+
#
|
|
33
|
+
# @return [Array<String>] Parse errors as strings (empty by default)
|
|
34
|
+
def parse_errors
|
|
35
|
+
@parse_errors || []
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def parse_errors=(value)
|
|
39
|
+
@parse_errors = Array(value)
|
|
40
|
+
end
|
|
41
|
+
|
|
27
42
|
# Return the text content of this node and all descendants.
|
|
28
43
|
# ElementNode concatenates children's text_content; other nodes
|
|
29
44
|
# (TextNode, CommentNode, etc.) return their value.
|
|
@@ -93,6 +93,23 @@ strip_doctype: false)
|
|
|
93
93
|
# Track in-scope namespaces at each level
|
|
94
94
|
# Each entry is a hash of prefix => uri
|
|
95
95
|
@namespace_stack = [build_initial_namespaces]
|
|
96
|
+
# Captured libxml errors during SAX parsing. Surfaced on the
|
|
97
|
+
# resulting RootNode so the diff report can warn the user
|
|
98
|
+
# when a FATAL parse error has caused content loss
|
|
99
|
+
# (see lutaml/canon#130).
|
|
100
|
+
@parse_errors = []
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# SAX callbacks for libxml errors and warnings. Without these
|
|
104
|
+
# overrides the default handlers swallow the events; with them,
|
|
105
|
+
# libxml's "Attribute xml:lang redefined" and similar messages
|
|
106
|
+
# land in @parse_errors and ride through to ComparisonResult.
|
|
107
|
+
def error(string)
|
|
108
|
+
@parse_errors << string.to_s.strip
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def warning(string)
|
|
112
|
+
@parse_errors << string.to_s.strip
|
|
96
113
|
end
|
|
97
114
|
|
|
98
115
|
# Called when an element starts
|
|
@@ -229,6 +246,7 @@ strip_doctype: false)
|
|
|
229
246
|
# followed by PIs and comments outside the document element
|
|
230
247
|
# (C14N requires this ordering)
|
|
231
248
|
reorder_children(@root)
|
|
249
|
+
@root.parse_errors = @parse_errors if @parse_errors.any?
|
|
232
250
|
@root
|
|
233
251
|
end
|
|
234
252
|
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: canon
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.6
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-05-03 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: diff-lcs
|
|
@@ -173,6 +173,7 @@ files:
|
|
|
173
173
|
- docs/features/diff-formatting/index.adoc
|
|
174
174
|
- docs/features/diff-formatting/pretty-diff-mode.adoc
|
|
175
175
|
- docs/features/diff-formatting/themes.adoc
|
|
176
|
+
- docs/features/diff-formatting/whitespace-adjacency.adoc
|
|
176
177
|
- docs/features/environment-configuration/index.adoc
|
|
177
178
|
- docs/features/environment-configuration/override-system.adoc
|
|
178
179
|
- docs/features/environment-configuration/size-limits.adoc
|
|
@@ -244,6 +245,7 @@ files:
|
|
|
244
245
|
- lib/canon/comparison/match_options/json_resolver.rb
|
|
245
246
|
- lib/canon/comparison/match_options/xml_resolver.rb
|
|
246
247
|
- lib/canon/comparison/match_options/yaml_resolver.rb
|
|
248
|
+
- lib/canon/comparison/node_inspector.rb
|
|
247
249
|
- lib/canon/comparison/profile_definition.rb
|
|
248
250
|
- lib/canon/comparison/ruby_object_comparator.rb
|
|
249
251
|
- lib/canon/comparison/strategies/base_match_strategy.rb
|
|
@@ -326,6 +328,7 @@ files:
|
|
|
326
328
|
- lib/canon/options/cli_generator.rb
|
|
327
329
|
- lib/canon/options/registry.rb
|
|
328
330
|
- lib/canon/pretty_printer/html.rb
|
|
331
|
+
- lib/canon/pretty_printer/html_void_elements.rb
|
|
329
332
|
- lib/canon/pretty_printer/json.rb
|
|
330
333
|
- lib/canon/pretty_printer/xml.rb
|
|
331
334
|
- lib/canon/pretty_printer/xml_normalized.rb
|