canon 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/docs/advanced/semantic-diff-report.adoc +65 -0
- data/docs/features/diff-formatting/index.adoc +3 -0
- data/docs/features/diff-formatting/whitespace-adjacency.adoc +140 -0
- data/docs/reference/environment-variables.adoc +3 -1
- data/lib/canon/comparison/comparison_result.rb +16 -2
- data/lib/canon/comparison/html_comparator.rb +4 -0
- data/lib/canon/comparison/markup_comparator.rb +49 -71
- data/lib/canon/comparison/node_inspector.rb +103 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +127 -55
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +24 -23
- data/lib/canon/comparison/xml_comparator.rb +94 -3
- data/lib/canon/comparison/xml_node_comparison.rb +37 -81
- data/lib/canon/comparison.rb +59 -0
- data/lib/canon/diff/diff_classifier.rb +37 -39
- data/lib/canon/diff/xml_serialization_formatter.rb +27 -42
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +119 -9
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +75 -4
- data/lib/canon/diff_formatter.rb +71 -2
- data/lib/canon/pretty_printer/html.rb +76 -14
- data/lib/canon/pretty_printer/html_void_elements.rb +20 -0
- data/lib/canon/pretty_printer/xml_normalized.rb +10 -3
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +13 -1
- data/lib/canon/xml/node.rb +15 -0
- data/lib/canon/xml/sax_builder.rb +18 -0
- metadata +5 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 615e3154c89a9850e86c39852201e5573b461ac62d52cc423523e444ace301f7
|
|
4
|
+
data.tar.gz: 37ee00969f0682dde670168fbd7888294edda612220bfbebb7c950efbcb76aa2
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: bce4239ab6a471edd896fd3b54def4e57e21714078cb3631b55363b50646349a6923eed1e208e5706c3319d3e7a2ae75f2db698ffe853c0e03a754d76c856679
|
|
7
|
+
data.tar.gz: 1441bd5412658d9d2b975e3889fc95bfd080dec2b89b731f71e191f5ca7bbc7e0a8aa63e787916781bd5e653732c16d5c03b0d3fc3b967a3b653a2a735e62636
|
|
@@ -14,6 +14,39 @@ The Semantic Diff Report provides dimension-specific, actionable details for eac
|
|
|
14
14
|
|
|
15
15
|
The report is automatically shown in verbose mode when differences exist, appearing before the detailed diff output.
|
|
16
16
|
|
|
17
|
+
== Parse errors
|
|
18
|
+
|
|
19
|
+
When Canon's underlying parser (libxml for XML, HTML5 for HTML) reports errors during input parsing, Canon surfaces them at the top of the diff report in a banner section before any per-difference output. The banner names the offending side and warns that the diff below describes the parsed tree, not the input — content the parser could not represent has been silently dropped from the comparison tree.
|
|
20
|
+
|
|
21
|
+
This is purely a transparency feature: Canon does not modify the parse to "fix" invalid input. The user is responsible for deciding whether the parse failure was expected (e.g. testing legacy fixtures during a migration) or symptomatic of an upstream bug.
|
|
22
|
+
|
|
23
|
+
.Example: Banner for a duplicate-attribute FATAL on the received side
|
|
24
|
+
[example]
|
|
25
|
+
====
|
|
26
|
+
[source]
|
|
27
|
+
----
|
|
28
|
+
======================================================================
|
|
29
|
+
⚠️ PARSE ERRORS
|
|
30
|
+
======================================================================
|
|
31
|
+
Received side:
|
|
32
|
+
Attribute xml:lang redefined
|
|
33
|
+
|
|
34
|
+
⚠️ The diff below describes the parsed tree, not the input.
|
|
35
|
+
Content that the parser could not represent has been
|
|
36
|
+
dropped and may appear as "missing" in the report.
|
|
37
|
+
======================================================================
|
|
38
|
+
----
|
|
39
|
+
====
|
|
40
|
+
|
|
41
|
+
Common triggers in HTML / XHTML round-trips:
|
|
42
|
+
|
|
43
|
+
* Duplicate attributes (XML strict; HTML5 permissive — only XML mode triggers a banner)
|
|
44
|
+
* Stray processing instructions in fragment context
|
|
45
|
+
* Malformed namespace declarations
|
|
46
|
+
* DOCTYPE in unexpected positions
|
|
47
|
+
|
|
48
|
+
The banner is rendered when `Canon::Comparison::ComparisonResult#parse_errors?` is true. Programmatic callers can read `parse_errors_expected` and `parse_errors_received` directly off the result.
|
|
49
|
+
|
|
17
50
|
== Key Features
|
|
18
51
|
|
|
19
52
|
* XPath locations for XML/HTML elements
|
|
@@ -179,6 +212,38 @@ Reason: Text: "¬······:¬······"
|
|
|
179
212
|
|
|
180
213
|
This fallback is implemented in `Canon::DiffFormatter::DiffDetailFormatterHelpers::DimensionFormatter.format_text_content_details` and only triggers when `TextUtils.ambiguous_text_pair?` returns `true` _and_ at least one side has a parent element to render.
|
|
181
214
|
|
|
215
|
+
==== One-sided text diffs (added or removed text nodes)
|
|
216
|
+
|
|
217
|
+
When a `text_content` difference carries a text node on one side and `nil` on the other (issue #125) -- the shape that fragment-length mismatches and child-comparison emit when a text-node child is missing -- the renderer mirrors `element_structure`: the missing side reads `(not present)`, and the present side reads the text-node content (whitespace-visualised) plus a brief parent open-tag hint for context. The full ancestor subtree is *not* dumped; only the immediate parent's opening tag is shown, so a missing whitespace text node cannot make the diff look like the entire ancestor differs.
|
|
218
|
+
|
|
219
|
+
.Example: Whitespace text node missing on the received side
|
|
220
|
+
[example]
|
|
221
|
+
====
|
|
222
|
+
[source]
|
|
223
|
+
----
|
|
224
|
+
🔍 DIFFERENCE #1/1 [NORMATIVE]
|
|
225
|
+
──────────────────────────────────────────────────────────────────────
|
|
226
|
+
Dimension: text_content
|
|
227
|
+
Reason: element missing: text
|
|
228
|
+
|
|
229
|
+
⊖ Expected (File 1):
|
|
230
|
+
text "¬············" in <div id="A">
|
|
231
|
+
⊕ Actual (File 2):
|
|
232
|
+
(not present)
|
|
233
|
+
|
|
234
|
+
✨ Changes:
|
|
235
|
+
Text removed: text "¬············" in <div id="A">
|
|
236
|
+
----
|
|
237
|
+
====
|
|
238
|
+
|
|
239
|
+
The `Changes:` line uses `Text removed:` or `Text added:` to mirror the `Element removed:` / `Element added:` phrasing of `element_structure`.
|
|
240
|
+
|
|
241
|
+
==== Element-shaped diffs misclassified as text_content
|
|
242
|
+
|
|
243
|
+
In rare cases an upstream comparator may emit an *element*-shaped one-sided diff under `dimension: :text_content`. Without a guard, the one-sided text formatter would call `raw_text_value` on the element (which returns `""` for an empty element such as `<br/>`) and render `text "" in <parent>` -- meaningless when an element is what's actually missing.
|
|
244
|
+
|
|
245
|
+
The formatter detects element-shaped present-side nodes (Canon `ElementNode` or Nokogiri `Element`) and delegates to `format_element_structure_details`, so the rendered output reads `<br/>` and `Element removed:` rather than `text ""` and `Text removed:`. This is defence in depth -- the construction-side fix in `XmlComparatorHelpers::ChildComparison` ensures element orphans are now tagged `:element_structure` at source -- but a misclassified diff still renders meaningfully if any path slips through.
|
|
246
|
+
|
|
182
247
|
=== Structural Whitespace
|
|
183
248
|
|
|
184
249
|
Shows whitespace-only differences (usually informative).
|
|
@@ -28,6 +28,9 @@ Canon's diff formatting includes:
|
|
|
28
28
|
* **Context and grouping**: Control how much surrounding context to show
|
|
29
29
|
* **Algorithm-specific output**: Different output styles for different diff
|
|
30
30
|
algorithms
|
|
31
|
+
* **Whitespace adjacency**: Stray whitespace-only text nodes are anchored at
|
|
32
|
+
themselves instead of cascading into mismatches against neighbouring
|
|
33
|
+
content (link:./whitespace-adjacency.adoc[details])
|
|
31
34
|
|
|
32
35
|
== Available formatting options
|
|
33
36
|
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: Whitespace adjacency in diff reports
|
|
3
|
+
parent: Diff Formatting
|
|
4
|
+
nav_order: 8
|
|
5
|
+
---
|
|
6
|
+
= Whitespace adjacency in diff reports
|
|
7
|
+
:toc:
|
|
8
|
+
:toclevels: 2
|
|
9
|
+
|
|
10
|
+
== Purpose
|
|
11
|
+
|
|
12
|
+
Canon's diff reports anchor whitespace-only text nodes that have no
|
|
13
|
+
counterpart on the other side to a dedicated `:whitespace_adjacency`
|
|
14
|
+
dimension instead of letting them cascade into 3-4 misaligned
|
|
15
|
+
`:text_content` mismatches against neighbouring content nodes.
|
|
16
|
+
|
|
17
|
+
This is a *report-only* contract — equivalence verdicts are unchanged.
|
|
18
|
+
Inputs that were non-equivalent before this feature remain non-equivalent;
|
|
19
|
+
only the *shape* of the diff report changes.
|
|
20
|
+
|
|
21
|
+
== The problem
|
|
22
|
+
|
|
23
|
+
Consider an HTML fragment compared as `be_html_equivalent_to`:
|
|
24
|
+
|
|
25
|
+
[source,html]
|
|
26
|
+
----
|
|
27
|
+
<!-- expected -->
|
|
28
|
+
<p>
|
|
29
|
+
<span>ISO </span>
|
|
30
|
+
<span>20483</span>
|
|
31
|
+
,
|
|
32
|
+
<i>Cereals and pulses</i>
|
|
33
|
+
</p>
|
|
34
|
+
|
|
35
|
+
<!-- actual -->
|
|
36
|
+
<p><span>ISO </span><span>20483</span>, <i>Cereals and pulses</i></p>
|
|
37
|
+
----
|
|
38
|
+
|
|
39
|
+
Both render identically in a browser — the indentation is structural HTML
|
|
40
|
+
formatting, not content. Before this feature, the diff report contained
|
|
41
|
+
four entries:
|
|
42
|
+
|
|
43
|
+
[source]
|
|
44
|
+
----
|
|
45
|
+
DIFFERENCE #1 — element_structure: parent <p> "missing children"
|
|
46
|
+
DIFFERENCE #2 — text_content: "" vs "20483" (visualised: ↵░░░░)
|
|
47
|
+
DIFFERENCE #3 — text_content: "20483" vs ","
|
|
48
|
+
DIFFERENCE #4 — text_content: "," vs "Cereals and pulses"
|
|
49
|
+
----
|
|
50
|
+
|
|
51
|
+
The cascade comes from positional `zip()` alignment in
|
|
52
|
+
`Canon::Comparison::XmlComparatorHelpers::ChildComparison`: with the
|
|
53
|
+
expected side carrying extra whitespace-only text nodes and the actual
|
|
54
|
+
side carrying none, every child slides by one slot and gets paired
|
|
55
|
+
against the wrong neighbour.
|
|
56
|
+
|
|
57
|
+
== The contract
|
|
58
|
+
|
|
59
|
+
When `ChildComparison` aligns child sequences and encounters a
|
|
60
|
+
whitespace-only text node on one side paired against a non-whitespace
|
|
61
|
+
node on the other, it:
|
|
62
|
+
|
|
63
|
+
1. Treats the whitespace node as a *single-side gap* in the alignment.
|
|
64
|
+
2. Emits one `:whitespace_adjacency` diff entry anchored at the
|
|
65
|
+
whitespace node itself (not at its mis-paired neighbour).
|
|
66
|
+
3. Advances only the cursor that carries the whitespace, so the next
|
|
67
|
+
iteration aligns content against content.
|
|
68
|
+
|
|
69
|
+
The asymmetric whitespace still produces a non-equivalent verdict — the
|
|
70
|
+
`:whitespace_adjacency` dimension is classified as normative
|
|
71
|
+
unconditionally — so any test that previously failed on whitespace
|
|
72
|
+
asymmetry continues to fail.
|
|
73
|
+
|
|
74
|
+
After the new contract, the cascade above collapses to:
|
|
75
|
+
|
|
76
|
+
[source]
|
|
77
|
+
----
|
|
78
|
+
DIFFERENCE #1 — whitespace_adjacency: Whitespace surrounding "20483":
|
|
79
|
+
present on EXPECTED ("↵░░"), absent on ACTUAL
|
|
80
|
+
DIFFERENCE #2 — whitespace_adjacency: Whitespace surrounding ",":
|
|
81
|
+
present on EXPECTED ("↵░░"), absent on ACTUAL
|
|
82
|
+
DIFFERENCE #3 — text_content: "↵░░,↵░░" vs ", "
|
|
83
|
+
----
|
|
84
|
+
|
|
85
|
+
== Adjacency positions
|
|
86
|
+
|
|
87
|
+
The Reason line names the adjacency position of the whitespace node
|
|
88
|
+
relative to its non-whitespace siblings:
|
|
89
|
+
|
|
90
|
+
`:preceding`:: Whitespace at the start of its parent (no non-whitespace
|
|
91
|
+
sibling before it, has one after it).
|
|
92
|
+
|
|
93
|
+
`:following`:: Whitespace at the end of its parent (has a non-whitespace
|
|
94
|
+
sibling before it, none after).
|
|
95
|
+
|
|
96
|
+
`:surrounding`:: Sandwiched between two non-whitespace siblings.
|
|
97
|
+
|
|
98
|
+
`:isolated`:: No non-whitespace siblings at all (degenerate; rarely
|
|
99
|
+
emitted).
|
|
100
|
+
|
|
101
|
+
== What this contract does NOT do
|
|
102
|
+
|
|
103
|
+
* **Does not change equivalence outcomes.** A non-equivalent comparison
|
|
104
|
+
before #137 remains non-equivalent after — only the diff-report shape
|
|
105
|
+
changes.
|
|
106
|
+
* **Does not silently filter whitespace.** The asymmetric whitespace is
|
|
107
|
+
always reported; it is just labelled `:whitespace_adjacency` and
|
|
108
|
+
anchored at the whitespace node, instead of cascading as
|
|
109
|
+
`:text_content` against unrelated content nodes.
|
|
110
|
+
* **Does not affect symmetric whitespace.** When both sides carry
|
|
111
|
+
parallel whitespace-only nodes, those compare normally
|
|
112
|
+
(no `:whitespace_adjacency` entry, no cascade).
|
|
113
|
+
|
|
114
|
+
== Where it runs
|
|
115
|
+
|
|
116
|
+
The contract is implemented as a re-alignment walk inside
|
|
117
|
+
`Canon::Comparison::XmlComparatorHelpers::ChildComparison.use_positional_comparison`.
|
|
118
|
+
It activates whenever the existing positional `zip()` alignment would
|
|
119
|
+
pair a whitespace-only text node against a content node — that is, in
|
|
120
|
+
every whitespace context where the upstream filter has not already
|
|
121
|
+
dropped the whitespace nodes.
|
|
122
|
+
|
|
123
|
+
For elements where whitespace is preserved by configuration
|
|
124
|
+
(`preserve_whitespace_elements`) the upstream filter does not drop
|
|
125
|
+
indentation, and the re-alignment walk surfaces every asymmetric
|
|
126
|
+
whitespace node as a single normative `:whitespace_adjacency` diff.
|
|
127
|
+
|
|
128
|
+
== Related
|
|
129
|
+
|
|
130
|
+
* link:../../advanced/diff-classification.adoc[Diff classification] —
|
|
131
|
+
Normative vs informative differences.
|
|
132
|
+
* link:../match-options/index.adoc[Match options] — Configuring
|
|
133
|
+
`preserve_whitespace_elements`, `collapse_whitespace_elements`, and
|
|
134
|
+
`strip_whitespace_elements`.
|
|
135
|
+
|
|
136
|
+
== History
|
|
137
|
+
|
|
138
|
+
The cascade behaviour was reported in
|
|
139
|
+
https://github.com/lutaml/canon/issues/137[issue #137]. The fix landed
|
|
140
|
+
as a report-only re-alignment in PR #138.
|
|
@@ -194,7 +194,9 @@ export CANON_JSON_FORMAT_PREPROCESSING=normalize
|
|
|
194
194
|
|`CANON_SHOW_PRETTYPRINT_RECEIVED`
|
|
195
195
|
|boolean
|
|
196
196
|
|`false`
|
|
197
|
-
|Show only the RECEIVED (actual) block in the fixture-ready pretty-printed section. This is the most common fixture-update workflow: enable this option to get a copy-pasteable pretty-printed form of the generated output that can replace the old fixture heredoc. Format-specific: `CANON_{FORMAT}_DIFF_SHOW_PRETTYPRINT_RECEIVED
|
|
197
|
+
|Show only the RECEIVED (actual) block in the fixture-ready pretty-printed section. This is the most common fixture-update workflow: enable this option to get a copy-pasteable pretty-printed form of the generated output that can replace the old fixture heredoc. Format-specific: `CANON_{FORMAT}_DIFF_SHOW_PRETTYPRINT_RECEIVED`.
|
|
198
|
+
|
|
199
|
+
For HTML / HTML4 / HTML5 inputs, the pretty-printed output is XHTML-shaped: void elements are self-closed (`<br/>`, `<meta/>`), non-void elements are paired (`<a></a>`), and Nokogiri may add `xmlns="http://www.w3.org/1999/xhtml"` on `<html>` and an `xml:lang` mirror of `lang`. This is a display-only serialisation chosen because libxml's `FORMAT` save flag (the only path that actually indents HTML5 input) requires the XHTML save mode -- `Nokogiri::HTML5#to_html` silently ignores its `indent:` keyword. See lutaml/canon#133.
|
|
198
200
|
|All formats (display only)
|
|
199
201
|
|
|
200
202
|
|`CANON_COMPACT_SEMANTIC_REPORT`
|
|
@@ -6,7 +6,8 @@ module Canon
|
|
|
6
6
|
# Provides methods to query equivalence based on normative diffs
|
|
7
7
|
class ComparisonResult
|
|
8
8
|
attr_reader :differences, :preprocessed_strings, :format, :html_version,
|
|
9
|
-
:match_options, :algorithm, :original_strings
|
|
9
|
+
:match_options, :algorithm, :original_strings,
|
|
10
|
+
:parse_errors_expected, :parse_errors_received
|
|
10
11
|
|
|
11
12
|
# @param differences [Array<DiffNode>] Array of difference nodes
|
|
12
13
|
# @param preprocessed_strings [Array<String, String>] Pre-processed content for display
|
|
@@ -15,8 +16,11 @@ module Canon
|
|
|
15
16
|
# @param match_options [Hash, nil] Resolved match options used for comparison
|
|
16
17
|
# @param algorithm [Symbol] Diff algorithm used (:dom or :semantic)
|
|
17
18
|
# @param original_strings [Array<String, String>, nil] Original unprocessed content for line diff
|
|
19
|
+
# @param parse_errors_expected [Array<String>, nil] Parser errors from the expected side
|
|
20
|
+
# @param parse_errors_received [Array<String>, nil] Parser errors from the received side
|
|
18
21
|
def initialize(differences:, preprocessed_strings:, format:,
|
|
19
|
-
html_version: nil, match_options: nil, algorithm: :dom, original_strings: nil
|
|
22
|
+
html_version: nil, match_options: nil, algorithm: :dom, original_strings: nil,
|
|
23
|
+
parse_errors_expected: nil, parse_errors_received: nil)
|
|
20
24
|
@differences = differences
|
|
21
25
|
@preprocessed_strings = preprocessed_strings
|
|
22
26
|
@original_strings = original_strings || preprocessed_strings
|
|
@@ -24,6 +28,16 @@ html_version: nil, match_options: nil, algorithm: :dom, original_strings: nil)
|
|
|
24
28
|
@html_version = html_version
|
|
25
29
|
@match_options = match_options
|
|
26
30
|
@algorithm = algorithm
|
|
31
|
+
@parse_errors_expected = Array(parse_errors_expected)
|
|
32
|
+
@parse_errors_received = Array(parse_errors_received)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Whether either side reported parse errors. Used by the diff
|
|
36
|
+
# formatter to decide whether to render the parse-error banner.
|
|
37
|
+
#
|
|
38
|
+
# @return [Boolean]
|
|
39
|
+
def parse_errors?
|
|
40
|
+
@parse_errors_expected.any? || @parse_errors_received.any?
|
|
27
41
|
end
|
|
28
42
|
|
|
29
43
|
# Check if documents are semantically equivalent (no normative diffs)
|
|
@@ -151,6 +151,8 @@ module Canon
|
|
|
151
151
|
html_version: detect_html_version_from_node(node1),
|
|
152
152
|
match_options: match_opts_hash,
|
|
153
153
|
algorithm: :dom,
|
|
154
|
+
parse_errors_expected: Comparison.parse_errors_for(node1),
|
|
155
|
+
parse_errors_received: Comparison.parse_errors_for(node2),
|
|
154
156
|
)
|
|
155
157
|
elsif result != Comparison::EQUIVALENT && !differences.empty?
|
|
156
158
|
# Non-verbose mode: check equivalence
|
|
@@ -300,6 +302,8 @@ module Canon
|
|
|
300
302
|
html_version: html_version,
|
|
301
303
|
match_options: match_opts_hash.merge(strategy.metadata),
|
|
302
304
|
algorithm: :semantic,
|
|
305
|
+
parse_errors_expected: Comparison.parse_errors_for(node1),
|
|
306
|
+
parse_errors_received: Comparison.parse_errors_for(node2),
|
|
303
307
|
)
|
|
304
308
|
else
|
|
305
309
|
# Simple boolean result - equivalent if no normative differences
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "../comparison" # Load base module with constants
|
|
4
|
+
require_relative "node_inspector"
|
|
4
5
|
require_relative "../diff/diff_node"
|
|
5
6
|
require_relative "../diff/path_builder"
|
|
6
7
|
|
|
@@ -87,23 +88,20 @@ module Canon
|
|
|
87
88
|
return nil if node.nil?
|
|
88
89
|
|
|
89
90
|
# Canon::Xml::Node types
|
|
90
|
-
|
|
91
|
+
case node
|
|
92
|
+
when Canon::Xml::Nodes::RootNode
|
|
91
93
|
# Serialize all children of root
|
|
92
94
|
node.children.map { |child| serialize_node(child) }.join
|
|
93
|
-
|
|
95
|
+
when Canon::Xml::Nodes::ElementNode
|
|
94
96
|
serialize_element_node(node)
|
|
95
|
-
|
|
97
|
+
when Canon::Xml::Nodes::TextNode
|
|
96
98
|
# Use original text (with entity references) if available,
|
|
97
99
|
# otherwise fall back to value (decoded text)
|
|
98
100
|
node.original || node.value
|
|
99
|
-
|
|
101
|
+
when Canon::Xml::Nodes::CommentNode
|
|
100
102
|
"<!--#{node.value}-->"
|
|
101
|
-
|
|
103
|
+
when Canon::Xml::Nodes::ProcessingInstructionNode
|
|
102
104
|
"<?#{node.target} #{node.data}?>"
|
|
103
|
-
elsif node.respond_to?(:to_xml)
|
|
104
|
-
node.to_xml
|
|
105
|
-
elsif node.respond_to?(:to_html)
|
|
106
|
-
node.to_html
|
|
107
105
|
else
|
|
108
106
|
node.to_s
|
|
109
107
|
end
|
|
@@ -121,8 +119,8 @@ module Canon
|
|
|
121
119
|
node.attribute_nodes.to_h do |attr|
|
|
122
120
|
[attr.name, attr.value]
|
|
123
121
|
end
|
|
124
|
-
# Nokogiri
|
|
125
|
-
elsif node.
|
|
122
|
+
# Nokogiri elements
|
|
123
|
+
elsif node.is_a?(Nokogiri::XML::Element)
|
|
126
124
|
node.attributes.to_h do |_, attr|
|
|
127
125
|
[attr.name, attr.value]
|
|
128
126
|
end
|
|
@@ -227,8 +225,8 @@ module Canon
|
|
|
227
225
|
def same_node_type?(node1, node2)
|
|
228
226
|
return false if node1.class != node2.class
|
|
229
227
|
|
|
230
|
-
|
|
231
|
-
|
|
228
|
+
case node1
|
|
229
|
+
when Canon::Xml::Node, Nokogiri::XML::Node
|
|
232
230
|
node1.node_type == node2.node_type
|
|
233
231
|
else
|
|
234
232
|
true
|
|
@@ -245,20 +243,7 @@ module Canon
|
|
|
245
243
|
# @param node [Object] Node to check
|
|
246
244
|
# @return [Boolean] true if node is a comment
|
|
247
245
|
def comment_node?(node)
|
|
248
|
-
|
|
249
|
-
return true if node.respond_to?(:node_type) && node.node_type == :comment
|
|
250
|
-
|
|
251
|
-
# HTML comments are parsed as TEXT nodes by Nokogiri
|
|
252
|
-
# Check if this is a text node with HTML comment content
|
|
253
|
-
if text_node?(node)
|
|
254
|
-
text = node_text(node)
|
|
255
|
-
# Strip whitespace and backslashes for comparison
|
|
256
|
-
# Nokogiri escapes HTML comments as "<\\!-- comment -->" in full documents
|
|
257
|
-
text_stripped = text.to_s.strip.gsub("\\", "")
|
|
258
|
-
return true if text_stripped.start_with?("<!--") && text_stripped.end_with?("-->")
|
|
259
|
-
end
|
|
260
|
-
|
|
261
|
-
false
|
|
246
|
+
NodeInspector.comment_node?(node)
|
|
262
247
|
end
|
|
263
248
|
|
|
264
249
|
# Check if a node is a text node
|
|
@@ -266,9 +251,7 @@ module Canon
|
|
|
266
251
|
# @param node [Object] Node to check
|
|
267
252
|
# @return [Boolean] true if node is a text node
|
|
268
253
|
def text_node?(node)
|
|
269
|
-
|
|
270
|
-
!node.respond_to?(:element?)) ||
|
|
271
|
-
(node.respond_to?(:node_type) && node.node_type == :text)
|
|
254
|
+
NodeInspector.text_node?(node)
|
|
272
255
|
end
|
|
273
256
|
|
|
274
257
|
# Get text content from a node
|
|
@@ -276,15 +259,7 @@ module Canon
|
|
|
276
259
|
# @param node [Object] Node to get text from
|
|
277
260
|
# @return [String] Text content
|
|
278
261
|
def node_text(node)
|
|
279
|
-
|
|
280
|
-
if node.respond_to?(:value)
|
|
281
|
-
node.value.to_s
|
|
282
|
-
# Nokogiri nodes use .content
|
|
283
|
-
elsif node.respond_to?(:content)
|
|
284
|
-
node.content.to_s
|
|
285
|
-
else
|
|
286
|
-
node.to_s
|
|
287
|
-
end
|
|
262
|
+
NodeInspector.text_content(node)
|
|
288
263
|
end
|
|
289
264
|
|
|
290
265
|
# Check if difference between two texts is only whitespace
|
|
@@ -328,7 +303,7 @@ module Canon
|
|
|
328
303
|
if diff1 == Canon::Comparison::MISSING_NODE && diff2 == Canon::Comparison::MISSING_NODE
|
|
329
304
|
"element structure mismatch (children differ)"
|
|
330
305
|
else
|
|
331
|
-
|
|
306
|
+
Canon::Comparison.code_pair_label(diff1, diff2)
|
|
332
307
|
end
|
|
333
308
|
end
|
|
334
309
|
|
|
@@ -371,26 +346,18 @@ module Canon
|
|
|
371
346
|
def extract_text_content_from_node(node)
|
|
372
347
|
return nil if node.nil?
|
|
373
348
|
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
# For nodes with value method (other types)
|
|
387
|
-
return node.value if node.respond_to?(:value)
|
|
388
|
-
|
|
389
|
-
# For simple text nodes or strings
|
|
390
|
-
return node.to_s if node.is_a?(String)
|
|
391
|
-
|
|
392
|
-
# For other node types, try to_s
|
|
393
|
-
node.to_s
|
|
349
|
+
case node
|
|
350
|
+
when Canon::Xml::Nodes::TextNode
|
|
351
|
+
node.value
|
|
352
|
+
when Canon::Xml::Node
|
|
353
|
+
node.text_content
|
|
354
|
+
when Nokogiri::XML::Node
|
|
355
|
+
node.content.to_s
|
|
356
|
+
when String
|
|
357
|
+
node
|
|
358
|
+
else
|
|
359
|
+
node.to_s
|
|
360
|
+
end
|
|
394
361
|
rescue StandardError
|
|
395
362
|
nil
|
|
396
363
|
end
|
|
@@ -444,26 +411,37 @@ module Canon
|
|
|
444
411
|
|
|
445
412
|
# Determine the appropriate dimension for a node type
|
|
446
413
|
#
|
|
414
|
+
# Used by ChildComparison to tag per-child orphan diffs with a
|
|
415
|
+
# dimension that matches what the node *is*, so the formatter
|
|
416
|
+
# renders correctly. An element orphan tagged :text_content
|
|
417
|
+
# would otherwise route through PR #126's one-sided text
|
|
418
|
+
# formatter and render as +text ""+ instead of as the actual
|
|
419
|
+
# element (see lutaml/canon#125 follow-up).
|
|
420
|
+
#
|
|
447
421
|
# @param node [Object] The node to check
|
|
448
422
|
# @return [Symbol] The dimension symbol
|
|
449
423
|
def determine_node_dimension(node)
|
|
450
|
-
|
|
451
|
-
|
|
424
|
+
case node
|
|
425
|
+
when Canon::Xml::Node
|
|
452
426
|
case node.node_type
|
|
427
|
+
when :element then :element_structure
|
|
453
428
|
when :comment then :comments
|
|
454
429
|
when :text, :cdata then :text_content
|
|
455
430
|
when :processing_instruction then :processing_instructions
|
|
456
431
|
else :text_content
|
|
457
432
|
end
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
433
|
+
when Nokogiri::XML::Node
|
|
434
|
+
if node.comment?
|
|
435
|
+
:comments
|
|
436
|
+
elsif node.text? || node.cdata?
|
|
437
|
+
:text_content
|
|
438
|
+
elsif node.processing_instruction?
|
|
439
|
+
:processing_instructions
|
|
440
|
+
elsif node.element?
|
|
441
|
+
:element_structure
|
|
442
|
+
else
|
|
443
|
+
:text_content
|
|
444
|
+
end
|
|
467
445
|
else
|
|
468
446
|
:text_content
|
|
469
447
|
end
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module Comparison
|
|
5
|
+
# Single source of truth for cross-backend node type operations.
|
|
6
|
+
#
|
|
7
|
+
# The comparison pipeline handles nodes from two backends:
|
|
8
|
+
# * Canon::Xml::Node (+ RootNode, ElementNode, TextNode, etc.) —
|
|
9
|
+
# custom DOM built by SAX builder and DataModel.
|
|
10
|
+
# * Nokogiri::XML::Node (+ subclasses) — native Nokogiri nodes used
|
|
11
|
+
# by the HTML comparator and some legacy paths.
|
|
12
|
+
#
|
|
13
|
+
# Every method here dispatches on type via +case/when+ (+is_a?+).
|
|
14
|
+
# No +respond_to?+ — the types are known at every call site.
|
|
15
|
+
module NodeInspector
|
|
16
|
+
CANON_TEXT_TYPE = :text
|
|
17
|
+
NOKOGIRI_TEXT_TYPE = defined?(Nokogiri::XML::Node::TEXT_NODE) ? Nokogiri::XML::Node::TEXT_NODE : 3
|
|
18
|
+
|
|
19
|
+
# True when +node+ is a text node (whitespace, content, etc.).
|
|
20
|
+
def self.text_node?(node)
|
|
21
|
+
case node
|
|
22
|
+
when Canon::Xml::Node
|
|
23
|
+
node.node_type == CANON_TEXT_TYPE
|
|
24
|
+
when Nokogiri::XML::Node
|
|
25
|
+
node.node_type == NOKOGIRI_TEXT_TYPE
|
|
26
|
+
else
|
|
27
|
+
false
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Extract the text content of +node+ as a String.
|
|
32
|
+
def self.text_content(node)
|
|
33
|
+
case node
|
|
34
|
+
when Canon::Xml::Node
|
|
35
|
+
node.value.to_s
|
|
36
|
+
when Nokogiri::XML::Node
|
|
37
|
+
node.content.to_s
|
|
38
|
+
else
|
|
39
|
+
node.to_s
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# True when +node+ is a text node whose content is whitespace-only.
|
|
44
|
+
# Empty-string text nodes return false — those represent genuine
|
|
45
|
+
# empty-vs-content asymmetry, not pretty-print indentation.
|
|
46
|
+
def self.whitespace_only_text?(node)
|
|
47
|
+
return false unless text_node?(node)
|
|
48
|
+
|
|
49
|
+
text = text_content(node)
|
|
50
|
+
!text.empty? && text.strip.empty?
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# True when +node+ is a comment node.
|
|
54
|
+
# For HTML, also detects comments that Nokogiri parses as TEXT nodes
|
|
55
|
+
# (content like "<!-- comment -->" or escaped "<\\!-- comment -->").
|
|
56
|
+
def self.comment_node?(node)
|
|
57
|
+
case node
|
|
58
|
+
when Canon::Xml::Node
|
|
59
|
+
node.node_type == :comment
|
|
60
|
+
when Nokogiri::XML::Node
|
|
61
|
+
return true if node.comment?
|
|
62
|
+
|
|
63
|
+
# HTML comments are parsed as TEXT nodes by Nokogiri
|
|
64
|
+
if node.text?
|
|
65
|
+
text_stripped = text_content(node).to_s.strip.gsub("\\", "")
|
|
66
|
+
return true if text_stripped.start_with?("<!--") && text_stripped.end_with?("-->")
|
|
67
|
+
end
|
|
68
|
+
false
|
|
69
|
+
else
|
|
70
|
+
false
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# True when +node+ is an element node.
|
|
75
|
+
def self.element_node?(node)
|
|
76
|
+
case node
|
|
77
|
+
when Canon::Xml::Node
|
|
78
|
+
node.node_type == :element
|
|
79
|
+
when Nokogiri::XML::Node
|
|
80
|
+
node.element?
|
|
81
|
+
else
|
|
82
|
+
false
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Extract parse-time errors carried on a node or its owning document.
|
|
87
|
+
# Returns an Array of Strings.
|
|
88
|
+
def self.parse_errors(node)
|
|
89
|
+
case node
|
|
90
|
+
when nil
|
|
91
|
+
[]
|
|
92
|
+
when Canon::Xml::Node
|
|
93
|
+
errors = node.parse_errors
|
|
94
|
+
Array(errors).map(&:to_s)
|
|
95
|
+
when Nokogiri::XML::Document, Nokogiri::HTML5::Document
|
|
96
|
+
Array(node.errors).map(&:to_s)
|
|
97
|
+
else
|
|
98
|
+
[]
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|