canon 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +31 -149
- data/README.adoc +9 -0
- data/docs/advanced/semantic-diff-report.adoc +96 -0
- data/docs/features/configuration-profiles.adoc +4 -2
- data/docs/features/diff-formatting/index.adoc +3 -0
- data/docs/features/diff-formatting/whitespace-adjacency.adoc +140 -0
- data/docs/features/match-options/html-policies.adoc +2 -0
- data/docs/features/match-options/index.adoc +40 -0
- data/docs/guides/choosing-configuration.adoc +12 -1
- data/docs/reference/cli-options.adoc +3 -0
- data/docs/reference/environment-variables.adoc +3 -1
- data/docs/reference/options-across-interfaces.adoc +7 -1
- data/docs/understanding/formats/html.adoc +9 -2
- data/lib/canon/cli.rb +4 -0
- data/lib/canon/commands/diff_command.rb +1 -0
- data/lib/canon/comparison/comparison_result.rb +95 -2
- data/lib/canon/comparison/html_comparator.rb +96 -11
- data/lib/canon/comparison/markup_comparator.rb +68 -71
- data/lib/canon/comparison/match_options/base_resolver.rb +1 -0
- data/lib/canon/comparison/match_options/xml_resolver.rb +8 -0
- data/lib/canon/comparison/match_options.rb +23 -2
- data/lib/canon/comparison/node_inspector.rb +103 -0
- data/lib/canon/comparison/whitespace_sensitivity.rb +96 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +133 -55
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +24 -23
- data/lib/canon/comparison/xml_comparator/node_parser.rb +45 -7
- data/lib/canon/comparison/xml_comparator.rb +174 -7
- data/lib/canon/comparison/xml_node_comparison.rb +48 -66
- data/lib/canon/comparison.rb +143 -22
- data/lib/canon/config/env_schema.rb +2 -1
- data/lib/canon/config/profiles/metanorma.yml +3 -0
- data/lib/canon/config.rb +51 -5
- data/lib/canon/diff/diff_classifier.rb +55 -41
- data/lib/canon/diff/diff_line_builder.rb +9 -8
- data/lib/canon/diff/xml_serialization_formatter.rb +27 -42
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +39 -4
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +5 -2
- data/lib/canon/diff_formatter/by_line_formatter.rb +84 -0
- data/lib/canon/diff_formatter/by_object_formatter.rb +53 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +184 -26
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +92 -4
- data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +29 -0
- data/lib/canon/diff_formatter/pretty_diff_formatter.rb +109 -0
- data/lib/canon/diff_formatter.rb +128 -175
- data/lib/canon/html/data_model.rb +10 -4
- data/lib/canon/pretty_printer/html.rb +76 -14
- data/lib/canon/pretty_printer/html_void_elements.rb +20 -0
- data/lib/canon/pretty_printer/xml_normalized.rb +10 -3
- data/lib/canon/tree_diff/adapters/html_adapter.rb +55 -2
- data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/c14n.rb +59 -5
- data/lib/canon/xml/data_model.rb +13 -1
- data/lib/canon/xml/element_matcher.rb +3 -0
- data/lib/canon/xml/node.rb +23 -1
- data/lib/canon/xml/nodes/comment_node.rb +4 -0
- data/lib/canon/xml/nodes/element_node.rb +4 -0
- data/lib/canon/xml/nodes/text_node.rb +4 -0
- data/lib/canon/xml/sax_builder.rb +29 -2
- data/lib/canon/xml/xpath_engine.rb +238 -0
- metadata +9 -2
|
@@ -92,6 +92,46 @@ Canon.equivalent?(
|
|
|
92
92
|
`:ignore`:: Structural whitespace is completely ignored
|
|
93
93
|
|
|
94
94
|
|
|
95
|
+
=== whitespace_type
|
|
96
|
+
|
|
97
|
+
**Applies to**: XML, HTML
|
|
98
|
+
|
|
99
|
+
**Purpose**: Controls whether different Unicode whitespace characters (space, NBSP, ideographic space, etc.) are treated as equivalent or distinct.
|
|
100
|
+
|
|
101
|
+
**Behaviors**:
|
|
102
|
+
|
|
103
|
+
`:strict`:: (default) Different Unicode whitespace types are significant.
|
|
104
|
+
Space (U+0020) and NBSP (U+00A0) are treated as different characters.
|
|
105
|
+
This is useful for catching accidental insertion of wrong whitespace types
|
|
106
|
+
(e.g., a pasted NBSP where a regular space was intended).
|
|
107
|
+
|
|
108
|
+
`:normalize`:: All Unicode whitespace characters are collapsed to a single space
|
|
109
|
+
before comparison. Space, NBSP, ideographic space (U+3000), and other Unicode
|
|
110
|
+
whitespace characters are treated as equivalent.
|
|
111
|
+
|
|
112
|
+
.Using whitespace_type: :strict (default)
|
|
113
|
+
[example]
|
|
114
|
+
====
|
|
115
|
+
[source,ruby]
|
|
116
|
+
----
|
|
117
|
+
# By default, space and NBSP are different
|
|
118
|
+
xml1 = '<root><span>ISO</span> <span>712</span></root>'
|
|
119
|
+
xml2 = '<root><span>ISO</span> <span>712</span></root>'
|
|
120
|
+
|
|
121
|
+
Canon::Comparison.equivalent?(xml1, xml2,
|
|
122
|
+
match_profile: :spec_friendly
|
|
123
|
+
)
|
|
124
|
+
# => false (NBSP detected as different from space)
|
|
125
|
+
|
|
126
|
+
# Opt into treating all whitespace types as equivalent
|
|
127
|
+
Canon::Comparison.equivalent?(xml1, xml2,
|
|
128
|
+
match_profile: :spec_friendly,
|
|
129
|
+
match: { whitespace_type: :normalize }
|
|
130
|
+
)
|
|
131
|
+
# => true
|
|
132
|
+
----
|
|
133
|
+
====
|
|
134
|
+
|
|
95
135
|
=== Whitespace sensitivity at element level
|
|
96
136
|
|
|
97
137
|
==== General
|
|
@@ -210,13 +210,24 @@ Canon::Comparison.equivalent?(doc1, doc2,
|
|
|
210
210
|
structural_whitespace: :ignore, # ignore, normalize, strict
|
|
211
211
|
attribute_order: :ignore, # ignore, strict (XML/HTML)
|
|
212
212
|
attribute_values: :normalize, # normalize, strict, ignore
|
|
213
|
-
comments: :ignore
|
|
213
|
+
comments: :ignore, # ignore, normalize, strict
|
|
214
|
+
whitespace_type: :strict # strict (default), normalize
|
|
214
215
|
}
|
|
215
216
|
)
|
|
216
217
|
----
|
|
217
218
|
|
|
218
219
|
**Remember**: Match options behave differently with each algorithm! See link:../features/match-options/algorithm-specific-behavior.adoc[Algorithm-Specific Behavior].
|
|
219
220
|
|
|
221
|
+
==== Whitespace Type Sensitivity
|
|
222
|
+
|
|
223
|
+
By default, Canon distinguishes between different Unicode whitespace types
|
|
224
|
+
(e.g. regular space U+0020 vs non-breaking space U+00A0 vs ideographic space
|
|
225
|
+
U+3000). This catches accidental insertion of wrong whitespace characters.
|
|
226
|
+
|
|
227
|
+
Use `whitespace_type: :normalize` when all Unicode whitespace variants should
|
|
228
|
+
be treated as equivalent (e.g. when output from different tools may use
|
|
229
|
+
different whitespace types for the same visual result).
|
|
230
|
+
|
|
220
231
|
=== Layer 4: Diff Formatting
|
|
221
232
|
|
|
222
233
|
**Question**: How should differences be displayed?
|
|
@@ -145,6 +145,9 @@ Individual dimension control (overrides profile settings):
|
|
|
145
145
|
|
|
146
146
|
|`--comments BEHAVIOR`
|
|
147
147
|
|Comments: `strict`, `normalize`, `ignore`
|
|
148
|
+
|
|
149
|
+
|`--whitespace-type BEHAVIOR`
|
|
150
|
+
|Whitespace type sensitivity: `strict` (default), `normalize`
|
|
148
151
|
|===
|
|
149
152
|
|
|
150
153
|
See link:../features/match-options/[Match Options] for details.
|
|
@@ -194,7 +194,9 @@ export CANON_JSON_FORMAT_PREPROCESSING=normalize
|
|
|
194
194
|
|`CANON_SHOW_PRETTYPRINT_RECEIVED`
|
|
195
195
|
|boolean
|
|
196
196
|
|`false`
|
|
197
|
-
|Show only the RECEIVED (actual) block in the fixture-ready pretty-printed section. This is the most common fixture-update workflow: enable this option to get a copy-pasteable pretty-printed form of the generated output that can replace the old fixture heredoc. Format-specific: `CANON_{FORMAT}_DIFF_SHOW_PRETTYPRINT_RECEIVED
|
|
197
|
+
|Show only the RECEIVED (actual) block in the fixture-ready pretty-printed section. This is the most common fixture-update workflow: enable this option to get a copy-pasteable pretty-printed form of the generated output that can replace the old fixture heredoc. Format-specific: `CANON_{FORMAT}_DIFF_SHOW_PRETTYPRINT_RECEIVED`.
|
|
198
|
+
|
|
199
|
+
For HTML / HTML4 / HTML5 inputs, the pretty-printed output is XHTML-shaped: void elements are self-closed (`<br/>`, `<meta/>`), non-void elements are paired (`<a></a>`), and Nokogiri may add `xmlns="http://www.w3.org/1999/xhtml"` on `<html>` and an `xml:lang` mirror of `lang`. This is a display-only serialisation chosen because libxml's `FORMAT` save flag (the only path that actually indents HTML5 input) requires the XHTML save mode -- `Nokogiri::HTML5#to_html` silently ignores its `indent:` keyword. See lutaml/canon#133.
|
|
198
200
|
|All formats (display only)
|
|
199
201
|
|
|
200
202
|
|`CANON_COMPACT_SEMANTIC_REPORT`
|
|
@@ -223,9 +223,15 @@ Profile values: `strict`, `rendered`, `spec_friendly`, `content_only`
|
|
|
223
223
|
|`match: { element_hierarchy: :strict }`
|
|
224
224
|
|`config.canon.xml.match.options = { element_hierarchy: :strict }`
|
|
225
225
|
|`CANON_ELEMENT_HIERARCHY=strict`
|
|
226
|
+
|
|
227
|
+
|Whitespace Type
|
|
228
|
+
|`--whitespace-type normalize`
|
|
229
|
+
|`match: { whitespace_type: :normalize }`
|
|
230
|
+
|`config.canon.xml.match.options = { whitespace_type: :normalize }`
|
|
231
|
+
|`CANON_WHITESPACE_TYPE=normalize`
|
|
226
232
|
|===
|
|
227
233
|
|
|
228
|
-
Values: `strict`, `normalize`, `ignore` (or `strict`, `ignore` for structure/position/hierarchy)
|
|
234
|
+
Values: `strict`, `normalize`, `ignore` (or `strict`, `ignore` for structure/position/hierarchy). `whitespace_type` values: `strict` (default), `normalize`
|
|
229
235
|
|
|
230
236
|
==== XML/HTML-Specific Match Dimensions
|
|
231
237
|
|
|
@@ -19,7 +19,7 @@ Canon supports HTML 4, HTML5, and XHTML with automatic format detection.
|
|
|
19
19
|
**Key features:**
|
|
20
20
|
|
|
21
21
|
* Automatic HTML vs XHTML detection
|
|
22
|
-
* HTML5 parser for
|
|
22
|
+
* HTML5 parser for HTML input regardless of declared version (HTML4 and HTML5 share the same content model and parsing whitespace rules — see <<html4-html5-parity>>)
|
|
23
23
|
* XML parser for XHTML
|
|
24
24
|
* Consistent attribute ordering
|
|
25
25
|
* Whitespace normalization
|
|
@@ -203,9 +203,16 @@ Automatically detects HTML5, HTML4, or XHTML based on DOCTYPE and structure.
|
|
|
203
203
|
----
|
|
204
204
|
====
|
|
205
205
|
|
|
206
|
+
[[html4-html5-parity]]
|
|
207
|
+
=== HTML4 / HTML5 parity
|
|
208
|
+
|
|
209
|
+
`be_html4_equivalent_to` and `be_html5_equivalent_to` apply the same whitespace-sensitivity rules. Whitespace sensitivity is a property of HTML's content model and is identical across the two HTML versions, so any input that compares equivalent under one matcher must compare equivalent under the other.
|
|
210
|
+
|
|
211
|
+
Internally, both matchers parse input via `Nokogiri::HTML5.fragment`. (Earlier releases routed `:html` and `:html4` through `Nokogiri::XML.fragment`, which silently applied XML whitespace rules — meaning `be_html4_equivalent_to` could reject inputs that `be_html5_equivalent_to` correctly accepted.) See https://github.com/lutaml/canon/issues/118 for the full background.
|
|
212
|
+
|
|
206
213
|
=== Whitespace handling
|
|
207
214
|
|
|
208
|
-
HTML whitespace is collapsed per CSS rendering rules. Empty text nodes between elements are removed.
|
|
215
|
+
HTML whitespace is collapsed per CSS rendering rules. Empty text nodes between elements are removed. Whitespace-only text between two adjacent inline elements (`<span>A</span> <span>B</span>`) is preserved because it renders as a visible space; whitespace at a block boundary (between an inline element and a block element, or between two block siblings) is collapsed.
|
|
209
216
|
|
|
210
217
|
.Whitespace handling example
|
|
211
218
|
[example]
|
data/lib/canon/cli.rb
CHANGED
|
@@ -218,6 +218,10 @@ module Canon
|
|
|
218
218
|
type: :string,
|
|
219
219
|
enum: %w[strict normalize ignore],
|
|
220
220
|
desc: "Comment matching: strict, normalize, or ignore"
|
|
221
|
+
method_option :whitespace_type,
|
|
222
|
+
type: :string,
|
|
223
|
+
enum: %w[strict normalize],
|
|
224
|
+
desc: "Whitespace type sensitivity: strict (default) or normalize"
|
|
221
225
|
method_option :show_diffs,
|
|
222
226
|
type: :string,
|
|
223
227
|
enum: %w[all normative informative],
|
|
@@ -6,7 +6,8 @@ module Canon
|
|
|
6
6
|
# Provides methods to query equivalence based on normative diffs
|
|
7
7
|
class ComparisonResult
|
|
8
8
|
attr_reader :differences, :preprocessed_strings, :format, :html_version,
|
|
9
|
-
:match_options, :algorithm, :original_strings
|
|
9
|
+
:match_options, :algorithm, :original_strings,
|
|
10
|
+
:parse_errors_expected, :parse_errors_received
|
|
10
11
|
|
|
11
12
|
# @param differences [Array<DiffNode>] Array of difference nodes
|
|
12
13
|
# @param preprocessed_strings [Array<String, String>] Pre-processed content for display
|
|
@@ -15,8 +16,11 @@ module Canon
|
|
|
15
16
|
# @param match_options [Hash, nil] Resolved match options used for comparison
|
|
16
17
|
# @param algorithm [Symbol] Diff algorithm used (:dom or :semantic)
|
|
17
18
|
# @param original_strings [Array<String, String>, nil] Original unprocessed content for line diff
|
|
19
|
+
# @param parse_errors_expected [Array<String>, nil] Parser errors from the expected side
|
|
20
|
+
# @param parse_errors_received [Array<String>, nil] Parser errors from the received side
|
|
18
21
|
def initialize(differences:, preprocessed_strings:, format:,
|
|
19
|
-
html_version: nil, match_options: nil, algorithm: :dom, original_strings: nil
|
|
22
|
+
html_version: nil, match_options: nil, algorithm: :dom, original_strings: nil,
|
|
23
|
+
parse_errors_expected: nil, parse_errors_received: nil)
|
|
20
24
|
@differences = differences
|
|
21
25
|
@preprocessed_strings = preprocessed_strings
|
|
22
26
|
@original_strings = original_strings || preprocessed_strings
|
|
@@ -24,6 +28,16 @@ html_version: nil, match_options: nil, algorithm: :dom, original_strings: nil)
|
|
|
24
28
|
@html_version = html_version
|
|
25
29
|
@match_options = match_options
|
|
26
30
|
@algorithm = algorithm
|
|
31
|
+
@parse_errors_expected = Array(parse_errors_expected)
|
|
32
|
+
@parse_errors_received = Array(parse_errors_received)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Whether either side reported parse errors. Used by the diff
|
|
36
|
+
# formatter to decide whether to render the parse-error banner.
|
|
37
|
+
#
|
|
38
|
+
# @return [Boolean]
|
|
39
|
+
def parse_errors?
|
|
40
|
+
@parse_errors_expected.any? || @parse_errors_received.any?
|
|
27
41
|
end
|
|
28
42
|
|
|
29
43
|
# Check if documents are semantically equivalent (no normative diffs)
|
|
@@ -84,6 +98,30 @@ html_version: nil, match_options: nil, algorithm: :dom, original_strings: nil)
|
|
|
84
98
|
@match_options&.[](:tree_diff_operations) || []
|
|
85
99
|
end
|
|
86
100
|
|
|
101
|
+
# Generate a human-readable summary of the first difference.
|
|
102
|
+
#
|
|
103
|
+
# When documents are equivalent, returns "Equivalent".
|
|
104
|
+
# When they differ, returns a single-line string with the first normative
|
|
105
|
+
# (or first informative) difference location and reason.
|
|
106
|
+
#
|
|
107
|
+
# @return [String] Summary string
|
|
108
|
+
def summary
|
|
109
|
+
return "Equivalent" if equivalent?
|
|
110
|
+
|
|
111
|
+
diff = normative_differences.first || informative_differences.first ||
|
|
112
|
+
@differences.first # rubocop:disable Layout/MultilineOperationIndentation
|
|
113
|
+
|
|
114
|
+
return "Not equivalent" unless diff
|
|
115
|
+
|
|
116
|
+
if diff.is_a?(Canon::Diff::DiffNode)
|
|
117
|
+
summarize_diff_node(diff)
|
|
118
|
+
elsif diff.is_a?(Hash)
|
|
119
|
+
summarize_legacy_hash(diff)
|
|
120
|
+
else
|
|
121
|
+
"Not equivalent"
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
87
125
|
# Generate formatted diff output
|
|
88
126
|
#
|
|
89
127
|
# @param use_color [Boolean] Whether to use ANSI color codes
|
|
@@ -116,6 +154,61 @@ show_diffs: :all, diff_mode: :separate, legacy_terminal: false)
|
|
|
116
154
|
html_version: @html_version,
|
|
117
155
|
)
|
|
118
156
|
end
|
|
157
|
+
|
|
158
|
+
private
|
|
159
|
+
|
|
160
|
+
# Format a single DiffNode into a summary string.
|
|
161
|
+
#
|
|
162
|
+
# @param diff [DiffNode] The difference to summarize
|
|
163
|
+
# @return [String] Human-readable summary
|
|
164
|
+
def summarize_diff_node(diff)
|
|
165
|
+
parts = ["Not equivalent:"]
|
|
166
|
+
|
|
167
|
+
# rubocop:disable Layout/SpaceBeforeInterpolation,Style/ConditionalAssignment
|
|
168
|
+
if diff.path
|
|
169
|
+
parts << "#{diff.reason} at #{diff.path}"
|
|
170
|
+
else
|
|
171
|
+
parts << diff.reason.to_s
|
|
172
|
+
end
|
|
173
|
+
# rubocop:enable Layout/SpaceBeforeInterpolation,Style/ConditionalAssignment
|
|
174
|
+
|
|
175
|
+
if diff.serialized_before && diff.serialized_after
|
|
176
|
+
before_preview = truncate_preview(diff.serialized_before)
|
|
177
|
+
after_preview = truncate_preview(diff.serialized_after)
|
|
178
|
+
parts << "(#{before_preview} vs #{after_preview})"
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
parts.join(" ")
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Format a legacy Hash difference into a summary string.
|
|
185
|
+
#
|
|
186
|
+
# @param diff [Hash] Legacy difference hash with :path, :value1, :value2
|
|
187
|
+
# @return [String] Human-readable summary
|
|
188
|
+
def summarize_legacy_hash(diff)
|
|
189
|
+
parts = ["Not equivalent:"]
|
|
190
|
+
parts << "#{diff[:diff_code_description]} at #{diff[:path]}" if diff[:path]
|
|
191
|
+
|
|
192
|
+
if diff[:value1] && diff[:value2]
|
|
193
|
+
parts << "(#{truncate_preview(diff[:value1].to_s)} vs #{truncate_preview(diff[:value2].to_s)})"
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
parts.size > 1 ? parts.join(" ") : "Not equivalent: values differ"
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Truncate a string for preview display.
|
|
200
|
+
#
|
|
201
|
+
# @param text [String] Text to truncate
|
|
202
|
+
# @param max_len [Integer] Maximum length
|
|
203
|
+
# @return [String] Truncated text with ellipsis if needed
|
|
204
|
+
def truncate_preview(text, max_len = 40)
|
|
205
|
+
stripped = text.strip.gsub(/\s+/, " ")
|
|
206
|
+
if stripped.length > max_len
|
|
207
|
+
"#{stripped[0...(max_len - 3)]}..."
|
|
208
|
+
else
|
|
209
|
+
stripped
|
|
210
|
+
end
|
|
211
|
+
end
|
|
119
212
|
end
|
|
120
213
|
end
|
|
121
214
|
end
|
|
@@ -13,6 +13,7 @@ require_relative "../diff/diff_classifier"
|
|
|
13
13
|
require_relative "strategies/match_strategy_factory"
|
|
14
14
|
require_relative "../html/data_model"
|
|
15
15
|
require_relative "xml_node_comparison"
|
|
16
|
+
require_relative "xml_comparator/diff_node_builder"
|
|
16
17
|
# Whitespace sensitivity module (single source of truth for sensitive elements)
|
|
17
18
|
require_relative "whitespace_sensitivity"
|
|
18
19
|
|
|
@@ -150,6 +151,8 @@ module Canon
|
|
|
150
151
|
html_version: detect_html_version_from_node(node1),
|
|
151
152
|
match_options: match_opts_hash,
|
|
152
153
|
algorithm: :dom,
|
|
154
|
+
parse_errors_expected: Comparison.parse_errors_for(node1),
|
|
155
|
+
parse_errors_received: Comparison.parse_errors_for(node2),
|
|
153
156
|
)
|
|
154
157
|
elsif result != Comparison::EQUIVALENT && !differences.empty?
|
|
155
158
|
# Non-verbose mode: check equivalence
|
|
@@ -172,10 +175,42 @@ module Canon
|
|
|
172
175
|
# @param node2 [Object] Second node
|
|
173
176
|
# @return [Boolean] true if both are document fragments
|
|
174
177
|
def fragment_nodes?(node1, node2)
|
|
175
|
-
(node1
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
178
|
+
fragment_node?(node1) && fragment_node?(node2)
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Check if a single node is a recognised document fragment.
|
|
182
|
+
# All three Nokogiri fragment types (XML, HTML4, HTML5) must be
|
|
183
|
+
# accepted: dom_diff routes html/html4/html5 input through
|
|
184
|
+
# Nokogiri::HTML5.fragment per #118.
|
|
185
|
+
def fragment_node?(node)
|
|
186
|
+
node.is_a?(Nokogiri::XML::DocumentFragment) ||
|
|
187
|
+
node.is_a?(Nokogiri::HTML4::DocumentFragment) ||
|
|
188
|
+
node.is_a?(Nokogiri::HTML5::DocumentFragment)
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
# Record a DiffNode for a fragment-level child-count mismatch.
|
|
192
|
+
# Each surplus child becomes its own MISSING_NODE diff so the
|
|
193
|
+
# downstream report shows what was added or removed.
|
|
194
|
+
def record_fragment_length_mismatch(_node1, _node2, children1,
|
|
195
|
+
children2, differences)
|
|
196
|
+
longer, shorter, side = if children1.length > children2.length
|
|
197
|
+
[children1, children2, :removed]
|
|
198
|
+
else
|
|
199
|
+
[children2, children1, :added]
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
longer[shorter.length...].each do |orphan|
|
|
203
|
+
n1 = side == :removed ? orphan : nil
|
|
204
|
+
n2 = side == :removed ? nil : orphan
|
|
205
|
+
differences <<
|
|
206
|
+
Canon::Comparison::DiffNodeBuilder.build(
|
|
207
|
+
node1: n1,
|
|
208
|
+
node2: n2,
|
|
209
|
+
diff1: Comparison::MISSING_NODE,
|
|
210
|
+
diff2: Comparison::MISSING_NODE,
|
|
211
|
+
dimension: :element_structure,
|
|
212
|
+
)
|
|
213
|
+
end
|
|
179
214
|
end
|
|
180
215
|
|
|
181
216
|
# Compare children of document fragments
|
|
@@ -196,6 +231,13 @@ module Canon
|
|
|
196
231
|
children2 = XmlNodeComparison.filter_children(all_children2, opts)
|
|
197
232
|
|
|
198
233
|
if children1.length != children2.length
|
|
234
|
+
# Record the length mismatch as a DiffNode so verbose mode
|
|
235
|
+
# surfaces it. Without this, equivalent? wraps an empty
|
|
236
|
+
# differences array and incorrectly reports the inputs as
|
|
237
|
+
# equivalent.
|
|
238
|
+
record_fragment_length_mismatch(node1, node2,
|
|
239
|
+
children1, children2,
|
|
240
|
+
differences)
|
|
199
241
|
return Comparison::UNEQUAL_ELEMENTS
|
|
200
242
|
elsif children1.empty?
|
|
201
243
|
return Comparison::EQUIVALENT
|
|
@@ -260,6 +302,8 @@ module Canon
|
|
|
260
302
|
html_version: html_version,
|
|
261
303
|
match_options: match_opts_hash.merge(strategy.metadata),
|
|
262
304
|
algorithm: :semantic,
|
|
305
|
+
parse_errors_expected: Comparison.parse_errors_for(node1),
|
|
306
|
+
parse_errors_received: Comparison.parse_errors_for(node2),
|
|
263
307
|
)
|
|
264
308
|
else
|
|
265
309
|
# Simple boolean result - equivalent if no normative differences
|
|
@@ -291,10 +335,12 @@ module Canon
|
|
|
291
335
|
node.to_html
|
|
292
336
|
end
|
|
293
337
|
|
|
294
|
-
# Use XML fragment parser to preserve structure without auto-generated elements
|
|
295
|
-
#
|
|
296
|
-
#
|
|
297
|
-
frag = Nokogiri::XML.fragment(
|
|
338
|
+
# Use XML fragment parser to preserve structure without auto-generated elements.
|
|
339
|
+
# Decode HTML named entities ( etc.) to UTF-8 characters since XML
|
|
340
|
+
# parser only understands the five XML entities.
|
|
341
|
+
frag = Nokogiri::XML.fragment(
|
|
342
|
+
decode_html_named_entities(html_string),
|
|
343
|
+
)
|
|
298
344
|
|
|
299
345
|
# Apply preprocessing if needed
|
|
300
346
|
if preprocessing == :rendered
|
|
@@ -448,8 +494,12 @@ module Canon
|
|
|
448
494
|
end
|
|
449
495
|
|
|
450
496
|
# Parse as Nokogiri fragment for DOM comparison
|
|
451
|
-
# Use XML fragment parser to avoid auto-inserted meta tags
|
|
452
|
-
|
|
497
|
+
# Use XML fragment parser to avoid auto-inserted meta tags.
|
|
498
|
+
# Decode HTML named entities ( etc.) to UTF-8 characters since
|
|
499
|
+
# XML parser only understands the five XML entities.
|
|
500
|
+
frag = Nokogiri::XML.fragment(
|
|
501
|
+
decode_html_named_entities(html_string),
|
|
502
|
+
)
|
|
453
503
|
|
|
454
504
|
# Apply post-parsing filtering for :normalize, :format, and :rendered preprocessing
|
|
455
505
|
if %i[normalize format rendered].include?(preprocessing)
|
|
@@ -496,6 +546,33 @@ module Canon
|
|
|
496
546
|
|
|
497
547
|
# Detect HTML version from content
|
|
498
548
|
#
|
|
549
|
+
# Decode HTML named entities to their UTF-8 character equivalents.
|
|
550
|
+
# This is a targeted replacement that only changes entity references,
|
|
551
|
+
# preserving all tag structure. Needed because Nokogiri::XML.fragment
|
|
552
|
+
# only understands the five XML entities (& < > " ').
|
|
553
|
+
#
|
|
554
|
+
# @param str [String] HTML string possibly containing named entities
|
|
555
|
+
# @return [String] String with named entities replaced by UTF-8 chars
|
|
556
|
+
def decode_html_named_entities(str)
|
|
557
|
+
return str unless str.include?("&")
|
|
558
|
+
|
|
559
|
+
str.gsub(/ /i, "\u00A0")
|
|
560
|
+
.gsub(/ /i, "\u2002")
|
|
561
|
+
.gsub(/ /i, "\u2003")
|
|
562
|
+
.gsub(/ /i, "\u2009")
|
|
563
|
+
.gsub(/©/i, "\u00A9")
|
|
564
|
+
.gsub(/®/i, "\u00AE")
|
|
565
|
+
.gsub(/™/i, "\u2122")
|
|
566
|
+
.gsub(/—/i, "\u2014")
|
|
567
|
+
.gsub(/–/i, "\u2013")
|
|
568
|
+
.gsub(/‘/i, "\u2018")
|
|
569
|
+
.gsub(/’/i, "\u2019")
|
|
570
|
+
.gsub(/“/i, "\u201C")
|
|
571
|
+
.gsub(/”/i, "\u201D")
|
|
572
|
+
.gsub(/•/i, "\u2022")
|
|
573
|
+
.gsub(/…/i, "\u2026")
|
|
574
|
+
end
|
|
575
|
+
|
|
499
576
|
# @param content [String] HTML content
|
|
500
577
|
# @return [Symbol] :html5 or :html4
|
|
501
578
|
def detect_html_version(content)
|
|
@@ -721,8 +798,16 @@ compare_profile = nil)
|
|
|
721
798
|
parent = text_node.parent
|
|
722
799
|
next if ancestor_preserves_whitespace?(parent, preserve_whitespace)
|
|
723
800
|
|
|
801
|
+
content = text_node.content
|
|
802
|
+
|
|
803
|
+
# NBSP (U+00A0) is never insignificant — don't remove
|
|
804
|
+
next if content.include?("\u00A0")
|
|
805
|
+
|
|
806
|
+
# Whitespace between inline siblings is significant — don't remove
|
|
807
|
+
next if WhitespaceSensitivity.inline_whitespace_significant?(text_node)
|
|
808
|
+
|
|
724
809
|
# Remove if the text is only whitespace (after normalization)
|
|
725
|
-
if
|
|
810
|
+
if content.strip.empty?
|
|
726
811
|
text_node.remove
|
|
727
812
|
end
|
|
728
813
|
end
|