canon 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +31 -149
- data/README.adoc +9 -0
- data/docs/advanced/semantic-diff-report.adoc +96 -0
- data/docs/features/configuration-profiles.adoc +4 -2
- data/docs/features/diff-formatting/index.adoc +3 -0
- data/docs/features/diff-formatting/whitespace-adjacency.adoc +140 -0
- data/docs/features/match-options/html-policies.adoc +2 -0
- data/docs/features/match-options/index.adoc +40 -0
- data/docs/guides/choosing-configuration.adoc +12 -1
- data/docs/reference/cli-options.adoc +3 -0
- data/docs/reference/environment-variables.adoc +3 -1
- data/docs/reference/options-across-interfaces.adoc +7 -1
- data/docs/understanding/formats/html.adoc +9 -2
- data/lib/canon/cli.rb +4 -0
- data/lib/canon/commands/diff_command.rb +1 -0
- data/lib/canon/comparison/comparison_result.rb +95 -2
- data/lib/canon/comparison/html_comparator.rb +96 -11
- data/lib/canon/comparison/markup_comparator.rb +68 -71
- data/lib/canon/comparison/match_options/base_resolver.rb +1 -0
- data/lib/canon/comparison/match_options/xml_resolver.rb +8 -0
- data/lib/canon/comparison/match_options.rb +23 -2
- data/lib/canon/comparison/node_inspector.rb +103 -0
- data/lib/canon/comparison/whitespace_sensitivity.rb +96 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +133 -55
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +24 -23
- data/lib/canon/comparison/xml_comparator/node_parser.rb +45 -7
- data/lib/canon/comparison/xml_comparator.rb +174 -7
- data/lib/canon/comparison/xml_node_comparison.rb +48 -66
- data/lib/canon/comparison.rb +143 -22
- data/lib/canon/config/env_schema.rb +2 -1
- data/lib/canon/config/profiles/metanorma.yml +3 -0
- data/lib/canon/config.rb +51 -5
- data/lib/canon/diff/diff_classifier.rb +55 -41
- data/lib/canon/diff/diff_line_builder.rb +9 -8
- data/lib/canon/diff/xml_serialization_formatter.rb +27 -42
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +39 -4
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +5 -2
- data/lib/canon/diff_formatter/by_line_formatter.rb +84 -0
- data/lib/canon/diff_formatter/by_object_formatter.rb +53 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +184 -26
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +92 -4
- data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +29 -0
- data/lib/canon/diff_formatter/pretty_diff_formatter.rb +109 -0
- data/lib/canon/diff_formatter.rb +128 -175
- data/lib/canon/html/data_model.rb +10 -4
- data/lib/canon/pretty_printer/html.rb +76 -14
- data/lib/canon/pretty_printer/html_void_elements.rb +20 -0
- data/lib/canon/pretty_printer/xml_normalized.rb +10 -3
- data/lib/canon/tree_diff/adapters/html_adapter.rb +55 -2
- data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/c14n.rb +59 -5
- data/lib/canon/xml/data_model.rb +13 -1
- data/lib/canon/xml/element_matcher.rb +3 -0
- data/lib/canon/xml/node.rb +23 -1
- data/lib/canon/xml/nodes/comment_node.rb +4 -0
- data/lib/canon/xml/nodes/element_node.rb +4 -0
- data/lib/canon/xml/nodes/text_node.rb +4 -0
- data/lib/canon/xml/sax_builder.rb +29 -2
- data/lib/canon/xml/xpath_engine.rb +238 -0
- metadata +9 -2
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "../comparison" # Load base module with constants
|
|
4
|
+
require_relative "node_inspector"
|
|
4
5
|
require_relative "../diff/diff_node"
|
|
5
6
|
require_relative "../diff/path_builder"
|
|
6
7
|
|
|
@@ -87,23 +88,20 @@ module Canon
|
|
|
87
88
|
return nil if node.nil?
|
|
88
89
|
|
|
89
90
|
# Canon::Xml::Node types
|
|
90
|
-
|
|
91
|
+
case node
|
|
92
|
+
when Canon::Xml::Nodes::RootNode
|
|
91
93
|
# Serialize all children of root
|
|
92
94
|
node.children.map { |child| serialize_node(child) }.join
|
|
93
|
-
|
|
95
|
+
when Canon::Xml::Nodes::ElementNode
|
|
94
96
|
serialize_element_node(node)
|
|
95
|
-
|
|
97
|
+
when Canon::Xml::Nodes::TextNode
|
|
96
98
|
# Use original text (with entity references) if available,
|
|
97
99
|
# otherwise fall back to value (decoded text)
|
|
98
100
|
node.original || node.value
|
|
99
|
-
|
|
101
|
+
when Canon::Xml::Nodes::CommentNode
|
|
100
102
|
"<!--#{node.value}-->"
|
|
101
|
-
|
|
103
|
+
when Canon::Xml::Nodes::ProcessingInstructionNode
|
|
102
104
|
"<?#{node.target} #{node.data}?>"
|
|
103
|
-
elsif node.respond_to?(:to_xml)
|
|
104
|
-
node.to_xml
|
|
105
|
-
elsif node.respond_to?(:to_html)
|
|
106
|
-
node.to_html
|
|
107
105
|
else
|
|
108
106
|
node.to_s
|
|
109
107
|
end
|
|
@@ -121,8 +119,8 @@ module Canon
|
|
|
121
119
|
node.attribute_nodes.to_h do |attr|
|
|
122
120
|
[attr.name, attr.value]
|
|
123
121
|
end
|
|
124
|
-
# Nokogiri
|
|
125
|
-
elsif node.
|
|
122
|
+
# Nokogiri elements
|
|
123
|
+
elsif node.is_a?(Nokogiri::XML::Element)
|
|
126
124
|
node.attributes.to_h do |_, attr|
|
|
127
125
|
[attr.name, attr.value]
|
|
128
126
|
end
|
|
@@ -182,6 +180,25 @@ module Canon
|
|
|
182
180
|
return false unless text_node?(node) && node.parent
|
|
183
181
|
return false unless MatchOptions.normalize_text(node_text(node)).empty?
|
|
184
182
|
|
|
183
|
+
# NBSP (U+00A0) is never insignificant whitespace —
|
|
184
|
+
# it always renders as a visible non-breaking space.
|
|
185
|
+
# For HTML: always preserve NBSP nodes.
|
|
186
|
+
# For XML with whitespace_type: :strict: preserve NBSP nodes so
|
|
187
|
+
# different Unicode whitespace types remain distinguishable.
|
|
188
|
+
format = opts[:format] || match_opts[:format]
|
|
189
|
+
whitespace_type = match_opts[:whitespace_type] || :strict
|
|
190
|
+
if (%i[html html4
|
|
191
|
+
html5].include?(format) || whitespace_type == :strict) && WhitespaceSensitivity.contains_nbsp?(node_text(node))
|
|
192
|
+
return false
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
if %i[html html4
|
|
196
|
+
html5].include?(format) && WhitespaceSensitivity.inline_whitespace_significant?(node)
|
|
197
|
+
# Whitespace between inline element siblings is semantically
|
|
198
|
+
# significant (renders as a visible gap) and must not be stripped.
|
|
199
|
+
return false
|
|
200
|
+
end
|
|
201
|
+
|
|
185
202
|
return true unless WhitespaceSensitivity.whitespace_preserved?(
|
|
186
203
|
node.parent, match_opts
|
|
187
204
|
)
|
|
@@ -208,8 +225,8 @@ module Canon
|
|
|
208
225
|
def same_node_type?(node1, node2)
|
|
209
226
|
return false if node1.class != node2.class
|
|
210
227
|
|
|
211
|
-
|
|
212
|
-
|
|
228
|
+
case node1
|
|
229
|
+
when Canon::Xml::Node, Nokogiri::XML::Node
|
|
213
230
|
node1.node_type == node2.node_type
|
|
214
231
|
else
|
|
215
232
|
true
|
|
@@ -226,20 +243,7 @@ module Canon
|
|
|
226
243
|
# @param node [Object] Node to check
|
|
227
244
|
# @return [Boolean] true if node is a comment
|
|
228
245
|
def comment_node?(node)
|
|
229
|
-
|
|
230
|
-
return true if node.respond_to?(:node_type) && node.node_type == :comment
|
|
231
|
-
|
|
232
|
-
# HTML comments are parsed as TEXT nodes by Nokogiri
|
|
233
|
-
# Check if this is a text node with HTML comment content
|
|
234
|
-
if text_node?(node)
|
|
235
|
-
text = node_text(node)
|
|
236
|
-
# Strip whitespace and backslashes for comparison
|
|
237
|
-
# Nokogiri escapes HTML comments as "<\\!-- comment -->" in full documents
|
|
238
|
-
text_stripped = text.to_s.strip.gsub("\\", "")
|
|
239
|
-
return true if text_stripped.start_with?("<!--") && text_stripped.end_with?("-->")
|
|
240
|
-
end
|
|
241
|
-
|
|
242
|
-
false
|
|
246
|
+
NodeInspector.comment_node?(node)
|
|
243
247
|
end
|
|
244
248
|
|
|
245
249
|
# Check if a node is a text node
|
|
@@ -247,9 +251,7 @@ module Canon
|
|
|
247
251
|
# @param node [Object] Node to check
|
|
248
252
|
# @return [Boolean] true if node is a text node
|
|
249
253
|
def text_node?(node)
|
|
250
|
-
|
|
251
|
-
!node.respond_to?(:element?)) ||
|
|
252
|
-
(node.respond_to?(:node_type) && node.node_type == :text)
|
|
254
|
+
NodeInspector.text_node?(node)
|
|
253
255
|
end
|
|
254
256
|
|
|
255
257
|
# Get text content from a node
|
|
@@ -257,15 +259,7 @@ module Canon
|
|
|
257
259
|
# @param node [Object] Node to get text from
|
|
258
260
|
# @return [String] Text content
|
|
259
261
|
def node_text(node)
|
|
260
|
-
|
|
261
|
-
if node.respond_to?(:value)
|
|
262
|
-
node.value.to_s
|
|
263
|
-
# Nokogiri nodes use .content
|
|
264
|
-
elsif node.respond_to?(:content)
|
|
265
|
-
node.content.to_s
|
|
266
|
-
else
|
|
267
|
-
node.to_s
|
|
268
|
-
end
|
|
262
|
+
NodeInspector.text_content(node)
|
|
269
263
|
end
|
|
270
264
|
|
|
271
265
|
# Check if difference between two texts is only whitespace
|
|
@@ -309,7 +303,7 @@ module Canon
|
|
|
309
303
|
if diff1 == Canon::Comparison::MISSING_NODE && diff2 == Canon::Comparison::MISSING_NODE
|
|
310
304
|
"element structure mismatch (children differ)"
|
|
311
305
|
else
|
|
312
|
-
|
|
306
|
+
Canon::Comparison.code_pair_label(diff1, diff2)
|
|
313
307
|
end
|
|
314
308
|
end
|
|
315
309
|
|
|
@@ -352,26 +346,18 @@ module Canon
|
|
|
352
346
|
def extract_text_content_from_node(node)
|
|
353
347
|
return nil if node.nil?
|
|
354
348
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
# For nodes with value method (other types)
|
|
368
|
-
return node.value if node.respond_to?(:value)
|
|
369
|
-
|
|
370
|
-
# For simple text nodes or strings
|
|
371
|
-
return node.to_s if node.is_a?(String)
|
|
372
|
-
|
|
373
|
-
# For other node types, try to_s
|
|
374
|
-
node.to_s
|
|
349
|
+
case node
|
|
350
|
+
when Canon::Xml::Nodes::TextNode
|
|
351
|
+
node.value
|
|
352
|
+
when Canon::Xml::Node
|
|
353
|
+
node.text_content
|
|
354
|
+
when Nokogiri::XML::Node
|
|
355
|
+
node.content.to_s
|
|
356
|
+
when String
|
|
357
|
+
node
|
|
358
|
+
else
|
|
359
|
+
node.to_s
|
|
360
|
+
end
|
|
375
361
|
rescue StandardError
|
|
376
362
|
nil
|
|
377
363
|
end
|
|
@@ -425,26 +411,37 @@ module Canon
|
|
|
425
411
|
|
|
426
412
|
# Determine the appropriate dimension for a node type
|
|
427
413
|
#
|
|
414
|
+
# Used by ChildComparison to tag per-child orphan diffs with a
|
|
415
|
+
# dimension that matches what the node *is*, so the formatter
|
|
416
|
+
# renders correctly. An element orphan tagged :text_content
|
|
417
|
+
# would otherwise route through PR #126's one-sided text
|
|
418
|
+
# formatter and render as +text ""+ instead of as the actual
|
|
419
|
+
# element (see lutaml/canon#125 follow-up).
|
|
420
|
+
#
|
|
428
421
|
# @param node [Object] The node to check
|
|
429
422
|
# @return [Symbol] The dimension symbol
|
|
430
423
|
def determine_node_dimension(node)
|
|
431
|
-
|
|
432
|
-
|
|
424
|
+
case node
|
|
425
|
+
when Canon::Xml::Node
|
|
433
426
|
case node.node_type
|
|
427
|
+
when :element then :element_structure
|
|
434
428
|
when :comment then :comments
|
|
435
429
|
when :text, :cdata then :text_content
|
|
436
430
|
when :processing_instruction then :processing_instructions
|
|
437
431
|
else :text_content
|
|
438
432
|
end
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
433
|
+
when Nokogiri::XML::Node
|
|
434
|
+
if node.comment?
|
|
435
|
+
:comments
|
|
436
|
+
elsif node.text? || node.cdata?
|
|
437
|
+
:text_content
|
|
438
|
+
elsif node.processing_instruction?
|
|
439
|
+
:processing_instructions
|
|
440
|
+
elsif node.element?
|
|
441
|
+
:element_structure
|
|
442
|
+
else
|
|
443
|
+
:text_content
|
|
444
|
+
end
|
|
448
445
|
else
|
|
449
446
|
:text_content
|
|
450
447
|
end
|
|
@@ -24,6 +24,7 @@ module Canon
|
|
|
24
24
|
attribute_values: :strict,
|
|
25
25
|
element_position: :ignore,
|
|
26
26
|
comments: :ignore,
|
|
27
|
+
whitespace_type: :strict,
|
|
27
28
|
},
|
|
28
29
|
xml: {
|
|
29
30
|
preprocessing: :none,
|
|
@@ -34,6 +35,7 @@ module Canon
|
|
|
34
35
|
attribute_values: :strict,
|
|
35
36
|
element_position: :strict,
|
|
36
37
|
comments: :strict,
|
|
38
|
+
whitespace_type: :strict,
|
|
37
39
|
},
|
|
38
40
|
}.freeze
|
|
39
41
|
|
|
@@ -51,6 +53,7 @@ module Canon
|
|
|
51
53
|
attribute_values: :strict,
|
|
52
54
|
element_position: :strict,
|
|
53
55
|
comments: :strict,
|
|
56
|
+
whitespace_type: :strict,
|
|
54
57
|
},
|
|
55
58
|
|
|
56
59
|
# Rendered: Match rendered output (HTML default)
|
|
@@ -64,6 +67,7 @@ module Canon
|
|
|
64
67
|
attribute_values: :strict,
|
|
65
68
|
element_position: :strict,
|
|
66
69
|
comments: :ignore,
|
|
70
|
+
whitespace_type: :strict,
|
|
67
71
|
},
|
|
68
72
|
|
|
69
73
|
# HTML4: Match HTML4 rendered output
|
|
@@ -77,6 +81,7 @@ module Canon
|
|
|
77
81
|
attribute_values: :normalize,
|
|
78
82
|
element_position: :ignore,
|
|
79
83
|
comments: :ignore,
|
|
84
|
+
whitespace_type: :strict,
|
|
80
85
|
},
|
|
81
86
|
|
|
82
87
|
# HTML5: Match HTML5 rendered output (same as rendered)
|
|
@@ -89,6 +94,7 @@ module Canon
|
|
|
89
94
|
attribute_values: :strict,
|
|
90
95
|
element_position: :ignore,
|
|
91
96
|
comments: :ignore,
|
|
97
|
+
whitespace_type: :strict,
|
|
92
98
|
},
|
|
93
99
|
|
|
94
100
|
# Spec-friendly: Formatting doesn't matter
|
|
@@ -102,6 +108,7 @@ module Canon
|
|
|
102
108
|
attribute_values: :normalize,
|
|
103
109
|
element_position: :ignore,
|
|
104
110
|
comments: :ignore,
|
|
111
|
+
whitespace_type: :strict,
|
|
105
112
|
},
|
|
106
113
|
|
|
107
114
|
# Content-only: Only content matters
|
|
@@ -114,6 +121,7 @@ module Canon
|
|
|
114
121
|
attribute_values: :normalize,
|
|
115
122
|
element_position: :ignore,
|
|
116
123
|
comments: :ignore,
|
|
124
|
+
whitespace_type: :strict,
|
|
117
125
|
},
|
|
118
126
|
}.freeze
|
|
119
127
|
|
|
@@ -69,13 +69,18 @@ module Canon
|
|
|
69
69
|
# @param text1 [String] First text
|
|
70
70
|
# @param text2 [String] Second text
|
|
71
71
|
# @param behavior [Symbol] Match behavior (:strict, :normalize, :ignore)
|
|
72
|
+
# @param whitespace_type [Symbol] Whitespace type handling (:strict, :normalize)
|
|
72
73
|
# @return [Boolean] true if texts match according to behavior
|
|
73
|
-
def match_text?(text1, text2, behavior)
|
|
74
|
+
def match_text?(text1, text2, behavior, whitespace_type: :strict)
|
|
74
75
|
case behavior
|
|
75
76
|
when :strict
|
|
76
77
|
text1 == text2
|
|
77
78
|
when :normalize
|
|
78
|
-
|
|
79
|
+
if whitespace_type == :normalize
|
|
80
|
+
normalize_text(text1) == normalize_text(text2)
|
|
81
|
+
else
|
|
82
|
+
normalize_text_preserving_type(text1) == normalize_text_preserving_type(text2)
|
|
83
|
+
end
|
|
79
84
|
when :ignore
|
|
80
85
|
true
|
|
81
86
|
else
|
|
@@ -101,6 +106,22 @@ module Canon
|
|
|
101
106
|
.strip # Remove leading/trailing whitespace
|
|
102
107
|
end
|
|
103
108
|
|
|
109
|
+
# Normalize text preserving Unicode whitespace type distinctions.
|
|
110
|
+
#
|
|
111
|
+
# Only ASCII whitespace (space, tab, newline, etc.) is collapsed.
|
|
112
|
+
# Unicode whitespace (NBSP, ideographic space, etc.) is preserved,
|
|
113
|
+
# so different whitespace types remain distinguishable.
|
|
114
|
+
#
|
|
115
|
+
# @param text [String] Text to normalize
|
|
116
|
+
# @return [String] Normalized text with preserved whitespace types
|
|
117
|
+
def normalize_text_preserving_type(text)
|
|
118
|
+
return "" if text.nil?
|
|
119
|
+
|
|
120
|
+
text.to_s
|
|
121
|
+
.gsub(/[ \t\r\n\f\v]+/, " ") # Collapse only ASCII whitespace
|
|
122
|
+
.strip
|
|
123
|
+
end
|
|
124
|
+
|
|
104
125
|
# Process attribute value according to match behavior
|
|
105
126
|
#
|
|
106
127
|
# @param value [String] Attribute value to process
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module Comparison
|
|
5
|
+
# Single source of truth for cross-backend node type operations.
|
|
6
|
+
#
|
|
7
|
+
# The comparison pipeline handles nodes from two backends:
|
|
8
|
+
# * Canon::Xml::Node (+ RootNode, ElementNode, TextNode, etc.) —
|
|
9
|
+
# custom DOM built by SAX builder and DataModel.
|
|
10
|
+
# * Nokogiri::XML::Node (+ subclasses) — native Nokogiri nodes used
|
|
11
|
+
# by the HTML comparator and some legacy paths.
|
|
12
|
+
#
|
|
13
|
+
# Every method here dispatches on type via +case/when+ (+is_a?+).
|
|
14
|
+
# No +respond_to?+ — the types are known at every call site.
|
|
15
|
+
module NodeInspector
|
|
16
|
+
CANON_TEXT_TYPE = :text
|
|
17
|
+
NOKOGIRI_TEXT_TYPE = defined?(Nokogiri::XML::Node::TEXT_NODE) ? Nokogiri::XML::Node::TEXT_NODE : 3
|
|
18
|
+
|
|
19
|
+
# True when +node+ is a text node (whitespace, content, etc.).
|
|
20
|
+
def self.text_node?(node)
|
|
21
|
+
case node
|
|
22
|
+
when Canon::Xml::Node
|
|
23
|
+
node.node_type == CANON_TEXT_TYPE
|
|
24
|
+
when Nokogiri::XML::Node
|
|
25
|
+
node.node_type == NOKOGIRI_TEXT_TYPE
|
|
26
|
+
else
|
|
27
|
+
false
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Extract the text content of +node+ as a String.
|
|
32
|
+
def self.text_content(node)
|
|
33
|
+
case node
|
|
34
|
+
when Canon::Xml::Node
|
|
35
|
+
node.value.to_s
|
|
36
|
+
when Nokogiri::XML::Node
|
|
37
|
+
node.content.to_s
|
|
38
|
+
else
|
|
39
|
+
node.to_s
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# True when +node+ is a text node whose content is whitespace-only.
|
|
44
|
+
# Empty-string text nodes return false — those represent genuine
|
|
45
|
+
# empty-vs-content asymmetry, not pretty-print indentation.
|
|
46
|
+
def self.whitespace_only_text?(node)
|
|
47
|
+
return false unless text_node?(node)
|
|
48
|
+
|
|
49
|
+
text = text_content(node)
|
|
50
|
+
!text.empty? && text.strip.empty?
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# True when +node+ is a comment node.
|
|
54
|
+
# For HTML, also detects comments that Nokogiri parses as TEXT nodes
|
|
55
|
+
# (content like "<!-- comment -->" or escaped "<\\!-- comment -->").
|
|
56
|
+
def self.comment_node?(node)
|
|
57
|
+
case node
|
|
58
|
+
when Canon::Xml::Node
|
|
59
|
+
node.node_type == :comment
|
|
60
|
+
when Nokogiri::XML::Node
|
|
61
|
+
return true if node.comment?
|
|
62
|
+
|
|
63
|
+
# HTML comments are parsed as TEXT nodes by Nokogiri
|
|
64
|
+
if node.text?
|
|
65
|
+
text_stripped = text_content(node).to_s.strip.gsub("\\", "")
|
|
66
|
+
return true if text_stripped.start_with?("<!--") && text_stripped.end_with?("-->")
|
|
67
|
+
end
|
|
68
|
+
false
|
|
69
|
+
else
|
|
70
|
+
false
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# True when +node+ is an element node.
|
|
75
|
+
def self.element_node?(node)
|
|
76
|
+
case node
|
|
77
|
+
when Canon::Xml::Node
|
|
78
|
+
node.node_type == :element
|
|
79
|
+
when Nokogiri::XML::Node
|
|
80
|
+
node.element?
|
|
81
|
+
else
|
|
82
|
+
false
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Extract parse-time errors carried on a node or its owning document.
|
|
87
|
+
# Returns an Array of Strings.
|
|
88
|
+
def self.parse_errors(node)
|
|
89
|
+
case node
|
|
90
|
+
when nil
|
|
91
|
+
[]
|
|
92
|
+
when Canon::Xml::Node
|
|
93
|
+
errors = node.parse_errors
|
|
94
|
+
Array(errors).map(&:to_s)
|
|
95
|
+
when Nokogiri::XML::Document, Nokogiri::HTML5::Document
|
|
96
|
+
Array(node.errors).map(&:to_s)
|
|
97
|
+
else
|
|
98
|
+
[]
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
@@ -50,6 +50,15 @@ module Canon
|
|
|
50
50
|
# HTML elements where every whitespace character is significant.
|
|
51
51
|
HTML_PRESERVE_ELEMENTS = %w[pre code textarea script style].freeze
|
|
52
52
|
|
|
53
|
+
# HTML inline elements — whitespace between these is semantically
|
|
54
|
+
# significant (renders as a visible space). Whitespace-only text
|
|
55
|
+
# nodes that sit between two inline siblings must not be stripped.
|
|
56
|
+
INLINE_ELEMENTS = %w[
|
|
57
|
+
a abbr acronym b bdo big br button cite code dfn em i img input kbd
|
|
58
|
+
label map object output q s samp select small span strong sub sup
|
|
59
|
+
time tt u var wbr
|
|
60
|
+
].freeze
|
|
61
|
+
|
|
53
62
|
class << self
|
|
54
63
|
# Classify the whitespace behaviour for an element using ancestor walk.
|
|
55
64
|
#
|
|
@@ -213,6 +222,69 @@ module Canon
|
|
|
213
222
|
.include?(element_name.to_sym)
|
|
214
223
|
end
|
|
215
224
|
|
|
225
|
+
# Check if a whitespace-only text node sits between two inline element
|
|
226
|
+
# siblings, making the whitespace semantically significant.
|
|
227
|
+
#
|
|
228
|
+
# In HTML rendering, a space between <span>A</span> <span>B</span>
|
|
229
|
+
# produces visible output. Stripping such nodes produces false
|
|
230
|
+
# equivalence.
|
|
231
|
+
#
|
|
232
|
+
# Works with any parent type (element, DocumentFragment, RootNode)
|
|
233
|
+
# since the check is about sibling context, not parent type.
|
|
234
|
+
#
|
|
235
|
+
# @param text_node [Object] Text node (Nokogiri or Canon::Xml::Node)
|
|
236
|
+
# @return [Boolean] true if whitespace is between inline siblings
|
|
237
|
+
def inline_whitespace_significant?(text_node)
|
|
238
|
+
return false unless text_node.respond_to?(:parent)
|
|
239
|
+
|
|
240
|
+
parent = text_node.parent
|
|
241
|
+
return false unless parent
|
|
242
|
+
return false unless parent.respond_to?(:children)
|
|
243
|
+
|
|
244
|
+
siblings = parent.children
|
|
245
|
+
idx = siblings.index(text_node)
|
|
246
|
+
return false unless idx
|
|
247
|
+
|
|
248
|
+
# Look at the IMMEDIATE non-whitespace-text neighbour on each
|
|
249
|
+
# side. Whitespace at a block boundary is collapsed per CSS,
|
|
250
|
+
# so both immediate neighbours must be inline for the
|
|
251
|
+
# whitespace to be significant. Walking all siblings (the
|
|
252
|
+
# earlier behaviour) misclassified whitespace at a block
|
|
253
|
+
# boundary as significant whenever any inline element existed
|
|
254
|
+
# elsewhere among the siblings.
|
|
255
|
+
prev_neighbour = nearest_non_whitespace_sibling(siblings, idx, -1)
|
|
256
|
+
next_neighbour = nearest_non_whitespace_sibling(siblings, idx, 1)
|
|
257
|
+
|
|
258
|
+
inline_element?(prev_neighbour) && inline_element?(next_neighbour)
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Walk outward from +idx+ in +direction+ (+1 forward, -1 back),
|
|
262
|
+
# skipping whitespace-only text nodes, and return the first
|
|
263
|
+
# non-whitespace sibling found. Returns nil if none.
|
|
264
|
+
def nearest_non_whitespace_sibling(siblings, idx, direction)
|
|
265
|
+
i = idx + direction
|
|
266
|
+
while i >= 0 && i < siblings.length
|
|
267
|
+
s = siblings[i]
|
|
268
|
+
unless s.respond_to?(:text?) && s.text? &&
|
|
269
|
+
s.respond_to?(:content) && s.content.to_s.strip.empty?
|
|
270
|
+
return s
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
i += direction
|
|
274
|
+
end
|
|
275
|
+
nil
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# Check if text content contains a non-breaking space (U+00A0).
|
|
279
|
+
# NBSP is NOT collapsible whitespace in HTML — it always renders as
|
|
280
|
+
# a visible space and must never be stripped.
|
|
281
|
+
#
|
|
282
|
+
# @param text [String] Text content to check
|
|
283
|
+
# @return [Boolean] true if text contains U+00A0
|
|
284
|
+
def contains_nbsp?(text)
|
|
285
|
+
text.to_s.include?("\u00A0")
|
|
286
|
+
end
|
|
287
|
+
|
|
216
288
|
private
|
|
217
289
|
|
|
218
290
|
# Build the Set of preserve whitespace element names (strings).
|
|
@@ -336,6 +408,30 @@ module Canon
|
|
|
336
408
|
# Nokogiri compatibility
|
|
337
409
|
parent.respond_to?(:node_type) && parent.node_type == :element
|
|
338
410
|
end
|
|
411
|
+
|
|
412
|
+
# Get the parent element of a text node, or nil.
|
|
413
|
+
# Works with both Nokogiri and Canon::Xml::Node types.
|
|
414
|
+
def parent_element_of(text_node)
|
|
415
|
+
return nil unless text_node.respond_to?(:parent)
|
|
416
|
+
|
|
417
|
+
parent = text_node.parent
|
|
418
|
+
return nil unless parent
|
|
419
|
+
|
|
420
|
+
if parent.is_a?(Canon::Xml::Nodes::ElementNode)
|
|
421
|
+
parent
|
|
422
|
+
elsif parent.respond_to?(:element?) && parent.element?
|
|
423
|
+
parent
|
|
424
|
+
elsif parent.respond_to?(:node_type) && parent.node_type == :element
|
|
425
|
+
parent
|
|
426
|
+
end
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
# Check if a node is an HTML inline element.
|
|
430
|
+
def inline_element?(node)
|
|
431
|
+
return false unless node.respond_to?(:name)
|
|
432
|
+
|
|
433
|
+
INLINE_ELEMENTS.include?(node.name.to_s.downcase)
|
|
434
|
+
end
|
|
339
435
|
end
|
|
340
436
|
end
|
|
341
437
|
end
|