canon 0.2.7 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec-opal +7 -0
- data/.rubocop_todo.yml +16 -61
- data/README.adoc +5 -0
- data/Rakefile +17 -0
- data/docs/features/diff-formatting/comment-asymmetry.adoc +160 -0
- data/lib/canon/cli.rb +1 -1
- data/lib/canon/color_detector.rb +3 -5
- data/lib/canon/comparison/child_realignment.rb +140 -0
- data/lib/canon/comparison/compare_profile.rb +1 -4
- data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +2 -6
- data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +2 -6
- data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +2 -6
- data/lib/canon/comparison/dimensions/comments_dimension.rb +2 -6
- data/lib/canon/comparison/dimensions/element_position_dimension.rb +2 -6
- data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +2 -6
- data/lib/canon/comparison/dimensions/text_content_dimension.rb +3 -5
- data/lib/canon/comparison/format_detector.rb +29 -20
- data/lib/canon/comparison/html_comparator.rb +36 -75
- data/lib/canon/comparison/html_compare_profile.rb +3 -10
- data/lib/canon/comparison/html_parser.rb +1 -1
- data/lib/canon/comparison/json_comparator.rb +8 -0
- data/lib/canon/comparison/node_inspector.rb +150 -58
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +6 -8
- data/lib/canon/comparison/whitespace_sensitivity.rb +55 -193
- data/lib/canon/comparison/xml_comparator/attribute_filter.rb +5 -10
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +32 -77
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +43 -8
- data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +14 -28
- data/lib/canon/comparison/xml_comparator/node_parser.rb +12 -11
- data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +30 -58
- data/lib/canon/comparison/xml_comparator.rb +89 -83
- data/lib/canon/comparison/xml_node_comparison.rb +15 -15
- data/lib/canon/comparison/yaml_comparator.rb +8 -0
- data/lib/canon/comparison.rb +25 -23
- data/lib/canon/config/profile_loader.rb +13 -13
- data/lib/canon/config.rb +29 -5
- data/lib/canon/diff/diff_classifier.rb +16 -42
- data/lib/canon/diff/diff_line.rb +1 -1
- data/lib/canon/diff/diff_node_enricher.rb +22 -24
- data/lib/canon/diff/node_serializer.rb +23 -30
- data/lib/canon/diff/path_builder.rb +24 -37
- data/lib/canon/diff/source_locator.rb +0 -3
- data/lib/canon/diff/xml_serialization_formatter.rb +8 -81
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +7 -7
- data/lib/canon/diff_formatter/by_line/json_formatter.rb +1 -1
- data/lib/canon/diff_formatter/by_line/simple_formatter.rb +1 -1
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +2 -2
- data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +1 -1
- data/lib/canon/diff_formatter/by_line_formatter.rb +1 -1
- data/lib/canon/diff_formatter/by_object/base_formatter.rb +11 -15
- data/lib/canon/diff_formatter/by_object/xml_formatter.rb +8 -10
- data/lib/canon/diff_formatter/by_object_formatter.rb +1 -1
- data/lib/canon/diff_formatter/debug_output.rb +12 -24
- data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +2 -2
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +146 -318
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +28 -20
- data/lib/canon/diff_formatter/legend.rb +2 -2
- data/lib/canon/diff_formatter/pretty_diff_formatter.rb +2 -2
- data/lib/canon/diff_formatter/theme.rb +4 -4
- data/lib/canon/diff_formatter.rb +2 -2
- data/lib/canon/formatters/html_formatter.rb +1 -1
- data/lib/canon/formatters/html_formatter_base.rb +1 -1
- data/lib/canon/formatters/xml_formatter.rb +7 -32
- data/lib/canon/html/data_model.rb +1 -1
- data/lib/canon/pretty_printer/html.rb +1 -1
- data/lib/canon/pretty_printer/xml.rb +16 -7
- data/lib/canon/pretty_printer/xml_normalized.rb +9 -3
- data/lib/canon/rspec_matchers.rb +2 -2
- data/lib/canon/tree_diff/adapters/html_adapter.rb +1 -1
- data/lib/canon/tree_diff/adapters/xml_adapter.rb +1 -1
- data/lib/canon/tree_diff/core/tree_node.rb +1 -3
- data/lib/canon/validators/html_validator.rb +1 -1
- data/lib/canon/validators/xml_validator.rb +1 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +131 -137
- data/lib/canon/xml/namespace_helper.rb +5 -0
- data/lib/canon/xml/node.rb +2 -1
- data/lib/canon/xml/nodes/root_node.rb +4 -0
- data/lib/canon/xml/nodes/text_node.rb +6 -1
- data/lib/canon/xml/sax_builder.rb +4 -6
- data/lib/canon/xml_backend.rb +49 -0
- data/lib/canon/xml_parsing.rb +271 -0
- data/lib/canon.rb +3 -1
- data/lib/tasks/benchmark_runner.rb +1 -1
- data/lib/tasks/performance_helpers.rb +1 -1
- metadata +7 -2
|
@@ -23,14 +23,10 @@ module Canon
|
|
|
23
23
|
def extract_data(node)
|
|
24
24
|
return 0 unless node
|
|
25
25
|
|
|
26
|
-
|
|
27
|
-
if node.is_a?(Moxml::Node)
|
|
28
|
-
extract_from_moxml(node)
|
|
29
|
-
# Handle Nokogiri nodes
|
|
30
|
-
elsif node.is_a?(Nokogiri::XML::Node)
|
|
26
|
+
if Canon::XmlBackend.nokogiri?
|
|
31
27
|
extract_from_nokogiri(node)
|
|
32
28
|
else
|
|
33
|
-
|
|
29
|
+
extract_from_moxml(node)
|
|
34
30
|
end
|
|
35
31
|
end
|
|
36
32
|
|
|
@@ -25,14 +25,10 @@ module Canon
|
|
|
25
25
|
def extract_data(node)
|
|
26
26
|
return [] unless node
|
|
27
27
|
|
|
28
|
-
|
|
29
|
-
if node.is_a?(Moxml::Node)
|
|
30
|
-
extract_from_moxml(node)
|
|
31
|
-
# Handle Nokogiri nodes
|
|
32
|
-
elsif node.is_a?(Nokogiri::XML::Node)
|
|
28
|
+
if Canon::XmlBackend.nokogiri?
|
|
33
29
|
extract_from_nokogiri(node)
|
|
34
30
|
else
|
|
35
|
-
|
|
31
|
+
extract_from_moxml(node)
|
|
36
32
|
end
|
|
37
33
|
end
|
|
38
34
|
|
|
@@ -23,12 +23,10 @@ module Canon
|
|
|
23
23
|
def extract_data(node)
|
|
24
24
|
return nil unless node
|
|
25
25
|
|
|
26
|
-
|
|
27
|
-
if node.is_a?(Moxml::Node)
|
|
28
|
-
extract_from_moxml(node)
|
|
29
|
-
# Handle Nokogiri nodes
|
|
30
|
-
elsif node.is_a?(Nokogiri::XML::Node)
|
|
26
|
+
if Canon::XmlBackend.nokogiri?
|
|
31
27
|
extract_from_nokogiri(node)
|
|
28
|
+
else
|
|
29
|
+
extract_from_moxml(node)
|
|
32
30
|
end
|
|
33
31
|
end
|
|
34
32
|
|
|
@@ -22,27 +22,36 @@ module Canon
|
|
|
22
22
|
# @param obj [Object] Object to detect format of
|
|
23
23
|
# @return [Symbol] Format type (:xml, :html, :json, :yaml, :ruby_object, :string)
|
|
24
24
|
def detect(obj)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
obj.html? ? :html : :xml
|
|
37
|
-
when Nokogiri::HTML::Document, Nokogiri::HTML5::Document
|
|
38
|
-
:html
|
|
39
|
-
when String
|
|
40
|
-
detect_string(obj)
|
|
41
|
-
when Hash, Array
|
|
42
|
-
# Raw Ruby objects (from parsed JSON/YAML)
|
|
43
|
-
:ruby_object
|
|
25
|
+
if XmlBackend.moxml?
|
|
26
|
+
case obj
|
|
27
|
+
when Moxml::Node, Moxml::Document
|
|
28
|
+
:xml
|
|
29
|
+
when String
|
|
30
|
+
detect_string(obj)
|
|
31
|
+
when Hash, Array
|
|
32
|
+
:ruby_object
|
|
33
|
+
else
|
|
34
|
+
raise Canon::Error, "Unknown format for object: #{obj.class}"
|
|
35
|
+
end
|
|
44
36
|
else
|
|
45
|
-
|
|
37
|
+
case obj
|
|
38
|
+
when Moxml::Node, Moxml::Document
|
|
39
|
+
:xml
|
|
40
|
+
when Nokogiri::HTML::DocumentFragment, Nokogiri::HTML5::DocumentFragment
|
|
41
|
+
:html
|
|
42
|
+
when Nokogiri::XML::DocumentFragment
|
|
43
|
+
obj.document&.html? ? :html : :xml
|
|
44
|
+
when Nokogiri::XML::Document, Nokogiri::XML::Node
|
|
45
|
+
obj.html? ? :html : :xml
|
|
46
|
+
when Nokogiri::HTML::Document, Nokogiri::HTML5::Document
|
|
47
|
+
:html
|
|
48
|
+
when String
|
|
49
|
+
detect_string(obj)
|
|
50
|
+
when Hash, Array
|
|
51
|
+
:ruby_object
|
|
52
|
+
else
|
|
53
|
+
raise Canon::Error, "Unknown format for object: #{obj.class}"
|
|
54
|
+
end
|
|
46
55
|
end
|
|
47
56
|
end
|
|
48
57
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require "nokogiri"
|
|
3
|
+
require "nokogiri" unless RUBY_ENGINE == "opal"
|
|
4
4
|
require_relative "../comparison" # Load base module with constants first
|
|
5
5
|
require_relative "markup_comparator"
|
|
6
6
|
require_relative "xml_comparator"
|
|
@@ -167,6 +167,11 @@ module Canon
|
|
|
167
167
|
end
|
|
168
168
|
end
|
|
169
169
|
|
|
170
|
+
# Public parsing API for external callers
|
|
171
|
+
def parse(html, preprocessing = :none)
|
|
172
|
+
parse_node_for_semantic(html, preprocessing)
|
|
173
|
+
end
|
|
174
|
+
|
|
170
175
|
private
|
|
171
176
|
|
|
172
177
|
# Check if both nodes are document fragments
|
|
@@ -188,32 +193,9 @@ module Canon
|
|
|
188
193
|
node.is_a?(Nokogiri::HTML5::DocumentFragment)
|
|
189
194
|
end
|
|
190
195
|
|
|
191
|
-
#
|
|
192
|
-
#
|
|
193
|
-
#
|
|
194
|
-
def record_fragment_length_mismatch(_node1, _node2, children1,
|
|
195
|
-
children2, differences)
|
|
196
|
-
longer, shorter, side = if children1.length > children2.length
|
|
197
|
-
[children1, children2, :removed]
|
|
198
|
-
else
|
|
199
|
-
[children2, children1, :added]
|
|
200
|
-
end
|
|
201
|
-
|
|
202
|
-
longer[shorter.length...].each do |orphan|
|
|
203
|
-
n1 = side == :removed ? orphan : nil
|
|
204
|
-
n2 = side == :removed ? nil : orphan
|
|
205
|
-
differences <<
|
|
206
|
-
Canon::Comparison::DiffNodeBuilder.build(
|
|
207
|
-
node1: n1,
|
|
208
|
-
node2: n2,
|
|
209
|
-
diff1: Comparison::MISSING_NODE,
|
|
210
|
-
diff2: Comparison::MISSING_NODE,
|
|
211
|
-
dimension: :element_structure,
|
|
212
|
-
)
|
|
213
|
-
end
|
|
214
|
-
end
|
|
215
|
-
|
|
216
|
-
# Compare children of document fragments
|
|
196
|
+
# Compare children of document fragments using the shared
|
|
197
|
+
# +ChildRealignment+ walk. Structural orphans are emitted here
|
|
198
|
+
# (the HTML fragment path has no separate length-mismatch step).
|
|
217
199
|
#
|
|
218
200
|
# @param node1 [Nokogiri::DocumentFragment] First fragment
|
|
219
201
|
# @param node2 [Nokogiri::DocumentFragment] Second fragment
|
|
@@ -230,29 +212,24 @@ module Canon
|
|
|
230
212
|
children1 = XmlNodeComparison.filter_children(all_children1, opts)
|
|
231
213
|
children2 = XmlNodeComparison.filter_children(all_children2, opts)
|
|
232
214
|
|
|
233
|
-
if children1.
|
|
234
|
-
# Record the length mismatch as a DiffNode so verbose mode
|
|
235
|
-
# surfaces it. Without this, equivalent? wraps an empty
|
|
236
|
-
# differences array and incorrectly reports the inputs as
|
|
237
|
-
# equivalent.
|
|
238
|
-
record_fragment_length_mismatch(node1, node2,
|
|
239
|
-
children1, children2,
|
|
240
|
-
differences)
|
|
241
|
-
return Comparison::UNEQUAL_ELEMENTS
|
|
242
|
-
elsif children1.empty?
|
|
243
|
-
return Comparison::EQUIVALENT
|
|
244
|
-
end
|
|
215
|
+
return Comparison::EQUIVALENT if children1.empty? && children2.empty?
|
|
245
216
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
differences)
|
|
252
|
-
return child_result if child_result != Comparison::EQUIVALENT
|
|
217
|
+
emitter = html_diff_emitter(differences)
|
|
218
|
+
ChildRealignment.walk(children1, children2, emitter,
|
|
219
|
+
emit_structural_orphans: true) do |c1, c2|
|
|
220
|
+
XmlNodeComparison.compare_nodes(c1, c2, opts, child_opts,
|
|
221
|
+
diff_children, differences)
|
|
253
222
|
end
|
|
223
|
+
end
|
|
254
224
|
|
|
255
|
-
|
|
225
|
+
# Build a diff emitter for the HTML comparator path that
|
|
226
|
+
# creates DiffNode objects via DiffNodeBuilder.
|
|
227
|
+
def html_diff_emitter(differences)
|
|
228
|
+
proc do |n1, n2, d1, d2, dim|
|
|
229
|
+
differences << Canon::Comparison::DiffNodeBuilder.build(
|
|
230
|
+
node1: n1, node2: n2, diff1: d1, diff2: d2, dimension: dim,
|
|
231
|
+
)
|
|
232
|
+
end
|
|
256
233
|
end
|
|
257
234
|
|
|
258
235
|
# Perform semantic tree diff using SemanticTreeMatchStrategy
|
|
@@ -365,13 +342,10 @@ module Canon
|
|
|
365
342
|
# Convert to string if needed
|
|
366
343
|
html_string = if html.is_a?(String)
|
|
367
344
|
html
|
|
368
|
-
elsif
|
|
345
|
+
elsif Canon::XmlParsing.xml_node?(html)
|
|
369
346
|
html.to_html
|
|
370
|
-
elsif html.respond_to?(:to_s)
|
|
371
|
-
html.to_s
|
|
372
347
|
else
|
|
373
|
-
|
|
374
|
-
"Unable to convert HTML to string: #{html.class}"
|
|
348
|
+
html.to_s
|
|
375
349
|
end
|
|
376
350
|
|
|
377
351
|
# Strip DOCTYPE for consistent parsing
|
|
@@ -520,22 +494,18 @@ module Canon
|
|
|
520
494
|
end
|
|
521
495
|
|
|
522
496
|
def find_and_normalize_style_script(node)
|
|
523
|
-
return unless node.
|
|
497
|
+
return unless node.is_a?(Canon::Xml::Node)
|
|
524
498
|
|
|
525
499
|
node.children.each do |child|
|
|
526
500
|
next unless child.is_a?(Canon::Xml::Nodes::ElementNode)
|
|
527
501
|
|
|
528
502
|
# If this is a style or script element, normalize its text content
|
|
529
503
|
if %w[style script].include?(child.name.downcase)
|
|
530
|
-
# Get text children and remove HTML comments from them
|
|
531
504
|
child.children.each do |text_child|
|
|
532
505
|
next unless text_child.is_a?(Canon::Xml::Nodes::TextNode)
|
|
533
506
|
|
|
534
|
-
# Remove HTML comments from text content without using regex
|
|
535
|
-
# to avoid ReDoS/incomplete sanitization vulnerabilities
|
|
536
507
|
normalized = remove_html_comments(text_child.value)
|
|
537
|
-
|
|
538
|
-
text_child.instance_variable_set(:@value, normalized)
|
|
508
|
+
text_child.value = normalized
|
|
539
509
|
end
|
|
540
510
|
end
|
|
541
511
|
|
|
@@ -612,13 +582,10 @@ module Canon
|
|
|
612
582
|
# @param node [Canon::Xml::Node, Nokogiri::HTML::Document] Parsed node
|
|
613
583
|
# @return [String] Serialized HTML string
|
|
614
584
|
def serialize_for_display(node)
|
|
615
|
-
# Use XmlNodeComparison's serializer for Canon::Xml::Node
|
|
616
585
|
if node.is_a?(Canon::Xml::Node)
|
|
617
586
|
XmlNodeComparison.serialize_node_to_xml(node)
|
|
618
|
-
elsif
|
|
619
|
-
node.to_html
|
|
620
|
-
elsif node.respond_to?(:to_xml)
|
|
621
|
-
node.to_xml
|
|
587
|
+
elsif Canon::XmlParsing.xml_node?(node)
|
|
588
|
+
Canon::XmlBackend.nokogiri? ? node.to_html : Canon::XmlParsing.serialize(node)
|
|
622
589
|
else
|
|
623
590
|
node.to_s
|
|
624
591
|
end
|
|
@@ -633,16 +600,11 @@ module Canon
|
|
|
633
600
|
if html.is_a?(String)
|
|
634
601
|
html
|
|
635
602
|
elsif html.is_a?(Canon::Xml::Node)
|
|
636
|
-
# Serialize Canon nodes to string
|
|
637
603
|
Canon::Xml::DataModel.serialize(html)
|
|
638
|
-
elsif
|
|
639
|
-
|
|
640
|
-
html.to_html
|
|
641
|
-
elsif html.respond_to?(:to_s)
|
|
642
|
-
html.to_s
|
|
604
|
+
elsif Canon::XmlParsing.xml_node?(html)
|
|
605
|
+
Canon::XmlBackend.nokogiri? ? html.to_html : html.to_s
|
|
643
606
|
else
|
|
644
|
-
|
|
645
|
-
"Unable to extract original string from: #{html.class}"
|
|
607
|
+
html.to_s
|
|
646
608
|
end
|
|
647
609
|
end
|
|
648
610
|
|
|
@@ -755,11 +717,10 @@ compare_profile = nil)
|
|
|
755
717
|
# Check if any ancestor of the given node preserves whitespace
|
|
756
718
|
def ancestor_preserves_whitespace?(node, preserve_list)
|
|
757
719
|
current = node
|
|
758
|
-
while current.
|
|
720
|
+
while current.is_a?(Canon::Xml::Node) || Canon::XmlParsing.xml_node?(current)
|
|
759
721
|
return true if preserve_list.include?(current.name.downcase)
|
|
760
722
|
|
|
761
|
-
|
|
762
|
-
break if current.is_a?(Nokogiri::XML::Document)
|
|
723
|
+
break if Canon::XmlParsing.document?(current)
|
|
763
724
|
|
|
764
725
|
current = current.parent
|
|
765
726
|
end
|
|
@@ -839,7 +800,7 @@ compare_profile = nil)
|
|
|
839
800
|
end
|
|
840
801
|
|
|
841
802
|
# Check if it's a fragment that contains XML processing instructions
|
|
842
|
-
if node.
|
|
803
|
+
if (node.is_a?(Canon::Xml::Node) || Canon::XmlParsing.xml_node?(node)) && node.children.any? do |child|
|
|
843
804
|
child.is_a?(Nokogiri::XML::ProcessingInstruction) &&
|
|
844
805
|
child.name == "xml"
|
|
845
806
|
end
|
|
@@ -48,9 +48,8 @@ module Canon
|
|
|
48
48
|
|
|
49
49
|
# If key exists, check if it's :strict
|
|
50
50
|
return match_options[:comments] == :strict
|
|
51
|
-
elsif match_options.
|
|
51
|
+
elsif match_options.is_a?(ResolvedMatchOptions)
|
|
52
52
|
behavior = behavior_for(dimension)
|
|
53
|
-
# In HTML, only :strict makes comments affect equivalence
|
|
54
53
|
return behavior == :strict
|
|
55
54
|
end
|
|
56
55
|
# Default: comments don't affect equivalence in HTML
|
|
@@ -106,14 +105,8 @@ module Canon
|
|
|
106
105
|
def has_explicit_option?(dimension)
|
|
107
106
|
if match_options.is_a?(Hash)
|
|
108
107
|
match_options.key?(dimension)
|
|
109
|
-
elsif match_options.
|
|
110
|
-
|
|
111
|
-
begin
|
|
112
|
-
match_options[dimension]
|
|
113
|
-
true
|
|
114
|
-
rescue StandardError
|
|
115
|
-
false
|
|
116
|
-
end
|
|
108
|
+
elsif match_options.is_a?(ResolvedMatchOptions)
|
|
109
|
+
!match_options.options[dimension].nil?
|
|
117
110
|
else
|
|
118
111
|
false
|
|
119
112
|
end
|
|
@@ -26,6 +26,14 @@ module Canon
|
|
|
26
26
|
}.freeze
|
|
27
27
|
|
|
28
28
|
class << self
|
|
29
|
+
# Parse JSON from string or return as-is
|
|
30
|
+
#
|
|
31
|
+
# @param obj [String, Hash, Array] JSON string or parsed object
|
|
32
|
+
# @return [Object] Parsed JSON object
|
|
33
|
+
def parse(obj)
|
|
34
|
+
parse_json(obj)
|
|
35
|
+
end
|
|
36
|
+
|
|
29
37
|
# Compare two JSON objects for equivalence
|
|
30
38
|
#
|
|
31
39
|
# @param json1 [String, Hash, Array] First JSON
|
|
@@ -4,42 +4,76 @@ module Canon
|
|
|
4
4
|
module Comparison
|
|
5
5
|
# Single source of truth for cross-backend node type operations.
|
|
6
6
|
#
|
|
7
|
-
# The comparison pipeline handles nodes from
|
|
7
|
+
# The comparison pipeline handles nodes from multiple sources:
|
|
8
8
|
# * Canon::Xml::Node (+ RootNode, ElementNode, TextNode, etc.) —
|
|
9
9
|
# custom DOM built by SAX builder and DataModel.
|
|
10
|
-
# *
|
|
11
|
-
#
|
|
10
|
+
# * Canon::TreeDiff::Core::TreeNode — semantic tree diff nodes.
|
|
11
|
+
# * Backend-specific nodes (Nokogiri or Moxml) — live parsed nodes.
|
|
12
12
|
#
|
|
13
|
-
#
|
|
14
|
-
#
|
|
13
|
+
# All type dispatch uses backend-branching (`if XmlBackend.nokogiri?`)
|
|
14
|
+
# rather than `case/when` with constant references. This prevents
|
|
15
|
+
# NameError when Nokogiri constants are undefined under Opal.
|
|
16
|
+
#
|
|
17
|
+
# Every node query in the codebase should go through this module.
|
|
18
|
+
# Do not create private dispatch methods in consumers.
|
|
15
19
|
module NodeInspector
|
|
16
|
-
CANON_TEXT_TYPE = :text
|
|
17
20
|
NOKOGIRI_TEXT_TYPE = defined?(Nokogiri::XML::Node::TEXT_NODE) ? Nokogiri::XML::Node::TEXT_NODE : 3
|
|
18
21
|
|
|
19
|
-
#
|
|
22
|
+
# --- Type predicates ---
|
|
23
|
+
|
|
20
24
|
def self.text_node?(node)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
node.
|
|
25
|
+
return false unless node
|
|
26
|
+
return node.node_type == :text if node.is_a?(Canon::Xml::Node)
|
|
27
|
+
|
|
28
|
+
if XmlBackend.nokogiri?
|
|
29
|
+
node.is_a?(Nokogiri::XML::Text) || node.is_a?(Moxml::Text)
|
|
26
30
|
else
|
|
27
|
-
|
|
31
|
+
node.is_a?(Moxml::Text)
|
|
28
32
|
end
|
|
29
33
|
end
|
|
30
34
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
35
|
+
def self.element_node?(node)
|
|
36
|
+
return false unless node
|
|
37
|
+
return node.node_type == :element if node.is_a?(Canon::Xml::Node)
|
|
38
|
+
|
|
39
|
+
if XmlBackend.nokogiri?
|
|
40
|
+
node.is_a?(Nokogiri::XML::Element) || node.is_a?(Moxml::Element)
|
|
41
|
+
else
|
|
42
|
+
node.is_a?(Moxml::Element)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def self.comment_node?(node)
|
|
47
|
+
return false unless node
|
|
48
|
+
return node.node_type == :comment if node.is_a?(Canon::Xml::Node)
|
|
49
|
+
|
|
50
|
+
if XmlBackend.nokogiri?
|
|
51
|
+
return true if node.is_a?(Nokogiri::XML::Node) && node.comment?
|
|
52
|
+
|
|
53
|
+
# HTML comments are parsed as TEXT nodes by Nokogiri
|
|
54
|
+
if node.is_a?(Nokogiri::XML::Node) && node.text?
|
|
55
|
+
text_stripped = text_content(node).to_s.strip.gsub("\\", "")
|
|
56
|
+
return true if text_stripped.start_with?("<!--") && text_stripped.end_with?("-->")
|
|
57
|
+
end
|
|
58
|
+
false
|
|
38
59
|
else
|
|
39
|
-
node.
|
|
60
|
+
node.is_a?(Moxml::Comment)
|
|
40
61
|
end
|
|
41
62
|
end
|
|
42
63
|
|
|
64
|
+
def self.document?(node)
|
|
65
|
+
return node.node_type == :root if node.is_a?(Canon::Xml::Node)
|
|
66
|
+
|
|
67
|
+
XmlParsing.document?(node)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def self.document_fragment?(node)
|
|
71
|
+
return false unless node
|
|
72
|
+
return false unless node.is_a?(Canon::Xml::Nodes::RootNode)
|
|
73
|
+
|
|
74
|
+
node.fragment?
|
|
75
|
+
end
|
|
76
|
+
|
|
43
77
|
# True when +node+ is a text node whose content is whitespace-only.
|
|
44
78
|
# Empty-string text nodes return false — those represent genuine
|
|
45
79
|
# empty-vs-content asymmetry, not pretty-print indentation.
|
|
@@ -50,62 +84,120 @@ module Canon
|
|
|
50
84
|
!text.empty? && text.strip.empty?
|
|
51
85
|
end
|
|
52
86
|
|
|
53
|
-
#
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
87
|
+
# --- Noise classification ---
|
|
88
|
+
|
|
89
|
+
def self.noise_dimension_for(node)
|
|
90
|
+
if whitespace_only_text?(node)
|
|
91
|
+
:whitespace_adjacency
|
|
92
|
+
elsif comment_node?(node)
|
|
93
|
+
:comments
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def self.noise_node?(node)
|
|
98
|
+
!noise_dimension_for(node).nil?
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# --- Node queries ---
|
|
102
|
+
|
|
103
|
+
# Unified node name extraction across all node types.
|
|
104
|
+
def self.name(node)
|
|
105
|
+
return nil unless node
|
|
106
|
+
return node.name if node.is_a?(Canon::Xml::Node)
|
|
107
|
+
return node.label if node.is_a?(Canon::TreeDiff::Core::TreeNode)
|
|
108
|
+
|
|
109
|
+
XmlParsing.name(node)
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Unified parent access across all node types.
|
|
113
|
+
def self.parent(node)
|
|
114
|
+
return nil unless node
|
|
115
|
+
return node.parent if node.is_a?(Canon::Xml::Node)
|
|
116
|
+
return node.parent if node.is_a?(Canon::TreeDiff::Core::TreeNode)
|
|
117
|
+
|
|
118
|
+
XmlParsing.parent(node)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Unified children access across all node types.
|
|
122
|
+
def self.children(node)
|
|
123
|
+
return [] unless node
|
|
124
|
+
return node.children if node.is_a?(Canon::Xml::Node)
|
|
125
|
+
return node.children || [] if node.is_a?(Canon::TreeDiff::Core::TreeNode)
|
|
126
|
+
|
|
127
|
+
XmlParsing.children(node)
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Extract the text content of +node+ as a String.
|
|
131
|
+
def self.text_content(node)
|
|
57
132
|
case node
|
|
133
|
+
when Canon::Xml::Nodes::TextNode
|
|
134
|
+
node.value.to_s
|
|
58
135
|
when Canon::Xml::Node
|
|
59
|
-
node.
|
|
60
|
-
when
|
|
61
|
-
|
|
136
|
+
node.text_content.to_s
|
|
137
|
+
when Moxml::Text
|
|
138
|
+
node.content.to_s
|
|
139
|
+
else
|
|
140
|
+
XmlParsing.text_content(node).to_s
|
|
141
|
+
end
|
|
142
|
+
end
|
|
62
143
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
144
|
+
# Unified node type that always returns a symbol.
|
|
145
|
+
# Returns nil for unrecognised nodes.
|
|
146
|
+
def self.node_type(node)
|
|
147
|
+
return nil unless node
|
|
148
|
+
return node.node_type if node.is_a?(Canon::Xml::Node)
|
|
149
|
+
|
|
150
|
+
if node.is_a?(Canon::TreeDiff::Core::TreeNode)
|
|
151
|
+
node.type&.to_sym
|
|
69
152
|
else
|
|
70
|
-
|
|
153
|
+
XmlParsing.node_type(node)
|
|
71
154
|
end
|
|
72
155
|
end
|
|
73
156
|
|
|
74
|
-
#
|
|
75
|
-
def self.
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
157
|
+
# Unified attribute value access.
|
|
158
|
+
def self.attribute_value(node, attr_name)
|
|
159
|
+
return nil unless node
|
|
160
|
+
|
|
161
|
+
if node.is_a?(Canon::Xml::Nodes::ElementNode)
|
|
162
|
+
attr = node.attribute_nodes.find { |a| a.name == attr_name.to_s }
|
|
163
|
+
attr&.value
|
|
164
|
+
elsif node.is_a?(Canon::Xml::Node)
|
|
165
|
+
nil
|
|
81
166
|
else
|
|
82
|
-
|
|
167
|
+
XmlParsing.attribute_value(node, attr_name)
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Unified namespace URI access.
|
|
172
|
+
def self.namespace_uri(node)
|
|
173
|
+
return nil unless node
|
|
174
|
+
|
|
175
|
+
if node.is_a?(Canon::Xml::Node)
|
|
176
|
+
node.is_a?(Canon::Xml::Nodes::ElementNode) ? node.namespace_uri : nil
|
|
177
|
+
else
|
|
178
|
+
XmlParsing.namespace_uri(node)
|
|
83
179
|
end
|
|
84
180
|
end
|
|
85
181
|
|
|
86
182
|
# Extract parse-time errors carried on a node or its owning document.
|
|
87
|
-
# Returns an Array of Strings.
|
|
88
183
|
def self.parse_errors(node)
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
184
|
+
return [] if node.nil?
|
|
185
|
+
return Array(node.parse_errors).map(&:to_s) if node.is_a?(Canon::Xml::Node)
|
|
186
|
+
|
|
187
|
+
if XmlBackend.nokogiri?
|
|
188
|
+
if node.is_a?(Nokogiri::XML::Document) || node.is_a?(Nokogiri::HTML5::Document)
|
|
189
|
+
Array(node.errors).map(&:to_s)
|
|
190
|
+
else
|
|
191
|
+
[]
|
|
192
|
+
end
|
|
97
193
|
else
|
|
98
194
|
[]
|
|
99
195
|
end
|
|
100
196
|
end
|
|
101
197
|
|
|
102
|
-
#
|
|
103
|
-
# recognised DOM backend type or has no parent.
|
|
198
|
+
# Deprecated: use NodeInspector.parent instead.
|
|
104
199
|
def self.parent_of(node)
|
|
105
|
-
|
|
106
|
-
when Canon::Xml::Node, Nokogiri::XML::Node
|
|
107
|
-
node.parent
|
|
108
|
-
end
|
|
200
|
+
parent(node)
|
|
109
201
|
end
|
|
110
202
|
end
|
|
111
203
|
end
|
|
@@ -125,20 +125,18 @@ module Canon
|
|
|
125
125
|
# @param doc2 [Object] Second XML document
|
|
126
126
|
# @return [Array<String>] Preprocessed strings
|
|
127
127
|
def preprocess_xml(doc1, doc2)
|
|
128
|
-
# Serialize XML to string
|
|
129
|
-
# Use XmlNodeComparison's serializer for Canon::Xml::Node
|
|
130
128
|
xml1 = if doc1.is_a?(Canon::Xml::Node)
|
|
131
129
|
XmlNodeComparison.serialize_node_to_xml(doc1)
|
|
132
|
-
elsif
|
|
133
|
-
doc1
|
|
130
|
+
elsif Canon::XmlParsing.xml_node?(doc1)
|
|
131
|
+
Canon::XmlParsing.serialize(doc1)
|
|
134
132
|
else
|
|
135
133
|
doc1.to_s
|
|
136
134
|
end
|
|
137
135
|
|
|
138
136
|
xml2 = if doc2.is_a?(Canon::Xml::Node)
|
|
139
137
|
XmlNodeComparison.serialize_node_to_xml(doc2)
|
|
140
|
-
elsif
|
|
141
|
-
doc2
|
|
138
|
+
elsif Canon::XmlParsing.xml_node?(doc2)
|
|
139
|
+
Canon::XmlParsing.serialize(doc2)
|
|
142
140
|
else
|
|
143
141
|
doc2.to_s
|
|
144
142
|
end
|
|
@@ -167,7 +165,7 @@ module Canon
|
|
|
167
165
|
XmlNodeComparison.serialize_node_to_xml(doc1)
|
|
168
166
|
elsif doc1.is_a?(Nokogiri::XML::DocumentFragment)
|
|
169
167
|
doc1.to_s
|
|
170
|
-
elsif
|
|
168
|
+
elsif Canon::XmlParsing.xml_node?(doc1)
|
|
171
169
|
doc1.to_html
|
|
172
170
|
else
|
|
173
171
|
doc1.to_s
|
|
@@ -177,7 +175,7 @@ module Canon
|
|
|
177
175
|
XmlNodeComparison.serialize_node_to_xml(doc2)
|
|
178
176
|
elsif doc2.is_a?(Nokogiri::XML::DocumentFragment)
|
|
179
177
|
doc2.to_s
|
|
180
|
-
elsif
|
|
178
|
+
elsif Canon::XmlParsing.xml_node?(doc2)
|
|
181
179
|
doc2.to_html
|
|
182
180
|
else
|
|
183
181
|
doc2.to_s
|