canon 0.1.8 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +83 -22
- data/docs/Gemfile +1 -0
- data/docs/_config.yml +90 -1
- data/docs/advanced/diff-classification.adoc +196 -24
- data/docs/features/match-options/index.adoc +239 -1
- data/lib/canon/comparison/format_detector.rb +2 -1
- data/lib/canon/comparison/html_comparator.rb +19 -8
- data/lib/canon/comparison/html_compare_profile.rb +8 -2
- data/lib/canon/comparison/markup_comparator.rb +109 -2
- data/lib/canon/comparison/match_options/base_resolver.rb +7 -0
- data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +15 -7
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +108 -0
- data/lib/canon/comparison/xml_comparator/node_parser.rb +10 -5
- data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +14 -7
- data/lib/canon/comparison/xml_comparator.rb +240 -23
- data/lib/canon/comparison/xml_node_comparison.rb +25 -3
- data/lib/canon/diff/diff_classifier.rb +119 -5
- data/lib/canon/diff/formatting_detector.rb +1 -1
- data/lib/canon/diff/xml_serialization_formatter.rb +153 -0
- data/lib/canon/rspec_matchers.rb +37 -8
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +24 -13
- metadata +4 -78
- data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +0 -250
- data/false_positive_analysis.txt +0 -0
- data/file1.html +0 -1
- data/file2.html +0 -1
- data/old-docs/ADVANCED_TOPICS.adoc +0 -20
- data/old-docs/BASIC_USAGE.adoc +0 -16
- data/old-docs/CHARACTER_VISUALIZATION.adoc +0 -567
- data/old-docs/CLI.adoc +0 -497
- data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/old-docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/old-docs/DIFF_FORMATTING.adoc +0 -540
- data/old-docs/DIFF_PARAMETERS.adoc +0 -261
- data/old-docs/DOM_DIFF.adoc +0 -1017
- data/old-docs/ENV_CONFIG.adoc +0 -876
- data/old-docs/FORMATS.adoc +0 -867
- data/old-docs/INPUT_VALIDATION.adoc +0 -477
- data/old-docs/MATCHER_BEHAVIOR.adoc +0 -90
- data/old-docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/old-docs/MATCH_OPTIONS.adoc +0 -912
- data/old-docs/MODES.adoc +0 -432
- data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/old-docs/OPTIONS.adoc +0 -1387
- data/old-docs/PREPROCESSING.adoc +0 -491
- data/old-docs/README.old.adoc +0 -2831
- data/old-docs/RSPEC.adoc +0 -814
- data/old-docs/RUBY_API.adoc +0 -485
- data/old-docs/SEMANTIC_DIFF_REPORT.adoc +0 -646
- data/old-docs/SEMANTIC_TREE_DIFF.adoc +0 -765
- data/old-docs/STRING_COMPARE.adoc +0 -345
- data/old-docs/TMP.adoc +0 -3384
- data/old-docs/TREE_DIFF.adoc +0 -1080
- data/old-docs/UNDERSTANDING_CANON.adoc +0 -17
- data/old-docs/VERBOSE.adoc +0 -482
- data/old-docs/VISUALIZATION_MAP.adoc +0 -625
- data/old-docs/WHITESPACE_TREATMENT.adoc +0 -1155
- data/scripts/analyze_current_state.rb +0 -85
- data/scripts/analyze_false_positives.rb +0 -114
- data/scripts/analyze_remaining_failures.rb +0 -105
- data/scripts/compare_current_failures.rb +0 -95
- data/scripts/compare_dom_tree_diff.rb +0 -158
- data/scripts/compare_failures.rb +0 -151
- data/scripts/debug_attribute_extraction.rb +0 -66
- data/scripts/debug_blocks_839.rb +0 -115
- data/scripts/debug_meta_matching.rb +0 -52
- data/scripts/debug_p_matching.rb +0 -192
- data/scripts/debug_signature_matching.rb +0 -118
- data/scripts/debug_sourcecode_124.rb +0 -32
- data/scripts/debug_whitespace_sensitive.rb +0 -192
- data/scripts/extract_false_positives.rb +0 -138
- data/scripts/find_actual_false_positives.rb +0 -125
- data/scripts/investigate_all_false_positives.rb +0 -161
- data/scripts/investigate_batch1.rb +0 -127
- data/scripts/investigate_classification.rb +0 -150
- data/scripts/investigate_classification_detailed.rb +0 -190
- data/scripts/investigate_common_failures.rb +0 -342
- data/scripts/investigate_false_negative.rb +0 -80
- data/scripts/investigate_false_positive.rb +0 -83
- data/scripts/investigate_false_positives.rb +0 -227
- data/scripts/investigate_false_positives_batch.rb +0 -163
- data/scripts/investigate_mixed_content.rb +0 -125
- data/scripts/investigate_remaining_16.rb +0 -214
- data/scripts/run_single_test.rb +0 -29
- data/scripts/test_all_false_positives.rb +0 -95
- data/scripts/test_attribute_details.rb +0 -61
- data/scripts/test_both_algorithms.rb +0 -49
- data/scripts/test_both_simple.rb +0 -49
- data/scripts/test_enhanced_semantic_output.rb +0 -125
- data/scripts/test_readme_examples.rb +0 -131
- data/scripts/test_semantic_tree_diff.rb +0 -99
- data/scripts/test_semantic_ux_improvements.rb +0 -135
- data/scripts/test_single_false_positive.rb +0 -119
- data/scripts/test_size_limits.rb +0 -99
- data/test_html_1.html +0 -21
- data/test_html_2.html +0 -21
- data/test_nokogiri.rb +0 -33
- data/test_normalize.rb +0 -45
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module Diff
|
|
5
|
+
# Detects and classifies XML serialization-level formatting differences.
|
|
6
|
+
#
|
|
7
|
+
# Serialization-level formatting differences are differences in XML syntax
|
|
8
|
+
# that do not affect the semantic content of the document. These differences
|
|
9
|
+
# arise from different valid ways to serialize the same semantic content.
|
|
10
|
+
#
|
|
11
|
+
# These differences are ALWAYS non-normative (formatting-only) regardless
|
|
12
|
+
# of match options, because they are purely syntactic variations.
|
|
13
|
+
#
|
|
14
|
+
# Examples:
|
|
15
|
+
# - Self-closing vs explicit closing tags: <tag/> vs <tag></tag>
|
|
16
|
+
# - Attribute quote style: attr="value" vs attr='value' (parser-normalized)
|
|
17
|
+
# - Whitespace within tags: <tag a="1" b="2"> vs <tag a="1" b="2"> (parser-normalized)
|
|
18
|
+
#
|
|
19
|
+
# Note: Some serialization differences are normalized away by XML parsers
|
|
20
|
+
# (attribute quotes, tag spacing). This class focuses on differences that
|
|
21
|
+
# survive parsing and comparison, such as self-closing vs explicit closing.
|
|
22
|
+
class XmlSerializationFormatter
|
|
23
|
+
# Detect if a diff node represents an XML serialization formatting difference.
|
|
24
|
+
#
|
|
25
|
+
# Serialization formatting differences are ALWAYS non-normative because they
|
|
26
|
+
# represent different valid serializations of the same semantic content.
|
|
27
|
+
#
|
|
28
|
+
# @param diff_node [DiffNode] The diff node to check
|
|
29
|
+
# @return [Boolean] true if this is a serialization formatting difference
|
|
30
|
+
def self.serialization_formatting?(diff_node)
|
|
31
|
+
# Currently only handles text_content dimension
|
|
32
|
+
# Future: add detection for other dimensions
|
|
33
|
+
return false unless diff_node.dimension == :text_content
|
|
34
|
+
|
|
35
|
+
empty_text_content_serialization_diff?(diff_node)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Check if a text_content difference is from XML serialization format.
|
|
39
|
+
#
|
|
40
|
+
# Specifically detects self-closing tags (<tag/>) vs explicit closing tags
|
|
41
|
+
# (<tag></tag>), which create different text node structures:
|
|
42
|
+
# - Self-closing: no text node (nil)
|
|
43
|
+
# - Explicit closing: empty or whitespace-only text node ("", " ", "\n", etc.)
|
|
44
|
+
#
|
|
45
|
+
# Per XML standards, these forms are semantically equivalent.
|
|
46
|
+
#
|
|
47
|
+
# @param diff_node [DiffNode] The diff node to check
|
|
48
|
+
# @return [Boolean] true if this is a serialization formatting difference
|
|
49
|
+
def self.empty_text_content_serialization_diff?(diff_node)
|
|
50
|
+
return false unless diff_node.dimension == :text_content
|
|
51
|
+
|
|
52
|
+
node1 = diff_node.node1
|
|
53
|
+
node2 = diff_node.node2
|
|
54
|
+
|
|
55
|
+
# Both nodes are nil - no actual difference, not a serialization formatting diff
|
|
56
|
+
return false if node1.nil? && node2.nil?
|
|
57
|
+
|
|
58
|
+
# Only one is nil (e.g., one doc has self-closing, other has text)
|
|
59
|
+
# If the non-nil one is blank, it's still serialization formatting
|
|
60
|
+
if node1.nil? || node2.nil?
|
|
61
|
+
non_nil = node1 || node2
|
|
62
|
+
return false unless text_node?(non_nil)
|
|
63
|
+
|
|
64
|
+
text = extract_text_content(non_nil)
|
|
65
|
+
return blank?(text)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Both must be text nodes
|
|
69
|
+
return false unless text_node?(node1) && text_node?(node2)
|
|
70
|
+
|
|
71
|
+
text1 = extract_text_content(node1)
|
|
72
|
+
text2 = extract_text_content(node2)
|
|
73
|
+
|
|
74
|
+
# Check if both texts are blank/whitespace-only
|
|
75
|
+
# This indicates self-closing vs explicit closing tag syntax
|
|
76
|
+
blank?(text1) && blank?(text2)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Check if a value is blank (nil or whitespace-only)
|
|
80
|
+
# @param value [String, nil] Value to check
|
|
81
|
+
# @return [Boolean] true if blank
|
|
82
|
+
def self.blank?(value)
|
|
83
|
+
value.nil? ||
|
|
84
|
+
(value.respond_to?(:empty?) && value.empty?) ||
|
|
85
|
+
(value.respond_to?(:strip) && value.strip.empty?)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Check if a node is a text node
|
|
89
|
+
# @param node [Object] The node to check
|
|
90
|
+
# @return [Boolean] true if the node is a text node
|
|
91
|
+
def self.text_node?(node)
|
|
92
|
+
return false if node.nil?
|
|
93
|
+
|
|
94
|
+
# Canon::Xml::Nodes::TextNode
|
|
95
|
+
return true if node.is_a?(Canon::Xml::Nodes::TextNode)
|
|
96
|
+
|
|
97
|
+
# Moxml::Text (check before generic node_type check)
|
|
98
|
+
return true if node.is_a?(Moxml::Text)
|
|
99
|
+
|
|
100
|
+
# Nokogiri text nodes (node_type returns integer constant like 3)
|
|
101
|
+
return true if node.respond_to?(:node_type) &&
|
|
102
|
+
node.node_type.is_a?(Integer) &&
|
|
103
|
+
node.node_type == Nokogiri::XML::Node::TEXT_NODE
|
|
104
|
+
|
|
105
|
+
# Moxml text nodes (node_type returns symbol) - for when using Moxml adapters
|
|
106
|
+
return true if node.respond_to?(:node_type) && node.node_type == :text
|
|
107
|
+
|
|
108
|
+
# String
|
|
109
|
+
return true if node.is_a?(String)
|
|
110
|
+
|
|
111
|
+
# Test doubles or objects with text node-like interface
|
|
112
|
+
# Check if it has a value method (contains text content)
|
|
113
|
+
return true if node.respond_to?(:value)
|
|
114
|
+
|
|
115
|
+
false
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Extract text content from a node
|
|
119
|
+
# @param node [Object] The node to extract text from
|
|
120
|
+
# @return [String, nil] The text content or nil
|
|
121
|
+
def self.extract_text_content(node)
|
|
122
|
+
return nil if node.nil?
|
|
123
|
+
|
|
124
|
+
# For TextNode with value attribute (Canon::Xml::Nodes::TextNode)
|
|
125
|
+
return node.value if node.respond_to?(:value) && node.is_a?(Canon::Xml::Nodes::TextNode)
|
|
126
|
+
|
|
127
|
+
# For XML/HTML nodes with text_content method
|
|
128
|
+
return node.text_content if node.respond_to?(:text_content)
|
|
129
|
+
|
|
130
|
+
# For nodes with content method (try before text, as Moxml::Text.text returns "")
|
|
131
|
+
return node.content if node.respond_to?(:content)
|
|
132
|
+
|
|
133
|
+
# For nodes with text method
|
|
134
|
+
return node.text if node.respond_to?(:text)
|
|
135
|
+
|
|
136
|
+
# For nodes with value method (other types)
|
|
137
|
+
return node.value if node.respond_to?(:value)
|
|
138
|
+
|
|
139
|
+
# For simple text nodes or strings
|
|
140
|
+
return node.to_s if node.is_a?(String)
|
|
141
|
+
|
|
142
|
+
# For other node types, try to_s
|
|
143
|
+
node.to_s
|
|
144
|
+
rescue StandardError
|
|
145
|
+
# If extraction fails, return nil (not a serialization difference)
|
|
146
|
+
nil
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
private_class_method :blank?, :text_node?, :extract_text_content,
|
|
150
|
+
:empty_text_content_serialization_diff?
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
data/lib/canon/rspec_matchers.rb
CHANGED
|
@@ -63,6 +63,15 @@ module Canon
|
|
|
63
63
|
self
|
|
64
64
|
end
|
|
65
65
|
|
|
66
|
+
# Chain method for setting match options
|
|
67
|
+
# @param match_opts [Hash] match options
|
|
68
|
+
# @return [SerializationMatcher] self for chaining
|
|
69
|
+
def with_match(**match_opts)
|
|
70
|
+
@match ||= {}
|
|
71
|
+
@match = @match.merge(match_opts)
|
|
72
|
+
self
|
|
73
|
+
end
|
|
74
|
+
|
|
66
75
|
def matches?(target)
|
|
67
76
|
@target = target
|
|
68
77
|
|
|
@@ -252,12 +261,22 @@ module Canon
|
|
|
252
261
|
diff_algorithm: diff_algorithm)
|
|
253
262
|
end
|
|
254
263
|
|
|
255
|
-
def be_yaml_equivalent_to(expected
|
|
256
|
-
|
|
264
|
+
def be_yaml_equivalent_to(expected, match_profile: nil, match: nil,
|
|
265
|
+
preprocessing: nil, diff_algorithm: nil)
|
|
266
|
+
SerializationMatcher.new(expected, :yaml,
|
|
267
|
+
match_profile: match_profile,
|
|
268
|
+
match: match,
|
|
269
|
+
preprocessing: preprocessing,
|
|
270
|
+
diff_algorithm: diff_algorithm)
|
|
257
271
|
end
|
|
258
272
|
|
|
259
|
-
def be_json_equivalent_to(expected
|
|
260
|
-
|
|
273
|
+
def be_json_equivalent_to(expected, match_profile: nil, match: nil,
|
|
274
|
+
preprocessing: nil, diff_algorithm: nil)
|
|
275
|
+
SerializationMatcher.new(expected, :json,
|
|
276
|
+
match_profile: match_profile,
|
|
277
|
+
match: match,
|
|
278
|
+
preprocessing: preprocessing,
|
|
279
|
+
diff_algorithm: diff_algorithm)
|
|
261
280
|
end
|
|
262
281
|
|
|
263
282
|
def be_html_equivalent_to(expected, match_profile: nil, match: nil,
|
|
@@ -287,12 +306,22 @@ module Canon
|
|
|
287
306
|
diff_algorithm: diff_algorithm)
|
|
288
307
|
end
|
|
289
308
|
|
|
290
|
-
def be_equivalent_to(expected
|
|
291
|
-
|
|
309
|
+
def be_equivalent_to(expected, match_profile: nil, match: nil,
|
|
310
|
+
preprocessing: nil, diff_algorithm: nil)
|
|
311
|
+
SerializationMatcher.new(expected, nil,
|
|
312
|
+
match_profile: match_profile,
|
|
313
|
+
match: match,
|
|
314
|
+
preprocessing: preprocessing,
|
|
315
|
+
diff_algorithm: diff_algorithm)
|
|
292
316
|
end
|
|
293
317
|
|
|
294
|
-
def be_string_equivalent_to(expected
|
|
295
|
-
|
|
318
|
+
def be_string_equivalent_to(expected, match_profile: nil, match: nil,
|
|
319
|
+
preprocessing: nil, diff_algorithm: nil)
|
|
320
|
+
SerializationMatcher.new(expected, :string,
|
|
321
|
+
match_profile: match_profile,
|
|
322
|
+
match: match,
|
|
323
|
+
preprocessing: preprocessing,
|
|
324
|
+
diff_algorithm: diff_algorithm)
|
|
296
325
|
end
|
|
297
326
|
|
|
298
327
|
if defined?(::RSpec) && ::RSpec.respond_to?(:configure)
|
data/lib/canon/version.rb
CHANGED
data/lib/canon/xml/data_model.rb
CHANGED
|
@@ -18,8 +18,9 @@ module Canon
|
|
|
18
18
|
# Build XPath data model from XML string
|
|
19
19
|
#
|
|
20
20
|
# @param xml_string [String] XML content to parse
|
|
21
|
+
# @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
|
|
21
22
|
# @return [Nodes::RootNode] Root of the data model tree
|
|
22
|
-
def self.from_xml(xml_string)
|
|
23
|
+
def self.from_xml(xml_string, preserve_whitespace: false)
|
|
23
24
|
# Parse with Nokogiri
|
|
24
25
|
doc = Nokogiri::XML(xml_string) do |config|
|
|
25
26
|
config.nonet # Disable network access
|
|
@@ -30,7 +31,7 @@ module Canon
|
|
|
30
31
|
check_for_relative_namespace_uris(doc)
|
|
31
32
|
|
|
32
33
|
# Convert to XPath data model
|
|
33
|
-
build_from_nokogiri(doc)
|
|
34
|
+
build_from_nokogiri(doc, preserve_whitespace: preserve_whitespace)
|
|
34
35
|
end
|
|
35
36
|
|
|
36
37
|
# Alias for compatibility with base class interface
|
|
@@ -74,19 +75,21 @@ module Canon
|
|
|
74
75
|
|
|
75
76
|
# Build XPath data model from Nokogiri document or fragment
|
|
76
77
|
# rubocop:disable Metrics/MethodLength
|
|
77
|
-
def self.build_from_nokogiri(nokogiri_doc)
|
|
78
|
+
def self.build_from_nokogiri(nokogiri_doc, preserve_whitespace: false)
|
|
78
79
|
root = Nodes::RootNode.new
|
|
79
80
|
|
|
80
81
|
if nokogiri_doc.respond_to?(:root) && nokogiri_doc.root
|
|
81
82
|
# For Documents (XML, HTML4, HTML5, Moxml): process the root element
|
|
82
|
-
root.add_child(build_element_node(nokogiri_doc.root
|
|
83
|
+
root.add_child(build_element_node(nokogiri_doc.root,
|
|
84
|
+
preserve_whitespace: preserve_whitespace))
|
|
83
85
|
|
|
84
86
|
# Process PIs and comments outside doc element
|
|
85
87
|
nokogiri_doc.children.each do |child|
|
|
86
88
|
next if child == nokogiri_doc.root
|
|
87
89
|
next if child.is_a?(Nokogiri::XML::DTD)
|
|
88
90
|
|
|
89
|
-
node = build_node_from_nokogiri(child
|
|
91
|
+
node = build_node_from_nokogiri(child,
|
|
92
|
+
preserve_whitespace: preserve_whitespace)
|
|
90
93
|
root.add_child(node) if node
|
|
91
94
|
end
|
|
92
95
|
else
|
|
@@ -95,7 +98,8 @@ module Canon
|
|
|
95
98
|
nokogiri_doc.children.each do |child|
|
|
96
99
|
next if child.is_a?(Nokogiri::XML::DTD)
|
|
97
100
|
|
|
98
|
-
node = build_node_from_nokogiri(child
|
|
101
|
+
node = build_node_from_nokogiri(child,
|
|
102
|
+
preserve_whitespace: preserve_whitespace)
|
|
99
103
|
root.add_child(node) if node
|
|
100
104
|
end
|
|
101
105
|
end
|
|
@@ -104,12 +108,15 @@ module Canon
|
|
|
104
108
|
end
|
|
105
109
|
|
|
106
110
|
# Build node from Nokogiri node
|
|
107
|
-
def self.build_node_from_nokogiri(nokogiri_node
|
|
111
|
+
def self.build_node_from_nokogiri(nokogiri_node,
|
|
112
|
+
preserve_whitespace: false)
|
|
108
113
|
case nokogiri_node
|
|
109
114
|
when Nokogiri::XML::Element
|
|
110
|
-
build_element_node(nokogiri_node
|
|
115
|
+
build_element_node(nokogiri_node,
|
|
116
|
+
preserve_whitespace: preserve_whitespace)
|
|
111
117
|
when Nokogiri::XML::Text
|
|
112
|
-
build_text_node(nokogiri_node
|
|
118
|
+
build_text_node(nokogiri_node,
|
|
119
|
+
preserve_whitespace: preserve_whitespace)
|
|
113
120
|
when Nokogiri::XML::Comment
|
|
114
121
|
build_comment_node(nokogiri_node)
|
|
115
122
|
when Nokogiri::XML::ProcessingInstruction
|
|
@@ -119,7 +126,7 @@ module Canon
|
|
|
119
126
|
|
|
120
127
|
# Build element node from Nokogiri element
|
|
121
128
|
# rubocop:disable Metrics/MethodLength
|
|
122
|
-
def self.build_element_node(nokogiri_element)
|
|
129
|
+
def self.build_element_node(nokogiri_element, preserve_whitespace: false)
|
|
123
130
|
element = Nodes::ElementNode.new(
|
|
124
131
|
name: nokogiri_element.name,
|
|
125
132
|
namespace_uri: nokogiri_element.namespace&.href,
|
|
@@ -134,7 +141,8 @@ module Canon
|
|
|
134
141
|
|
|
135
142
|
# Build child nodes
|
|
136
143
|
nokogiri_element.children.each do |child|
|
|
137
|
-
node = build_node_from_nokogiri(child
|
|
144
|
+
node = build_node_from_nokogiri(child,
|
|
145
|
+
preserve_whitespace: preserve_whitespace)
|
|
138
146
|
element.add_child(node) if node
|
|
139
147
|
end
|
|
140
148
|
|
|
@@ -195,13 +203,16 @@ module Canon
|
|
|
195
203
|
end
|
|
196
204
|
|
|
197
205
|
# Build text node from Nokogiri text node
|
|
198
|
-
def self.build_text_node(nokogiri_text)
|
|
206
|
+
def self.build_text_node(nokogiri_text, preserve_whitespace: false)
|
|
199
207
|
# XML text nodes: preserve all content including whitespace
|
|
200
208
|
# Unlike HTML, XML treats all whitespace as significant
|
|
201
209
|
content = nokogiri_text.content
|
|
202
210
|
|
|
203
211
|
# Skip empty text nodes between elements (common formatting whitespace)
|
|
204
|
-
|
|
212
|
+
# UNLESS preserve_whitespace is true (for structural_whitespace: :strict)
|
|
213
|
+
if !preserve_whitespace && content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element)
|
|
214
|
+
return nil
|
|
215
|
+
end
|
|
205
216
|
|
|
206
217
|
# Nokogiri already handles CDATA conversion and entity resolution
|
|
207
218
|
Nodes::TextNode.new(value: content)
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: canon
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.10
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-01-
|
|
11
|
+
date: 2026-01-21 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: diff-lcs
|
|
@@ -174,7 +174,6 @@ files:
|
|
|
174
174
|
- docs/internals/diffnode-enrichment.adoc
|
|
175
175
|
- docs/internals/index.adoc
|
|
176
176
|
- docs/lychee.toml
|
|
177
|
-
- docs/plans/2025-01-17-html-parser-selection-fix.adoc
|
|
178
177
|
- docs/reference/cli-options.adoc
|
|
179
178
|
- docs/reference/environment-variables.adoc
|
|
180
179
|
- docs/reference/index.adoc
|
|
@@ -191,9 +190,6 @@ files:
|
|
|
191
190
|
- docs/understanding/formats/yaml.adoc
|
|
192
191
|
- docs/understanding/index.adoc
|
|
193
192
|
- exe/canon
|
|
194
|
-
- false_positive_analysis.txt
|
|
195
|
-
- file1.html
|
|
196
|
-
- file2.html
|
|
197
193
|
- lib/canon.rb
|
|
198
194
|
- lib/canon/cache.rb
|
|
199
195
|
- lib/canon/cli.rb
|
|
@@ -230,6 +226,7 @@ files:
|
|
|
230
226
|
- lib/canon/comparison/strategies/base_match_strategy.rb
|
|
231
227
|
- lib/canon/comparison/strategies/match_strategy_factory.rb
|
|
232
228
|
- lib/canon/comparison/strategies/semantic_tree_match_strategy.rb
|
|
229
|
+
- lib/canon/comparison/whitespace_sensitivity.rb
|
|
233
230
|
- lib/canon/comparison/xml_comparator.rb
|
|
234
231
|
- lib/canon/comparison/xml_comparator/attribute_comparator.rb
|
|
235
232
|
- lib/canon/comparison/xml_comparator/attribute_filter.rb
|
|
@@ -260,6 +257,7 @@ files:
|
|
|
260
257
|
- lib/canon/diff/formatting_detector.rb
|
|
261
258
|
- lib/canon/diff/node_serializer.rb
|
|
262
259
|
- lib/canon/diff/path_builder.rb
|
|
260
|
+
- lib/canon/diff/xml_serialization_formatter.rb
|
|
263
261
|
- lib/canon/diff_formatter.rb
|
|
264
262
|
- lib/canon/diff_formatter/by_line/base_formatter.rb
|
|
265
263
|
- lib/canon/diff_formatter/by_line/html_formatter.rb
|
|
@@ -344,79 +342,7 @@ files:
|
|
|
344
342
|
- lib/canon/xml/whitespace_normalizer.rb
|
|
345
343
|
- lib/canon/xml/xml_base_handler.rb
|
|
346
344
|
- lib/xml-c14n.rb
|
|
347
|
-
- old-docs/ADVANCED_TOPICS.adoc
|
|
348
|
-
- old-docs/BASIC_USAGE.adoc
|
|
349
|
-
- old-docs/CHARACTER_VISUALIZATION.adoc
|
|
350
|
-
- old-docs/CLI.adoc
|
|
351
|
-
- old-docs/CUSTOMIZING_BEHAVIOR.adoc
|
|
352
|
-
- old-docs/DIFF_ARCHITECTURE.adoc
|
|
353
|
-
- old-docs/DIFF_FORMATTING.adoc
|
|
354
|
-
- old-docs/DIFF_PARAMETERS.adoc
|
|
355
|
-
- old-docs/DOM_DIFF.adoc
|
|
356
|
-
- old-docs/ENV_CONFIG.adoc
|
|
357
|
-
- old-docs/FORMATS.adoc
|
|
358
|
-
- old-docs/INPUT_VALIDATION.adoc
|
|
359
|
-
- old-docs/MATCHER_BEHAVIOR.adoc
|
|
360
|
-
- old-docs/MATCH_ARCHITECTURE.adoc
|
|
361
|
-
- old-docs/MATCH_OPTIONS.adoc
|
|
362
|
-
- old-docs/MODES.adoc
|
|
363
|
-
- old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc
|
|
364
|
-
- old-docs/OPTIONS.adoc
|
|
365
|
-
- old-docs/PREPROCESSING.adoc
|
|
366
|
-
- old-docs/README.old.adoc
|
|
367
|
-
- old-docs/RSPEC.adoc
|
|
368
|
-
- old-docs/RUBY_API.adoc
|
|
369
|
-
- old-docs/SEMANTIC_DIFF_REPORT.adoc
|
|
370
|
-
- old-docs/SEMANTIC_TREE_DIFF.adoc
|
|
371
|
-
- old-docs/STRING_COMPARE.adoc
|
|
372
|
-
- old-docs/TMP.adoc
|
|
373
|
-
- old-docs/TREE_DIFF.adoc
|
|
374
|
-
- old-docs/UNDERSTANDING_CANON.adoc
|
|
375
|
-
- old-docs/VERBOSE.adoc
|
|
376
|
-
- old-docs/VISUALIZATION_MAP.adoc
|
|
377
|
-
- old-docs/WHITESPACE_TREATMENT.adoc
|
|
378
|
-
- scripts/analyze_current_state.rb
|
|
379
|
-
- scripts/analyze_false_positives.rb
|
|
380
|
-
- scripts/analyze_remaining_failures.rb
|
|
381
|
-
- scripts/compare_current_failures.rb
|
|
382
|
-
- scripts/compare_dom_tree_diff.rb
|
|
383
|
-
- scripts/compare_failures.rb
|
|
384
|
-
- scripts/debug_attribute_extraction.rb
|
|
385
|
-
- scripts/debug_blocks_839.rb
|
|
386
|
-
- scripts/debug_meta_matching.rb
|
|
387
|
-
- scripts/debug_p_matching.rb
|
|
388
|
-
- scripts/debug_signature_matching.rb
|
|
389
|
-
- scripts/debug_sourcecode_124.rb
|
|
390
|
-
- scripts/debug_whitespace_sensitive.rb
|
|
391
|
-
- scripts/extract_false_positives.rb
|
|
392
|
-
- scripts/find_actual_false_positives.rb
|
|
393
|
-
- scripts/investigate_all_false_positives.rb
|
|
394
|
-
- scripts/investigate_batch1.rb
|
|
395
|
-
- scripts/investigate_classification.rb
|
|
396
|
-
- scripts/investigate_classification_detailed.rb
|
|
397
|
-
- scripts/investigate_common_failures.rb
|
|
398
|
-
- scripts/investigate_false_negative.rb
|
|
399
|
-
- scripts/investigate_false_positive.rb
|
|
400
|
-
- scripts/investigate_false_positives.rb
|
|
401
|
-
- scripts/investigate_false_positives_batch.rb
|
|
402
|
-
- scripts/investigate_mixed_content.rb
|
|
403
|
-
- scripts/investigate_remaining_16.rb
|
|
404
|
-
- scripts/run_single_test.rb
|
|
405
|
-
- scripts/test_all_false_positives.rb
|
|
406
|
-
- scripts/test_attribute_details.rb
|
|
407
|
-
- scripts/test_both_algorithms.rb
|
|
408
|
-
- scripts/test_both_simple.rb
|
|
409
|
-
- scripts/test_enhanced_semantic_output.rb
|
|
410
|
-
- scripts/test_readme_examples.rb
|
|
411
|
-
- scripts/test_semantic_tree_diff.rb
|
|
412
|
-
- scripts/test_semantic_ux_improvements.rb
|
|
413
|
-
- scripts/test_single_false_positive.rb
|
|
414
|
-
- scripts/test_size_limits.rb
|
|
415
345
|
- sig/xml/c14n.rbs
|
|
416
|
-
- test_html_1.html
|
|
417
|
-
- test_html_2.html
|
|
418
|
-
- test_nokogiri.rb
|
|
419
|
-
- test_normalize.rb
|
|
420
346
|
homepage: https://github.com/lutaml/canon
|
|
421
347
|
licenses:
|
|
422
348
|
- BSD-2-Clause
|