canon 0.2.8 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec-opal +7 -0
- data/.rubocop_todo.yml +14 -71
- data/Rakefile +17 -0
- data/lib/canon/cli.rb +1 -1
- data/lib/canon/color_detector.rb +3 -5
- data/lib/canon/comparison/compare_profile.rb +1 -4
- data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +2 -6
- data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +2 -6
- data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +2 -6
- data/lib/canon/comparison/dimensions/comments_dimension.rb +2 -6
- data/lib/canon/comparison/dimensions/element_position_dimension.rb +2 -6
- data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +2 -6
- data/lib/canon/comparison/dimensions/text_content_dimension.rb +3 -5
- data/lib/canon/comparison/format_detector.rb +29 -20
- data/lib/canon/comparison/html_comparator.rb +18 -29
- data/lib/canon/comparison/html_compare_profile.rb +3 -10
- data/lib/canon/comparison/html_parser.rb +1 -1
- data/lib/canon/comparison/json_comparator.rb +8 -0
- data/lib/canon/comparison/node_inspector.rb +146 -80
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +6 -8
- data/lib/canon/comparison/whitespace_sensitivity.rb +55 -193
- data/lib/canon/comparison/xml_comparator/attribute_filter.rb +5 -10
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +4 -4
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +10 -8
- data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +14 -28
- data/lib/canon/comparison/xml_comparator/node_parser.rb +12 -11
- data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +30 -58
- data/lib/canon/comparison/xml_comparator.rb +61 -83
- data/lib/canon/comparison/xml_node_comparison.rb +15 -15
- data/lib/canon/comparison/yaml_comparator.rb +8 -0
- data/lib/canon/comparison.rb +23 -23
- data/lib/canon/config/profile_loader.rb +13 -13
- data/lib/canon/config.rb +29 -5
- data/lib/canon/diff/diff_classifier.rb +7 -41
- data/lib/canon/diff/diff_line.rb +1 -1
- data/lib/canon/diff/diff_node_enricher.rb +22 -24
- data/lib/canon/diff/node_serializer.rb +23 -30
- data/lib/canon/diff/path_builder.rb +24 -37
- data/lib/canon/diff/source_locator.rb +0 -3
- data/lib/canon/diff/xml_serialization_formatter.rb +8 -81
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +7 -7
- data/lib/canon/diff_formatter/by_line/json_formatter.rb +1 -1
- data/lib/canon/diff_formatter/by_line/simple_formatter.rb +1 -1
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +2 -2
- data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +1 -1
- data/lib/canon/diff_formatter/by_line_formatter.rb +1 -1
- data/lib/canon/diff_formatter/by_object/base_formatter.rb +11 -15
- data/lib/canon/diff_formatter/by_object/xml_formatter.rb +8 -10
- data/lib/canon/diff_formatter/by_object_formatter.rb +1 -1
- data/lib/canon/diff_formatter/debug_output.rb +12 -24
- data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +2 -2
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +146 -318
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +28 -20
- data/lib/canon/diff_formatter/legend.rb +2 -2
- data/lib/canon/diff_formatter/pretty_diff_formatter.rb +2 -2
- data/lib/canon/diff_formatter/theme.rb +4 -4
- data/lib/canon/diff_formatter.rb +2 -2
- data/lib/canon/formatters/html_formatter.rb +1 -1
- data/lib/canon/formatters/html_formatter_base.rb +1 -1
- data/lib/canon/formatters/xml_formatter.rb +7 -32
- data/lib/canon/html/data_model.rb +1 -1
- data/lib/canon/pretty_printer/html.rb +1 -1
- data/lib/canon/pretty_printer/xml.rb +16 -7
- data/lib/canon/pretty_printer/xml_normalized.rb +9 -3
- data/lib/canon/rspec_matchers.rb +2 -2
- data/lib/canon/tree_diff/adapters/html_adapter.rb +1 -1
- data/lib/canon/tree_diff/adapters/xml_adapter.rb +1 -1
- data/lib/canon/tree_diff/core/tree_node.rb +1 -3
- data/lib/canon/validators/html_validator.rb +1 -1
- data/lib/canon/validators/xml_validator.rb +1 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +131 -137
- data/lib/canon/xml/namespace_helper.rb +5 -0
- data/lib/canon/xml/node.rb +2 -1
- data/lib/canon/xml/nodes/root_node.rb +4 -0
- data/lib/canon/xml/nodes/text_node.rb +6 -1
- data/lib/canon/xml/sax_builder.rb +4 -6
- data/lib/canon/xml_backend.rb +49 -0
- data/lib/canon/xml_parsing.rb +271 -0
- data/lib/canon.rb +3 -1
- data/lib/tasks/benchmark_runner.rb +1 -1
- data/lib/tasks/performance_helpers.rb +1 -1
- metadata +5 -2
data/lib/canon/xml/data_model.rb
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require "nokogiri"
|
|
3
|
+
require "nokogiri" unless RUBY_ENGINE == "opal"
|
|
4
4
|
require "set"
|
|
5
5
|
require_relative "../data_model"
|
|
6
|
+
require_relative "../xml_backend"
|
|
7
|
+
require_relative "../xml_parsing"
|
|
6
8
|
require_relative "nodes/root_node"
|
|
7
9
|
require_relative "nodes/element_node"
|
|
8
10
|
require_relative "nodes/namespace_node"
|
|
@@ -13,115 +15,59 @@ require_relative "nodes/processing_instruction_node"
|
|
|
13
15
|
|
|
14
16
|
module Canon
|
|
15
17
|
module Xml
|
|
16
|
-
# Builds XPath data model from XML
|
|
17
18
|
class DataModel < Canon::DataModel
|
|
18
|
-
# Build XPath data model from XML string
|
|
19
|
-
#
|
|
20
|
-
# @param xml_string [String] XML content to parse
|
|
21
|
-
# @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
|
|
22
|
-
# @return [Nodes::RootNode] Root of the data model tree
|
|
23
19
|
def self.from_xml(xml_string, preserve_whitespace: false)
|
|
24
|
-
# Normalize encoding before parsing
|
|
25
20
|
normalized_xml = normalize_encoding(xml_string)
|
|
26
21
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
result = build_from_nokogiri(doc,
|
|
35
|
-
preserve_whitespace: preserve_whitespace)
|
|
36
|
-
|
|
37
|
-
# Carry libxml's parse errors on the resulting tree so the diff
|
|
38
|
-
# report can surface them (see lutaml/canon#130). libxml's
|
|
39
|
-
# FATAL conditions (e.g. duplicate attributes) silently drop
|
|
40
|
-
# content from the parse tree; without surfacing the error
|
|
41
|
-
# list, downstream diffs describe the partial tree, not the
|
|
42
|
-
# input.
|
|
43
|
-
errors = Array(doc.errors).map(&:to_s)
|
|
44
|
-
result.parse_errors = errors if errors.any?
|
|
45
|
-
|
|
46
|
-
result
|
|
22
|
+
if Canon::XmlBackend.nokogiri?
|
|
23
|
+
from_nokogiri_xml(normalized_xml,
|
|
24
|
+
preserve_whitespace: preserve_whitespace)
|
|
25
|
+
else
|
|
26
|
+
from_moxml_xml(normalized_xml,
|
|
27
|
+
preserve_whitespace: preserve_whitespace)
|
|
28
|
+
end
|
|
47
29
|
end
|
|
48
30
|
|
|
49
|
-
# Normalize XML string encoding to UTF-8
|
|
50
|
-
#
|
|
51
|
-
# Handles cases where:
|
|
52
|
-
# 1. The XML declaration specifies an encoding that doesn't match the actual encoding
|
|
53
|
-
# 2. The string's internal encoding is non-UTF-8 (without a declaration)
|
|
54
|
-
#
|
|
55
|
-
# For case 1, we check if the declared encoding matches the actual bytes.
|
|
56
|
-
# If bytes are valid UTF-8 despite the declaration, we update the declaration to UTF-8.
|
|
57
|
-
#
|
|
58
|
-
# @param xml_string [String] XML string to normalize
|
|
59
|
-
# @return [String] Normalized XML string with UTF-8 encoding
|
|
60
31
|
def self.normalize_encoding(xml_string)
|
|
61
32
|
return xml_string unless xml_string.is_a?(String)
|
|
62
33
|
|
|
63
|
-
# Extract declared encoding from XML declaration
|
|
64
34
|
declared_encoding = extract_xml_encoding(xml_string)
|
|
65
35
|
|
|
66
36
|
if declared_encoding
|
|
67
|
-
# Case 1: XML has a declaration
|
|
68
37
|
if declared_encoding.upcase != "UTF-8"
|
|
69
|
-
# Check if bytes are actually valid UTF-8 despite the declaration
|
|
70
38
|
utf8_reinterpreted = try_utf8_reinterpretation(xml_string)
|
|
71
39
|
if utf8_reinterpreted
|
|
72
|
-
|
|
73
|
-
|
|
40
|
+
return update_xml_declaration(xml_string,
|
|
41
|
+
"UTF-8")
|
|
74
42
|
end
|
|
75
43
|
|
|
76
|
-
# Bytes aren't valid UTF-8 - must really be in declared encoding
|
|
77
44
|
return transcode_to_utf8(xml_string, declared_encoding)
|
|
78
45
|
end
|
|
79
46
|
elsif xml_string.encoding.name != "UTF-8"
|
|
80
|
-
# Case 2: No declaration but string encoding is non-UTF-8
|
|
81
|
-
# First, try to re-interpret bytes as UTF-8 (handles mislabeled strings)
|
|
82
47
|
reinterpreted = try_utf8_reinterpretation(xml_string)
|
|
83
48
|
return reinterpreted if reinterpreted
|
|
84
49
|
|
|
85
|
-
# If re-interpretation fails, try transcoding with the labeled encoding
|
|
86
50
|
return transcode_to_utf8(xml_string, xml_string.encoding.name)
|
|
87
51
|
end
|
|
88
52
|
|
|
89
53
|
xml_string
|
|
90
54
|
end
|
|
91
55
|
|
|
92
|
-
# Update the encoding declaration in an XML string
|
|
93
|
-
#
|
|
94
|
-
# @param xml_string [String] XML string
|
|
95
|
-
# @param new_encoding [String] New encoding to declare
|
|
96
|
-
# @return [String] XML string with updated declaration
|
|
97
56
|
def self.update_xml_declaration(xml_string, new_encoding)
|
|
98
57
|
xml_string.sub(/\bencoding\s*=\s*["'][^"']+["']/i) do |_match|
|
|
99
58
|
%(encoding="#{new_encoding}")
|
|
100
59
|
end
|
|
101
60
|
end
|
|
102
61
|
|
|
103
|
-
# Transcode string to UTF-8
|
|
104
|
-
#
|
|
105
|
-
# @param xml_string [String] String to transcode
|
|
106
|
-
# @param source_encoding [String] Source encoding to interpret bytes as
|
|
107
|
-
# @return [String] UTF-8 transcoded string
|
|
108
62
|
def self.transcode_to_utf8(xml_string, source_encoding)
|
|
109
|
-
# First, check if the bytes are actually valid UTF-8 despite the declared encoding
|
|
110
|
-
# If so, just re-interpret as UTF-8 (common case: declaration is wrong)
|
|
111
63
|
if source_encoding != "UTF-8"
|
|
112
|
-
# Force the bytes to be interpreted as the declared encoding, then check validity
|
|
113
64
|
forced = xml_string.dup.force_encoding(source_encoding)
|
|
114
65
|
if forced.valid_encoding?
|
|
115
|
-
# Now check if the same bytes are valid UTF-8
|
|
116
66
|
utf8_check = xml_string.dup.force_encoding("UTF-8")
|
|
117
67
|
if utf8_check.valid_encoding?
|
|
118
|
-
# Bytes are valid UTF-8 - the declaration is likely wrong
|
|
119
|
-
# Return the string as UTF-8 (already is)
|
|
120
68
|
return xml_string.dup.force_encoding("UTF-8")
|
|
121
69
|
end
|
|
122
70
|
|
|
123
|
-
# Bytes aren't valid UTF-8, so they must really be in source_encoding
|
|
124
|
-
# Proceed with transcoding
|
|
125
71
|
return forced.encode("UTF-8", source_encoding,
|
|
126
72
|
invalid: :replace,
|
|
127
73
|
undef: :replace,
|
|
@@ -129,41 +75,21 @@ module Canon
|
|
|
129
75
|
end
|
|
130
76
|
end
|
|
131
77
|
|
|
132
|
-
# Already UTF-8 or transcoding failed, return as-is
|
|
133
78
|
xml_string.dup.force_encoding("UTF-8")
|
|
134
79
|
rescue EncodingError
|
|
135
80
|
xml_string
|
|
136
81
|
end
|
|
137
82
|
|
|
138
|
-
# Attempt to re-interpret string as UTF-8 if bytes are valid UTF-8
|
|
139
|
-
#
|
|
140
|
-
# This handles the case where a string was incorrectly labeled with a different
|
|
141
|
-
# encoding (e.g., `.encode("Shift_JIS")` on a UTF-8 string) but the actual
|
|
142
|
-
# bytes are valid UTF-8.
|
|
143
|
-
#
|
|
144
|
-
# @param xml_string [String] XML string to check
|
|
145
|
-
# @return [String, nil] UTF-8 re-interpreted string, or nil if not possible
|
|
146
83
|
def self.try_utf8_reinterpretation(xml_string)
|
|
147
84
|
return xml_string if xml_string.encoding.name == "UTF-8"
|
|
148
85
|
|
|
149
|
-
# Try forcing to UTF-8 and see if it's valid
|
|
150
86
|
forced = xml_string.dup.force_encoding("UTF-8")
|
|
151
87
|
return forced if forced.valid_encoding?
|
|
152
88
|
|
|
153
89
|
nil
|
|
154
90
|
end
|
|
155
91
|
|
|
156
|
-
# Extract encoding from XML declaration
|
|
157
|
-
#
|
|
158
|
-
# @param xml_string [String] XML string
|
|
159
|
-
# @return [String, nil] Declared encoding or nil if not found
|
|
160
92
|
def self.extract_xml_encoding(xml_string)
|
|
161
|
-
# Match XML declaration with encoding attribute
|
|
162
|
-
# Handles: <?xml version="1.0" encoding="UTF-8"?>
|
|
163
|
-
# and: <?xml version='1.0' encoding='UTF-8'?>
|
|
164
|
-
#
|
|
165
|
-
# Use binary encoding to avoid encoding compatibility issues
|
|
166
|
-
# when the string has non-ASCII compatible encoding (e.g., UTF-16)
|
|
167
93
|
binary_string = xml_string.dup.force_encoding("BINARY")
|
|
168
94
|
if binary_string =~ /\A\s*<\?xml[^>]*\bencoding\s*=\s*["']([^"']+)["'][^>]*\?>/i
|
|
169
95
|
return Regexp.last_match(1)
|
|
@@ -172,31 +98,36 @@ module Canon
|
|
|
172
98
|
nil
|
|
173
99
|
end
|
|
174
100
|
|
|
175
|
-
# Alias for compatibility with base class interface
|
|
176
101
|
def self.parse(xml_string)
|
|
177
102
|
from_xml(xml_string)
|
|
178
103
|
end
|
|
179
104
|
|
|
180
|
-
# Serialize XML node to string
|
|
181
|
-
#
|
|
182
|
-
# @param node [Nodes::RootNode, Nodes::ElementNode] Node to serialize
|
|
183
|
-
# @return [String] Serialized XML string
|
|
184
105
|
def self.serialize(node)
|
|
185
|
-
# Implementation will delegate to existing XML serialization
|
|
186
|
-
# This is a placeholder for the base class interface
|
|
187
106
|
node.to_s
|
|
188
107
|
end
|
|
189
108
|
|
|
190
|
-
|
|
191
|
-
|
|
109
|
+
def self.relative_uri?(uri)
|
|
110
|
+
uri !~ %r{^[a-zA-Z][a-zA-Z0-9+.-]*:}
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# --- Nokogiri path ---
|
|
114
|
+
|
|
115
|
+
def self.from_nokogiri_xml(xml_string, preserve_whitespace:)
|
|
116
|
+
doc = Nokogiri::XML(xml_string, &:nonet)
|
|
117
|
+
check_for_relative_namespace_uris(doc)
|
|
118
|
+
result = build_from_nokogiri(doc,
|
|
119
|
+
preserve_whitespace: preserve_whitespace)
|
|
120
|
+
errors = Array(doc.errors).map(&:to_s)
|
|
121
|
+
result.parse_errors = errors if errors.any?
|
|
122
|
+
result
|
|
123
|
+
end
|
|
124
|
+
|
|
192
125
|
def self.check_for_relative_namespace_uris(doc)
|
|
193
126
|
doc.traverse do |node|
|
|
194
127
|
next unless node.is_a?(Nokogiri::XML::Element)
|
|
195
128
|
|
|
196
129
|
node.namespace_definitions.each do |ns|
|
|
197
130
|
next if ns.href.nil? || ns.href.empty?
|
|
198
|
-
|
|
199
|
-
# Check if URI is relative
|
|
200
131
|
if relative_uri?(ns.href)
|
|
201
132
|
raise Canon::Error,
|
|
202
133
|
"Relative namespace URI not allowed: #{ns.href}"
|
|
@@ -205,23 +136,12 @@ module Canon
|
|
|
205
136
|
end
|
|
206
137
|
end
|
|
207
138
|
|
|
208
|
-
# Check if a URI is relative
|
|
209
|
-
def self.relative_uri?(uri)
|
|
210
|
-
# A URI is relative if it doesn't have a scheme
|
|
211
|
-
uri !~ %r{^[a-zA-Z][a-zA-Z0-9+.-]*:}
|
|
212
|
-
end
|
|
213
|
-
|
|
214
|
-
# Build XPath data model from Nokogiri document or fragment
|
|
215
|
-
# rubocop:disable Metrics/MethodLength
|
|
216
139
|
def self.build_from_nokogiri(nokogiri_doc, preserve_whitespace: false)
|
|
217
140
|
root = Nodes::RootNode.new
|
|
218
141
|
|
|
219
142
|
if nokogiri_doc.respond_to?(:root) && nokogiri_doc.root
|
|
220
|
-
# For Documents (XML, HTML4, HTML5, Moxml): process the root element
|
|
221
143
|
root.add_child(build_element_node(nokogiri_doc.root,
|
|
222
144
|
preserve_whitespace: preserve_whitespace))
|
|
223
|
-
|
|
224
|
-
# Process PIs and comments outside doc element
|
|
225
145
|
nokogiri_doc.children.each do |child|
|
|
226
146
|
next if child == nokogiri_doc.root
|
|
227
147
|
next if child.is_a?(Nokogiri::XML::DTD)
|
|
@@ -231,8 +151,6 @@ module Canon
|
|
|
231
151
|
root.add_child(node) if node
|
|
232
152
|
end
|
|
233
153
|
else
|
|
234
|
-
# For DocumentFragments: process all children directly
|
|
235
|
-
# Fragments don't have a single .root, they contain multiple top-level nodes
|
|
236
154
|
nokogiri_doc.children.each do |child|
|
|
237
155
|
next if child.is_a?(Nokogiri::XML::DTD)
|
|
238
156
|
|
|
@@ -245,7 +163,6 @@ module Canon
|
|
|
245
163
|
root
|
|
246
164
|
end
|
|
247
165
|
|
|
248
|
-
# Build node from Nokogiri node
|
|
249
166
|
def self.build_node_from_nokogiri(nokogiri_node,
|
|
250
167
|
preserve_whitespace: false)
|
|
251
168
|
case nokogiri_node
|
|
@@ -262,8 +179,6 @@ preserve_whitespace: false)
|
|
|
262
179
|
end
|
|
263
180
|
end
|
|
264
181
|
|
|
265
|
-
# Build element node from Nokogiri element
|
|
266
|
-
# rubocop:disable Metrics/MethodLength
|
|
267
182
|
def self.build_element_node(nokogiri_element, preserve_whitespace: false)
|
|
268
183
|
element = Nodes::ElementNode.new(
|
|
269
184
|
name: nokogiri_element.name,
|
|
@@ -271,13 +186,9 @@ preserve_whitespace: false)
|
|
|
271
186
|
prefix: nokogiri_element.namespace&.prefix,
|
|
272
187
|
)
|
|
273
188
|
|
|
274
|
-
# Build namespace nodes (includes inherited namespaces)
|
|
275
189
|
build_namespace_nodes(nokogiri_element, element)
|
|
276
|
-
|
|
277
|
-
# Build attribute nodes
|
|
278
190
|
build_attribute_nodes(nokogiri_element, element)
|
|
279
191
|
|
|
280
|
-
# Build child nodes
|
|
281
192
|
nokogiri_element.children.each do |child|
|
|
282
193
|
node = build_node_from_nokogiri(child,
|
|
283
194
|
preserve_whitespace: preserve_whitespace)
|
|
@@ -287,9 +198,7 @@ preserve_whitespace: false)
|
|
|
287
198
|
element
|
|
288
199
|
end
|
|
289
200
|
|
|
290
|
-
# Build namespace nodes for an element
|
|
291
201
|
def self.build_namespace_nodes(nokogiri_element, element)
|
|
292
|
-
# Collect all in-scope namespaces
|
|
293
202
|
namespaces = collect_in_scope_namespaces(nokogiri_element)
|
|
294
203
|
|
|
295
204
|
namespaces.each do |prefix, uri|
|
|
@@ -301,18 +210,14 @@ preserve_whitespace: false)
|
|
|
301
210
|
end
|
|
302
211
|
end
|
|
303
212
|
|
|
304
|
-
# Collect all in-scope namespaces for an element
|
|
305
|
-
# rubocop:disable Metrics/MethodLength
|
|
306
213
|
def self.collect_in_scope_namespaces(nokogiri_element)
|
|
307
214
|
namespaces = {}
|
|
308
215
|
|
|
309
|
-
# Walk up the tree to collect all namespace declarations
|
|
310
216
|
current = nokogiri_element
|
|
311
217
|
while current && !current.is_a?(Nokogiri::XML::Document)
|
|
312
218
|
if current.is_a?(Nokogiri::XML::Element)
|
|
313
219
|
current.namespace_definitions.each do |ns|
|
|
314
220
|
prefix = ns.prefix || ""
|
|
315
|
-
# Only add if not already defined (child overrides parent)
|
|
316
221
|
unless namespaces.key?(prefix)
|
|
317
222
|
namespaces[prefix] = ns.href
|
|
318
223
|
end
|
|
@@ -321,13 +226,11 @@ preserve_whitespace: false)
|
|
|
321
226
|
current = current.parent
|
|
322
227
|
end
|
|
323
228
|
|
|
324
|
-
# Always include xml namespace
|
|
325
229
|
namespaces["xml"] ||= "http://www.w3.org/XML/1998/namespace"
|
|
326
230
|
|
|
327
231
|
namespaces
|
|
328
232
|
end
|
|
329
233
|
|
|
330
|
-
# Build attribute nodes for an element
|
|
331
234
|
def self.build_attribute_nodes(nokogiri_element, element)
|
|
332
235
|
nokogiri_element.attributes.each_value do |attr|
|
|
333
236
|
attr_node = Nodes::AttributeNode.new(
|
|
@@ -340,39 +243,130 @@ preserve_whitespace: false)
|
|
|
340
243
|
end
|
|
341
244
|
end
|
|
342
245
|
|
|
343
|
-
# Build text node from Nokogiri text node
|
|
344
246
|
def self.build_text_node(nokogiri_text, preserve_whitespace: false)
|
|
345
|
-
# XML text nodes: preserve all content including whitespace
|
|
346
|
-
# Unlike HTML, XML treats all whitespace as significant
|
|
347
247
|
content = nokogiri_text.content
|
|
348
248
|
|
|
349
|
-
# Skip empty text nodes between elements (common formatting whitespace)
|
|
350
|
-
# UNLESS preserve_whitespace is true (for structural_whitespace: :strict)
|
|
351
249
|
if !preserve_whitespace && content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element)
|
|
352
250
|
return nil
|
|
353
251
|
end
|
|
354
252
|
|
|
355
|
-
# Capture original text with entity references preserved.
|
|
356
|
-
# nokogiri_text.to_xml returns the serialized text node which preserves
|
|
357
|
-
# entity forms like “ instead of the decoded character U+201C.
|
|
358
253
|
original = nokogiri_text.to_xml
|
|
359
|
-
|
|
360
|
-
# Nokogiri already handles CDATA conversion and entity resolution
|
|
361
254
|
Nodes::TextNode.new(value: content, original: original)
|
|
362
255
|
end
|
|
363
256
|
|
|
364
|
-
# Build comment node from Nokogiri comment
|
|
365
257
|
def self.build_comment_node(nokogiri_comment)
|
|
366
258
|
Nodes::CommentNode.new(value: nokogiri_comment.content)
|
|
367
259
|
end
|
|
368
260
|
|
|
369
|
-
# Build PI node from Nokogiri PI
|
|
370
261
|
def self.build_pi_node(nokogiri_pi)
|
|
371
262
|
Nodes::ProcessingInstructionNode.new(
|
|
372
263
|
target: nokogiri_pi.name,
|
|
373
264
|
data: nokogiri_pi.content,
|
|
374
265
|
)
|
|
375
266
|
end
|
|
267
|
+
|
|
268
|
+
# --- Moxml path ---
|
|
269
|
+
|
|
270
|
+
def self.from_moxml_xml(xml_string, preserve_whitespace:)
|
|
271
|
+
doc = Canon::XmlParsing.parse(xml_string)
|
|
272
|
+
build_from_moxml(doc, preserve_whitespace: preserve_whitespace)
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
def self.build_from_moxml(moxml_doc, preserve_whitespace: false)
|
|
276
|
+
root = Nodes::RootNode.new
|
|
277
|
+
|
|
278
|
+
if moxml_doc.respond_to?(:root) && moxml_doc.root
|
|
279
|
+
root.add_child(build_moxml_element_node(moxml_doc.root,
|
|
280
|
+
preserve_whitespace: preserve_whitespace))
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
root
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
def self.build_moxml_node(node, preserve_whitespace: false)
|
|
287
|
+
case node
|
|
288
|
+
when Moxml::Element
|
|
289
|
+
build_moxml_element_node(node,
|
|
290
|
+
preserve_whitespace: preserve_whitespace)
|
|
291
|
+
when Moxml::Text
|
|
292
|
+
build_moxml_text_node(node, preserve_whitespace: preserve_whitespace)
|
|
293
|
+
when Moxml::Comment
|
|
294
|
+
build_moxml_comment_node(node)
|
|
295
|
+
when Moxml::ProcessingInstruction
|
|
296
|
+
build_moxml_pi_node(node)
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
def self.build_moxml_element_node(moxml_element,
|
|
301
|
+
preserve_whitespace: false)
|
|
302
|
+
ns = moxml_element.namespace
|
|
303
|
+
element = Nodes::ElementNode.new(
|
|
304
|
+
name: moxml_element.name,
|
|
305
|
+
namespace_uri: ns&.uri,
|
|
306
|
+
prefix: ns&.prefix,
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
build_moxml_namespace_nodes(moxml_element, element)
|
|
310
|
+
build_moxml_attribute_nodes(moxml_element, element)
|
|
311
|
+
|
|
312
|
+
moxml_element.children.each do |child|
|
|
313
|
+
node = build_moxml_node(child,
|
|
314
|
+
preserve_whitespace: preserve_whitespace)
|
|
315
|
+
element.add_child(node) if node
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
element
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
def self.build_moxml_namespace_nodes(moxml_element, element)
|
|
322
|
+
moxml_element.namespace_definitions.each do |ns|
|
|
323
|
+
ns_node = Nodes::NamespaceNode.new(
|
|
324
|
+
prefix: ns.prefix || "",
|
|
325
|
+
uri: ns.uri,
|
|
326
|
+
)
|
|
327
|
+
element.add_namespace(ns_node)
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
unless element.namespaces.any? do |n|
|
|
331
|
+
n.prefix == "xml"
|
|
332
|
+
end
|
|
333
|
+
element.add_namespace(Nodes::NamespaceNode.new(
|
|
334
|
+
prefix: "xml",
|
|
335
|
+
uri: "http://www.w3.org/XML/1998/namespace",
|
|
336
|
+
))
|
|
337
|
+
end
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
def self.build_moxml_attribute_nodes(moxml_element, element)
|
|
341
|
+
moxml_element.attributes.each do |attr|
|
|
342
|
+
attr_node = Nodes::AttributeNode.new(
|
|
343
|
+
name: attr.name,
|
|
344
|
+
value: attr.value,
|
|
345
|
+
)
|
|
346
|
+
element.add_attribute(attr_node)
|
|
347
|
+
end
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
def self.build_moxml_text_node(moxml_text, preserve_whitespace: false)
|
|
351
|
+
content = moxml_text.text
|
|
352
|
+
|
|
353
|
+
if !preserve_whitespace && content.strip.empty? && moxml_text.parent.is_a?(Moxml::Element)
|
|
354
|
+
return nil
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
Nodes::TextNode.new(value: content, original: content)
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
def self.build_moxml_comment_node(moxml_comment)
|
|
361
|
+
Nodes::CommentNode.new(value: moxml_comment.text)
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
def self.build_moxml_pi_node(moxml_pi)
|
|
365
|
+
Nodes::ProcessingInstructionNode.new(
|
|
366
|
+
target: moxml_pi.target,
|
|
367
|
+
data: moxml_pi.data,
|
|
368
|
+
)
|
|
369
|
+
end
|
|
376
370
|
end
|
|
377
371
|
end
|
|
378
372
|
end
|
|
@@ -104,6 +104,11 @@ module Canon
|
|
|
104
104
|
namespace_uri.to_s
|
|
105
105
|
end
|
|
106
106
|
|
|
107
|
+
# Check if an attribute name is a namespace declaration (xmlns or xmlns:*)
|
|
108
|
+
def self.namespace_declaration?(attr_name)
|
|
109
|
+
attr_name == "xmlns" || attr_name.start_with?("xmlns:")
|
|
110
|
+
end
|
|
111
|
+
|
|
107
112
|
private_class_method :normalize_namespace
|
|
108
113
|
end
|
|
109
114
|
end
|
data/lib/canon/xml/node.rb
CHANGED
|
@@ -9,6 +9,7 @@ module Canon
|
|
|
9
9
|
def initialize
|
|
10
10
|
@parent = nil
|
|
11
11
|
@children = []
|
|
12
|
+
@in_node_set = true
|
|
12
13
|
end
|
|
13
14
|
|
|
14
15
|
def add_child(child)
|
|
@@ -17,7 +18,7 @@ module Canon
|
|
|
17
18
|
end
|
|
18
19
|
|
|
19
20
|
def in_node_set?
|
|
20
|
-
|
|
21
|
+
@in_node_set
|
|
21
22
|
end
|
|
22
23
|
|
|
23
24
|
def in_node_set=(value)
|
|
@@ -10,7 +10,12 @@ module Canon
|
|
|
10
10
|
# Stores both the decoded text value and the original text (with entity
|
|
11
11
|
# references preserved) to enable accurate round-trip serialization.
|
|
12
12
|
class TextNode < Node
|
|
13
|
-
|
|
13
|
+
attr_accessor :value
|
|
14
|
+
attr_reader :original
|
|
15
|
+
|
|
16
|
+
def original=(value)
|
|
17
|
+
@original = value
|
|
18
|
+
end
|
|
14
19
|
|
|
15
20
|
# @param value [String] Decoded text content (entity references resolved)
|
|
16
21
|
# @param original [String, nil] Original text as it appeared in source XML,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require "nokogiri"
|
|
3
|
+
require "nokogiri" unless RUBY_ENGINE == "opal"
|
|
4
4
|
require_relative "nodes/root_node"
|
|
5
5
|
require_relative "nodes/element_node"
|
|
6
6
|
require_relative "nodes/namespace_node"
|
|
@@ -190,10 +190,8 @@ strip_doctype: false)
|
|
|
190
190
|
last_child = parent.children.last
|
|
191
191
|
if last_child&.node_type == :text
|
|
192
192
|
# Combine both raw and decoded forms
|
|
193
|
-
last_child.
|
|
194
|
-
|
|
195
|
-
last_child.instance_variable_set(:@original,
|
|
196
|
-
(last_child.original || "") + raw_string)
|
|
193
|
+
last_child.value = last_child.value + decoded_string
|
|
194
|
+
last_child.original = (last_child.original || "") + raw_string
|
|
197
195
|
return
|
|
198
196
|
end
|
|
199
197
|
|
|
@@ -257,7 +255,7 @@ strip_doctype: false)
|
|
|
257
255
|
return unless doc_element
|
|
258
256
|
|
|
259
257
|
other_children = root.children.reject { |c| c.node_type == :element }
|
|
260
|
-
root.
|
|
258
|
+
root.children = [doc_element] + other_children
|
|
261
259
|
end
|
|
262
260
|
|
|
263
261
|
private
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
# Centralized XML backend detection for Canon.
|
|
5
|
+
#
|
|
6
|
+
# Canon supports two XML backends:
|
|
7
|
+
# - :nokogiri — MRI with Nokogiri installed (default, existing code path)
|
|
8
|
+
# - :moxml — Opal runtime or MRI without Nokogiri (uses Oga via moxml)
|
|
9
|
+
#
|
|
10
|
+
# The active backend is determined once at load time and cached.
|
|
11
|
+
# All XML-related code should check `Canon::XmlBackend.moxml?` or
|
|
12
|
+
# `Canon::XmlBackend.nokogiri?` to select the appropriate code path.
|
|
13
|
+
#
|
|
14
|
+
# This module intentionally does NOT wrap Nokogiri through moxml.
|
|
15
|
+
# Each backend path is independent — the Nokogiri path is the existing
|
|
16
|
+
# battle-tested code; the moxml path is a parallel implementation for
|
|
17
|
+
# environments where Nokogiri is unavailable.
|
|
18
|
+
module XmlBackend
|
|
19
|
+
class << self
|
|
20
|
+
def active
|
|
21
|
+
@active ||= detect
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def nokogiri?
|
|
25
|
+
active == :nokogiri
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def moxml?
|
|
29
|
+
active == :moxml
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def reset!
|
|
33
|
+
@active = nil
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
private
|
|
37
|
+
|
|
38
|
+
def detect
|
|
39
|
+
if RUBY_ENGINE == "opal"
|
|
40
|
+
:moxml
|
|
41
|
+
elsif defined?(Nokogiri)
|
|
42
|
+
:nokogiri
|
|
43
|
+
else
|
|
44
|
+
:moxml
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|