canon 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +31 -149
- data/README.adoc +9 -0
- data/docs/advanced/semantic-diff-report.adoc +96 -0
- data/docs/features/configuration-profiles.adoc +4 -2
- data/docs/features/diff-formatting/index.adoc +3 -0
- data/docs/features/diff-formatting/whitespace-adjacency.adoc +140 -0
- data/docs/features/match-options/html-policies.adoc +2 -0
- data/docs/features/match-options/index.adoc +40 -0
- data/docs/guides/choosing-configuration.adoc +12 -1
- data/docs/reference/cli-options.adoc +3 -0
- data/docs/reference/environment-variables.adoc +3 -1
- data/docs/reference/options-across-interfaces.adoc +7 -1
- data/docs/understanding/formats/html.adoc +9 -2
- data/lib/canon/cli.rb +4 -0
- data/lib/canon/commands/diff_command.rb +1 -0
- data/lib/canon/comparison/comparison_result.rb +95 -2
- data/lib/canon/comparison/html_comparator.rb +96 -11
- data/lib/canon/comparison/markup_comparator.rb +68 -71
- data/lib/canon/comparison/match_options/base_resolver.rb +1 -0
- data/lib/canon/comparison/match_options/xml_resolver.rb +8 -0
- data/lib/canon/comparison/match_options.rb +23 -2
- data/lib/canon/comparison/node_inspector.rb +103 -0
- data/lib/canon/comparison/whitespace_sensitivity.rb +96 -0
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +133 -55
- data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +24 -23
- data/lib/canon/comparison/xml_comparator/node_parser.rb +45 -7
- data/lib/canon/comparison/xml_comparator.rb +174 -7
- data/lib/canon/comparison/xml_node_comparison.rb +48 -66
- data/lib/canon/comparison.rb +143 -22
- data/lib/canon/config/env_schema.rb +2 -1
- data/lib/canon/config/profiles/metanorma.yml +3 -0
- data/lib/canon/config.rb +51 -5
- data/lib/canon/diff/diff_classifier.rb +55 -41
- data/lib/canon/diff/diff_line_builder.rb +9 -8
- data/lib/canon/diff/xml_serialization_formatter.rb +27 -42
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +39 -4
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +5 -2
- data/lib/canon/diff_formatter/by_line_formatter.rb +84 -0
- data/lib/canon/diff_formatter/by_object_formatter.rb +53 -0
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +184 -26
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +92 -4
- data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +29 -0
- data/lib/canon/diff_formatter/pretty_diff_formatter.rb +109 -0
- data/lib/canon/diff_formatter.rb +128 -175
- data/lib/canon/html/data_model.rb +10 -4
- data/lib/canon/pretty_printer/html.rb +76 -14
- data/lib/canon/pretty_printer/html_void_elements.rb +20 -0
- data/lib/canon/pretty_printer/xml_normalized.rb +10 -3
- data/lib/canon/tree_diff/adapters/html_adapter.rb +55 -2
- data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/c14n.rb +59 -5
- data/lib/canon/xml/data_model.rb +13 -1
- data/lib/canon/xml/element_matcher.rb +3 -0
- data/lib/canon/xml/node.rb +23 -1
- data/lib/canon/xml/nodes/comment_node.rb +4 -0
- data/lib/canon/xml/nodes/element_node.rb +4 -0
- data/lib/canon/xml/nodes/text_node.rb +4 -0
- data/lib/canon/xml/sax_builder.rb +29 -2
- data/lib/canon/xml/xpath_engine.rb +238 -0
- metadata +9 -2
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative "../node_inspector"
|
|
4
|
+
|
|
3
5
|
module Canon
|
|
4
6
|
module Comparison
|
|
5
7
|
module XmlComparatorHelpers
|
|
@@ -27,7 +29,10 @@ module Canon
|
|
|
27
29
|
# @param differences [Array] Array to collect differences
|
|
28
30
|
# @return [Integer] Comparison result code
|
|
29
31
|
def compare(node1, node2, comparator, opts, child_opts,
|
|
30
|
-
diff_children, differences)
|
|
32
|
+
diff_children, differences)
|
|
33
|
+
# FAST PATH: Object identity - same object means equivalent children
|
|
34
|
+
return Comparison::EQUIVALENT if node1.equal?(node2)
|
|
35
|
+
|
|
31
36
|
# Apply side-specific pretty-print heuristic when either flag is set:
|
|
32
37
|
# pretty_printed_expected → drop \n-starting whitespace nodes from node1
|
|
33
38
|
# pretty_printed_received → drop \n-starting whitespace nodes from node2
|
|
@@ -37,12 +42,15 @@ diff_children, differences)
|
|
|
37
42
|
opts1 = XmlNodeComparison.opts_for_side(opts, :expected)
|
|
38
43
|
opts2 = XmlNodeComparison.opts_for_side(opts, :received)
|
|
39
44
|
|
|
40
|
-
children1 = comparator.
|
|
41
|
-
children2 = comparator.
|
|
45
|
+
children1 = comparator.filter_children(node1.children, opts1)
|
|
46
|
+
children2 = comparator.filter_children(node2.children, opts2)
|
|
42
47
|
|
|
43
48
|
# Quick check: if both have no children, they're equivalent
|
|
44
49
|
return Comparison::EQUIVALENT if children1.empty? && children2.empty?
|
|
45
50
|
|
|
51
|
+
# FAST PATH: Identical children arrays mean equivalent subtrees
|
|
52
|
+
return Comparison::EQUIVALENT if children1.equal?(children2)
|
|
53
|
+
|
|
46
54
|
# Check if we can use ElementMatcher (requires Canon::Xml::DataModel nodes)
|
|
47
55
|
if can_use_element_matcher?(children1, children2)
|
|
48
56
|
use_element_matcher_comparison(children1, children2, node1, comparator,
|
|
@@ -91,9 +99,9 @@ diff_children, differences)
|
|
|
91
99
|
|
|
92
100
|
# If no matches and children exist, they're all different
|
|
93
101
|
if matches.empty? && (!children1.empty? || !children2.empty?)
|
|
94
|
-
comparator.
|
|
95
|
-
|
|
96
|
-
|
|
102
|
+
comparator.add_difference(parent_node, parent_node,
|
|
103
|
+
Comparison::MISSING_NODE, Comparison::MISSING_NODE,
|
|
104
|
+
:text_content, opts, differences)
|
|
97
105
|
return Comparison::UNEQUAL_ELEMENTS
|
|
98
106
|
end
|
|
99
107
|
|
|
@@ -116,30 +124,30 @@ diff_children, differences)
|
|
|
116
124
|
|
|
117
125
|
# Only create DiffNode if element_position is not :ignore
|
|
118
126
|
if position_behavior != :ignore
|
|
119
|
-
comparator.
|
|
120
|
-
|
|
121
|
-
|
|
127
|
+
comparator.add_difference(match.elem1, match.elem2,
|
|
128
|
+
"position #{match.pos1}", "position #{match.pos2}",
|
|
129
|
+
:element_position, opts, differences)
|
|
122
130
|
all_equivalent = false if position_behavior == :strict
|
|
123
131
|
end
|
|
124
132
|
end
|
|
125
133
|
|
|
126
134
|
# Compare the matched elements for content/attribute differences
|
|
127
|
-
result = comparator.
|
|
128
|
-
|
|
135
|
+
result = comparator.compare_nodes(match.elem1, match.elem2,
|
|
136
|
+
child_opts, child_opts, diff_children, differences)
|
|
129
137
|
all_equivalent = false unless result == Comparison::EQUIVALENT
|
|
130
138
|
|
|
131
139
|
when :deleted
|
|
132
140
|
# Element present in first tree but not second
|
|
133
|
-
comparator.
|
|
134
|
-
|
|
135
|
-
|
|
141
|
+
comparator.add_difference(match.elem1, nil,
|
|
142
|
+
Comparison::MISSING_NODE, Comparison::MISSING_NODE,
|
|
143
|
+
:element_structure, opts, differences)
|
|
136
144
|
all_equivalent = false
|
|
137
145
|
|
|
138
146
|
when :inserted
|
|
139
147
|
# Element present in second tree but not first
|
|
140
|
-
comparator.
|
|
141
|
-
|
|
142
|
-
|
|
148
|
+
comparator.add_difference(nil, match.elem2,
|
|
149
|
+
Comparison::MISSING_NODE, Comparison::MISSING_NODE,
|
|
150
|
+
:element_structure, opts, differences)
|
|
143
151
|
all_equivalent = false
|
|
144
152
|
end
|
|
145
153
|
end
|
|
@@ -147,9 +155,16 @@ diff_children, differences)
|
|
|
147
155
|
all_equivalent ? Comparison::EQUIVALENT : Comparison::UNEQUAL_ELEMENTS
|
|
148
156
|
end
|
|
149
157
|
|
|
150
|
-
# Use simple positional comparison for children
|
|
158
|
+
# Use simple positional comparison for children, with
|
|
159
|
+
# whitespace-asymmetry-aware re-alignment. When positional
|
|
160
|
+
# +zip()+ would pair a whitespace-only text node on one side
|
|
161
|
+
# against a content node on the other, treat the whitespace
|
|
162
|
+
# node as a single-side gap: emit one +:whitespace_adjacency+
|
|
163
|
+
# diff anchored at the whitespace node and advance only the
|
|
164
|
+
# cursor carrying the whitespace, so the next iteration aligns
|
|
165
|
+
# content against content. See lutaml/canon#137.
|
|
151
166
|
def use_positional_comparison(
|
|
152
|
-
children1, children2,
|
|
167
|
+
children1, children2, parent_node, comparator,
|
|
153
168
|
opts, child_opts, diff_children, differences
|
|
154
169
|
)
|
|
155
170
|
has_mismatch = false
|
|
@@ -157,53 +172,120 @@ diff_children, differences)
|
|
|
157
172
|
# Length check
|
|
158
173
|
unless children1.length == children2.length
|
|
159
174
|
has_mismatch = true
|
|
160
|
-
|
|
161
|
-
|
|
175
|
+
|
|
176
|
+
ws_asymmetric = asymmetric_whitespace_explains_length_diff?(
|
|
177
|
+
children1, children2
|
|
162
178
|
)
|
|
163
179
|
|
|
164
|
-
|
|
165
|
-
|
|
180
|
+
if ws_asymmetric
|
|
181
|
+
dimension = nil
|
|
182
|
+
mismatched_children = []
|
|
183
|
+
else
|
|
184
|
+
dimension = determine_dimension_for_mismatch(
|
|
166
185
|
children1, children2, comparator
|
|
167
186
|
)
|
|
187
|
+
mismatched_children, children1, children2 =
|
|
188
|
+
determine_mismatch_children(
|
|
189
|
+
children1, children2, comparator
|
|
190
|
+
)
|
|
191
|
+
end
|
|
168
192
|
|
|
169
193
|
if mismatched_children.empty?
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
194
|
+
unless ws_asymmetric
|
|
195
|
+
comparator.add_difference(parent_node, parent_node,
|
|
196
|
+
Comparison::MISSING_NODE, Comparison::MISSING_NODE,
|
|
197
|
+
dimension, opts, differences)
|
|
198
|
+
end
|
|
173
199
|
else
|
|
174
200
|
mismatched_children.each do |child|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
201
|
+
child_dim = comparator.determine_node_dimension(child)
|
|
202
|
+
if children1.length > children2.length
|
|
203
|
+
comparator.add_difference(child, nil,
|
|
204
|
+
Comparison::MISSING_NODE,
|
|
205
|
+
Comparison::MISSING_NODE,
|
|
206
|
+
child_dim, opts, differences)
|
|
179
207
|
else
|
|
180
|
-
comparator.
|
|
181
|
-
|
|
182
|
-
|
|
208
|
+
comparator.add_difference(nil, child,
|
|
209
|
+
Comparison::MISSING_NODE,
|
|
210
|
+
Comparison::MISSING_NODE,
|
|
211
|
+
child_dim, opts, differences)
|
|
183
212
|
end
|
|
184
213
|
end
|
|
185
214
|
end
|
|
186
|
-
# Continue comparing children to find deeper differences like attribute values
|
|
187
|
-
# Use zip to compare up to the shorter length
|
|
188
215
|
end
|
|
189
216
|
|
|
190
|
-
# Compare children pairwise by position
|
|
191
217
|
result = has_mismatch ? Comparison::UNEQUAL_ELEMENTS : Comparison::EQUIVALENT
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
218
|
+
walk_result = walk_children_with_realignment(
|
|
219
|
+
children1, children2, comparator,
|
|
220
|
+
child_opts, diff_children, opts, differences
|
|
221
|
+
)
|
|
222
|
+
result = walk_result unless walk_result == Comparison::EQUIVALENT
|
|
223
|
+
result
|
|
224
|
+
end
|
|
195
225
|
|
|
196
|
-
|
|
197
|
-
|
|
226
|
+
# Two-cursor walk over paired children that re-aligns past
|
|
227
|
+
# asymmetric whitespace-only text nodes. Returns the worst
|
|
228
|
+
# child result encountered.
|
|
229
|
+
def walk_children_with_realignment(
|
|
230
|
+
children1, children2, comparator,
|
|
231
|
+
child_opts, diff_children, opts, differences
|
|
232
|
+
)
|
|
233
|
+
result = Comparison::EQUIVALENT
|
|
234
|
+
i = 0
|
|
235
|
+
j = 0
|
|
236
|
+
|
|
237
|
+
while i < children1.length || j < children2.length
|
|
238
|
+
c1 = children1[i]
|
|
239
|
+
c2 = children2[j]
|
|
240
|
+
|
|
241
|
+
if c1.nil?
|
|
242
|
+
j += 1
|
|
243
|
+
next
|
|
244
|
+
elsif c2.nil?
|
|
245
|
+
i += 1
|
|
246
|
+
next
|
|
247
|
+
end
|
|
198
248
|
|
|
199
|
-
|
|
200
|
-
|
|
249
|
+
ws1 = NodeInspector.whitespace_only_text?(c1)
|
|
250
|
+
ws2 = NodeInspector.whitespace_only_text?(c2)
|
|
251
|
+
|
|
252
|
+
if ws1 && !ws2
|
|
253
|
+
comparator.add_difference(c1, c2,
|
|
254
|
+
Comparison::UNEQUAL_TEXT_CONTENTS,
|
|
255
|
+
Comparison::UNEQUAL_TEXT_CONTENTS,
|
|
256
|
+
:whitespace_adjacency, opts, differences)
|
|
257
|
+
result = Comparison::UNEQUAL_TEXT_CONTENTS
|
|
258
|
+
i += 1
|
|
259
|
+
next
|
|
260
|
+
elsif ws2 && !ws1
|
|
261
|
+
comparator.add_difference(c1, c2,
|
|
262
|
+
Comparison::UNEQUAL_TEXT_CONTENTS,
|
|
263
|
+
Comparison::UNEQUAL_TEXT_CONTENTS,
|
|
264
|
+
:whitespace_adjacency, opts, differences)
|
|
265
|
+
result = Comparison::UNEQUAL_TEXT_CONTENTS
|
|
266
|
+
j += 1
|
|
267
|
+
next
|
|
201
268
|
end
|
|
269
|
+
|
|
270
|
+
child_result = comparator.compare_nodes(c1, c2,
|
|
271
|
+
child_opts, child_opts,
|
|
272
|
+
diff_children, differences)
|
|
273
|
+
result = child_result unless child_result == Comparison::EQUIVALENT
|
|
274
|
+
i += 1
|
|
275
|
+
j += 1
|
|
202
276
|
end
|
|
203
277
|
|
|
204
278
|
result
|
|
205
279
|
end
|
|
206
280
|
|
|
281
|
+
# True when the length difference between the two child arrays
|
|
282
|
+
# is fully explained by asymmetric whitespace-only text nodes.
|
|
283
|
+
def asymmetric_whitespace_explains_length_diff?(children1, children2)
|
|
284
|
+
non_ws1 = children1.reject { |c| NodeInspector.whitespace_only_text?(c) }
|
|
285
|
+
non_ws2 = children2.reject { |c| NodeInspector.whitespace_only_text?(c) }
|
|
286
|
+
non_ws1.length == non_ws2.length
|
|
287
|
+
end
|
|
288
|
+
|
|
207
289
|
# Determine dimension for length mismatch
|
|
208
290
|
def determine_dimension_for_mismatch(children1, children2, comparator)
|
|
209
291
|
dimension = :text_content # default
|
|
@@ -213,22 +295,17 @@ diff_children, differences)
|
|
|
213
295
|
(0...max_len).each do |i|
|
|
214
296
|
if i >= children1.length
|
|
215
297
|
# Extra child in children2
|
|
216
|
-
dimension = comparator.
|
|
217
|
-
children2[i])
|
|
298
|
+
dimension = comparator.determine_node_dimension(children2[i])
|
|
218
299
|
break
|
|
219
300
|
elsif i >= children2.length
|
|
220
301
|
# Extra child in children1
|
|
221
|
-
dimension = comparator.
|
|
222
|
-
children1[i])
|
|
302
|
+
dimension = comparator.determine_node_dimension(children1[i])
|
|
223
303
|
break
|
|
224
|
-
elsif !comparator.
|
|
225
|
-
children2[i])
|
|
304
|
+
elsif !comparator.same_node_type?(children1[i], children2[i])
|
|
226
305
|
# Different node types at same position
|
|
227
306
|
# Check both nodes - if either is a comment, use :comments dimension
|
|
228
|
-
dim1 = comparator.
|
|
229
|
-
|
|
230
|
-
dim2 = comparator.send(:determine_node_dimension,
|
|
231
|
-
children2[i])
|
|
307
|
+
dim1 = comparator.determine_node_dimension(children1[i])
|
|
308
|
+
dim2 = comparator.determine_node_dimension(children2[i])
|
|
232
309
|
dimension = [dim1, dim2].include?(:comments) ? :comments : dim1
|
|
233
310
|
break
|
|
234
311
|
end
|
|
@@ -250,7 +327,7 @@ diff_children, differences)
|
|
|
250
327
|
end
|
|
251
328
|
|
|
252
329
|
smaller_set_names = smaller_set.filter_map do |c|
|
|
253
|
-
next nil unless c.
|
|
330
|
+
next nil unless c.is_a?(Canon::Xml::Node) || c.is_a?(Nokogiri::XML::Node)
|
|
254
331
|
# Exclude generic node-type names (e.g. "#text") that are
|
|
255
332
|
# shared by all text nodes and cannot be used for matching.
|
|
256
333
|
next nil if c.name.start_with?("#")
|
|
@@ -265,7 +342,8 @@ diff_children, differences)
|
|
|
265
342
|
# If the smaller set has no child at this position,
|
|
266
343
|
# consider it a mismatch
|
|
267
344
|
mismatch_children << larger_set[i]
|
|
268
|
-
elsif larger_set[i].
|
|
345
|
+
elsif (larger_set[i].is_a?(Canon::Xml::Node) ||
|
|
346
|
+
larger_set[i].is_a?(Nokogiri::XML::Node)) &&
|
|
269
347
|
!larger_set[i].name.start_with?("#") &&
|
|
270
348
|
!smaller_set_names.include?(larger_set[i].name)
|
|
271
349
|
# If the name of the node is not found in the smaller set,
|
|
@@ -4,6 +4,7 @@ require "set"
|
|
|
4
4
|
require_relative "../../diff/diff_node"
|
|
5
5
|
require_relative "../../diff/path_builder"
|
|
6
6
|
require_relative "../../diff/node_serializer"
|
|
7
|
+
require_relative "../node_inspector"
|
|
7
8
|
|
|
8
9
|
module Canon
|
|
9
10
|
module Comparison
|
|
@@ -52,14 +53,15 @@ module Canon
|
|
|
52
53
|
# For deleted/inserted nodes, include namespace information if available
|
|
53
54
|
if dimension == :text_content && (node1.nil? || node2.nil?)
|
|
54
55
|
node = node1 || node2
|
|
55
|
-
if node.
|
|
56
|
+
if node.is_a?(Canon::Xml::Node) || node.is_a?(Nokogiri::XML::Node)
|
|
56
57
|
ns = node.namespace_uri
|
|
57
58
|
ns_info = if ns.nil? || ns.empty?
|
|
58
59
|
""
|
|
59
60
|
else
|
|
60
61
|
" (namespace: #{ns})"
|
|
61
62
|
end
|
|
62
|
-
|
|
63
|
+
label = Canon::Comparison.code_pair_label(diff1, diff2)
|
|
64
|
+
return "element '#{node.name}'#{ns_info}: #{label}"
|
|
63
65
|
end
|
|
64
66
|
end
|
|
65
67
|
|
|
@@ -87,8 +89,15 @@ module Canon
|
|
|
87
89
|
# Default reason
|
|
88
90
|
if diff1 == Canon::Comparison::MISSING_NODE && diff2 == Canon::Comparison::MISSING_NODE
|
|
89
91
|
"element structure mismatch (children differ)"
|
|
92
|
+
elsif dimension == :element_structure &&
|
|
93
|
+
diff1 == Canon::Comparison::UNEQUAL_ELEMENTS &&
|
|
94
|
+
diff2 == Canon::Comparison::UNEQUAL_ELEMENTS &&
|
|
95
|
+
(node1.is_a?(Canon::Xml::Node) || node1.is_a?(Nokogiri::XML::Node)) &&
|
|
96
|
+
(node2.is_a?(Canon::Xml::Node) || node2.is_a?(Nokogiri::XML::Node)) &&
|
|
97
|
+
node1.name && node2.name && node1.name != node2.name
|
|
98
|
+
"different element name (<#{node1.name}> vs <#{node2.name}>)"
|
|
90
99
|
else
|
|
91
|
-
|
|
100
|
+
Canon::Comparison.code_pair_label(diff1, diff2)
|
|
92
101
|
end
|
|
93
102
|
end
|
|
94
103
|
|
|
@@ -176,26 +185,18 @@ module Canon
|
|
|
176
185
|
def self.extract_text_content(node)
|
|
177
186
|
return nil if node.nil?
|
|
178
187
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
# For nodes with value method (other types)
|
|
192
|
-
return node.value if node.respond_to?(:value)
|
|
193
|
-
|
|
194
|
-
# For simple text nodes or strings
|
|
195
|
-
return node.to_s if node.is_a?(String)
|
|
196
|
-
|
|
197
|
-
# For other node types, try to_s
|
|
198
|
-
node.to_s
|
|
188
|
+
case node
|
|
189
|
+
when Canon::Xml::Nodes::TextNode
|
|
190
|
+
node.value
|
|
191
|
+
when Canon::Xml::Node
|
|
192
|
+
node.text_content
|
|
193
|
+
when Nokogiri::XML::Node
|
|
194
|
+
node.content.to_s
|
|
195
|
+
when String
|
|
196
|
+
node
|
|
197
|
+
else
|
|
198
|
+
node.to_s
|
|
199
|
+
end
|
|
199
200
|
rescue StandardError
|
|
200
201
|
nil
|
|
201
202
|
end
|
|
@@ -14,15 +14,18 @@ module Canon
|
|
|
14
14
|
# @param node [String, Object] Node to parse
|
|
15
15
|
# @param preprocessing [Symbol] Preprocessing mode (:none, :normalize, :c14n, :format)
|
|
16
16
|
# @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
|
|
17
|
+
# @param parser [Symbol] Parser backend (:sax or :dom, default from config)
|
|
17
18
|
# @return [Canon::Xml::Node] Parsed node
|
|
18
|
-
def self.parse(node, preprocessing = :none, preserve_whitespace: false
|
|
19
|
+
def self.parse(node, preprocessing = :none, preserve_whitespace: false,
|
|
20
|
+
parser: nil)
|
|
19
21
|
# If already a Canon::Xml::Node, return as-is
|
|
20
22
|
return node if node.is_a?(Canon::Xml::Node)
|
|
21
23
|
|
|
22
24
|
# If it's a Nokogiri or Moxml node, convert to DataModel
|
|
23
25
|
unless node.is_a?(String)
|
|
24
26
|
return convert_from_node(node,
|
|
25
|
-
preserve_whitespace: preserve_whitespace
|
|
27
|
+
preserve_whitespace: preserve_whitespace,
|
|
28
|
+
parser: parser)
|
|
26
29
|
end
|
|
27
30
|
|
|
28
31
|
# Normalize encoding before preprocessing (UTF-16 strings can't use strip, etc.)
|
|
@@ -31,9 +34,17 @@ module Canon
|
|
|
31
34
|
# Apply preprocessing to XML string before parsing
|
|
32
35
|
xml_string = apply_preprocessing(node, preprocessing).strip
|
|
33
36
|
|
|
34
|
-
#
|
|
35
|
-
|
|
37
|
+
# Select parser backend
|
|
38
|
+
resolved_parser = parser || resolve_parser_config
|
|
39
|
+
|
|
40
|
+
if resolved_parser == :sax
|
|
41
|
+
require_relative "../../xml/sax_builder"
|
|
42
|
+
Canon::Xml::SaxBuilder.parse(xml_string,
|
|
36
43
|
preserve_whitespace: preserve_whitespace)
|
|
44
|
+
else
|
|
45
|
+
Canon::Xml::DataModel.from_xml(xml_string,
|
|
46
|
+
preserve_whitespace: preserve_whitespace)
|
|
47
|
+
end
|
|
37
48
|
end
|
|
38
49
|
|
|
39
50
|
# Apply preprocessing transformation to XML string
|
|
@@ -62,9 +73,18 @@ module Canon
|
|
|
62
73
|
#
|
|
63
74
|
# @param node [Object] Nokogiri or Moxml node
|
|
64
75
|
# @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
|
|
76
|
+
# @param parser [Symbol, nil] Parser backend override
|
|
65
77
|
# @return [Canon::Xml::Node] Converted node
|
|
66
|
-
def self.convert_from_node(node, preserve_whitespace: false
|
|
67
|
-
|
|
78
|
+
def self.convert_from_node(node, preserve_whitespace: false,
|
|
79
|
+
parser: nil)
|
|
80
|
+
# FAST PATH: Convert Nokogiri/Moxml nodes directly without string round-trip
|
|
81
|
+
if defined?(Nokogiri::XML::Node) && node.is_a?(Nokogiri::XML::Node)
|
|
82
|
+
return Canon::Xml::DataModel.build_from_nokogiri(
|
|
83
|
+
node, preserve_whitespace: preserve_whitespace
|
|
84
|
+
)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# SLOW PATH: Fallback to string serialization for unknown node types
|
|
68
88
|
xml_str = if node.respond_to?(:to_xml)
|
|
69
89
|
node.to_xml
|
|
70
90
|
elsif node.respond_to?(:to_s)
|
|
@@ -73,8 +93,26 @@ module Canon
|
|
|
73
93
|
raise Canon::Error,
|
|
74
94
|
"Unable to convert node to string: #{node.class}"
|
|
75
95
|
end
|
|
76
|
-
|
|
96
|
+
|
|
97
|
+
resolved_parser = parser || resolve_parser_config
|
|
98
|
+
|
|
99
|
+
if resolved_parser == :sax
|
|
100
|
+
require_relative "../../xml/sax_builder"
|
|
101
|
+
Canon::Xml::SaxBuilder.parse(xml_str,
|
|
77
102
|
preserve_whitespace: preserve_whitespace)
|
|
103
|
+
else
|
|
104
|
+
Canon::Xml::DataModel.from_xml(xml_str,
|
|
105
|
+
preserve_whitespace: preserve_whitespace)
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Resolve parser config from global config
|
|
110
|
+
#
|
|
111
|
+
# @return [Symbol] :sax or :dom
|
|
112
|
+
def self.resolve_parser_config
|
|
113
|
+
Canon::Config.instance.xml.diff.parser
|
|
114
|
+
rescue StandardError
|
|
115
|
+
:sax
|
|
78
116
|
end
|
|
79
117
|
end
|
|
80
118
|
end
|