canon 0.1.22 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +174 -25
  3. data/docs/INDEX.adoc +4 -0
  4. data/docs/advanced/diff-classification.adoc +3 -2
  5. data/docs/features/configuration-profiles.adoc +288 -0
  6. data/docs/features/diff-formatting/character-visualization.adoc +153 -454
  7. data/docs/features/diff-formatting/display-filtering.adoc +44 -0
  8. data/docs/features/diff-formatting/display-preprocessing.adoc +656 -0
  9. data/docs/features/diff-formatting/index.adoc +47 -0
  10. data/docs/features/diff-formatting/pretty-diff-mode.adoc +154 -0
  11. data/docs/features/environment-configuration/override-system.adoc +10 -3
  12. data/docs/features/index.adoc +9 -0
  13. data/docs/features/match-options/index.adoc +32 -42
  14. data/docs/features/match-options/pretty-printed-fixtures.adoc +270 -0
  15. data/docs/guides/choosing-configuration.adoc +22 -0
  16. data/docs/reference/environment-variables.adoc +121 -1
  17. data/docs/reference/options-across-interfaces.adoc +182 -2
  18. data/lib/canon/cli.rb +20 -0
  19. data/lib/canon/commands/diff_command.rb +7 -2
  20. data/lib/canon/commands/format_command.rb +1 -1
  21. data/lib/canon/comparison/html_comparator.rb +20 -15
  22. data/lib/canon/comparison/html_compare_profile.rb +4 -4
  23. data/lib/canon/comparison/markup_comparator.rb +12 -3
  24. data/lib/canon/comparison/match_options/base_resolver.rb +29 -7
  25. data/lib/canon/comparison/match_options/json_resolver.rb +9 -0
  26. data/lib/canon/comparison/match_options/xml_resolver.rb +16 -2
  27. data/lib/canon/comparison/match_options/yaml_resolver.rb +10 -0
  28. data/lib/canon/comparison/match_options.rb +4 -1
  29. data/lib/canon/comparison/whitespace_sensitivity.rb +189 -137
  30. data/lib/canon/comparison/xml_comparator/child_comparison.rb +21 -4
  31. data/lib/canon/comparison/xml_comparator.rb +14 -12
  32. data/lib/canon/comparison/xml_node_comparison.rb +51 -6
  33. data/lib/canon/comparison.rb +52 -9
  34. data/lib/canon/config/env_schema.rb +32 -4
  35. data/lib/canon/config/override_resolver.rb +16 -3
  36. data/lib/canon/config/profile_loader.rb +135 -0
  37. data/lib/canon/config/profiles/metanorma.yml +74 -0
  38. data/lib/canon/config/profiles/metanorma_debug.yml +8 -0
  39. data/lib/canon/config/type_converter.rb +8 -0
  40. data/lib/canon/config.rb +469 -5
  41. data/lib/canon/diff/diff_classifier.rb +41 -11
  42. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +48 -17
  43. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +58 -0
  44. data/lib/canon/diff_formatter/diff_detail_formatter.rb +22 -7
  45. data/lib/canon/diff_formatter/theme.rb +24 -17
  46. data/lib/canon/diff_formatter.rb +493 -36
  47. data/lib/canon/pretty_printer/xml_normalized.rb +395 -0
  48. data/lib/canon/rspec_matchers.rb +36 -0
  49. data/lib/canon/tree_diff/matchers/hash_matcher.rb +26 -11
  50. data/lib/canon/version.rb +1 -1
  51. data/lib/canon/xml/nodes/namespace_node.rb +4 -0
  52. data/lib/canon/xml/nodes/processing_instruction_node.rb +4 -0
  53. data/lib/canon/xml/nodes/root_node.rb +4 -0
  54. data/lib/canon/xml/nodes/text_node.rb +4 -0
  55. data/lib/tasks/performance_helpers.rb +2 -2
  56. metadata +24 -2
@@ -0,0 +1,395 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+
5
+ module Canon
6
+ module PrettyPrinter
7
+ # Mixed-content-aware XML serializer for diff display preprocessing.
8
+ #
9
+ # == The mixed-content problem
10
+ #
11
+ # Standard XML pretty-printers (including Nokogiri's built-in serializer)
12
+ # keep elements that contain both text and child elements on a single line.
13
+ # They have no choice: inserting a newline between, say, `<p>See ` and
14
+ # `<xref.../>` would create a new whitespace text node, changing the
15
+ # document's semantic content. The result for line-by-line diffs is that
16
+ # any change inside such an element forces the entire line — potentially
17
+ # hundreds or thousands of characters — to be marked as changed. Issue #53
18
+ # documented this as "1000-character long lines" from HTML diffs.
19
+ #
20
+ # == Three-way whitespace classification
21
+ #
22
+ # This serializer distinguishes three categories of element-level whitespace
23
+ # behaviour, configured via element-name lists:
24
+ #
25
+ # * **Preserve** (`preserve_whitespace_elements`) — every whitespace character is
26
+ # significant. `" "` ≠ `"\n"`. Typical: `<pre>`, `<code>`, `<textarea>`.
27
+ # Whitespace-only text nodes are visualized character-by-character.
28
+ #
29
+ # * **Collapse** (`collapse_whitespace_elements`) — presence ≠ absence,
30
+ # but all whitespace forms are equivalent: `" "` == `"\n "` == `"\t"`.
31
+ # Typical: `<p>`, `<li>`, `<td>`, heading elements.
32
+ # Whitespace-only text nodes are collapsed to a single `░` visualization,
33
+ # so `<p>\n <em>` (indented fixture) and `<p> <em>` (compact source)
34
+ # both render as `<p>░<em>` — identical display lines, no spurious diff.
35
+ #
36
+ # * **Strip** (everything else, or explicit `strip_whitespace_elements`) —
37
+ # all whitespace between child elements is structural formatting noise.
38
+ # `" "` == `"\n "` == nothing. Whitespace-only text nodes are silently
39
+ # dropped. Typical: `<section>`, `<ul>`, `<formattedref>`, `<bibitem>`.
40
+ #
41
+ # Classification is **ancestor-based**: a text node's class is determined
42
+ # by the closest matching ancestor. This means `<em>` inside `<p>` inherits
43
+ # `<p>`'s normalize behaviour without needing to be listed explicitly.
44
+ #
45
+ # == Format defaults
46
+ #
47
+ # * **XML**: all three lists are empty by default — insensitive everywhere.
48
+ # Whitespace sensitivity is opt-in, consistent with XML's data-first usage.
49
+ #
50
+ # * **HTML**: built-in defaults are provided (but overridable):
51
+ # - preserve: `pre`, `code`, `textarea`, `script`, `style`
52
+ # - collapse: `p`, `li`, `dt`, `dd`, `td`, `th`, `h1`–`h6`, `caption`,
53
+ # `figcaption`, `label`, `legend`, `summary`, `blockquote`, `address`
54
+ #
55
+ # == Structural vs. content whitespace
56
+ #
57
+ # * **Structural whitespace** — indentation characters emitted by the
58
+ # serializer itself. These do not exist in the source document.
59
+ # They are rendered as ordinary ASCII space and newline characters.
60
+ # * **Content whitespace** — whitespace that exists as text-node content
61
+ # in the source document. Classification (above) decides how to render it.
62
+ #
63
+ # The invariant is: every XML element always starts on its own line.
64
+ # Content whitespace is never confused with structural indentation.
65
+ #
66
+ # == Example (normalize element <p>)
67
+ #
68
+ # Input — compact source (Metanorma-style):
69
+ # <p>See <xref target="M"/></p>
70
+ #
71
+ # Input — indented fixture heredoc:
72
+ # <p>
73
+ # See
74
+ # <xref target="M"/>
75
+ # </p>
76
+ #
77
+ # Both serialize to:
78
+ # <p>
79
+ # See░
80
+ # <xref target="M"/>
81
+ # </p>
82
+ #
83
+ # Result: zero diff lines for a semantically identical document.
84
+ #
85
+ # == Example (insensitive element <formattedref>)
86
+ #
87
+ # Input — compact source:
88
+ # <formattedref><em>Cereals</em>.</formattedref>
89
+ #
90
+ # Input — indented fixture:
91
+ # <formattedref>
92
+ # <em>Cereals</em>.
93
+ # </formattedref>
94
+ #
95
+ # Both serialize to (whitespace-only nodes silently dropped):
96
+ # <formattedref>
97
+ # <em>Cereals</em>
98
+ # .
99
+ # </formattedref>
100
+ #
101
+ # Result: zero diff lines.
102
+ #
103
+ # == Usage
104
+ #
105
+ # printer = Canon::PrettyPrinter::XmlNormalized.new
106
+ # formatted = printer.format(xml_string)
107
+ #
108
+ # # With element lists (XML):
109
+ # printer = Canon::PrettyPrinter::XmlNormalized.new(
110
+ # collapse_whitespace_elements: %w[p formattedref title],
111
+ # preserve_whitespace_elements: %w[sourcecode pre],
112
+ # )
113
+ #
114
+ class XmlNormalized
115
+ # @param indent [Integer] number of indent characters per level (default 2)
116
+ # @param indent_type [String] "space" or "tab"
117
+ # @param visualization_map [Hash, nil] character visualization map
118
+ # @param preserve_whitespace_elements [Array<String>] element names where
119
+ # every whitespace character is significant (e.g. pre, code).
120
+ # @param collapse_whitespace_elements [Array<String>] element names where
121
+ # presence of whitespace matters but all forms are equivalent (e.g. p, li).
122
+ # @param strip_whitespace_elements [Array<String>] explicit blacklist — these
123
+ # elements and their children always have whitespace dropped, even if an
124
+ # ancestor would otherwise be preserve or collapse.
125
+ # @param pretty_printed [Boolean] when true, whitespace-only text nodes
126
+ # that begin with "\n" inside +:collapse+ elements are treated as
127
+ # structural indentation and silently dropped. This matches the
128
+ # comparison-side behaviour activated by +pretty_printed_expected+ /
129
+ # +pretty_printed_received+ match options. Nodes under +:preserve+ elements
130
+ # are always preserved; nodes under +:strip+ elements are already dropped.
131
+ def initialize(indent: 2, indent_type: "space", visualization_map: nil,
132
+ preserve_whitespace_elements: [],
133
+ collapse_whitespace_elements: [],
134
+ strip_whitespace_elements: [],
135
+ pretty_printed: false,
136
+ sort_attributes: false)
137
+ @indent = indent.to_i
138
+ @indent_char = indent_type == "tab" ? "\t" : " "
139
+ @vis_map = visualization_map || default_vis_map
140
+ @pretty_printed = pretty_printed
141
+ @sort_attributes = sort_attributes
142
+
143
+ @strict_ws = Set.new((preserve_whitespace_elements || []).map(&:to_s))
144
+ @norm_ws = Set.new((collapse_whitespace_elements || []).map(&:to_s))
145
+ @insens_ws = Set.new((strip_whitespace_elements || []).map(&:to_s))
146
+ end
147
+
148
+ # Format an XML string with mixed-content-aware serialization.
149
+ #
150
+ # @param xml_string [String] Input XML
151
+ # @return [String] Serialized XML, one node per line, with content
152
+ # whitespace visualized at line boundaries
153
+ def format(xml_string)
154
+ doc = Nokogiri::XML(xml_string)
155
+ lines = []
156
+
157
+ if doc.version
158
+ enc = doc.encoding ? " encoding=\"#{doc.encoding}\"" : ""
159
+ lines << "<?xml version=\"#{doc.version}\"#{enc}?>"
160
+ end
161
+
162
+ lines << serialize_element(doc.root, 0) if doc.root
163
+ lines.join("\n")
164
+ end
165
+
166
+ private
167
+
168
+ # Return indent string for depth.
169
+ def ind(depth)
170
+ @indent_char * (@indent * depth)
171
+ end
172
+
173
+ # Classify the whitespace behaviour for a given Nokogiri element node.
174
+ #
175
+ # Walks up the ancestor chain from the element itself. The first
176
+ # matching ancestor determines the class. Insensitive blacklist wins
177
+ # over any sensitive ancestor.
178
+ #
179
+ # @param element [Nokogiri::XML::Element] The element to classify
180
+ # @return [Symbol] :strict, :normalize, or :drop
181
+ def classify_whitespace(element)
182
+ current = element
183
+ while current && !current.is_a?(Nokogiri::XML::Document)
184
+ name = current.name.to_s
185
+ return :drop if @insens_ws.include?(name)
186
+ return :strict if @strict_ws.include?(name)
187
+ return :normalize if @norm_ws.include?(name)
188
+
189
+ current = current.parent
190
+ end
191
+ # No matching ancestor — default: drop (insensitive)
192
+ :drop
193
+ end
194
+
195
+ # Serialize a single element node.
196
+ def serialize_element(node, depth)
197
+ # Filter out empty text nodes (zero-length, not whitespace-only).
198
+ children = node.children.reject { |c| c.text? && c.content.empty? }
199
+
200
+ if children.empty?
201
+ return "#{ind(depth)}#{open_tag(node,
202
+ self_close: true)}"
203
+ end
204
+
205
+ elem_children = children.select(&:element?)
206
+ text_with_content = children.select do |c|
207
+ c.text? && !c.content.strip.empty?
208
+ end
209
+
210
+ if elem_children.empty?
211
+ # Pure-text element — keep on one line.
212
+ return "#{ind(depth)}#{open_tag(node)}#{node.text}</#{node.name}>"
213
+ end
214
+
215
+ if text_with_content.empty?
216
+ # Element-only children (may have whitespace-only text nodes between them).
217
+ # Apply classification to decide whether to drop or visualize them.
218
+ ws_class = classify_whitespace(node)
219
+ lines = ["#{ind(depth)}#{open_tag(node)}"]
220
+ children.each do |child|
221
+ if child.text?
222
+ # Whitespace-only text node between element children
223
+ vis = render_whitespace_only(child.content, ws_class)
224
+ next if vis.nil? # :drop
225
+
226
+ # Append to previous line (do not create a new line)
227
+ lines[-1] = lines[-1] + vis
228
+ else
229
+ lines << serialize_element(child, depth + 1)
230
+ end
231
+ end
232
+ lines << "#{ind(depth)}</#{node.name}>"
233
+ return lines.join("\n")
234
+ end
235
+
236
+ # Mixed content: both text-with-content and element children.
237
+ serialize_mixed(node, children, depth)
238
+ end
239
+
240
+ # Serialize a mixed-content element.
241
+ #
242
+ # Each child is processed in document order. Text nodes are split into:
243
+ # * leading whitespace → rendered according to whitespace classification
244
+ # * non-whitespace content → put on its OWN indented line
245
+ # * trailing whitespace → rendered according to classification, appended
246
+ #
247
+ # Element children flush the current accumulated line, then are
248
+ # serialized recursively.
249
+ def serialize_mixed(node, children, depth)
250
+ child_depth = depth + 1
251
+ lines = []
252
+ current_line = "#{ind(depth)}#{open_tag(node)}"
253
+ ws_class = classify_whitespace(node)
254
+
255
+ children.each do |child|
256
+ if child.text?
257
+ process_text_node(child.content, child_depth, lines, current_line,
258
+ ws_class) do |nl|
259
+ current_line = nl
260
+ end
261
+ else
262
+ lines << current_line
263
+ current_line = serialize_element(child, child_depth)
264
+ end
265
+ end
266
+
267
+ lines << current_line
268
+ lines << "#{ind(depth)}</#{node.name}>"
269
+ lines.join("\n")
270
+ end
271
+
272
+ # Render a whitespace-only string according to classification.
273
+ #
274
+ # When +@pretty_printed+ is true and +ws_class+ is +:normalize+:
275
+ # * Content starting with "\n" (e.g. "\n " indentation) is treated as
276
+ # structural pretty-print formatting and **dropped** (returns nil).
277
+ # * All other whitespace (e.g. " " inline space) is still rendered as the
278
+ # usual single-space visualization.
279
+ # This aligns display output with the comparison-side behaviour controlled
280
+ # by +pretty_printed_expected+ / +pretty_printed_received+.
281
+ #
282
+ # @param content [String] Whitespace-only string
283
+ # @param ws_class [Symbol] :strict, :normalize, or :drop
284
+ # @return [String, nil] Rendered string, or nil to indicate "drop"
285
+ def render_whitespace_only(content, ws_class)
286
+ case ws_class
287
+ when :strict
288
+ visualize(content)
289
+ when :normalize
290
+ # In pretty_printed mode, \n-leading whitespace is structural — drop it
291
+ return nil if @pretty_printed && content.start_with?("\n")
292
+
293
+ # Any other whitespace → single space visualization
294
+ content.empty? ? nil : @vis_map.fetch(" ", "░")
295
+ # :drop — fall through to nil
296
+ end
297
+ end
298
+
299
+ # Process a text node in mixed-content context.
300
+ #
301
+ # Yields the new current_line (string the caller should adopt).
302
+ #
303
+ # === Pure-whitespace text nodes
304
+ #
305
+ # Whitespace-only text nodes are rendered via +render_whitespace_only+
306
+ # according to the element's whitespace classification:
307
+ # - :strict → visualize every character (e.g. ↵░░░)
308
+ # - :normalize → single ░ regardless of whitespace form
309
+ # - :drop → silently discarded
310
+ #
311
+ # === Text nodes with printable content
312
+ #
313
+ # Leading and trailing whitespace are split off and rendered according
314
+ # to the whitespace classification at line boundaries. The printable
315
+ # content occupies its own indented line.
316
+ def process_text_node(content, child_depth, lines, current_line, ws_class)
317
+ stripped = content.strip
318
+
319
+ if stripped.empty?
320
+ # Pure whitespace between elements
321
+ vis = render_whitespace_only(content, ws_class)
322
+ if vis.nil?
323
+ yield current_line # :drop — no change
324
+ else
325
+ yield current_line + vis
326
+ end
327
+ return
328
+ end
329
+
330
+ leading = content[/\A\s*/]
331
+ trailing = content[/\s*\z/]
332
+ middle = stripped
333
+
334
+ # Leading whitespace: append to current line (then flush), or drop
335
+ unless leading.empty?
336
+ vis = render_whitespace_only(leading, ws_class)
337
+ current_line += vis unless vis.nil?
338
+ end
339
+ lines << current_line
340
+
341
+ # Trailing whitespace visualization
342
+ trailing_vis = if trailing.empty?
343
+ ""
344
+ else
345
+ v = render_whitespace_only(trailing, ws_class)
346
+ v.nil? ? "" : v
347
+ end
348
+ yield "#{ind(child_depth)}#{middle}#{trailing_vis}"
349
+ end
350
+
351
+ # Build an opening XML tag with namespace declarations and attributes.
352
+ def open_tag(node, self_close: false)
353
+ ns_decls = node.namespace_definitions.map do |ns|
354
+ ns.prefix ? " xmlns:#{ns.prefix}=\"#{ns.href}\"" : " xmlns=\"#{ns.href}\""
355
+ end.join
356
+
357
+ attr_nodes = node.attribute_nodes
358
+ if @sort_attributes
359
+ attr_nodes = attr_nodes.sort_by do |a|
360
+ [a.namespace&.href.to_s, a.name]
361
+ end
362
+ end
363
+
364
+ attrs = attr_nodes.map do |a|
365
+ prefix = a.namespace&.prefix ? "#{a.namespace.prefix}:" : ""
366
+ " #{prefix}#{a.name}=\"#{escape_attr(a.value)}\""
367
+ end.join
368
+
369
+ close = self_close ? "/>" : ">"
370
+ "<#{node.name}#{ns_decls}#{attrs}#{close}"
371
+ end
372
+
373
+ # Escape characters that are special inside attribute values.
374
+ def escape_attr(value)
375
+ value.gsub("&", "&amp;").gsub('"', "&quot;").gsub("<", "&lt;")
376
+ end
377
+
378
+ # Visualize a whitespace string using the character map.
379
+ # Non-whitespace characters are passed through unchanged (safety net).
380
+ def visualize(str)
381
+ return "" if str.nil? || str.empty?
382
+
383
+ str.chars.map { |c| @vis_map.fetch(c, c) }.join
384
+ end
385
+
386
+ # Load the default visualization map from DiffFormatter constants.
387
+ def default_vis_map
388
+ require_relative "../diff_formatter"
389
+ Canon::DiffFormatter::DEFAULT_VISUALIZATION_MAP
390
+ rescue LoadError, NameError
391
+ { " " => "░", "\t" => "⇥", "\n" => "↵", "\r" => "⏎", "\u00A0" => "␣" }
392
+ end
393
+ end
394
+ end
395
+ end
@@ -219,6 +219,20 @@ module Canon
219
219
  context_lines: diff_config.context_lines,
220
220
  diff_grouping_lines: diff_config.grouping_lines,
221
221
  show_diffs: diff_config.show_diffs,
222
+ show_raw_inputs: diff_config.show_raw_inputs,
223
+ show_raw_expected: diff_config.show_raw_expected,
224
+ show_raw_received: diff_config.show_raw_received,
225
+ show_preprocessed_inputs: diff_config.show_preprocessed_inputs,
226
+ show_preprocessed_expected: diff_config.show_preprocessed_expected,
227
+ show_preprocessed_received: diff_config.show_preprocessed_received,
228
+ show_prettyprint_inputs: diff_config.show_prettyprint_inputs,
229
+ show_prettyprint_expected: diff_config.show_prettyprint_expected,
230
+ show_prettyprint_received: diff_config.show_prettyprint_received,
231
+ show_line_numbered_inputs: diff_config.show_line_numbered_inputs,
232
+ character_visualization: diff_config.character_visualization,
233
+ display_preprocessing: diff_config.display_preprocessing,
234
+ pretty_printer_indent: diff_config.pretty_printer.indent,
235
+ pretty_printer_indent_type: diff_config.pretty_printer.indent_type,
222
236
  )
223
237
 
224
238
  return formatter.format([], :string, doc1: @expected.to_s,
@@ -237,6 +251,28 @@ module Canon
237
251
  diff_grouping_lines: diff_config.grouping_lines,
238
252
  show_diffs: diff_config.show_diffs,
239
253
  verbose_diff: diff_config.verbose_diff,
254
+ show_raw_inputs: diff_config.show_raw_inputs,
255
+ show_raw_expected: diff_config.show_raw_expected,
256
+ show_raw_received: diff_config.show_raw_received,
257
+ show_preprocessed_inputs: diff_config.show_preprocessed_inputs,
258
+ show_preprocessed_expected: diff_config.show_preprocessed_expected,
259
+ show_preprocessed_received: diff_config.show_preprocessed_received,
260
+ show_prettyprint_inputs: diff_config.show_prettyprint_inputs,
261
+ show_prettyprint_expected: diff_config.show_prettyprint_expected,
262
+ show_prettyprint_received: diff_config.show_prettyprint_received,
263
+ show_line_numbered_inputs: diff_config.show_line_numbered_inputs,
264
+ character_visualization: diff_config.character_visualization,
265
+ display_preprocessing: diff_config.display_preprocessing,
266
+ pretty_printer_indent: diff_config.pretty_printer.indent,
267
+ pretty_printer_indent_type: diff_config.pretty_printer.indent_type,
268
+ preserve_whitespace_elements: diff_config.preserve_whitespace_elements,
269
+ collapse_whitespace_elements: diff_config.collapse_whitespace_elements,
270
+ strip_whitespace_elements: diff_config.strip_whitespace_elements,
271
+ pretty_printed_expected: diff_config.pretty_printed_expected,
272
+ pretty_printed_received: diff_config.pretty_printed_received,
273
+ pretty_printer_sort_attributes: diff_config.pretty_printer_sort_attributes,
274
+ compact_semantic_report: diff_config.compact_semantic_report,
275
+ expand_difference: diff_config.expand_difference,
240
276
  )
241
277
 
242
278
  # Format the diff using the comparison result
@@ -93,19 +93,34 @@ module Canon
93
93
  end
94
94
  return if candidates.empty?
95
95
 
96
- best_match = find_best_match(node2, candidates)
97
- return unless best_match
98
-
99
- if @matching.add(best_match, node2)
100
- @matched_tree1 << best_match
101
- @matched_tree2 << node2
102
- propagate_to_ancestors(best_match, node2)
96
+ # When multiple candidates have identical signatures (common with
97
+ # duplicate subtrees like MathML formulas), sort by sibling position
98
+ # proximity to prefer matching nodes at the same position within
99
+ # their parent. This reduces cross-matching that causes cascading
100
+ # prefix closure failures.
101
+ if candidates.size > 1
102
+ pos2 = node2.position || 0
103
+ candidates = candidates.sort_by do |c|
104
+ pos1 = c.position || 0
105
+ (pos1 - pos2).abs
106
+ end
103
107
  end
104
- end
105
108
 
106
- # @return [TreeNode, nil]
107
- def find_best_match(node2, candidates)
108
- candidates.find { |node1| subtrees_match?(node1, node2) }
109
+ # Try each candidate until one passes both subtree matching
110
+ # AND the prefix closure constraint in matching.add.
111
+ # When multiple candidates have identical subtrees (e.g., labels
112
+ # with the same text child), the first may fail prefix closure
113
+ # due to ancestor cross-matching, but a later candidate succeeds.
114
+ candidates.each do |candidate|
115
+ next unless subtrees_match?(candidate, node2)
116
+
117
+ if @matching.add(candidate, node2)
118
+ @matched_tree1 << candidate
119
+ @matched_tree2 << node2
120
+ propagate_to_ancestors(candidate, node2)
121
+ return
122
+ end
123
+ end
109
124
  end
110
125
 
111
126
  def subtrees_match?(node1, node2)
data/lib/canon/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Canon
4
- VERSION = "0.1.22"
4
+ VERSION = "0.2.0"
5
5
  end
@@ -15,6 +15,10 @@ module Canon
15
15
  @uri = uri
16
16
  end
17
17
 
18
+ def name
19
+ prefix.to_s
20
+ end
21
+
18
22
  def node_type
19
23
  :namespace
20
24
  end
@@ -15,6 +15,10 @@ module Canon
15
15
  @data = data
16
16
  end
17
17
 
18
+ def name
19
+ target
20
+ end
21
+
18
22
  def node_type
19
23
  :processing_instruction
20
24
  end
@@ -7,6 +7,10 @@ module Canon
7
7
  module Nodes
8
8
  # Root node representing the document root
9
9
  class RootNode < Node
10
+ def name
11
+ "#document"
12
+ end
13
+
10
14
  def node_type
11
15
  :root
12
16
  end
@@ -22,6 +22,10 @@ module Canon
22
22
  @original = original || value
23
23
  end
24
24
 
25
+ def name
26
+ "#text"
27
+ end
28
+
25
29
  def node_type
26
30
  :text
27
31
  end
@@ -52,7 +52,7 @@ module PerformanceHelpers
52
52
 
53
53
  class << self
54
54
  def load_into_namespace(module_obj, file_path)
55
- content = File.read(file_path)
55
+ content = File.read(file_path, encoding: "utf-8")
56
56
  module_obj.module_eval(content, file_path)
57
57
  end
58
58
 
@@ -85,7 +85,7 @@ module PerformanceHelpers
85
85
  bench_copy_dir = File.join(clone_dir, "tmp", "performance")
86
86
  FileUtils.mkdir_p(bench_copy_dir)
87
87
  bench_copy = File.join(bench_copy_dir, "benchmark_runner.rb")
88
- File.write(bench_copy, File.read(script))
88
+ File.write(bench_copy, File.read(script, encoding: "utf-8"))
89
89
  load_into_namespace(Base, bench_copy)
90
90
  end
91
91
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: canon
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.22
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-03-31 00:00:00.000000000 Z
11
+ date: 2026-04-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: diff-lcs
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rainbow
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: table_tennis
85
99
  requirement: !ruby/object:Gem::Requirement
@@ -149,12 +163,15 @@ files:
149
163
  - docs/advanced/index.adoc
150
164
  - docs/advanced/semantic-diff-report.adoc
151
165
  - docs/advanced/verbose-mode-architecture.adoc
166
+ - docs/features/configuration-profiles.adoc
152
167
  - docs/features/diff-formatting/algorithm-specific-output.adoc
153
168
  - docs/features/diff-formatting/character-visualization.adoc
154
169
  - docs/features/diff-formatting/colors-and-symbols.adoc
155
170
  - docs/features/diff-formatting/context-and-grouping.adoc
156
171
  - docs/features/diff-formatting/display-filtering.adoc
172
+ - docs/features/diff-formatting/display-preprocessing.adoc
157
173
  - docs/features/diff-formatting/index.adoc
174
+ - docs/features/diff-formatting/pretty-diff-mode.adoc
158
175
  - docs/features/diff-formatting/themes.adoc
159
176
  - docs/features/environment-configuration/index.adoc
160
177
  - docs/features/environment-configuration/override-system.adoc
@@ -164,6 +181,7 @@ files:
164
181
  - docs/features/match-options/algorithm-specific-behavior.adoc
165
182
  - docs/features/match-options/html-policies.adoc
166
183
  - docs/features/match-options/index.adoc
184
+ - docs/features/match-options/pretty-printed-fixtures.adoc
167
185
  - docs/features/performance.adoc
168
186
  - docs/getting-started/index.adoc
169
187
  - docs/getting-started/quick-start.adoc
@@ -247,6 +265,9 @@ files:
247
265
  - lib/canon/config/env_provider.rb
248
266
  - lib/canon/config/env_schema.rb
249
267
  - lib/canon/config/override_resolver.rb
268
+ - lib/canon/config/profile_loader.rb
269
+ - lib/canon/config/profiles/metanorma.yml
270
+ - lib/canon/config/profiles/metanorma_debug.yml
250
271
  - lib/canon/config/type_converter.rb
251
272
  - lib/canon/data_model.rb
252
273
  - lib/canon/diff/diff_block.rb
@@ -304,6 +325,7 @@ files:
304
325
  - lib/canon/pretty_printer/html.rb
305
326
  - lib/canon/pretty_printer/json.rb
306
327
  - lib/canon/pretty_printer/xml.rb
328
+ - lib/canon/pretty_printer/xml_normalized.rb
307
329
  - lib/canon/rspec_matchers.rb
308
330
  - lib/canon/tree_diff.rb
309
331
  - lib/canon/tree_diff/adapters/html_adapter.rb