canon 0.1.22 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +174 -25
- data/docs/INDEX.adoc +4 -0
- data/docs/advanced/diff-classification.adoc +3 -2
- data/docs/features/configuration-profiles.adoc +288 -0
- data/docs/features/diff-formatting/character-visualization.adoc +153 -454
- data/docs/features/diff-formatting/display-filtering.adoc +44 -0
- data/docs/features/diff-formatting/display-preprocessing.adoc +656 -0
- data/docs/features/diff-formatting/index.adoc +47 -0
- data/docs/features/diff-formatting/pretty-diff-mode.adoc +154 -0
- data/docs/features/environment-configuration/override-system.adoc +10 -3
- data/docs/features/index.adoc +9 -0
- data/docs/features/match-options/index.adoc +32 -42
- data/docs/features/match-options/pretty-printed-fixtures.adoc +270 -0
- data/docs/guides/choosing-configuration.adoc +22 -0
- data/docs/reference/environment-variables.adoc +121 -1
- data/docs/reference/options-across-interfaces.adoc +182 -2
- data/lib/canon/cli.rb +20 -0
- data/lib/canon/commands/diff_command.rb +7 -2
- data/lib/canon/commands/format_command.rb +1 -1
- data/lib/canon/comparison/html_comparator.rb +20 -15
- data/lib/canon/comparison/html_compare_profile.rb +4 -4
- data/lib/canon/comparison/markup_comparator.rb +12 -3
- data/lib/canon/comparison/match_options/base_resolver.rb +29 -7
- data/lib/canon/comparison/match_options/json_resolver.rb +9 -0
- data/lib/canon/comparison/match_options/xml_resolver.rb +16 -2
- data/lib/canon/comparison/match_options/yaml_resolver.rb +10 -0
- data/lib/canon/comparison/match_options.rb +4 -1
- data/lib/canon/comparison/whitespace_sensitivity.rb +189 -137
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +21 -4
- data/lib/canon/comparison/xml_comparator.rb +14 -12
- data/lib/canon/comparison/xml_node_comparison.rb +51 -6
- data/lib/canon/comparison.rb +52 -9
- data/lib/canon/config/env_schema.rb +32 -4
- data/lib/canon/config/override_resolver.rb +16 -3
- data/lib/canon/config/profile_loader.rb +135 -0
- data/lib/canon/config/profiles/metanorma.yml +74 -0
- data/lib/canon/config/profiles/metanorma_debug.yml +8 -0
- data/lib/canon/config/type_converter.rb +8 -0
- data/lib/canon/config.rb +469 -5
- data/lib/canon/diff/diff_classifier.rb +41 -11
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +48 -17
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +58 -0
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +22 -7
- data/lib/canon/diff_formatter/theme.rb +24 -17
- data/lib/canon/diff_formatter.rb +493 -36
- data/lib/canon/pretty_printer/xml_normalized.rb +395 -0
- data/lib/canon/rspec_matchers.rb +36 -0
- data/lib/canon/tree_diff/matchers/hash_matcher.rb +26 -11
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/nodes/namespace_node.rb +4 -0
- data/lib/canon/xml/nodes/processing_instruction_node.rb +4 -0
- data/lib/canon/xml/nodes/root_node.rb +4 -0
- data/lib/canon/xml/nodes/text_node.rb +4 -0
- data/lib/tasks/performance_helpers.rb +2 -2
- metadata +24 -2
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
|
|
5
|
+
module Canon
|
|
6
|
+
module PrettyPrinter
|
|
7
|
+
# Mixed-content-aware XML serializer for diff display preprocessing.
|
|
8
|
+
#
|
|
9
|
+
# == The mixed-content problem
|
|
10
|
+
#
|
|
11
|
+
# Standard XML pretty-printers (including Nokogiri's built-in serializer)
|
|
12
|
+
# keep elements that contain both text and child elements on a single line.
|
|
13
|
+
# They have no choice: inserting a newline between, say, `<p>See ` and
|
|
14
|
+
# `<xref.../>` would create a new whitespace text node, changing the
|
|
15
|
+
# document's semantic content. The result for line-by-line diffs is that
|
|
16
|
+
# any change inside such an element forces the entire line — potentially
|
|
17
|
+
# hundreds or thousands of characters — to be marked as changed. Issue #53
|
|
18
|
+
# documented this as "1000-character long lines" from HTML diffs.
|
|
19
|
+
#
|
|
20
|
+
# == Three-way whitespace classification
|
|
21
|
+
#
|
|
22
|
+
# This serializer distinguishes three categories of element-level whitespace
|
|
23
|
+
# behaviour, configured via element-name lists:
|
|
24
|
+
#
|
|
25
|
+
# * **Preserve** (`preserve_whitespace_elements`) — every whitespace character is
|
|
26
|
+
# significant. `" "` ≠ `"\n"`. Typical: `<pre>`, `<code>`, `<textarea>`.
|
|
27
|
+
# Whitespace-only text nodes are visualized character-by-character.
|
|
28
|
+
#
|
|
29
|
+
# * **Collapse** (`collapse_whitespace_elements`) — presence ≠ absence,
|
|
30
|
+
# but all whitespace forms are equivalent: `" "` == `"\n "` == `"\t"`.
|
|
31
|
+
# Typical: `<p>`, `<li>`, `<td>`, heading elements.
|
|
32
|
+
# Whitespace-only text nodes are collapsed to a single `░` visualization,
|
|
33
|
+
# so `<p>\n <em>` (indented fixture) and `<p> <em>` (compact source)
|
|
34
|
+
# both render as `<p>░<em>` — identical display lines, no spurious diff.
|
|
35
|
+
#
|
|
36
|
+
# * **Strip** (everything else, or explicit `strip_whitespace_elements`) —
|
|
37
|
+
# all whitespace between child elements is structural formatting noise.
|
|
38
|
+
# `" "` == `"\n "` == nothing. Whitespace-only text nodes are silently
|
|
39
|
+
# dropped. Typical: `<section>`, `<ul>`, `<formattedref>`, `<bibitem>`.
|
|
40
|
+
#
|
|
41
|
+
# Classification is **ancestor-based**: a text node's class is determined
|
|
42
|
+
# by the closest matching ancestor. This means `<em>` inside `<p>` inherits
|
|
43
|
+
# `<p>`'s normalize behaviour without needing to be listed explicitly.
|
|
44
|
+
#
|
|
45
|
+
# == Format defaults
|
|
46
|
+
#
|
|
47
|
+
# * **XML**: all three lists are empty by default — insensitive everywhere.
|
|
48
|
+
# Whitespace sensitivity is opt-in, consistent with XML's data-first usage.
|
|
49
|
+
#
|
|
50
|
+
# * **HTML**: built-in defaults are provided (but overridable):
|
|
51
|
+
# - preserve: `pre`, `code`, `textarea`, `script`, `style`
|
|
52
|
+
# - collapse: `p`, `li`, `dt`, `dd`, `td`, `th`, `h1`–`h6`, `caption`,
|
|
53
|
+
# `figcaption`, `label`, `legend`, `summary`, `blockquote`, `address`
|
|
54
|
+
#
|
|
55
|
+
# == Structural vs. content whitespace
|
|
56
|
+
#
|
|
57
|
+
# * **Structural whitespace** — indentation characters emitted by the
|
|
58
|
+
# serializer itself. These do not exist in the source document.
|
|
59
|
+
# They are rendered as ordinary ASCII space and newline characters.
|
|
60
|
+
# * **Content whitespace** — whitespace that exists as text-node content
|
|
61
|
+
# in the source document. Classification (above) decides how to render it.
|
|
62
|
+
#
|
|
63
|
+
# The invariant is: every XML element always starts on its own line.
|
|
64
|
+
# Content whitespace is never confused with structural indentation.
|
|
65
|
+
#
|
|
66
|
+
# == Example (normalize element <p>)
|
|
67
|
+
#
|
|
68
|
+
# Input — compact source (Metanorma-style):
|
|
69
|
+
# <p>See <xref target="M"/></p>
|
|
70
|
+
#
|
|
71
|
+
# Input — indented fixture heredoc:
|
|
72
|
+
# <p>
|
|
73
|
+
# See
|
|
74
|
+
# <xref target="M"/>
|
|
75
|
+
# </p>
|
|
76
|
+
#
|
|
77
|
+
# Both serialize to:
|
|
78
|
+
# <p>
|
|
79
|
+
# See░
|
|
80
|
+
# <xref target="M"/>
|
|
81
|
+
# </p>
|
|
82
|
+
#
|
|
83
|
+
# Result: zero diff lines for a semantically identical document.
|
|
84
|
+
#
|
|
85
|
+
# == Example (insensitive element <formattedref>)
|
|
86
|
+
#
|
|
87
|
+
# Input — compact source:
|
|
88
|
+
# <formattedref><em>Cereals</em>.</formattedref>
|
|
89
|
+
#
|
|
90
|
+
# Input — indented fixture:
|
|
91
|
+
# <formattedref>
|
|
92
|
+
# <em>Cereals</em>.
|
|
93
|
+
# </formattedref>
|
|
94
|
+
#
|
|
95
|
+
# Both serialize to (whitespace-only nodes silently dropped):
|
|
96
|
+
# <formattedref>
|
|
97
|
+
# <em>Cereals</em>
|
|
98
|
+
# .
|
|
99
|
+
# </formattedref>
|
|
100
|
+
#
|
|
101
|
+
# Result: zero diff lines.
|
|
102
|
+
#
|
|
103
|
+
# == Usage
|
|
104
|
+
#
|
|
105
|
+
# printer = Canon::PrettyPrinter::XmlNormalized.new
|
|
106
|
+
# formatted = printer.format(xml_string)
|
|
107
|
+
#
|
|
108
|
+
# # With element lists (XML):
|
|
109
|
+
# printer = Canon::PrettyPrinter::XmlNormalized.new(
|
|
110
|
+
# collapse_whitespace_elements: %w[p formattedref title],
|
|
111
|
+
# preserve_whitespace_elements: %w[sourcecode pre],
|
|
112
|
+
# )
|
|
113
|
+
#
|
|
114
|
+
class XmlNormalized
|
|
115
|
+
# @param indent [Integer] number of indent characters per level (default 2)
|
|
116
|
+
# @param indent_type [String] "space" or "tab"
|
|
117
|
+
# @param visualization_map [Hash, nil] character visualization map
|
|
118
|
+
# @param preserve_whitespace_elements [Array<String>] element names where
|
|
119
|
+
# every whitespace character is significant (e.g. pre, code).
|
|
120
|
+
# @param collapse_whitespace_elements [Array<String>] element names where
|
|
121
|
+
# presence of whitespace matters but all forms are equivalent (e.g. p, li).
|
|
122
|
+
# @param strip_whitespace_elements [Array<String>] explicit blacklist — these
|
|
123
|
+
# elements and their children always have whitespace dropped, even if an
|
|
124
|
+
# ancestor would otherwise be preserve or collapse.
|
|
125
|
+
# @param pretty_printed [Boolean] when true, whitespace-only text nodes
|
|
126
|
+
# that begin with "\n" inside +:collapse+ elements are treated as
|
|
127
|
+
# structural indentation and silently dropped. This matches the
|
|
128
|
+
# comparison-side behaviour activated by +pretty_printed_expected+ /
|
|
129
|
+
# +pretty_printed_received+ match options. Nodes under +:preserve+ elements
|
|
130
|
+
# are always preserved; nodes under +:strip+ elements are already dropped.
|
|
131
|
+
def initialize(indent: 2, indent_type: "space", visualization_map: nil,
|
|
132
|
+
preserve_whitespace_elements: [],
|
|
133
|
+
collapse_whitespace_elements: [],
|
|
134
|
+
strip_whitespace_elements: [],
|
|
135
|
+
pretty_printed: false,
|
|
136
|
+
sort_attributes: false)
|
|
137
|
+
@indent = indent.to_i
|
|
138
|
+
@indent_char = indent_type == "tab" ? "\t" : " "
|
|
139
|
+
@vis_map = visualization_map || default_vis_map
|
|
140
|
+
@pretty_printed = pretty_printed
|
|
141
|
+
@sort_attributes = sort_attributes
|
|
142
|
+
|
|
143
|
+
@strict_ws = Set.new((preserve_whitespace_elements || []).map(&:to_s))
|
|
144
|
+
@norm_ws = Set.new((collapse_whitespace_elements || []).map(&:to_s))
|
|
145
|
+
@insens_ws = Set.new((strip_whitespace_elements || []).map(&:to_s))
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Format an XML string with mixed-content-aware serialization.
|
|
149
|
+
#
|
|
150
|
+
# @param xml_string [String] Input XML
|
|
151
|
+
# @return [String] Serialized XML, one node per line, with content
|
|
152
|
+
# whitespace visualized at line boundaries
|
|
153
|
+
def format(xml_string)
|
|
154
|
+
doc = Nokogiri::XML(xml_string)
|
|
155
|
+
lines = []
|
|
156
|
+
|
|
157
|
+
if doc.version
|
|
158
|
+
enc = doc.encoding ? " encoding=\"#{doc.encoding}\"" : ""
|
|
159
|
+
lines << "<?xml version=\"#{doc.version}\"#{enc}?>"
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
lines << serialize_element(doc.root, 0) if doc.root
|
|
163
|
+
lines.join("\n")
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
private
|
|
167
|
+
|
|
168
|
+
# Return indent string for depth.
|
|
169
|
+
def ind(depth)
|
|
170
|
+
@indent_char * (@indent * depth)
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Classify the whitespace behaviour for a given Nokogiri element node.
|
|
174
|
+
#
|
|
175
|
+
# Walks up the ancestor chain from the element itself. The first
|
|
176
|
+
# matching ancestor determines the class. Insensitive blacklist wins
|
|
177
|
+
# over any sensitive ancestor.
|
|
178
|
+
#
|
|
179
|
+
# @param element [Nokogiri::XML::Element] The element to classify
|
|
180
|
+
# @return [Symbol] :strict, :normalize, or :drop
|
|
181
|
+
def classify_whitespace(element)
|
|
182
|
+
current = element
|
|
183
|
+
while current && !current.is_a?(Nokogiri::XML::Document)
|
|
184
|
+
name = current.name.to_s
|
|
185
|
+
return :drop if @insens_ws.include?(name)
|
|
186
|
+
return :strict if @strict_ws.include?(name)
|
|
187
|
+
return :normalize if @norm_ws.include?(name)
|
|
188
|
+
|
|
189
|
+
current = current.parent
|
|
190
|
+
end
|
|
191
|
+
# No matching ancestor — default: drop (insensitive)
|
|
192
|
+
:drop
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Serialize a single element node.
|
|
196
|
+
def serialize_element(node, depth)
|
|
197
|
+
# Filter out empty text nodes (zero-length, not whitespace-only).
|
|
198
|
+
children = node.children.reject { |c| c.text? && c.content.empty? }
|
|
199
|
+
|
|
200
|
+
if children.empty?
|
|
201
|
+
return "#{ind(depth)}#{open_tag(node,
|
|
202
|
+
self_close: true)}"
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
elem_children = children.select(&:element?)
|
|
206
|
+
text_with_content = children.select do |c|
|
|
207
|
+
c.text? && !c.content.strip.empty?
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
if elem_children.empty?
|
|
211
|
+
# Pure-text element — keep on one line.
|
|
212
|
+
return "#{ind(depth)}#{open_tag(node)}#{node.text}</#{node.name}>"
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
if text_with_content.empty?
|
|
216
|
+
# Element-only children (may have whitespace-only text nodes between them).
|
|
217
|
+
# Apply classification to decide whether to drop or visualize them.
|
|
218
|
+
ws_class = classify_whitespace(node)
|
|
219
|
+
lines = ["#{ind(depth)}#{open_tag(node)}"]
|
|
220
|
+
children.each do |child|
|
|
221
|
+
if child.text?
|
|
222
|
+
# Whitespace-only text node between element children
|
|
223
|
+
vis = render_whitespace_only(child.content, ws_class)
|
|
224
|
+
next if vis.nil? # :drop
|
|
225
|
+
|
|
226
|
+
# Append to previous line (do not create a new line)
|
|
227
|
+
lines[-1] = lines[-1] + vis
|
|
228
|
+
else
|
|
229
|
+
lines << serialize_element(child, depth + 1)
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
lines << "#{ind(depth)}</#{node.name}>"
|
|
233
|
+
return lines.join("\n")
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
# Mixed content: both text-with-content and element children.
|
|
237
|
+
serialize_mixed(node, children, depth)
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
# Serialize a mixed-content element.
|
|
241
|
+
#
|
|
242
|
+
# Each child is processed in document order. Text nodes are split into:
|
|
243
|
+
# * leading whitespace → rendered according to whitespace classification
|
|
244
|
+
# * non-whitespace content → put on its OWN indented line
|
|
245
|
+
# * trailing whitespace → rendered according to classification, appended
|
|
246
|
+
#
|
|
247
|
+
# Element children flush the current accumulated line, then are
|
|
248
|
+
# serialized recursively.
|
|
249
|
+
def serialize_mixed(node, children, depth)
|
|
250
|
+
child_depth = depth + 1
|
|
251
|
+
lines = []
|
|
252
|
+
current_line = "#{ind(depth)}#{open_tag(node)}"
|
|
253
|
+
ws_class = classify_whitespace(node)
|
|
254
|
+
|
|
255
|
+
children.each do |child|
|
|
256
|
+
if child.text?
|
|
257
|
+
process_text_node(child.content, child_depth, lines, current_line,
|
|
258
|
+
ws_class) do |nl|
|
|
259
|
+
current_line = nl
|
|
260
|
+
end
|
|
261
|
+
else
|
|
262
|
+
lines << current_line
|
|
263
|
+
current_line = serialize_element(child, child_depth)
|
|
264
|
+
end
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
lines << current_line
|
|
268
|
+
lines << "#{ind(depth)}</#{node.name}>"
|
|
269
|
+
lines.join("\n")
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
# Render a whitespace-only string according to classification.
|
|
273
|
+
#
|
|
274
|
+
# When +@pretty_printed+ is true and +ws_class+ is +:normalize+:
|
|
275
|
+
# * Content starting with "\n" (e.g. "\n " indentation) is treated as
|
|
276
|
+
# structural pretty-print formatting and **dropped** (returns nil).
|
|
277
|
+
# * All other whitespace (e.g. " " inline space) is still rendered as the
|
|
278
|
+
# usual single-space visualization.
|
|
279
|
+
# This aligns display output with the comparison-side behaviour controlled
|
|
280
|
+
# by +pretty_printed_expected+ / +pretty_printed_received+.
|
|
281
|
+
#
|
|
282
|
+
# @param content [String] Whitespace-only string
|
|
283
|
+
# @param ws_class [Symbol] :strict, :normalize, or :drop
|
|
284
|
+
# @return [String, nil] Rendered string, or nil to indicate "drop"
|
|
285
|
+
def render_whitespace_only(content, ws_class)
|
|
286
|
+
case ws_class
|
|
287
|
+
when :strict
|
|
288
|
+
visualize(content)
|
|
289
|
+
when :normalize
|
|
290
|
+
# In pretty_printed mode, \n-leading whitespace is structural — drop it
|
|
291
|
+
return nil if @pretty_printed && content.start_with?("\n")
|
|
292
|
+
|
|
293
|
+
# Any other whitespace → single space visualization
|
|
294
|
+
content.empty? ? nil : @vis_map.fetch(" ", "░")
|
|
295
|
+
# :drop — fall through to nil
|
|
296
|
+
end
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
# Process a text node in mixed-content context.
|
|
300
|
+
#
|
|
301
|
+
# Yields the new current_line (string the caller should adopt).
|
|
302
|
+
#
|
|
303
|
+
# === Pure-whitespace text nodes
|
|
304
|
+
#
|
|
305
|
+
# Whitespace-only text nodes are rendered via +render_whitespace_only+
|
|
306
|
+
# according to the element's whitespace classification:
|
|
307
|
+
# - :strict → visualize every character (e.g. ↵░░░)
|
|
308
|
+
# - :normalize → single ░ regardless of whitespace form
|
|
309
|
+
# - :drop → silently discarded
|
|
310
|
+
#
|
|
311
|
+
# === Text nodes with printable content
|
|
312
|
+
#
|
|
313
|
+
# Leading and trailing whitespace are split off and rendered according
|
|
314
|
+
# to the whitespace classification at line boundaries. The printable
|
|
315
|
+
# content occupies its own indented line.
|
|
316
|
+
def process_text_node(content, child_depth, lines, current_line, ws_class)
|
|
317
|
+
stripped = content.strip
|
|
318
|
+
|
|
319
|
+
if stripped.empty?
|
|
320
|
+
# Pure whitespace between elements
|
|
321
|
+
vis = render_whitespace_only(content, ws_class)
|
|
322
|
+
if vis.nil?
|
|
323
|
+
yield current_line # :drop — no change
|
|
324
|
+
else
|
|
325
|
+
yield current_line + vis
|
|
326
|
+
end
|
|
327
|
+
return
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
leading = content[/\A\s*/]
|
|
331
|
+
trailing = content[/\s*\z/]
|
|
332
|
+
middle = stripped
|
|
333
|
+
|
|
334
|
+
# Leading whitespace: append to current line (then flush), or drop
|
|
335
|
+
unless leading.empty?
|
|
336
|
+
vis = render_whitespace_only(leading, ws_class)
|
|
337
|
+
current_line += vis unless vis.nil?
|
|
338
|
+
end
|
|
339
|
+
lines << current_line
|
|
340
|
+
|
|
341
|
+
# Trailing whitespace visualization
|
|
342
|
+
trailing_vis = if trailing.empty?
|
|
343
|
+
""
|
|
344
|
+
else
|
|
345
|
+
v = render_whitespace_only(trailing, ws_class)
|
|
346
|
+
v.nil? ? "" : v
|
|
347
|
+
end
|
|
348
|
+
yield "#{ind(child_depth)}#{middle}#{trailing_vis}"
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
# Build an opening XML tag with namespace declarations and attributes.
|
|
352
|
+
def open_tag(node, self_close: false)
|
|
353
|
+
ns_decls = node.namespace_definitions.map do |ns|
|
|
354
|
+
ns.prefix ? " xmlns:#{ns.prefix}=\"#{ns.href}\"" : " xmlns=\"#{ns.href}\""
|
|
355
|
+
end.join
|
|
356
|
+
|
|
357
|
+
attr_nodes = node.attribute_nodes
|
|
358
|
+
if @sort_attributes
|
|
359
|
+
attr_nodes = attr_nodes.sort_by do |a|
|
|
360
|
+
[a.namespace&.href.to_s, a.name]
|
|
361
|
+
end
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
attrs = attr_nodes.map do |a|
|
|
365
|
+
prefix = a.namespace&.prefix ? "#{a.namespace.prefix}:" : ""
|
|
366
|
+
" #{prefix}#{a.name}=\"#{escape_attr(a.value)}\""
|
|
367
|
+
end.join
|
|
368
|
+
|
|
369
|
+
close = self_close ? "/>" : ">"
|
|
370
|
+
"<#{node.name}#{ns_decls}#{attrs}#{close}"
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
# Escape characters that are special inside attribute values.
|
|
374
|
+
def escape_attr(value)
|
|
375
|
+
value.gsub("&", "&").gsub('"', """).gsub("<", "<")
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
# Visualize a whitespace string using the character map.
|
|
379
|
+
# Non-whitespace characters are passed through unchanged (safety net).
|
|
380
|
+
def visualize(str)
|
|
381
|
+
return "" if str.nil? || str.empty?
|
|
382
|
+
|
|
383
|
+
str.chars.map { |c| @vis_map.fetch(c, c) }.join
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
# Load the default visualization map from DiffFormatter constants.
|
|
387
|
+
def default_vis_map
|
|
388
|
+
require_relative "../diff_formatter"
|
|
389
|
+
Canon::DiffFormatter::DEFAULT_VISUALIZATION_MAP
|
|
390
|
+
rescue LoadError, NameError
|
|
391
|
+
{ " " => "░", "\t" => "⇥", "\n" => "↵", "\r" => "⏎", "\u00A0" => "␣" }
|
|
392
|
+
end
|
|
393
|
+
end
|
|
394
|
+
end
|
|
395
|
+
end
|
data/lib/canon/rspec_matchers.rb
CHANGED
|
@@ -219,6 +219,20 @@ module Canon
|
|
|
219
219
|
context_lines: diff_config.context_lines,
|
|
220
220
|
diff_grouping_lines: diff_config.grouping_lines,
|
|
221
221
|
show_diffs: diff_config.show_diffs,
|
|
222
|
+
show_raw_inputs: diff_config.show_raw_inputs,
|
|
223
|
+
show_raw_expected: diff_config.show_raw_expected,
|
|
224
|
+
show_raw_received: diff_config.show_raw_received,
|
|
225
|
+
show_preprocessed_inputs: diff_config.show_preprocessed_inputs,
|
|
226
|
+
show_preprocessed_expected: diff_config.show_preprocessed_expected,
|
|
227
|
+
show_preprocessed_received: diff_config.show_preprocessed_received,
|
|
228
|
+
show_prettyprint_inputs: diff_config.show_prettyprint_inputs,
|
|
229
|
+
show_prettyprint_expected: diff_config.show_prettyprint_expected,
|
|
230
|
+
show_prettyprint_received: diff_config.show_prettyprint_received,
|
|
231
|
+
show_line_numbered_inputs: diff_config.show_line_numbered_inputs,
|
|
232
|
+
character_visualization: diff_config.character_visualization,
|
|
233
|
+
display_preprocessing: diff_config.display_preprocessing,
|
|
234
|
+
pretty_printer_indent: diff_config.pretty_printer.indent,
|
|
235
|
+
pretty_printer_indent_type: diff_config.pretty_printer.indent_type,
|
|
222
236
|
)
|
|
223
237
|
|
|
224
238
|
return formatter.format([], :string, doc1: @expected.to_s,
|
|
@@ -237,6 +251,28 @@ module Canon
|
|
|
237
251
|
diff_grouping_lines: diff_config.grouping_lines,
|
|
238
252
|
show_diffs: diff_config.show_diffs,
|
|
239
253
|
verbose_diff: diff_config.verbose_diff,
|
|
254
|
+
show_raw_inputs: diff_config.show_raw_inputs,
|
|
255
|
+
show_raw_expected: diff_config.show_raw_expected,
|
|
256
|
+
show_raw_received: diff_config.show_raw_received,
|
|
257
|
+
show_preprocessed_inputs: diff_config.show_preprocessed_inputs,
|
|
258
|
+
show_preprocessed_expected: diff_config.show_preprocessed_expected,
|
|
259
|
+
show_preprocessed_received: diff_config.show_preprocessed_received,
|
|
260
|
+
show_prettyprint_inputs: diff_config.show_prettyprint_inputs,
|
|
261
|
+
show_prettyprint_expected: diff_config.show_prettyprint_expected,
|
|
262
|
+
show_prettyprint_received: diff_config.show_prettyprint_received,
|
|
263
|
+
show_line_numbered_inputs: diff_config.show_line_numbered_inputs,
|
|
264
|
+
character_visualization: diff_config.character_visualization,
|
|
265
|
+
display_preprocessing: diff_config.display_preprocessing,
|
|
266
|
+
pretty_printer_indent: diff_config.pretty_printer.indent,
|
|
267
|
+
pretty_printer_indent_type: diff_config.pretty_printer.indent_type,
|
|
268
|
+
preserve_whitespace_elements: diff_config.preserve_whitespace_elements,
|
|
269
|
+
collapse_whitespace_elements: diff_config.collapse_whitespace_elements,
|
|
270
|
+
strip_whitespace_elements: diff_config.strip_whitespace_elements,
|
|
271
|
+
pretty_printed_expected: diff_config.pretty_printed_expected,
|
|
272
|
+
pretty_printed_received: diff_config.pretty_printed_received,
|
|
273
|
+
pretty_printer_sort_attributes: diff_config.pretty_printer_sort_attributes,
|
|
274
|
+
compact_semantic_report: diff_config.compact_semantic_report,
|
|
275
|
+
expand_difference: diff_config.expand_difference,
|
|
240
276
|
)
|
|
241
277
|
|
|
242
278
|
# Format the diff using the comparison result
|
|
@@ -93,19 +93,34 @@ module Canon
|
|
|
93
93
|
end
|
|
94
94
|
return if candidates.empty?
|
|
95
95
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
96
|
+
# When multiple candidates have identical signatures (common with
|
|
97
|
+
# duplicate subtrees like MathML formulas), sort by sibling position
|
|
98
|
+
# proximity to prefer matching nodes at the same position within
|
|
99
|
+
# their parent. This reduces cross-matching that causes cascading
|
|
100
|
+
# prefix closure failures.
|
|
101
|
+
if candidates.size > 1
|
|
102
|
+
pos2 = node2.position || 0
|
|
103
|
+
candidates = candidates.sort_by do |c|
|
|
104
|
+
pos1 = c.position || 0
|
|
105
|
+
(pos1 - pos2).abs
|
|
106
|
+
end
|
|
103
107
|
end
|
|
104
|
-
end
|
|
105
108
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
candidates
|
|
109
|
+
# Try each candidate until one passes both subtree matching
|
|
110
|
+
# AND the prefix closure constraint in matching.add.
|
|
111
|
+
# When multiple candidates have identical subtrees (e.g., labels
|
|
112
|
+
# with the same text child), the first may fail prefix closure
|
|
113
|
+
# due to ancestor cross-matching, but a later candidate succeeds.
|
|
114
|
+
candidates.each do |candidate|
|
|
115
|
+
next unless subtrees_match?(candidate, node2)
|
|
116
|
+
|
|
117
|
+
if @matching.add(candidate, node2)
|
|
118
|
+
@matched_tree1 << candidate
|
|
119
|
+
@matched_tree2 << node2
|
|
120
|
+
propagate_to_ancestors(candidate, node2)
|
|
121
|
+
return
|
|
122
|
+
end
|
|
123
|
+
end
|
|
109
124
|
end
|
|
110
125
|
|
|
111
126
|
def subtrees_match?(node1, node2)
|
data/lib/canon/version.rb
CHANGED
|
@@ -52,7 +52,7 @@ module PerformanceHelpers
|
|
|
52
52
|
|
|
53
53
|
class << self
|
|
54
54
|
def load_into_namespace(module_obj, file_path)
|
|
55
|
-
content = File.read(file_path)
|
|
55
|
+
content = File.read(file_path, encoding: "utf-8")
|
|
56
56
|
module_obj.module_eval(content, file_path)
|
|
57
57
|
end
|
|
58
58
|
|
|
@@ -85,7 +85,7 @@ module PerformanceHelpers
|
|
|
85
85
|
bench_copy_dir = File.join(clone_dir, "tmp", "performance")
|
|
86
86
|
FileUtils.mkdir_p(bench_copy_dir)
|
|
87
87
|
bench_copy = File.join(bench_copy_dir, "benchmark_runner.rb")
|
|
88
|
-
File.write(bench_copy, File.read(script))
|
|
88
|
+
File.write(bench_copy, File.read(script, encoding: "utf-8"))
|
|
89
89
|
load_into_namespace(Base, bench_copy)
|
|
90
90
|
end
|
|
91
91
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: canon
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-04-12 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: diff-lcs
|
|
@@ -80,6 +80,20 @@ dependencies:
|
|
|
80
80
|
- - ">="
|
|
81
81
|
- !ruby/object:Gem::Version
|
|
82
82
|
version: '0'
|
|
83
|
+
- !ruby/object:Gem::Dependency
|
|
84
|
+
name: rainbow
|
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
|
86
|
+
requirements:
|
|
87
|
+
- - ">="
|
|
88
|
+
- !ruby/object:Gem::Version
|
|
89
|
+
version: '0'
|
|
90
|
+
type: :runtime
|
|
91
|
+
prerelease: false
|
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
93
|
+
requirements:
|
|
94
|
+
- - ">="
|
|
95
|
+
- !ruby/object:Gem::Version
|
|
96
|
+
version: '0'
|
|
83
97
|
- !ruby/object:Gem::Dependency
|
|
84
98
|
name: table_tennis
|
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -149,12 +163,15 @@ files:
|
|
|
149
163
|
- docs/advanced/index.adoc
|
|
150
164
|
- docs/advanced/semantic-diff-report.adoc
|
|
151
165
|
- docs/advanced/verbose-mode-architecture.adoc
|
|
166
|
+
- docs/features/configuration-profiles.adoc
|
|
152
167
|
- docs/features/diff-formatting/algorithm-specific-output.adoc
|
|
153
168
|
- docs/features/diff-formatting/character-visualization.adoc
|
|
154
169
|
- docs/features/diff-formatting/colors-and-symbols.adoc
|
|
155
170
|
- docs/features/diff-formatting/context-and-grouping.adoc
|
|
156
171
|
- docs/features/diff-formatting/display-filtering.adoc
|
|
172
|
+
- docs/features/diff-formatting/display-preprocessing.adoc
|
|
157
173
|
- docs/features/diff-formatting/index.adoc
|
|
174
|
+
- docs/features/diff-formatting/pretty-diff-mode.adoc
|
|
158
175
|
- docs/features/diff-formatting/themes.adoc
|
|
159
176
|
- docs/features/environment-configuration/index.adoc
|
|
160
177
|
- docs/features/environment-configuration/override-system.adoc
|
|
@@ -164,6 +181,7 @@ files:
|
|
|
164
181
|
- docs/features/match-options/algorithm-specific-behavior.adoc
|
|
165
182
|
- docs/features/match-options/html-policies.adoc
|
|
166
183
|
- docs/features/match-options/index.adoc
|
|
184
|
+
- docs/features/match-options/pretty-printed-fixtures.adoc
|
|
167
185
|
- docs/features/performance.adoc
|
|
168
186
|
- docs/getting-started/index.adoc
|
|
169
187
|
- docs/getting-started/quick-start.adoc
|
|
@@ -247,6 +265,9 @@ files:
|
|
|
247
265
|
- lib/canon/config/env_provider.rb
|
|
248
266
|
- lib/canon/config/env_schema.rb
|
|
249
267
|
- lib/canon/config/override_resolver.rb
|
|
268
|
+
- lib/canon/config/profile_loader.rb
|
|
269
|
+
- lib/canon/config/profiles/metanorma.yml
|
|
270
|
+
- lib/canon/config/profiles/metanorma_debug.yml
|
|
250
271
|
- lib/canon/config/type_converter.rb
|
|
251
272
|
- lib/canon/data_model.rb
|
|
252
273
|
- lib/canon/diff/diff_block.rb
|
|
@@ -304,6 +325,7 @@ files:
|
|
|
304
325
|
- lib/canon/pretty_printer/html.rb
|
|
305
326
|
- lib/canon/pretty_printer/json.rb
|
|
306
327
|
- lib/canon/pretty_printer/xml.rb
|
|
328
|
+
- lib/canon/pretty_printer/xml_normalized.rb
|
|
307
329
|
- lib/canon/rspec_matchers.rb
|
|
308
330
|
- lib/canon/tree_diff.rb
|
|
309
331
|
- lib/canon/tree_diff/adapters/html_adapter.rb
|