canon 0.1.23 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +155 -30
- data/docs/INDEX.adoc +4 -0
- data/docs/advanced/diff-classification.adoc +3 -2
- data/docs/advanced/verbose-mode-architecture.adoc +23 -0
- data/docs/features/configuration-profiles.adoc +288 -0
- data/docs/features/diff-formatting/character-visualization.adoc +153 -454
- data/docs/features/diff-formatting/display-filtering.adoc +44 -0
- data/docs/features/diff-formatting/display-preprocessing.adoc +656 -0
- data/docs/features/diff-formatting/index.adoc +47 -0
- data/docs/features/diff-formatting/pretty-diff-mode.adoc +154 -0
- data/docs/features/environment-configuration/override-system.adoc +10 -3
- data/docs/features/index.adoc +9 -0
- data/docs/features/match-options/html-policies.adoc +3 -0
- data/docs/features/match-options/index.adoc +32 -42
- data/docs/features/match-options/pretty-printed-fixtures.adoc +270 -0
- data/docs/guides/choosing-configuration.adoc +22 -0
- data/docs/reference/environment-variables.adoc +121 -1
- data/docs/reference/options-across-interfaces.adoc +182 -2
- data/lib/canon/cli.rb +20 -0
- data/lib/canon/commands/diff_command.rb +7 -2
- data/lib/canon/commands/format_command.rb +1 -1
- data/lib/canon/comparison/html_comparator.rb +29 -19
- data/lib/canon/comparison/html_compare_profile.rb +4 -4
- data/lib/canon/comparison/markup_comparator.rb +12 -3
- data/lib/canon/comparison/match_options/base_resolver.rb +29 -7
- data/lib/canon/comparison/match_options/json_resolver.rb +9 -0
- data/lib/canon/comparison/match_options/xml_resolver.rb +16 -2
- data/lib/canon/comparison/match_options/yaml_resolver.rb +10 -0
- data/lib/canon/comparison/match_options.rb +4 -1
- data/lib/canon/comparison/whitespace_sensitivity.rb +189 -137
- data/lib/canon/comparison/xml_comparator/child_comparison.rb +21 -4
- data/lib/canon/comparison/xml_comparator.rb +14 -12
- data/lib/canon/comparison/xml_node_comparison.rb +51 -6
- data/lib/canon/comparison.rb +52 -9
- data/lib/canon/config/env_schema.rb +32 -4
- data/lib/canon/config/override_resolver.rb +16 -3
- data/lib/canon/config/profile_loader.rb +135 -0
- data/lib/canon/config/profiles/metanorma.yml +74 -0
- data/lib/canon/config/profiles/metanorma_debug.yml +8 -0
- data/lib/canon/config/type_converter.rb +8 -0
- data/lib/canon/config.rb +469 -5
- data/lib/canon/diff/diff_classifier.rb +41 -11
- data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +48 -17
- data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +58 -0
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +73 -17
- data/lib/canon/diff_formatter.rb +493 -36
- data/lib/canon/pretty_printer/xml_normalized.rb +395 -0
- data/lib/canon/rspec_matchers.rb +36 -0
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/nodes/namespace_node.rb +4 -0
- data/lib/canon/xml/nodes/processing_instruction_node.rb +4 -0
- data/lib/canon/xml/nodes/root_node.rb +4 -0
- data/lib/canon/xml/nodes/text_node.rb +4 -0
- data/lib/tasks/performance_helpers.rb +2 -2
- metadata +24 -2
|
@@ -26,8 +26,8 @@ module Canon
|
|
|
26
26
|
check_file_size(file2, format2)
|
|
27
27
|
|
|
28
28
|
# Read raw content for potential by-line diff
|
|
29
|
-
content1 = File.read(file1)
|
|
30
|
-
content2 = File.read(file2)
|
|
29
|
+
content1 = File.read(file1, encoding: "utf-8")
|
|
30
|
+
content2 = File.read(file2, encoding: "utf-8")
|
|
31
31
|
|
|
32
32
|
# Parse documents
|
|
33
33
|
doc1 = parse_document_content(content1, format1)
|
|
@@ -56,6 +56,11 @@ module Canon
|
|
|
56
56
|
show_diffs: @options[:show_diffs]&.to_sym || :all,
|
|
57
57
|
show_raw_inputs: @options[:show_raw_inputs] || false,
|
|
58
58
|
show_preprocessed_inputs: @options[:show_preprocessed_inputs] || false,
|
|
59
|
+
show_preprocessed_expected: @options[:show_preprocessed_expected] || false,
|
|
60
|
+
show_preprocessed_received: @options[:show_preprocessed_received] || false,
|
|
61
|
+
show_prettyprint_inputs: @options[:show_prettyprint_inputs] || false,
|
|
62
|
+
show_prettyprint_expected: @options[:show_prettyprint_expected] || false,
|
|
63
|
+
show_prettyprint_received: @options[:show_prettyprint_received] || false,
|
|
59
64
|
show_line_numbered_inputs: @options[:show_line_numbered_inputs] || false,
|
|
60
65
|
)
|
|
61
66
|
|
|
@@ -15,7 +15,7 @@ module Canon
|
|
|
15
15
|
# rubocop:disable Metrics/MethodLength
|
|
16
16
|
def run(input_file)
|
|
17
17
|
# Read input file
|
|
18
|
-
content = File.read(input_file)
|
|
18
|
+
content = File.read(input_file, encoding: "utf-8")
|
|
19
19
|
|
|
20
20
|
# Detect or use specified format
|
|
21
21
|
format = detect_format(input_file)
|
|
@@ -60,10 +60,14 @@ module Canon
|
|
|
60
60
|
def equivalent?(html1, html2, opts = {}, child_opts = {})
|
|
61
61
|
opts = DEFAULT_OPTS.merge(opts)
|
|
62
62
|
|
|
63
|
-
# Capture original HTML strings
|
|
64
|
-
#
|
|
65
|
-
|
|
66
|
-
|
|
63
|
+
# Capture original HTML strings for display.
|
|
64
|
+
# Prefer the true originals preserved by dom_diff (before
|
|
65
|
+
# HtmlParser.parse mutated the DOM), falling back to
|
|
66
|
+
# extract_original_string for callers that bypass dom_diff.
|
|
67
|
+
original_str1 = opts.delete(:_original_str1) ||
|
|
68
|
+
extract_original_string(html1)
|
|
69
|
+
original_str2 = opts.delete(:_original_str2) ||
|
|
70
|
+
extract_original_string(html2)
|
|
67
71
|
|
|
68
72
|
# Resolve match options with format-specific defaults
|
|
69
73
|
match_opts_hash = MatchOptions::Xml.resolve(
|
|
@@ -217,10 +221,11 @@ module Canon
|
|
|
217
221
|
# @param match_opts_hash [Hash] Resolved match options
|
|
218
222
|
# @return [Boolean, ComparisonResult] Result of tree diff comparison
|
|
219
223
|
def perform_semantic_tree_diff(html1, html2, opts, match_opts_hash)
|
|
220
|
-
# Capture original HTML strings
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
original_str2 =
|
|
224
|
+
# Capture original HTML strings for display (see equivalent? for details).
|
|
225
|
+
original_str1 = opts.delete(:_original_str1) ||
|
|
226
|
+
extract_original_string(html1)
|
|
227
|
+
original_str2 = opts.delete(:_original_str2) ||
|
|
228
|
+
extract_original_string(html2)
|
|
224
229
|
|
|
225
230
|
# Parse to Canon::Xml::Node (preserves preprocessing)
|
|
226
231
|
# For HTML, we parse as XML to get Canon::Xml::Node structure
|
|
@@ -388,12 +393,17 @@ module Canon
|
|
|
388
393
|
end
|
|
389
394
|
end
|
|
390
395
|
|
|
391
|
-
# For
|
|
392
|
-
|
|
393
|
-
|
|
396
|
+
# For preprocessing modes that require whitespace filtering,
|
|
397
|
+
# apply the same post-parsing normalization used for string inputs.
|
|
398
|
+
# This is needed because dom_diff() pre-parses HTML5 strings into
|
|
399
|
+
# Nokogiri fragments before calling HtmlComparator, bypassing the
|
|
400
|
+
# string-input path where these filters are normally applied.
|
|
401
|
+
if %i[normalize format rendered].include?(preprocessing)
|
|
394
402
|
frag = node.is_a?(Nokogiri::XML::DocumentFragment) ? node : Nokogiri::XML.fragment(node.to_html)
|
|
395
403
|
normalize_html_style_script_comments(frag)
|
|
396
|
-
|
|
404
|
+
if preprocessing == :rendered
|
|
405
|
+
normalize_rendered_whitespace(frag, match_opts)
|
|
406
|
+
end
|
|
397
407
|
remove_whitespace_only_text_nodes(frag)
|
|
398
408
|
return frag
|
|
399
409
|
end
|
|
@@ -628,22 +638,22 @@ compare_profile = nil)
|
|
|
628
638
|
return if match_opts[:text_content] == :strict
|
|
629
639
|
|
|
630
640
|
# Elements where whitespace is significant - don't normalize
|
|
631
|
-
# SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.
|
|
641
|
+
# SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_preserve_elements
|
|
632
642
|
# This ensures consistency between preprocessing and comparison logic
|
|
633
|
-
# SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.
|
|
643
|
+
# SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_preserve_elements
|
|
634
644
|
# This ensures consistency between preprocessing and comparison logic
|
|
635
645
|
preserve_whitespace = if compare_profile.is_a?(HtmlCompareProfile)
|
|
636
646
|
# Profile handles HTML-specific whitespace rules
|
|
637
647
|
# Get default list and filter by profile
|
|
638
648
|
WhitespaceSensitivity
|
|
639
|
-
.
|
|
649
|
+
.format_default_preserve_elements(match_opts)
|
|
640
650
|
.select do |elem|
|
|
641
651
|
compare_profile.preserve_whitespace?(elem.to_s)
|
|
642
652
|
end
|
|
643
653
|
.map(&:to_s)
|
|
644
654
|
else
|
|
645
655
|
# Use default list from WhitespaceSensitivity (single source of truth)
|
|
646
|
-
WhitespaceSensitivity.
|
|
656
|
+
WhitespaceSensitivity.format_default_preserve_elements(match_opts).map(&:to_s)
|
|
647
657
|
end
|
|
648
658
|
|
|
649
659
|
# Walk all text nodes
|
|
@@ -700,11 +710,11 @@ compare_profile = nil)
|
|
|
700
710
|
# CRITICAL: Do NOT remove whitespace-only text nodes from whitespace-sensitive
|
|
701
711
|
# elements like <pre>, <code>, <textarea>, <script>, <style>
|
|
702
712
|
#
|
|
703
|
-
# SINGLE SOURCE OF TRUTH: Uses WhitespaceSensitivity.
|
|
713
|
+
# SINGLE SOURCE OF TRUTH: Uses WhitespaceSensitivity.format_default_preserve_elements
|
|
704
714
|
def remove_whitespace_only_text_nodes(doc)
|
|
705
715
|
# Elements where whitespace is significant - don't remove whitespace-only nodes
|
|
706
|
-
# SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.
|
|
707
|
-
preserve_whitespace = WhitespaceSensitivity.
|
|
716
|
+
# SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_preserve_elements
|
|
717
|
+
preserve_whitespace = WhitespaceSensitivity.format_default_preserve_elements(format: :html).map(&:to_s)
|
|
708
718
|
|
|
709
719
|
doc.xpath(".//text()").each do |text_node|
|
|
710
720
|
# CRITICAL: Skip if this text node is inside a whitespace-preserving element
|
|
@@ -69,7 +69,7 @@ module Canon
|
|
|
69
69
|
# @param element_name [String] The element name to check
|
|
70
70
|
# @return [Boolean] true if whitespace should be preserved
|
|
71
71
|
def preserve_whitespace?(element_name)
|
|
72
|
-
|
|
72
|
+
html_preserve_elements.include?(element_name.to_s.downcase)
|
|
73
73
|
end
|
|
74
74
|
|
|
75
75
|
# Check if element names should be compared case-sensitively
|
|
@@ -85,12 +85,12 @@ module Canon
|
|
|
85
85
|
|
|
86
86
|
# Elements where whitespace is semantically significant in HTML
|
|
87
87
|
#
|
|
88
|
-
# SINGLE SOURCE OF TRUTH: Delegates to WhitespaceSensitivity.
|
|
88
|
+
# SINGLE SOURCE OF TRUTH: Delegates to WhitespaceSensitivity.format_default_preserve_elements
|
|
89
89
|
# This ensures consistency across the codebase.
|
|
90
90
|
#
|
|
91
91
|
# @return [Array<String>] List of element names (as strings)
|
|
92
|
-
def
|
|
93
|
-
WhitespaceSensitivity.
|
|
92
|
+
def html_preserve_elements
|
|
93
|
+
WhitespaceSensitivity.format_default_preserve_elements(format: @html_version).map(&:to_s)
|
|
94
94
|
end
|
|
95
95
|
|
|
96
96
|
# Check if a dimension is explicitly set to :strict
|
|
@@ -177,8 +177,8 @@ module Canon
|
|
|
177
177
|
end
|
|
178
178
|
|
|
179
179
|
# Strip whitespace-only text nodes based on parent element configuration.
|
|
180
|
-
# Use
|
|
181
|
-
# Blacklist (
|
|
180
|
+
# Use preserve_whitespace_elements / strip_whitespace_elements to control.
|
|
181
|
+
# Blacklist (strip) > preserve > collapse > format defaults.
|
|
182
182
|
return false unless text_node?(node) && node.parent
|
|
183
183
|
return false unless MatchOptions.normalize_text(node_text(node)).empty?
|
|
184
184
|
|
|
@@ -186,7 +186,16 @@ module Canon
|
|
|
186
186
|
node.parent, match_opts
|
|
187
187
|
)
|
|
188
188
|
|
|
189
|
-
|
|
189
|
+
# When the pretty-print-side flag is active (set by opts_for_side in
|
|
190
|
+
# ChildComparison.compare), drop whitespace-only text nodes that start
|
|
191
|
+
# with "\n" inside :collapse elements — they are structural indentation
|
|
192
|
+
# from the pretty-printer, not content. Space-only nodes (no initial "\n") are
|
|
193
|
+
# real inline content and are kept for normalised comparison.
|
|
194
|
+
# :preserve elements are always left unchanged.
|
|
195
|
+
if match_opts[:_pretty_print_side_active]
|
|
196
|
+
ws_class = WhitespaceSensitivity.classify_text_node(node, opts)
|
|
197
|
+
return true if ws_class == :collapse && node_text(node).start_with?("\n")
|
|
198
|
+
end
|
|
190
199
|
|
|
191
200
|
false
|
|
192
201
|
end
|
|
@@ -95,6 +95,25 @@ module Canon
|
|
|
95
95
|
|
|
96
96
|
protected
|
|
97
97
|
|
|
98
|
+
# Valid match behaviors per dimension for this format.
|
|
99
|
+
# Override in subclasses to provide format-specific behaviors.
|
|
100
|
+
# Used for per-dimension validation in validate_match_options!
|
|
101
|
+
#
|
|
102
|
+
# @return [Hash{Symbol => Array<Symbol>}] Dimension to valid behaviors mapping
|
|
103
|
+
def dimension_behaviors
|
|
104
|
+
# Default: XML/HTML behaviors (override in JSON/YAML resolvers)
|
|
105
|
+
{
|
|
106
|
+
text_content: %i[strict normalize ignore].freeze,
|
|
107
|
+
structural_whitespace: %i[strict normalize ignore].freeze,
|
|
108
|
+
attribute_presence: %i[strict ignore].freeze,
|
|
109
|
+
attribute_order: %i[strict ignore].freeze,
|
|
110
|
+
attribute_values: %i[strict strip compact normalize
|
|
111
|
+
ignore].freeze,
|
|
112
|
+
element_position: %i[strict ignore].freeze,
|
|
113
|
+
comments: %i[strict ignore].freeze,
|
|
114
|
+
}
|
|
115
|
+
end
|
|
116
|
+
|
|
98
117
|
# Validate preprocessing option
|
|
99
118
|
#
|
|
100
119
|
# @param preprocessing [Symbol] Preprocessing option
|
|
@@ -107,7 +126,7 @@ module Canon
|
|
|
107
126
|
end
|
|
108
127
|
end
|
|
109
128
|
|
|
110
|
-
# Validate match options
|
|
129
|
+
# Validate match options using per-dimension behavior validation
|
|
111
130
|
#
|
|
112
131
|
# @param match_options [Hash] Options to validate
|
|
113
132
|
# @raise [Canon::Error] If invalid dimension or behavior
|
|
@@ -121,11 +140,12 @@ module Canon
|
|
|
121
140
|
hash_matching
|
|
122
141
|
similarity_matching
|
|
123
142
|
propagation
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
whitespace_insensitive_elements
|
|
143
|
+
preserve_whitespace_elements
|
|
144
|
+
collapse_whitespace_elements
|
|
145
|
+
strip_whitespace_elements
|
|
128
146
|
respect_xml_space
|
|
147
|
+
pretty_printed_expected
|
|
148
|
+
pretty_printed_received
|
|
129
149
|
]
|
|
130
150
|
|
|
131
151
|
match_options.each do |dimension, behavior|
|
|
@@ -138,10 +158,12 @@ module Canon
|
|
|
138
158
|
"Valid dimensions: #{match_dimensions.join(', ')}"
|
|
139
159
|
end
|
|
140
160
|
|
|
141
|
-
|
|
161
|
+
# Per-dimension behavior validation using overridable method
|
|
162
|
+
valid_behaviors = dimension_behaviors[dimension]
|
|
163
|
+
unless valid_behaviors&.include?(behavior)
|
|
142
164
|
raise Canon::Error,
|
|
143
165
|
"Unknown match behavior: #{behavior} for #{dimension}. " \
|
|
144
|
-
"Valid behaviors: #{
|
|
166
|
+
"Valid behaviors for #{dimension}: #{valid_behaviors&.join(', ')}"
|
|
145
167
|
end
|
|
146
168
|
end
|
|
147
169
|
end
|
|
@@ -75,6 +75,15 @@ module Canon
|
|
|
75
75
|
end
|
|
76
76
|
MATCH_PROFILES[profile].dup
|
|
77
77
|
end
|
|
78
|
+
|
|
79
|
+
# JSON-specific dimension behaviors
|
|
80
|
+
def dimension_behaviors
|
|
81
|
+
{
|
|
82
|
+
text_content: %i[strict normalize ignore].freeze,
|
|
83
|
+
structural_whitespace: %i[strict normalize ignore].freeze,
|
|
84
|
+
key_order: %i[strict ignore].freeze,
|
|
85
|
+
}
|
|
86
|
+
end
|
|
78
87
|
end
|
|
79
88
|
end
|
|
80
89
|
end
|
|
@@ -12,7 +12,7 @@ module Canon
|
|
|
12
12
|
# Sensitive elements (preserve structural whitespace):
|
|
13
13
|
# - XML: none by default — all structural whitespace stripped
|
|
14
14
|
# - HTML: pre, code, textarea, script, style by default
|
|
15
|
-
# Use
|
|
15
|
+
# Use preserve_whitespace_elements option to add elements that preserve whitespace.
|
|
16
16
|
#
|
|
17
17
|
FORMAT_DEFAULTS = {
|
|
18
18
|
html: {
|
|
@@ -41,7 +41,7 @@ module Canon
|
|
|
41
41
|
MATCH_PROFILES = {
|
|
42
42
|
# Strict: Match exactly as written in source (XML default).
|
|
43
43
|
# Structural whitespace is stripped by default for XML.
|
|
44
|
-
# Use
|
|
44
|
+
# Use preserve_whitespace_elements to preserve structural whitespace in specific elements.
|
|
45
45
|
strict: {
|
|
46
46
|
preprocessing: :none,
|
|
47
47
|
text_content: :strict,
|
|
@@ -152,6 +152,20 @@ module Canon
|
|
|
152
152
|
end
|
|
153
153
|
MATCH_PROFILES[profile].dup
|
|
154
154
|
end
|
|
155
|
+
|
|
156
|
+
# XML/HTML-specific dimension behaviors
|
|
157
|
+
def dimension_behaviors
|
|
158
|
+
{
|
|
159
|
+
text_content: %i[strict normalize ignore].freeze,
|
|
160
|
+
structural_whitespace: %i[strict normalize ignore].freeze,
|
|
161
|
+
attribute_presence: %i[strict ignore].freeze,
|
|
162
|
+
attribute_order: %i[strict ignore].freeze,
|
|
163
|
+
attribute_values: %i[strict strip compact normalize
|
|
164
|
+
ignore].freeze,
|
|
165
|
+
element_position: %i[strict ignore].freeze,
|
|
166
|
+
comments: %i[strict ignore].freeze,
|
|
167
|
+
}
|
|
168
|
+
end
|
|
155
169
|
end
|
|
156
170
|
end
|
|
157
171
|
end
|
|
@@ -80,6 +80,16 @@ module Canon
|
|
|
80
80
|
end
|
|
81
81
|
MATCH_PROFILES[profile].dup
|
|
82
82
|
end
|
|
83
|
+
|
|
84
|
+
# YAML-specific dimension behaviors
|
|
85
|
+
def dimension_behaviors
|
|
86
|
+
{
|
|
87
|
+
text_content: %i[strict normalize ignore].freeze,
|
|
88
|
+
structural_whitespace: %i[strict normalize ignore].freeze,
|
|
89
|
+
key_order: %i[strict ignore].freeze,
|
|
90
|
+
comments: %i[strict ignore].freeze,
|
|
91
|
+
}
|
|
92
|
+
end
|
|
83
93
|
end
|
|
84
94
|
end
|
|
85
95
|
end
|
|
@@ -57,7 +57,10 @@ module Canon
|
|
|
57
57
|
# Preprocessing options - what to do before comparison
|
|
58
58
|
PREPROCESSING_OPTIONS = %i[none c14n normalize format rendered].freeze
|
|
59
59
|
|
|
60
|
-
# Matching behaviors (
|
|
60
|
+
# Matching behaviors (deprecated - use per-dimension validation instead)
|
|
61
|
+
# This universal constant is kept for backward compatibility but should not
|
|
62
|
+
# be used for validation. Use BaseResolver.dimension_behaviors instead.
|
|
63
|
+
# Note: :strip and :compact are only valid for attribute_values dimension.
|
|
61
64
|
MATCH_BEHAVIORS = %i[strict strip compact normalize ignore].freeze
|
|
62
65
|
|
|
63
66
|
class << self
|