canon 0.1.23 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +155 -30
  3. data/docs/INDEX.adoc +4 -0
  4. data/docs/advanced/diff-classification.adoc +3 -2
  5. data/docs/advanced/verbose-mode-architecture.adoc +23 -0
  6. data/docs/features/configuration-profiles.adoc +288 -0
  7. data/docs/features/diff-formatting/character-visualization.adoc +153 -454
  8. data/docs/features/diff-formatting/display-filtering.adoc +44 -0
  9. data/docs/features/diff-formatting/display-preprocessing.adoc +656 -0
  10. data/docs/features/diff-formatting/index.adoc +47 -0
  11. data/docs/features/diff-formatting/pretty-diff-mode.adoc +154 -0
  12. data/docs/features/environment-configuration/override-system.adoc +10 -3
  13. data/docs/features/index.adoc +9 -0
  14. data/docs/features/match-options/html-policies.adoc +3 -0
  15. data/docs/features/match-options/index.adoc +32 -42
  16. data/docs/features/match-options/pretty-printed-fixtures.adoc +270 -0
  17. data/docs/guides/choosing-configuration.adoc +22 -0
  18. data/docs/reference/environment-variables.adoc +121 -1
  19. data/docs/reference/options-across-interfaces.adoc +182 -2
  20. data/lib/canon/cli.rb +20 -0
  21. data/lib/canon/commands/diff_command.rb +7 -2
  22. data/lib/canon/commands/format_command.rb +1 -1
  23. data/lib/canon/comparison/html_comparator.rb +29 -19
  24. data/lib/canon/comparison/html_compare_profile.rb +4 -4
  25. data/lib/canon/comparison/markup_comparator.rb +12 -3
  26. data/lib/canon/comparison/match_options/base_resolver.rb +29 -7
  27. data/lib/canon/comparison/match_options/json_resolver.rb +9 -0
  28. data/lib/canon/comparison/match_options/xml_resolver.rb +16 -2
  29. data/lib/canon/comparison/match_options/yaml_resolver.rb +10 -0
  30. data/lib/canon/comparison/match_options.rb +4 -1
  31. data/lib/canon/comparison/whitespace_sensitivity.rb +189 -137
  32. data/lib/canon/comparison/xml_comparator/child_comparison.rb +21 -4
  33. data/lib/canon/comparison/xml_comparator.rb +14 -12
  34. data/lib/canon/comparison/xml_node_comparison.rb +51 -6
  35. data/lib/canon/comparison.rb +52 -9
  36. data/lib/canon/config/env_schema.rb +32 -4
  37. data/lib/canon/config/override_resolver.rb +16 -3
  38. data/lib/canon/config/profile_loader.rb +135 -0
  39. data/lib/canon/config/profiles/metanorma.yml +74 -0
  40. data/lib/canon/config/profiles/metanorma_debug.yml +8 -0
  41. data/lib/canon/config/type_converter.rb +8 -0
  42. data/lib/canon/config.rb +469 -5
  43. data/lib/canon/diff/diff_classifier.rb +41 -11
  44. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +48 -17
  45. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +58 -0
  46. data/lib/canon/diff_formatter/diff_detail_formatter.rb +73 -17
  47. data/lib/canon/diff_formatter.rb +493 -36
  48. data/lib/canon/pretty_printer/xml_normalized.rb +395 -0
  49. data/lib/canon/rspec_matchers.rb +36 -0
  50. data/lib/canon/version.rb +1 -1
  51. data/lib/canon/xml/nodes/namespace_node.rb +4 -0
  52. data/lib/canon/xml/nodes/processing_instruction_node.rb +4 -0
  53. data/lib/canon/xml/nodes/root_node.rb +4 -0
  54. data/lib/canon/xml/nodes/text_node.rb +4 -0
  55. data/lib/tasks/performance_helpers.rb +2 -2
  56. metadata +24 -2
@@ -26,8 +26,8 @@ module Canon
26
26
  check_file_size(file2, format2)
27
27
 
28
28
  # Read raw content for potential by-line diff
29
- content1 = File.read(file1)
30
- content2 = File.read(file2)
29
+ content1 = File.read(file1, encoding: "utf-8")
30
+ content2 = File.read(file2, encoding: "utf-8")
31
31
 
32
32
  # Parse documents
33
33
  doc1 = parse_document_content(content1, format1)
@@ -56,6 +56,11 @@ module Canon
56
56
  show_diffs: @options[:show_diffs]&.to_sym || :all,
57
57
  show_raw_inputs: @options[:show_raw_inputs] || false,
58
58
  show_preprocessed_inputs: @options[:show_preprocessed_inputs] || false,
59
+ show_preprocessed_expected: @options[:show_preprocessed_expected] || false,
60
+ show_preprocessed_received: @options[:show_preprocessed_received] || false,
61
+ show_prettyprint_inputs: @options[:show_prettyprint_inputs] || false,
62
+ show_prettyprint_expected: @options[:show_prettyprint_expected] || false,
63
+ show_prettyprint_received: @options[:show_prettyprint_received] || false,
59
64
  show_line_numbered_inputs: @options[:show_line_numbered_inputs] || false,
60
65
  )
61
66
 
@@ -15,7 +15,7 @@ module Canon
15
15
  # rubocop:disable Metrics/MethodLength
16
16
  def run(input_file)
17
17
  # Read input file
18
- content = File.read(input_file)
18
+ content = File.read(input_file, encoding: "utf-8")
19
19
 
20
20
  # Detect or use specified format
21
21
  format = detect_format(input_file)
@@ -60,10 +60,14 @@ module Canon
60
60
  def equivalent?(html1, html2, opts = {}, child_opts = {})
61
61
  opts = DEFAULT_OPTS.merge(opts)
62
62
 
63
- # Capture original HTML strings BEFORE any parsing/transformation
64
- # These are used for display to preserve original formatting
65
- original_str1 = extract_original_string(html1)
66
- original_str2 = extract_original_string(html2)
63
+ # Capture original HTML strings for display.
64
+ # Prefer the true originals preserved by dom_diff (before
65
+ # HtmlParser.parse mutated the DOM), falling back to
66
+ # extract_original_string for callers that bypass dom_diff.
67
+ original_str1 = opts.delete(:_original_str1) ||
68
+ extract_original_string(html1)
69
+ original_str2 = opts.delete(:_original_str2) ||
70
+ extract_original_string(html2)
67
71
 
68
72
  # Resolve match options with format-specific defaults
69
73
  match_opts_hash = MatchOptions::Xml.resolve(
@@ -217,10 +221,11 @@ module Canon
217
221
  # @param match_opts_hash [Hash] Resolved match options
218
222
  # @return [Boolean, ComparisonResult] Result of tree diff comparison
219
223
  def perform_semantic_tree_diff(html1, html2, opts, match_opts_hash)
220
- # Capture original HTML strings BEFORE any parsing/transformation
221
- # These are used for display to preserve original formatting
222
- original_str1 = extract_original_string(html1)
223
- original_str2 = extract_original_string(html2)
224
+ # Capture original HTML strings for display (see equivalent? for details).
225
+ original_str1 = opts.delete(:_original_str1) ||
226
+ extract_original_string(html1)
227
+ original_str2 = opts.delete(:_original_str2) ||
228
+ extract_original_string(html2)
224
229
 
225
230
  # Parse to Canon::Xml::Node (preserves preprocessing)
226
231
  # For HTML, we parse as XML to get Canon::Xml::Node structure
@@ -388,12 +393,17 @@ module Canon
388
393
  end
389
394
  end
390
395
 
391
- # For :rendered preprocessing with Nokogiri nodes
392
- if preprocessing == :rendered
393
- # Normalize and return
396
+ # For preprocessing modes that require whitespace filtering,
397
+ # apply the same post-parsing normalization used for string inputs.
398
+ # This is needed because dom_diff() pre-parses HTML5 strings into
399
+ # Nokogiri fragments before calling HtmlComparator, bypassing the
400
+ # string-input path where these filters are normally applied.
401
+ if %i[normalize format rendered].include?(preprocessing)
394
402
  frag = node.is_a?(Nokogiri::XML::DocumentFragment) ? node : Nokogiri::XML.fragment(node.to_html)
395
403
  normalize_html_style_script_comments(frag)
396
- normalize_rendered_whitespace(frag, match_opts)
404
+ if preprocessing == :rendered
405
+ normalize_rendered_whitespace(frag, match_opts)
406
+ end
397
407
  remove_whitespace_only_text_nodes(frag)
398
408
  return frag
399
409
  end
@@ -628,22 +638,22 @@ compare_profile = nil)
628
638
  return if match_opts[:text_content] == :strict
629
639
 
630
640
  # Elements where whitespace is significant - don't normalize
631
- # SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_sensitive_elements
641
+ # SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_preserve_elements
632
642
  # This ensures consistency between preprocessing and comparison logic
633
- # SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_sensitive_elements
643
+ # SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_preserve_elements
634
644
  # This ensures consistency between preprocessing and comparison logic
635
645
  preserve_whitespace = if compare_profile.is_a?(HtmlCompareProfile)
636
646
  # Profile handles HTML-specific whitespace rules
637
647
  # Get default list and filter by profile
638
648
  WhitespaceSensitivity
639
- .format_default_sensitive_elements(match_opts)
649
+ .format_default_preserve_elements(match_opts)
640
650
  .select do |elem|
641
651
  compare_profile.preserve_whitespace?(elem.to_s)
642
652
  end
643
653
  .map(&:to_s)
644
654
  else
645
655
  # Use default list from WhitespaceSensitivity (single source of truth)
646
- WhitespaceSensitivity.format_default_sensitive_elements(match_opts).map(&:to_s)
656
+ WhitespaceSensitivity.format_default_preserve_elements(match_opts).map(&:to_s)
647
657
  end
648
658
 
649
659
  # Walk all text nodes
@@ -700,11 +710,11 @@ compare_profile = nil)
700
710
  # CRITICAL: Do NOT remove whitespace-only text nodes from whitespace-sensitive
701
711
  # elements like <pre>, <code>, <textarea>, <script>, <style>
702
712
  #
703
- # SINGLE SOURCE OF TRUTH: Uses WhitespaceSensitivity.format_default_sensitive_elements
713
+ # SINGLE SOURCE OF TRUTH: Uses WhitespaceSensitivity.format_default_preserve_elements
704
714
  def remove_whitespace_only_text_nodes(doc)
705
715
  # Elements where whitespace is significant - don't remove whitespace-only nodes
706
- # SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_sensitive_elements
707
- preserve_whitespace = WhitespaceSensitivity.format_default_sensitive_elements(format: :html).map(&:to_s)
716
+ # SINGLE SOURCE OF TRUTH: WhitespaceSensitivity.format_default_preserve_elements
717
+ preserve_whitespace = WhitespaceSensitivity.format_default_preserve_elements(format: :html).map(&:to_s)
708
718
 
709
719
  doc.xpath(".//text()").each do |text_node|
710
720
  # CRITICAL: Skip if this text node is inside a whitespace-preserving element
@@ -69,7 +69,7 @@ module Canon
69
69
  # @param element_name [String] The element name to check
70
70
  # @return [Boolean] true if whitespace should be preserved
71
71
  def preserve_whitespace?(element_name)
72
- whitespace_sensitive_elements.include?(element_name.to_s.downcase)
72
+ html_preserve_elements.include?(element_name.to_s.downcase)
73
73
  end
74
74
 
75
75
  # Check if element names should be compared case-sensitively
@@ -85,12 +85,12 @@ module Canon
85
85
 
86
86
  # Elements where whitespace is semantically significant in HTML
87
87
  #
88
- # SINGLE SOURCE OF TRUTH: Delegates to WhitespaceSensitivity.format_default_sensitive_elements
88
+ # SINGLE SOURCE OF TRUTH: Delegates to WhitespaceSensitivity.format_default_preserve_elements
89
89
  # This ensures consistency across the codebase.
90
90
  #
91
91
  # @return [Array<String>] List of element names (as strings)
92
- def whitespace_sensitive_elements
93
- WhitespaceSensitivity.format_default_sensitive_elements(format: @html_version).map(&:to_s)
92
+ def html_preserve_elements
93
+ WhitespaceSensitivity.format_default_preserve_elements(format: @html_version).map(&:to_s)
94
94
  end
95
95
 
96
96
  # Check if a dimension is explicitly set to :strict
@@ -177,8 +177,8 @@ module Canon
177
177
  end
178
178
 
179
179
  # Strip whitespace-only text nodes based on parent element configuration.
180
- # Use sensitive_elements / insensitive_elements to control.
181
- # Blacklist (insensitive) > whitelist (sensitive) > format defaults.
180
+ # Use preserve_whitespace_elements / strip_whitespace_elements to control.
181
+ # Blacklist (strip) > preserve > collapse > format defaults.
182
182
  return false unless text_node?(node) && node.parent
183
183
  return false unless MatchOptions.normalize_text(node_text(node)).empty?
184
184
 
@@ -186,7 +186,16 @@ module Canon
186
186
  node.parent, match_opts
187
187
  )
188
188
 
189
- false
189
+ # When the pretty-print-side flag is active (set by opts_for_side in
190
+ # ChildComparison.compare), drop whitespace-only text nodes that start
191
+ # with "\n" inside :collapse elements — they are structural indentation
192
+ # from the pretty-printer, not content. Space-only nodes (no initial "\n") are
193
+ # real inline content and are kept for normalised comparison.
194
+ # :preserve elements are always left unchanged.
195
+ if match_opts[:_pretty_print_side_active]
196
+ ws_class = WhitespaceSensitivity.classify_text_node(node, opts)
197
+ return true if ws_class == :collapse && node_text(node).start_with?("\n")
198
+ end
190
199
 
191
200
  false
192
201
  end
@@ -95,6 +95,25 @@ module Canon
95
95
 
96
96
  protected
97
97
 
98
+ # Valid match behaviors per dimension for this format.
99
+ # Override in subclasses to provide format-specific behaviors.
100
+ # Used for per-dimension validation in validate_match_options!
101
+ #
102
+ # @return [Hash{Symbol => Array<Symbol>}] Dimension to valid behaviors mapping
103
+ def dimension_behaviors
104
+ # Default: XML/HTML behaviors (override in JSON/YAML resolvers)
105
+ {
106
+ text_content: %i[strict normalize ignore].freeze,
107
+ structural_whitespace: %i[strict normalize ignore].freeze,
108
+ attribute_presence: %i[strict ignore].freeze,
109
+ attribute_order: %i[strict ignore].freeze,
110
+ attribute_values: %i[strict strip compact normalize
111
+ ignore].freeze,
112
+ element_position: %i[strict ignore].freeze,
113
+ comments: %i[strict ignore].freeze,
114
+ }
115
+ end
116
+
98
117
  # Validate preprocessing option
99
118
  #
100
119
  # @param preprocessing [Symbol] Preprocessing option
@@ -107,7 +126,7 @@ module Canon
107
126
  end
108
127
  end
109
128
 
110
- # Validate match options
129
+ # Validate match options using per-dimension behavior validation
111
130
  #
112
131
  # @param match_options [Hash] Options to validate
113
132
  # @raise [Canon::Error] If invalid dimension or behavior
@@ -121,11 +140,12 @@ module Canon
121
140
  hash_matching
122
141
  similarity_matching
123
142
  propagation
124
- sensitive_elements
125
- insensitive_elements
126
- whitespace_sensitive_elements
127
- whitespace_insensitive_elements
143
+ preserve_whitespace_elements
144
+ collapse_whitespace_elements
145
+ strip_whitespace_elements
128
146
  respect_xml_space
147
+ pretty_printed_expected
148
+ pretty_printed_received
129
149
  ]
130
150
 
131
151
  match_options.each do |dimension, behavior|
@@ -138,10 +158,12 @@ module Canon
138
158
  "Valid dimensions: #{match_dimensions.join(', ')}"
139
159
  end
140
160
 
141
- unless MatchOptions::MATCH_BEHAVIORS.include?(behavior)
161
+ # Per-dimension behavior validation using overridable method
162
+ valid_behaviors = dimension_behaviors[dimension]
163
+ unless valid_behaviors&.include?(behavior)
142
164
  raise Canon::Error,
143
165
  "Unknown match behavior: #{behavior} for #{dimension}. " \
144
- "Valid behaviors: #{MatchOptions::MATCH_BEHAVIORS.join(', ')}"
166
+ "Valid behaviors for #{dimension}: #{valid_behaviors&.join(', ')}"
145
167
  end
146
168
  end
147
169
  end
@@ -75,6 +75,15 @@ module Canon
75
75
  end
76
76
  MATCH_PROFILES[profile].dup
77
77
  end
78
+
79
+ # JSON-specific dimension behaviors
80
+ def dimension_behaviors
81
+ {
82
+ text_content: %i[strict normalize ignore].freeze,
83
+ structural_whitespace: %i[strict normalize ignore].freeze,
84
+ key_order: %i[strict ignore].freeze,
85
+ }
86
+ end
78
87
  end
79
88
  end
80
89
  end
@@ -12,7 +12,7 @@ module Canon
12
12
  # Sensitive elements (preserve structural whitespace):
13
13
  # - XML: none by default — all structural whitespace stripped
14
14
  # - HTML: pre, code, textarea, script, style by default
15
- # Use sensitive_elements option to add elements that preserve whitespace.
15
+ # Use preserve_whitespace_elements option to add elements that preserve whitespace.
16
16
  #
17
17
  FORMAT_DEFAULTS = {
18
18
  html: {
@@ -41,7 +41,7 @@ module Canon
41
41
  MATCH_PROFILES = {
42
42
  # Strict: Match exactly as written in source (XML default).
43
43
  # Structural whitespace is stripped by default for XML.
44
- # Use sensitive_elements to preserve structural whitespace in specific elements.
44
+ # Use preserve_whitespace_elements to preserve structural whitespace in specific elements.
45
45
  strict: {
46
46
  preprocessing: :none,
47
47
  text_content: :strict,
@@ -152,6 +152,20 @@ module Canon
152
152
  end
153
153
  MATCH_PROFILES[profile].dup
154
154
  end
155
+
156
+ # XML/HTML-specific dimension behaviors
157
+ def dimension_behaviors
158
+ {
159
+ text_content: %i[strict normalize ignore].freeze,
160
+ structural_whitespace: %i[strict normalize ignore].freeze,
161
+ attribute_presence: %i[strict ignore].freeze,
162
+ attribute_order: %i[strict ignore].freeze,
163
+ attribute_values: %i[strict strip compact normalize
164
+ ignore].freeze,
165
+ element_position: %i[strict ignore].freeze,
166
+ comments: %i[strict ignore].freeze,
167
+ }
168
+ end
155
169
  end
156
170
  end
157
171
  end
@@ -80,6 +80,16 @@ module Canon
80
80
  end
81
81
  MATCH_PROFILES[profile].dup
82
82
  end
83
+
84
+ # YAML-specific dimension behaviors
85
+ def dimension_behaviors
86
+ {
87
+ text_content: %i[strict normalize ignore].freeze,
88
+ structural_whitespace: %i[strict normalize ignore].freeze,
89
+ key_order: %i[strict ignore].freeze,
90
+ comments: %i[strict ignore].freeze,
91
+ }
92
+ end
83
93
  end
84
94
  end
85
95
  end
@@ -57,7 +57,10 @@ module Canon
57
57
  # Preprocessing options - what to do before comparison
58
58
  PREPROCESSING_OPTIONS = %i[none c14n normalize format rendered].freeze
59
59
 
60
- # Matching behaviors (mutually exclusive)
60
+ # Matching behaviors (deprecated - use per-dimension validation instead)
61
+ # This universal constant is kept for backward compatibility but should not
62
+ # be used for validation. Use BaseResolver.dimension_behaviors instead.
63
+ # Note: :strip and :compact are only valid for attribute_values dimension.
61
64
  MATCH_BEHAVIORS = %i[strict strip compact normalize ignore].freeze
62
65
 
63
66
  class << self