canon 0.1.22 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +174 -25
  3. data/docs/INDEX.adoc +4 -0
  4. data/docs/advanced/diff-classification.adoc +3 -2
  5. data/docs/features/configuration-profiles.adoc +288 -0
  6. data/docs/features/diff-formatting/character-visualization.adoc +153 -454
  7. data/docs/features/diff-formatting/display-filtering.adoc +44 -0
  8. data/docs/features/diff-formatting/display-preprocessing.adoc +656 -0
  9. data/docs/features/diff-formatting/index.adoc +47 -0
  10. data/docs/features/diff-formatting/pretty-diff-mode.adoc +154 -0
  11. data/docs/features/environment-configuration/override-system.adoc +10 -3
  12. data/docs/features/index.adoc +9 -0
  13. data/docs/features/match-options/index.adoc +32 -42
  14. data/docs/features/match-options/pretty-printed-fixtures.adoc +270 -0
  15. data/docs/guides/choosing-configuration.adoc +22 -0
  16. data/docs/reference/environment-variables.adoc +121 -1
  17. data/docs/reference/options-across-interfaces.adoc +182 -2
  18. data/lib/canon/cli.rb +20 -0
  19. data/lib/canon/commands/diff_command.rb +7 -2
  20. data/lib/canon/commands/format_command.rb +1 -1
  21. data/lib/canon/comparison/html_comparator.rb +20 -15
  22. data/lib/canon/comparison/html_compare_profile.rb +4 -4
  23. data/lib/canon/comparison/markup_comparator.rb +12 -3
  24. data/lib/canon/comparison/match_options/base_resolver.rb +29 -7
  25. data/lib/canon/comparison/match_options/json_resolver.rb +9 -0
  26. data/lib/canon/comparison/match_options/xml_resolver.rb +16 -2
  27. data/lib/canon/comparison/match_options/yaml_resolver.rb +10 -0
  28. data/lib/canon/comparison/match_options.rb +4 -1
  29. data/lib/canon/comparison/whitespace_sensitivity.rb +189 -137
  30. data/lib/canon/comparison/xml_comparator/child_comparison.rb +21 -4
  31. data/lib/canon/comparison/xml_comparator.rb +14 -12
  32. data/lib/canon/comparison/xml_node_comparison.rb +51 -6
  33. data/lib/canon/comparison.rb +52 -9
  34. data/lib/canon/config/env_schema.rb +32 -4
  35. data/lib/canon/config/override_resolver.rb +16 -3
  36. data/lib/canon/config/profile_loader.rb +135 -0
  37. data/lib/canon/config/profiles/metanorma.yml +74 -0
  38. data/lib/canon/config/profiles/metanorma_debug.yml +8 -0
  39. data/lib/canon/config/type_converter.rb +8 -0
  40. data/lib/canon/config.rb +469 -5
  41. data/lib/canon/diff/diff_classifier.rb +41 -11
  42. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +48 -17
  43. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +58 -0
  44. data/lib/canon/diff_formatter/diff_detail_formatter.rb +22 -7
  45. data/lib/canon/diff_formatter/theme.rb +24 -17
  46. data/lib/canon/diff_formatter.rb +493 -36
  47. data/lib/canon/pretty_printer/xml_normalized.rb +395 -0
  48. data/lib/canon/rspec_matchers.rb +36 -0
  49. data/lib/canon/tree_diff/matchers/hash_matcher.rb +26 -11
  50. data/lib/canon/version.rb +1 -1
  51. data/lib/canon/xml/nodes/namespace_node.rb +4 -0
  52. data/lib/canon/xml/nodes/processing_instruction_node.rb +4 -0
  53. data/lib/canon/xml/nodes/root_node.rb +4 -0
  54. data/lib/canon/xml/nodes/text_node.rb +4 -0
  55. data/lib/tasks/performance_helpers.rb +2 -2
  56. metadata +24 -2
@@ -273,8 +273,10 @@ module Canon
273
273
  text_content structural_whitespace attribute_presence
274
274
  attribute_order attribute_values element_position
275
275
  comments format similarity_threshold hash_matching
276
- similarity_matching propagation sensitive_elements
277
- whitespace_sensitive_elements respect_xml_space]
276
+ similarity_matching propagation
277
+ preserve_whitespace_elements
278
+ collapse_whitespace_elements
279
+ strip_whitespace_elements respect_xml_space]
278
280
  match_options_only = match_opts_hash.slice(*match_only_keys)
279
281
 
280
282
  # Convert operations to DiffNodes for unified pipeline
@@ -596,10 +598,30 @@ module Canon
596
598
  format1 = format2 = opts[:format]
597
599
  # Parse HTML strings if format is html/html4/html5
598
600
  if %i[html html4 html5].include?(opts[:format])
599
- obj1 = HtmlParser.parse(obj1, opts[:format]) if obj1.is_a?(String)
600
- obj2 = HtmlParser.parse(obj2, opts[:format]) if obj2.is_a?(String)
601
- # Note: We preserve html4/html5 format instead of normalizing to :html
602
- # This allows HtmlComparator to use the correct parsing behavior
601
+ # Preserve original strings for display (HTML fragment
602
+ # parsers can mutate the DOM).
603
+ opts[:_original_str1] = obj1.dup if obj1.is_a?(String)
604
+ opts[:_original_str2] = obj2.dup if obj2.is_a?(String)
605
+ if opts[:format] == :html5
606
+ # HTML5 fragment parsing is safe — it normalizes without
607
+ # destructive content-model mutations.
608
+ obj1 = HtmlParser.parse(obj1, :html5) if obj1.is_a?(String)
609
+ obj2 = HtmlParser.parse(obj2, :html5) if obj2.is_a?(String)
610
+ else
611
+ # HTML4 fragment parsing mutates the DOM (strips <body>
612
+ # attributes, re-parents <h1> content, etc.). Use XML
613
+ # fragment parsing which preserves structure faithfully.
614
+ if obj1.is_a?(String)
615
+ obj1 = Nokogiri::XML.fragment(
616
+ strip_xml_preamble(obj1),
617
+ )
618
+ end
619
+ if obj2.is_a?(String)
620
+ obj2 = Nokogiri::XML.fragment(
621
+ strip_xml_preamble(obj2),
622
+ )
623
+ end
624
+ end
603
625
  end
604
626
  else
605
627
  format1 = FormatDetector.detect(obj1)
@@ -638,12 +660,21 @@ module Canon
638
660
 
639
661
  # get match_profile if it is not defined in options
640
662
  # but defined in config
641
- if opts[:match_profile].nil? &&
642
- Canon::Config.instance.respond_to?(comparison_format)
663
+ if Canon::Config.instance.respond_to?(comparison_format)
643
664
  format_config = Canon::Config.instance.public_send(comparison_format)
644
- if format_config.match.profile
665
+ if opts[:match_profile].nil? && format_config.match.profile
645
666
  opts[:match_profile] = format_config.match.profile
646
667
  end
668
+ # Pass YAML profile's extra match options (e.g., preserve_whitespace_elements)
669
+ # that are stored in MatchConfig's resolver but not exposed via the
670
+ # built-in MATCH_PROFILES system. These supplement the built-in profile.
671
+ profile_opts = format_config.match.profile_options
672
+ if profile_opts.any? && opts[:global_options].nil?
673
+ opts[:global_options] = profile_opts
674
+ elsif profile_opts.any?
675
+ # Merge: global_options already set (e.g., per-call) takes precedence
676
+ opts[:global_options] = opts[:global_options].merge(profile_opts)
677
+ end
647
678
  end
648
679
 
649
680
  case comparison_format
@@ -658,6 +689,18 @@ module Canon
658
689
  end
659
690
  end
660
691
 
692
+ # Strip XML declarations and DOCTYPE preambles from an HTML string
693
+ # so it can be safely parsed with Nokogiri::XML.fragment without
694
+ # generating processing-instruction nodes.
695
+ def strip_xml_preamble(str)
696
+ str = str.sub(/\A\s*<\?xml[^?]*\?>\s*/m, "")
697
+ if (i = str.index(/<!DOCTYPE/i))
698
+ j = str.index(">", i)
699
+ str = (str[0...i] + str[(j + 1)..]).strip if j
700
+ end
701
+ str
702
+ end
703
+
661
704
  # Detect the format of an object (delegates to FormatDetector)
662
705
  #
663
706
  # @param obj [Object] Object to detect format of
@@ -15,9 +15,28 @@ module Canon
15
15
  verbose_diff: :boolean,
16
16
  algorithm: :symbol,
17
17
  show_raw_inputs: :boolean,
18
+ show_raw_expected: :boolean,
19
+ show_raw_received: :boolean,
18
20
  show_preprocessed_inputs: :boolean,
21
+ show_preprocessed_expected: :boolean,
22
+ show_preprocessed_received: :boolean,
23
+ show_prettyprint_inputs: :boolean,
24
+ show_prettyprint_expected: :boolean,
25
+ show_prettyprint_received: :boolean,
19
26
  show_line_numbered_inputs: :boolean,
27
+ character_visualization: :symbol, # true, false, :content_only
20
28
  display_format: :symbol,
29
+ display_preprocessing: :symbol, # :none, :pretty_print, :normalize_pretty_print, :c14n
30
+ pretty_printer_indent: :integer,
31
+ pretty_printer_indent_type: :symbol, # :space or :tab
32
+ preserve_whitespace_elements: :string_array,
33
+ collapse_whitespace_elements: :string_array,
34
+ strip_whitespace_elements: :string_array,
35
+ pretty_printed_expected: :boolean,
36
+ pretty_printed_received: :boolean,
37
+ pretty_printer_sort_attributes: :boolean,
38
+ compact_semantic_report: :boolean,
39
+ expand_difference: :boolean,
21
40
  theme: :symbol,
22
41
 
23
42
  # MatchConfig attributes
@@ -47,13 +66,22 @@ module Canon
47
66
 
48
67
  def all_diff_attributes
49
68
  %i[mode use_color context_lines grouping_lines show_diffs
50
- verbose_diff algorithm show_raw_inputs show_preprocessed_inputs
51
- show_line_numbered_inputs display_format max_file_size max_node_count max_diff_lines
52
- theme]
69
+ verbose_diff algorithm show_raw_inputs show_raw_expected show_raw_received
70
+ show_preprocessed_inputs show_preprocessed_expected show_preprocessed_received
71
+ show_prettyprint_inputs show_prettyprint_expected show_prettyprint_received
72
+ show_line_numbered_inputs character_visualization
73
+ display_format display_preprocessing
74
+ pretty_printer_indent pretty_printer_indent_type
75
+ preserve_whitespace_elements collapse_whitespace_elements strip_whitespace_elements
76
+ pretty_printed_expected pretty_printed_received
77
+ pretty_printer_sort_attributes
78
+ compact_semantic_report expand_difference
79
+ max_file_size max_node_count max_diff_lines theme]
53
80
  end
54
81
 
55
82
  def all_match_attributes
56
- %i[profile]
83
+ %i[profile
84
+ preserve_whitespace_elements collapse_whitespace_elements strip_whitespace_elements]
57
85
  end
58
86
 
59
87
  def all_format_attributes
@@ -3,14 +3,15 @@
3
3
  module Canon
4
4
  class Config
5
5
  # Resolves configuration values using priority chain
6
- # Priority: ENV > programmatic > defaults
6
+ # Priority: ENV > programmatic > profile > defaults
7
7
  class OverrideResolver
8
- attr_reader :defaults, :programmatic, :env
8
+ attr_reader :defaults, :programmatic, :env, :profile
9
9
 
10
- def initialize(defaults: {}, programmatic: {}, env: {})
10
+ def initialize(defaults: {}, programmatic: {}, env: {}, profile: {})
11
11
  @defaults = defaults
12
12
  @programmatic = programmatic
13
13
  @env = env
14
+ @profile = profile
14
15
  end
15
16
 
16
17
  # Resolve a single value using priority chain
@@ -18,6 +19,7 @@ module Canon
18
19
  def resolve(key)
19
20
  return @env[key] if @env.key?(key)
20
21
  return @programmatic[key] if @programmatic.key?(key)
22
+ return @profile[key] if @profile.key?(key)
21
23
 
22
24
  @defaults[key]
23
25
  end
@@ -32,6 +34,16 @@ module Canon
32
34
  @env[key] = value
33
35
  end
34
36
 
37
+ # Update profile value
38
+ def set_profile(key, value)
39
+ @profile[key] = value
40
+ end
41
+
42
+ # Clear all profile values
43
+ def clear_profile!
44
+ @profile = {}
45
+ end
46
+
35
47
  # Check if value is set by ENV
36
48
  def env_set?(key)
37
49
  @env.key?(key)
@@ -46,6 +58,7 @@ module Canon
46
58
  def source_for(key)
47
59
  return :env if @env.key?(key)
48
60
  return :programmatic if @programmatic.key?(key)
61
+ return :profile if @profile.key?(key)
49
62
  return :default if @defaults.key?(key)
50
63
 
51
64
  nil
@@ -0,0 +1,135 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "yaml"
4
+ require_relative "env_schema"
5
+
6
+ module Canon
7
+ class Config
8
+ # Loads configuration profiles from YAML files.
9
+ # Supports built-in profiles (shipped with the gem) and external file paths.
10
+ # Profiles can inherit from other profiles via the +inherits+ key.
11
+ class ProfileLoader
12
+ PROFILES_DIR = File.expand_path("profiles", __dir__).freeze
13
+
14
+ class << self
15
+ # Load a profile by name (Symbol for built-in) or file path (String).
16
+ # Returns a merged Hash with inheritance resolved.
17
+ def load(name_or_path)
18
+ key = cache_key(name_or_path)
19
+ cache[key] ||= resolve(name_or_path, [])
20
+ end
21
+
22
+ # List available built-in profile names.
23
+ def available_profiles
24
+ return [] unless Dir.exist?(PROFILES_DIR)
25
+
26
+ Dir.glob(File.join(PROFILES_DIR, "*.yml")).map do |path|
27
+ File.basename(path, ".yml").to_sym
28
+ end.sort
29
+ end
30
+
31
+ def reset_cache!
32
+ @cache = nil
33
+ end
34
+
35
+ private
36
+
37
+ def cache
38
+ @cache ||= {}
39
+ end
40
+
41
+ def cache_key(name_or_path)
42
+ if name_or_path.is_a?(Symbol)
43
+ name_or_path
44
+ else
45
+ File.expand_path(name_or_path.to_s)
46
+ end
47
+ end
48
+
49
+ # Resolve a profile, following inheritance chain.
50
+ # +seen+ tracks visited profiles for cycle detection.
51
+ def resolve(name_or_path, seen)
52
+ path = resolve_path(name_or_path)
53
+ resolve_from_path(path, seen)
54
+ end
55
+
56
+ def resolve_from_path(path, seen)
57
+ canonical = File.expand_path(path)
58
+
59
+ if seen.include?(canonical)
60
+ chain = seen.map { |s| File.basename(s, ".yml") }.join(" -> ")
61
+ raise Canon::Error,
62
+ "Profile inheritance cycle detected: #{chain} -> #{File.basename(
63
+ canonical, '.yml'
64
+ )}"
65
+ end
66
+
67
+ seen = seen + [canonical]
68
+ data = load_yaml(path)
69
+
70
+ if data["inherits"]
71
+ parent_path = resolve_inherits_path(data["inherits"])
72
+ parent = resolve_from_path(parent_path, seen)
73
+ data = deep_merge(parent, data)
74
+ end
75
+
76
+ data.delete("inherits")
77
+ data
78
+ end
79
+
80
+ # Determine the YAML file path from a name or path value.
81
+ # Symbols are looked up as built-in profiles; strings are treated
82
+ # as file paths.
83
+ def resolve_path(name_or_path)
84
+ if name_or_path.is_a?(Symbol)
85
+ path = File.join(PROFILES_DIR, "#{name_or_path}.yml")
86
+ unless File.exist?(path)
87
+ available = available_profiles.join(", ")
88
+ raise Canon::Error,
89
+ "Unknown config profile: #{name_or_path}. Available: #{available}"
90
+ end
91
+
92
+ path
93
+ else
94
+ expanded = File.expand_path(name_or_path.to_s)
95
+ unless File.exist?(expanded)
96
+ raise Canon::Error, "Profile file not found: #{expanded}"
97
+ end
98
+
99
+ expanded
100
+ end
101
+ end
102
+
103
+ # Resolve an +inherits+ value from YAML (always a string).
104
+ # Tries built-in profile name first, then file path.
105
+ def resolve_inherits_path(value)
106
+ builtin = File.join(PROFILES_DIR, "#{value}.yml")
107
+ return builtin if File.exist?(builtin)
108
+
109
+ expanded = File.expand_path(value)
110
+ return expanded if File.exist?(expanded)
111
+
112
+ raise Canon::Error, "Inherited profile not found: #{value}"
113
+ end
114
+
115
+ def load_yaml(path)
116
+ content = File.read(path)
117
+ YAML.safe_load(content, permitted_classes: [Symbol]) || {}
118
+ end
119
+
120
+ # Deep merge two hashes. Arrays are replaced (not concatenated).
121
+ def deep_merge(base, overlay)
122
+ result = base.dup
123
+ overlay.each do |key, value|
124
+ result[key] = if result[key].is_a?(Hash) && value.is_a?(Hash)
125
+ deep_merge(result[key], value)
126
+ else
127
+ value
128
+ end
129
+ end
130
+ result
131
+ end
132
+ end
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,74 @@
1
+ ---
2
+ name: metanorma
3
+ description: Standard Metanorma spec configuration for XML/HTML comparison
4
+
5
+ shared:
6
+ preprocessing: format
7
+ match:
8
+ profile: spec_friendly
9
+ diff:
10
+ show_diffs: normative
11
+ verbose_diff: false
12
+ show_line_numbered_inputs: false
13
+ show_raw_inputs: false
14
+ show_raw_expected: false
15
+ show_raw_received: false
16
+ show_prettyprint_expected: false
17
+ context_lines: 5
18
+ mode: pretty_diff
19
+ algorithm: dom
20
+ display_format: canonical
21
+ display_preprocessing: normalize_pretty_print
22
+ compact_semantic_report: true
23
+ pretty_printed_expected: true
24
+ expand_difference: true
25
+ pretty_printer_sort_attributes: true
26
+
27
+ formats:
28
+ xml:
29
+ match:
30
+ profile: spec_friendly
31
+ # Elements where whitespace is PRESERVED exactly (no manipulation)
32
+ # All whitespace characters are significant in these elements
33
+ preserve_whitespace_elements:
34
+ - body
35
+ - passthrough
36
+ # Elements where whitespace is COLLAPSED (HTML-style behavior)
37
+ # Multiple whitespace chars collapse to single space; boundaries preserved
38
+ collapse_whitespace_elements:
39
+ - p
40
+ - title
41
+ - name
42
+ - td
43
+ - th
44
+ - dt
45
+ - form
46
+ - floating-title
47
+ - variant-title
48
+ - field-of-application
49
+ - usage-info
50
+ - pronunciation
51
+ - domain
52
+ - subject
53
+ - fmt-title
54
+ - fmt-name
55
+ - semx
56
+ - fmt-identifier
57
+ - fmt-xref-label
58
+ - fmt-definition
59
+ - fmt-fn-label
60
+ - fmt-sourcecode
61
+ - fmt-preferred
62
+ - fmt-admitted
63
+ - fmt-deprecates
64
+ - note
65
+ - abstract
66
+ - formattedref
67
+ - description
68
+ - identifier
69
+ # Elements where whitespace-only nodes are STRIPPED
70
+ # Default: none (elements not in any list use xml:space or format defaults)
71
+ strip_whitespace_elements: []
72
+ html:
73
+ diff:
74
+ show_raw_inputs: true
@@ -0,0 +1,8 @@
1
+ ---
2
+ name: metanorma_debug
3
+ description: Metanorma profile with debug output enabled
4
+ inherits: metanorma
5
+
6
+ shared:
7
+ diff:
8
+ show_prettyprint_received: true
@@ -28,6 +28,8 @@ module Canon
28
28
  convert_symbol(value)
29
29
  when :string
30
30
  value
31
+ when :string_array
32
+ convert_string_array(value)
31
33
  else
32
34
  value
33
35
  end
@@ -53,6 +55,12 @@ module Canon
53
55
  def convert_symbol(value)
54
56
  value.to_sym
55
57
  end
58
+
59
+ # Parse comma-separated element names from an ENV string.
60
+ # e.g. "p,li,td,th" → ["p", "li", "td", "th"]
61
+ def convert_string_array(value)
62
+ value.split(",").map(&:strip).reject(&:empty?)
63
+ end
56
64
  end
57
65
  end
58
66
  end