canon 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +31 -149
  3. data/README.adoc +9 -0
  4. data/docs/advanced/semantic-diff-report.adoc +31 -0
  5. data/docs/features/configuration-profiles.adoc +4 -2
  6. data/docs/features/match-options/html-policies.adoc +2 -0
  7. data/docs/features/match-options/index.adoc +40 -0
  8. data/docs/guides/choosing-configuration.adoc +12 -1
  9. data/docs/reference/cli-options.adoc +3 -0
  10. data/docs/reference/options-across-interfaces.adoc +7 -1
  11. data/docs/understanding/formats/html.adoc +9 -2
  12. data/lib/canon/cli.rb +4 -0
  13. data/lib/canon/commands/diff_command.rb +1 -0
  14. data/lib/canon/comparison/comparison_result.rb +79 -0
  15. data/lib/canon/comparison/html_comparator.rb +92 -11
  16. data/lib/canon/comparison/markup_comparator.rb +19 -0
  17. data/lib/canon/comparison/match_options/base_resolver.rb +1 -0
  18. data/lib/canon/comparison/match_options/xml_resolver.rb +8 -0
  19. data/lib/canon/comparison/match_options.rb +23 -2
  20. data/lib/canon/comparison/whitespace_sensitivity.rb +96 -0
  21. data/lib/canon/comparison/xml_comparator/child_comparison.rb +6 -0
  22. data/lib/canon/comparison/xml_comparator/node_parser.rb +45 -7
  23. data/lib/canon/comparison/xml_comparator.rb +80 -4
  24. data/lib/canon/comparison/xml_node_comparison.rb +29 -3
  25. data/lib/canon/comparison.rb +84 -22
  26. data/lib/canon/config/env_schema.rb +2 -1
  27. data/lib/canon/config/profiles/metanorma.yml +3 -0
  28. data/lib/canon/config.rb +51 -5
  29. data/lib/canon/diff/diff_classifier.rb +18 -2
  30. data/lib/canon/diff/diff_line_builder.rb +9 -8
  31. data/lib/canon/diff_formatter/by_line/base_formatter.rb +39 -4
  32. data/lib/canon/diff_formatter/by_line/html_formatter.rb +5 -2
  33. data/lib/canon/diff_formatter/by_line_formatter.rb +84 -0
  34. data/lib/canon/diff_formatter/by_object_formatter.rb +53 -0
  35. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +65 -17
  36. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +17 -0
  37. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +29 -0
  38. data/lib/canon/diff_formatter/pretty_diff_formatter.rb +109 -0
  39. data/lib/canon/diff_formatter.rb +57 -173
  40. data/lib/canon/html/data_model.rb +10 -4
  41. data/lib/canon/tree_diff/adapters/html_adapter.rb +55 -2
  42. data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
  43. data/lib/canon/version.rb +1 -1
  44. data/lib/canon/xml/c14n.rb +59 -5
  45. data/lib/canon/xml/element_matcher.rb +3 -0
  46. data/lib/canon/xml/node.rb +8 -1
  47. data/lib/canon/xml/nodes/comment_node.rb +4 -0
  48. data/lib/canon/xml/nodes/element_node.rb +4 -0
  49. data/lib/canon/xml/nodes/text_node.rb +4 -0
  50. data/lib/canon/xml/sax_builder.rb +11 -2
  51. data/lib/canon/xml/xpath_engine.rb +238 -0
  52. metadata +6 -2
@@ -232,6 +232,17 @@ diff_children, differences)
232
232
  return false unless text_node?(node) && node.parent
233
233
  return false unless MatchOptions.normalize_text(node_text(node)).empty?
234
234
 
235
+ # HTML-specific: NBSP (U+00A0) is never insignificant whitespace —
236
+ # it always renders as a visible non-breaking space.
237
+ format = opts[:format] || match_opts[:format]
238
+ if %i[html html4 html5].include?(format)
239
+ return false if WhitespaceSensitivity.contains_nbsp?(node_text(node))
240
+
241
+ # Whitespace between inline element siblings is semantically
242
+ # significant (renders as a visible gap) and must not be stripped.
243
+ return false if WhitespaceSensitivity.inline_whitespace_significant?(node)
244
+ end
245
+
235
246
  return true unless WhitespaceSensitivity.whitespace_preserved?(
236
247
  node.parent, match_opts
237
248
  )
@@ -329,9 +340,24 @@ diff_children, differences)
329
340
  # @param node [Object] Node to check
330
341
  # @return [Boolean] true if node is a text node
331
342
  def self.text_node?(node)
332
- (node.respond_to?(:text?) && node.text? &&
333
- !node.respond_to?(:element?)) ||
334
- (node.respond_to?(:node_type) && node.node_type == :text)
343
+ return false unless node
344
+
345
+ # Nokogiri text nodes (XML, HTML4, HTML5) call element? rather
346
+ # than respond_to?(:element?), which always returns true for
347
+ # Nokogiri::XML::Node and made this predicate vacuously false
348
+ # for every Nokogiri text node. See issue #118.
349
+ if node.is_a?(Nokogiri::XML::Node)
350
+ return node.text? && !node.element?
351
+ end
352
+
353
+ # Canon::Xml::Nodes types and other ducktyped nodes.
354
+ if node.respond_to?(:text?) && node.text? &&
355
+ !node.respond_to?(:element?)
356
+ return true
357
+ end
358
+
359
+ # Symbol-style node_type (Canon's own node objects).
360
+ node.respond_to?(:node_type) && node.node_type == :text
335
361
  end
336
362
 
337
363
  # Extract text content from a node
@@ -144,6 +144,35 @@ module Canon
144
144
  dom_diff(obj1, obj2, opts)
145
145
  end
146
146
 
147
+ # Summarize the first difference between two documents.
148
+ #
149
+ # Returns a human-readable string describing the first difference
150
+ # when documents differ, or "Equivalent" when they match.
151
+ # This is a lightweight alternative to +equivalent?+ with +verbose: true+.
152
+ #
153
+ # @param obj1 [Object] First object to compare
154
+ # @param obj2 [Object] Second object to compare
155
+ # @param opts [Hash] Comparison options (same as +equivalent?+)
156
+ # @return [String] Summary string
157
+ #
158
+ # @example
159
+ # Canon::Comparison.summarize("<p>Hello</p>", "<p>World</p>")
160
+ # # => "Not equivalent: text content differs at /p[1] (Hello vs World)"
161
+ #
162
+ # Canon::Comparison.summarize("<p>Hello</p>", "<p>Hello</p>")
163
+ # # => "Equivalent"
164
+ def summarize(obj1, obj2, opts = {})
165
+ result = equivalent?(obj1, obj2, opts.merge(verbose: true))
166
+
167
+ if result.is_a?(ComparisonResult)
168
+ result.summary
169
+ elsif result == true
170
+ "Equivalent"
171
+ else
172
+ "Not equivalent"
173
+ end
174
+ end
175
+
147
176
  # Define a custom comparison profile with DSL syntax
148
177
  #
149
178
  # @param name [Symbol] Profile name
@@ -602,26 +631,26 @@ module Canon
602
631
  # parsers can mutate the DOM).
603
632
  opts[:_original_str1] = obj1.dup if obj1.is_a?(String)
604
633
  opts[:_original_str2] = obj2.dup if obj2.is_a?(String)
605
- if opts[:format] == :html5
606
- # HTML5 fragment parsing is safe it normalizes without
607
- # destructive content-model mutations.
608
- obj1 = HtmlParser.parse(obj1, :html5) if obj1.is_a?(String)
609
- obj2 = HtmlParser.parse(obj2, :html5) if obj2.is_a?(String)
610
- else
611
- # HTML4 fragment parsing mutates the DOM (strips <body>
612
- # attributes, re-parents <h1> content, etc.). Use XML
613
- # fragment parsing which preserves structure faithfully.
614
- if obj1.is_a?(String)
615
- obj1 = Nokogiri::XML.fragment(
616
- strip_xml_preamble(obj1),
617
- )
618
- end
619
- if obj2.is_a?(String)
620
- obj2 = Nokogiri::XML.fragment(
621
- strip_xml_preamble(obj2),
622
- )
623
- end
624
- end
634
+ # Parse all HTML formats (:html, :html4, :html5) with
635
+ # Nokogiri::HTML5 so that html4 and html5 share HTML's
636
+ # whitespace-sensitivity semantics (issue #118).
637
+ #
638
+ # The previous html/html4 branch used Nokogiri::XML.fragment
639
+ # to dodge Nokogiri::HTML4.fragment's destructive DOM
640
+ # mutations. That avoided one problem but introduced a
641
+ # bigger one: XML whitespace rules were being applied to
642
+ # HTML content. HTML's content model — identical between
643
+ # HTML4 and HTML5 — treats whitespace-only text between
644
+ # block-level children as insignificant; XML treats every
645
+ # whitespace text node as significant. Routing html4 input
646
+ # through an XML parser therefore made
647
+ # be_html4_equivalent_to reject inputs that
648
+ # be_html5_equivalent_to (correctly) accepts.
649
+ # Nokogiri::HTML5.fragment is non-destructive (the original
650
+ # HTML4.fragment concern does not apply to it) and applies
651
+ # HTML's content model uniformly.
652
+ obj1 = HtmlParser.parse(obj1, :html5) if obj1.is_a?(String)
653
+ obj2 = HtmlParser.parse(obj2, :html5) if obj2.is_a?(String)
625
654
  end
626
655
  else
627
656
  format1 = FormatDetector.detect(obj1)
@@ -662,8 +691,14 @@ module Canon
662
691
  # but defined in config
663
692
  if Canon::Config.instance.respond_to?(comparison_format)
664
693
  format_config = Canon::Config.instance.public_send(comparison_format)
665
- if opts[:match_profile].nil? && format_config.match.profile
666
- opts[:match_profile] = format_config.match.profile
694
+ if opts[:global_profile].nil? && format_config.match.profile
695
+ # Config-sourced profile has *global* priority (applied before
696
+ # global_options), so that YAML profile_options like
697
+ # whitespace_type: :normalize can override the built-in profile
698
+ # (e.g. :spec_friendly)'s whitespace_type: :strict. Writing to
699
+ # :match_profile here gave the config profile per-call priority,
700
+ # which incorrectly overrode the YAML's own overrides.
701
+ opts[:global_profile] = format_config.match.profile
667
702
  end
668
703
  # Pass YAML profile's extra match options (e.g., preserve_whitespace_elements)
669
704
  # that are stored in MatchConfig's resolver but not exposed via the
@@ -701,6 +736,33 @@ module Canon
701
736
  str
702
737
  end
703
738
 
739
+ # Decode HTML named entities (&nbsp; etc.) to their numeric
740
+ # character reference equivalents so that Nokogiri::XML.fragment
741
+ # (which only understands the five XML entities) preserves them
742
+ # as text nodes instead of silently dropping them.
743
+ #
744
+ # Uses Nokogiri's HTML4 parser to resolve the entities — the
745
+ # text is extracted from a fragment so no structural tags are added.
746
+ #
747
+ # @param str [String] HTML string potentially containing named entities
748
+ # @return [String] String with named entities replaced by characters
749
+ def decode_html_entities(str)
750
+ # Fast path: skip if no ampersands present
751
+ return str unless str.include?("&")
752
+
753
+ # Parse as HTML fragment to resolve named entities, then
754
+ # re-serialize as text. This converts &nbsp; → U+00A0, etc.
755
+ doc = Nokogiri::HTML4.fragment(str)
756
+
757
+ # Serialize back, preserving the resolved characters.
758
+ # to_html re-encodes characters, so use inner_html which
759
+ # keeps the character form.
760
+ doc.inner_html
761
+
762
+ # If the serialization re-encoded characters as entities,
763
+ # that's fine — the XML parser understands numeric refs like &#160;
764
+ end
765
+
704
766
  # Detect the format of an object (delegates to FormatDetector)
705
767
  #
706
768
  # @param obj [Object] Object to detect format of
@@ -14,6 +14,7 @@ module Canon
14
14
  show_diffs: :symbol,
15
15
  verbose_diff: :boolean,
16
16
  algorithm: :symbol,
17
+ parser: :symbol,
17
18
  show_raw_inputs: :boolean,
18
19
  show_raw_expected: :boolean,
19
20
  show_raw_received: :boolean,
@@ -66,7 +67,7 @@ module Canon
66
67
 
67
68
  def all_diff_attributes
68
69
  %i[mode use_color context_lines grouping_lines show_diffs
69
- verbose_diff algorithm show_raw_inputs show_raw_expected show_raw_received
70
+ verbose_diff algorithm parser show_raw_inputs show_raw_expected show_raw_received
70
71
  show_preprocessed_inputs show_preprocessed_expected show_preprocessed_received
71
72
  show_prettyprint_inputs show_prettyprint_expected show_prettyprint_received
72
73
  show_line_numbered_inputs character_visualization
@@ -28,6 +28,9 @@ formats:
28
28
  xml:
29
29
  match:
30
30
  profile: spec_friendly
31
+ # Treat different Unicode whitespace types (space, NBSP, ideographic space, etc.)
32
+ # as equivalent — useful for spec comparisons where whitespace type doesn't matter
33
+ whitespace_type: :normalize
31
34
  # Elements where whitespace is PRESERVED exactly (no manipulation)
32
35
  # All whitespace characters are significant in these elements
33
36
  preserve_whitespace_elements:
data/lib/canon/config.rb CHANGED
@@ -285,6 +285,7 @@ module Canon
285
285
  end
286
286
 
287
287
  def indent_type=(value)
288
+ DiffConfig.validate_config_value!(:pretty_printer_indent_type, value)
288
289
  @resolver.set_programmatic(:pretty_printer_indent_type, value)
289
290
  end
290
291
  end
@@ -293,6 +294,20 @@ module Canon
293
294
  class DiffConfig
294
295
  attr_reader :pretty_printer
295
296
 
297
+ # Valid values for enum-like configuration options
298
+ VALID_ENUM_VALUES = {
299
+ mode: %i[by_line by_object pretty_diff],
300
+ show_diffs: %i[all normative informative],
301
+ algorithm: %i[dom semantic],
302
+ parser: %i[sax dom],
303
+ display_preprocessing: %i[none pretty_print normalize_pretty_print
304
+ c14n],
305
+ display_format: %i[raw canonical],
306
+ pretty_printer_indent_type: %i[space tab],
307
+ character_visualization: [true, false, :content_only],
308
+ theme: %i[light dark retro claude cyberpunk],
309
+ }.freeze
310
+
296
311
  def initialize(format = nil)
297
312
  @format = format
298
313
  @resolver = build_resolver(format)
@@ -309,7 +324,9 @@ module Canon
309
324
 
310
325
  data.each do |key, value|
311
326
  sym_key = key.to_sym
312
- @resolver.set_profile(sym_key, coerce_profile_value(sym_key, value))
327
+ coerced = coerce_profile_value(sym_key, value)
328
+ self.class.validate_config_value!(sym_key, coerced)
329
+ @resolver.set_profile(sym_key, coerced)
313
330
  end
314
331
  end
315
332
 
@@ -317,12 +334,25 @@ module Canon
317
334
  @resolver.clear_profile!
318
335
  end
319
336
 
337
+ # Validate a config value against its allowed enum values
338
+ def self.validate_config_value!(key, value)
339
+ valid = VALID_ENUM_VALUES[key]
340
+ return unless valid
341
+
342
+ return if valid.include?(value)
343
+
344
+ raise ArgumentError,
345
+ "Invalid value #{value.inspect} for #{key}. " \
346
+ "Valid values: #{valid.map(&:inspect).join(', ')}"
347
+ end
348
+
320
349
  # Accessors with ENV override support
321
350
  def mode
322
351
  @resolver.resolve(:mode)
323
352
  end
324
353
 
325
354
  def mode=(value)
355
+ self.class.validate_config_value!(:mode, value)
326
356
  @resolver.set_programmatic(:mode, value)
327
357
  end
328
358
 
@@ -355,6 +385,7 @@ module Canon
355
385
  end
356
386
 
357
387
  def show_diffs=(value)
388
+ self.class.validate_config_value!(:show_diffs, value)
358
389
  @resolver.set_programmatic(:show_diffs, value)
359
390
  end
360
391
 
@@ -495,6 +526,7 @@ module Canon
495
526
  end
496
527
 
497
528
  def display_format=(value)
529
+ self.class.validate_config_value!(:display_format, value)
498
530
  @resolver.set_programmatic(:display_format, value)
499
531
  end
500
532
 
@@ -511,6 +543,7 @@ module Canon
511
543
  end
512
544
 
513
545
  def display_preprocessing=(value)
546
+ self.class.validate_config_value!(:display_preprocessing, value)
514
547
  @resolver.set_programmatic(:display_preprocessing, value)
515
548
  end
516
549
 
@@ -620,10 +653,8 @@ module Canon
620
653
  # Values:
621
654
  # true - apply the full default visualization map (default)
622
655
  # false - disable visualization; output plain text
623
- # :content_only - reserved for future use; currently behaves as +true+.
624
- # Future intent: apply visualization only to DOM text
625
- # node content, not to structural indentation whitespace.
626
- # (TODO: implement DOM-level pre-serialization pass)
656
+ # :content_only - apply visualization only to text content, not
657
+ # to structural indentation whitespace.
627
658
  def character_visualization
628
659
  val = @resolver.resolve(:character_visualization)
629
660
  # Coerce symbol booleans that may arrive via ENV (env_schema uses :symbol type
@@ -636,6 +667,7 @@ module Canon
636
667
  end
637
668
 
638
669
  def character_visualization=(value)
670
+ self.class.validate_config_value!(:character_visualization, value)
639
671
  @resolver.set_programmatic(:character_visualization, value)
640
672
  end
641
673
 
@@ -644,15 +676,27 @@ module Canon
644
676
  end
645
677
 
646
678
  def algorithm=(value)
679
+ self.class.validate_config_value!(:algorithm, value)
647
680
  @resolver.set_programmatic(:algorithm, value)
648
681
  end
649
682
 
683
+ # XML parser backend (:sax or :dom, default :sax)
684
+ def parser
685
+ @resolver.resolve(:parser)
686
+ end
687
+
688
+ def parser=(value)
689
+ self.class.validate_config_value!(:parser, value)
690
+ @resolver.set_programmatic(:parser, value)
691
+ end
692
+
650
693
  # Theme name (:light, :dark, :retro, :claude)
651
694
  def theme
652
695
  @resolver.resolve(:theme)
653
696
  end
654
697
 
655
698
  def theme=(value)
699
+ self.class.validate_config_value!(:theme, value)
656
700
  @resolver.set_programmatic(:theme, value)
657
701
  end
658
702
 
@@ -693,6 +737,7 @@ module Canon
693
737
  show_diffs: show_diffs,
694
738
  verbose_diff: verbose_diff,
695
739
  diff_algorithm: algorithm,
740
+ parser: parser,
696
741
  show_raw_inputs: show_raw_inputs,
697
742
  show_raw_expected: show_raw_expected,
698
743
  show_raw_received: show_raw_received,
@@ -733,6 +778,7 @@ module Canon
733
778
  show_diffs: :all,
734
779
  verbose_diff: false,
735
780
  algorithm: :dom,
781
+ parser: :sax,
736
782
  show_raw_inputs: false,
737
783
  show_raw_expected: false,
738
784
  show_raw_received: false,
@@ -150,8 +150,9 @@ module Canon
150
150
  end
151
151
 
152
152
  # Check if the text node is inside a whitespace-sensitive element
153
- # (preserve/collapse classification or xml:space='preserve').
154
- # In these elements, whitespace presence is meaningful and should
153
+ # (preserve/collapse classification, xml:space='preserve', or
154
+ # between inline element siblings in HTML).
155
+ # In these contexts, whitespace presence is meaningful and should
155
156
  # not be dismissed as serialization formatting.
156
157
  # @param diff_node [DiffNode] The diff node to check
157
158
  # @return [Boolean] true if whitespace is preserved for this element
@@ -159,6 +160,21 @@ module Canon
159
160
  node = diff_node.node1 || diff_node.node2
160
161
  return false unless node
161
162
 
163
+ # HTML: whitespace between inline element siblings is significant
164
+ if Canon::Comparison::WhitespaceSensitivity.inline_whitespace_significant?(node)
165
+ return true
166
+ end
167
+
168
+ # HTML: non-breaking space (U+00A0) is never insignificant
169
+ text = if node.respond_to?(:content)
170
+ node.content
171
+ elsif node.respond_to?(:value)
172
+ node.value
173
+ end
174
+ if text && Canon::Comparison::WhitespaceSensitivity.contains_nbsp?(text)
175
+ return true
176
+ end
177
+
162
178
  return false unless node.respond_to?(:parent)
163
179
 
164
180
  parent = node.parent
@@ -858,12 +858,14 @@ new_line_ranges)
858
858
  # The DiffNode's explicit formatting? flag takes precedence:
859
859
  # - If formatting? == true: return true (explicitly formatting-only)
860
860
  #
861
- # If node exists and is normative (formatting? is nil but norm is true):
862
- # - Check line-level formatting via FormattingDetector for whitespace-only changes
863
- # - But NOT via comment_only_line? heuristic because comment content is different
861
+ # If node exists and is normative:
862
+ # - Return false normative DiffNodes are NEVER formatting-only.
863
+ # Even if the serialized content looks whitespace-equivalent,
864
+ # the comparison classified it as a normative change and it MUST
865
+ # be visible in by_line output (especially with show_diffs: :normative).
864
866
  #
865
867
  # If node exists and is informative (norm=false):
866
- # - Return false (informative diffs are always shown as informative)
868
+ # - Return false (informative diffs are shown as informative)
867
869
  #
868
870
  # If NO node exists (diff_node is nil):
869
871
  # - Use heuristics: comment-only lines and FormattingDetector
@@ -877,11 +879,10 @@ new_line_ranges)
877
879
  return true if diff_node&.formatting?
878
880
 
879
881
  if diff_node
880
- # Node exists - use node classification
881
- return false unless diff_node.normative?
882
+ # Normative nodes are never formatting-only
883
+ return false if diff_node.normative?
882
884
 
883
- # For normative nodes, check line-level formatting
884
- # (but NOT comment_only_line? which would misclassify comment content changes)
885
+ # Informative nodes: check line-level formatting
885
886
  elsif comment_only_line?(line1) || comment_only_line?(line2)
886
887
  # No DiffNode: use heuristics
887
888
  return true
@@ -45,11 +45,13 @@ module Canon
45
45
  end
46
46
  end
47
47
 
48
+ # rubocop:disable Metrics/ParameterLists
48
49
  def initialize(use_color: true, context_lines: 3,
49
50
  diff_grouping_lines: nil, visualization_map: nil,
50
51
  show_diffs: :all, differences: [],
51
52
  diff_mode: :separate, legacy_terminal: false,
52
- equivalent: nil, theme: nil)
53
+ equivalent: nil, theme: nil,
54
+ character_visualization: true)
53
55
  @use_color = use_color
54
56
  @context_lines = context_lines
55
57
  @diff_grouping_lines = diff_grouping_lines
@@ -61,7 +63,9 @@ module Canon
61
63
  @legacy_terminal = legacy_terminal
62
64
  @equivalent = equivalent
63
65
  @theme = theme
66
+ @character_visualization = character_visualization
64
67
  end
68
+ # rubocop:enable Metrics/ParameterLists
65
69
 
66
70
  # Get the resolved theme hash
67
71
  # @return [Hash] Theme hash
@@ -644,15 +648,23 @@ module Canon
644
648
 
645
649
  # Apply character visualization
646
650
  #
651
+ # When +character_visualization+ is +:content_only+, leading
652
+ # structural whitespace (indentation) is left plain while content
653
+ # whitespace is visualized.
654
+ #
647
655
  # @param token [String] The token to apply visualization to
648
656
  # @param color [Symbol, nil] Optional color to apply
649
657
  # @return [String] Visualized and optionally colored token
650
658
  def apply_visualization(token, color = nil)
651
659
  return "" if token.nil?
652
660
 
653
- visual = token.to_s.chars.map do |char|
654
- @visualization_map.fetch(char, char)
655
- end.join
661
+ visual = if @character_visualization == :content_only
662
+ visualize_content_only(token.to_s)
663
+ else
664
+ token.to_s.chars.map do |char|
665
+ @visualization_map.fetch(char, char)
666
+ end.join
667
+ end
656
668
 
657
669
  if color && @use_color
658
670
  require "rainbow"
@@ -678,6 +690,29 @@ module Canon
678
690
  end
679
691
  end
680
692
 
693
+ # Visualize only content portion, leaving structural indentation plain.
694
+ #
695
+ # Splits the token into leading whitespace (structural indentation)
696
+ # and the rest (content). Only the content portion gets character
697
+ # visualization.
698
+ #
699
+ # @param token [String] The full line token
700
+ # @return [String] Token with content-only visualization
701
+ def visualize_content_only(token)
702
+ # Leading whitespace is structural indentation — keep it plain
703
+ indent_end = token.index(/[^\s]/) || token.length
704
+ indent = token[0...indent_end]
705
+ content = token[indent_end..]
706
+
707
+ if content.nil? || content.empty?
708
+ indent
709
+ else
710
+ indent + content.chars.map { |char|
711
+ @visualization_map.fetch(char, char)
712
+ }.join
713
+ end
714
+ end
715
+
681
716
  # Get max diff lines limit
682
717
  #
683
718
  # @return [Integer, nil] Max diff output lines
@@ -12,19 +12,22 @@ module Canon
12
12
  class HtmlFormatter < BaseFormatter
13
13
  attr_reader :html_version
14
14
 
15
+ # rubocop:disable Metrics/ParameterLists
15
16
  def initialize(use_color: true, context_lines: 3,
16
17
  diff_grouping_lines: nil, visualization_map: nil,
17
18
  html_version: :html4, show_diffs: :all, differences: [],
18
19
  diff_mode: :separate, legacy_terminal: false,
19
- equivalent: nil)
20
+ equivalent: nil, character_visualization: true)
20
21
  super(use_color: use_color, context_lines: context_lines,
21
22
  diff_grouping_lines: diff_grouping_lines,
22
23
  visualization_map: visualization_map,
23
24
  show_diffs: show_diffs, differences: differences,
24
25
  diff_mode: diff_mode, legacy_terminal: legacy_terminal,
25
- equivalent: equivalent)
26
+ equivalent: equivalent,
27
+ character_visualization: character_visualization)
26
28
  @html_version = html_version
27
29
  end
30
+ # rubocop:enable Metrics/ParameterLists
28
31
 
29
32
  # Format DOM-guided HTML diff
30
33
  #
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "paint"
4
+
5
+ module Canon
6
+ class DiffFormatter
7
+ # Handles the by_line rendering pipeline for line-by-line diffs.
8
+ #
9
+ # Receives preprocessed document strings from the DiffFormatter facade
10
+ # and delegates to format-specific ByLine formatters (XML, HTML, JSON, YAML).
11
+ class ByLineFormatter
12
+ # rubocop:disable Metrics/ParameterLists
13
+ def initialize(use_color:, visualization_map:, context_lines:,
14
+ diff_grouping_lines:, show_diffs:, character_visualization:,
15
+ legacy_terminal:, diff_mode:)
16
+ @use_color = use_color
17
+ @visualization_map = visualization_map
18
+ @context_lines = context_lines
19
+ @diff_grouping_lines = diff_grouping_lines
20
+ @show_diffs = show_diffs
21
+ @character_visualization = character_visualization
22
+ @legacy_terminal = legacy_terminal
23
+ @diff_mode = diff_mode
24
+ end
25
+ # rubocop:enable Metrics/ParameterLists
26
+
27
+ # Format a line-by-line diff between two documents.
28
+ #
29
+ # @param doc1 [String] First document (already preprocessed)
30
+ # @param doc2 [String] Second document (already preprocessed)
31
+ # @param format [Symbol] Document format (:xml, :html, :json, :yaml, etc.)
32
+ # @param html_version [Symbol, nil] HTML version override (:html4, :html5)
33
+ # @param differences [Array, ComparisonResult] Differences from comparison
34
+ # @return [String] Formatted diff output
35
+ def format(doc1, doc2, format:, html_version: nil, differences: [])
36
+ resolved_format = format == :html && html_version ? html_version : format
37
+ format_name = resolved_format.to_s.upcase
38
+
39
+ output = []
40
+ output << colorize("Line-by-line diff (#{format_name} mode):", :cyan,
41
+ :bold)
42
+
43
+ return output.join("\n") if doc1.nil? || doc2.nil?
44
+
45
+ diffs_array = extract_differences(differences)
46
+
47
+ formatter = ByLine::BaseFormatter.for_format(
48
+ resolved_format,
49
+ use_color: @use_color,
50
+ context_lines: @context_lines,
51
+ diff_grouping_lines: @diff_grouping_lines,
52
+ visualization_map: @visualization_map,
53
+ show_diffs: @show_diffs,
54
+ differences: diffs_array,
55
+ diff_mode: @legacy_terminal ? :separate : @diff_mode,
56
+ legacy_terminal: @legacy_terminal,
57
+ equivalent: @comparison_equivalent,
58
+ character_visualization: @character_visualization,
59
+ )
60
+
61
+ output << formatter.format(doc1, doc2)
62
+ output.join("\n")
63
+ end
64
+
65
+ private
66
+
67
+ def extract_differences(differences)
68
+ if differences.is_a?(Canon::Comparison::ComparisonResult)
69
+ @comparison_equivalent = differences.equivalent?
70
+ differences.differences
71
+ else
72
+ @comparison_equivalent = nil
73
+ differences
74
+ end
75
+ end
76
+
77
+ def colorize(text, *colors)
78
+ return text unless @use_color
79
+
80
+ "\e[0m#{Paint[text, *colors]}"
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "paint"
4
+
5
+ module Canon
6
+ class DiffFormatter
7
+ # Handles the by_object rendering pipeline for tree-based semantic diffs.
8
+ #
9
+ # Delegates to format-specific ByObject formatters (XML, JSON, YAML)
10
+ # which produce visual tree output with box-drawing characters.
11
+ class ByObjectFormatter
12
+ def initialize(use_color:, visualization_map:, show_diffs:)
13
+ @use_color = use_color
14
+ @visualization_map = visualization_map
15
+ @show_diffs = show_diffs
16
+ end
17
+
18
+ # Format a tree-based object diff.
19
+ #
20
+ # @param differences [Array, ComparisonResult] Differences from comparison
21
+ # @param format [Symbol] Document format (:xml, :json, :yaml)
22
+ # @return [String] Formatted diff output
23
+ def format(differences, format)
24
+ output = []
25
+ output << colorize("Visual Diff:", :cyan, :bold)
26
+
27
+ diffs_array = if differences.is_a?(Canon::Comparison::ComparisonResult)
28
+ differences.differences
29
+ else
30
+ differences
31
+ end
32
+
33
+ formatter = ByObject::BaseFormatter.for_format(
34
+ format,
35
+ use_color: @use_color,
36
+ visualization_map: @visualization_map,
37
+ show_diffs: @show_diffs,
38
+ )
39
+
40
+ output << formatter.format(diffs_array, format)
41
+ output.join("\n")
42
+ end
43
+
44
+ private
45
+
46
+ def colorize(text, *colors)
47
+ return text unless @use_color
48
+
49
+ "\e[0m#{Paint[text, *colors]}"
50
+ end
51
+ end
52
+ end
53
+ end