canon 0.1.22 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +174 -25
  3. data/docs/INDEX.adoc +4 -0
  4. data/docs/advanced/diff-classification.adoc +3 -2
  5. data/docs/features/configuration-profiles.adoc +288 -0
  6. data/docs/features/diff-formatting/character-visualization.adoc +153 -454
  7. data/docs/features/diff-formatting/display-filtering.adoc +44 -0
  8. data/docs/features/diff-formatting/display-preprocessing.adoc +656 -0
  9. data/docs/features/diff-formatting/index.adoc +47 -0
  10. data/docs/features/diff-formatting/pretty-diff-mode.adoc +154 -0
  11. data/docs/features/environment-configuration/override-system.adoc +10 -3
  12. data/docs/features/index.adoc +9 -0
  13. data/docs/features/match-options/index.adoc +32 -42
  14. data/docs/features/match-options/pretty-printed-fixtures.adoc +270 -0
  15. data/docs/guides/choosing-configuration.adoc +22 -0
  16. data/docs/reference/environment-variables.adoc +121 -1
  17. data/docs/reference/options-across-interfaces.adoc +182 -2
  18. data/lib/canon/cli.rb +20 -0
  19. data/lib/canon/commands/diff_command.rb +7 -2
  20. data/lib/canon/commands/format_command.rb +1 -1
  21. data/lib/canon/comparison/html_comparator.rb +20 -15
  22. data/lib/canon/comparison/html_compare_profile.rb +4 -4
  23. data/lib/canon/comparison/markup_comparator.rb +12 -3
  24. data/lib/canon/comparison/match_options/base_resolver.rb +29 -7
  25. data/lib/canon/comparison/match_options/json_resolver.rb +9 -0
  26. data/lib/canon/comparison/match_options/xml_resolver.rb +16 -2
  27. data/lib/canon/comparison/match_options/yaml_resolver.rb +10 -0
  28. data/lib/canon/comparison/match_options.rb +4 -1
  29. data/lib/canon/comparison/whitespace_sensitivity.rb +189 -137
  30. data/lib/canon/comparison/xml_comparator/child_comparison.rb +21 -4
  31. data/lib/canon/comparison/xml_comparator.rb +14 -12
  32. data/lib/canon/comparison/xml_node_comparison.rb +51 -6
  33. data/lib/canon/comparison.rb +52 -9
  34. data/lib/canon/config/env_schema.rb +32 -4
  35. data/lib/canon/config/override_resolver.rb +16 -3
  36. data/lib/canon/config/profile_loader.rb +135 -0
  37. data/lib/canon/config/profiles/metanorma.yml +74 -0
  38. data/lib/canon/config/profiles/metanorma_debug.yml +8 -0
  39. data/lib/canon/config/type_converter.rb +8 -0
  40. data/lib/canon/config.rb +469 -5
  41. data/lib/canon/diff/diff_classifier.rb +41 -11
  42. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +48 -17
  43. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +58 -0
  44. data/lib/canon/diff_formatter/diff_detail_formatter.rb +22 -7
  45. data/lib/canon/diff_formatter/theme.rb +24 -17
  46. data/lib/canon/diff_formatter.rb +493 -36
  47. data/lib/canon/pretty_printer/xml_normalized.rb +395 -0
  48. data/lib/canon/rspec_matchers.rb +36 -0
  49. data/lib/canon/tree_diff/matchers/hash_matcher.rb +26 -11
  50. data/lib/canon/version.rb +1 -1
  51. data/lib/canon/xml/nodes/namespace_node.rb +4 -0
  52. data/lib/canon/xml/nodes/processing_instruction_node.rb +4 -0
  53. data/lib/canon/xml/nodes/root_node.rb +4 -0
  54. data/lib/canon/xml/nodes/text_node.rb +4 -0
  55. data/lib/tasks/performance_helpers.rb +2 -2
  56. metadata +24 -2
@@ -167,8 +167,27 @@ module Canon
167
167
  diff_grouping_lines: nil, visualization_map: nil,
168
168
  character_map_file: nil, character_definitions: nil,
169
169
  show_diffs: :all, verbose_diff: false,
170
- show_raw_inputs: false, show_preprocessed_inputs: false,
170
+ show_raw_inputs: false, show_raw_expected: false,
171
+ show_raw_received: false,
172
+ show_preprocessed_inputs: false,
173
+ show_preprocessed_expected: false,
174
+ show_preprocessed_received: false,
175
+ show_prettyprint_inputs: false,
176
+ show_prettyprint_expected: false,
177
+ show_prettyprint_received: false,
171
178
  show_line_numbered_inputs: false,
179
+ character_visualization: true,
180
+ display_preprocessing: :none,
181
+ pretty_printer_indent: 2,
182
+ pretty_printer_indent_type: :space,
183
+ preserve_whitespace_elements: [],
184
+ collapse_whitespace_elements: [],
185
+ strip_whitespace_elements: [],
186
+ pretty_printed_expected: false,
187
+ pretty_printed_received: false,
188
+ pretty_printer_sort_attributes: false,
189
+ compact_semantic_report: false,
190
+ expand_difference: false,
172
191
  diff_mode: :separate, legacy_terminal: false)
173
192
  # rubocop:enable Metrics/ParameterLists
174
193
  @use_color = use_color
@@ -178,11 +197,31 @@ module Canon
178
197
  @show_diffs = show_diffs
179
198
  @verbose_diff = verbose_diff
180
199
  @show_raw_inputs = show_raw_inputs
200
+ @show_raw_expected = show_raw_expected
201
+ @show_raw_received = show_raw_received
181
202
  @show_preprocessed_inputs = show_preprocessed_inputs
203
+ @show_preprocessed_expected = show_preprocessed_expected
204
+ @show_preprocessed_received = show_preprocessed_received
205
+ @show_prettyprint_inputs = show_prettyprint_inputs
206
+ @show_prettyprint_expected = show_prettyprint_expected
207
+ @show_prettyprint_received = show_prettyprint_received
182
208
  @show_line_numbered_inputs = show_line_numbered_inputs
209
+ @character_visualization = character_visualization
210
+ @display_preprocessing = display_preprocessing
211
+ @pretty_printer_indent = pretty_printer_indent
212
+ @pretty_printer_indent_type = pretty_printer_indent_type
213
+ @preserve_whitespace_elements = Array(preserve_whitespace_elements).map(&:to_s)
214
+ @collapse_whitespace_elements = Array(collapse_whitespace_elements).map(&:to_s)
215
+ @strip_whitespace_elements = Array(strip_whitespace_elements).map(&:to_s)
216
+ @pretty_printed_expected = pretty_printed_expected
217
+ @pretty_printed_received = pretty_printed_received
218
+ @pretty_printer_sort_attributes = pretty_printer_sort_attributes
219
+ @compact_semantic_report = compact_semantic_report
220
+ @expand_difference = expand_difference
183
221
  @diff_mode = legacy_terminal ? :separate : diff_mode
184
222
  @legacy_terminal = legacy_terminal
185
223
  @visualization_map = build_visualization_map(
224
+ character_visualization: character_visualization,
186
225
  visualization_map: visualization_map,
187
226
  character_map_file: character_map_file,
188
227
  character_definitions: character_definitions,
@@ -269,6 +308,12 @@ module Canon
269
308
  differences: differences)
270
309
  end
271
310
 
311
+ # In pretty_diff mode, always use text-LCS diff (bypasses DiffNodeMapper).
312
+ # pretty_diff_format handles nil doc1/doc2 itself (emits header only).
313
+ if @mode == :pretty_diff
314
+ return pretty_diff_format(doc1, doc2, format: format)
315
+ end
316
+
272
317
  no_diffs = if differences.respond_to?(:equivalent?)
273
318
  differences.equivalent?
274
319
  else
@@ -280,6 +325,8 @@ module Canon
280
325
  when :by_line
281
326
  by_line_diff(doc1, doc2, format: format, html_version: html_version,
282
327
  differences: differences)
328
+ when :pretty_diff
329
+ pretty_diff_format(doc1, doc2, format: format)
283
330
  else
284
331
  by_object_diff(differences, format)
285
332
  end
@@ -333,30 +380,65 @@ module Canon
333
380
  output << DiffDetailFormatter.format_report(
334
381
  comparison_result.differences,
335
382
  use_color: @use_color,
383
+ show_diffs: @show_diffs,
384
+ compact_semantic_report: @compact_semantic_report,
385
+ expand_difference: @expand_difference,
336
386
  )
337
387
  end
338
388
 
339
- # verbose_diff enables all three input displays as a convenience
340
- verbose = @verbose_diff || @show_raw_inputs
341
- show_prep = @verbose_diff || @show_preprocessed_inputs
389
+ # verbose_diff / show_raw_inputs shows both sides as a convenience shorthand.
390
+ # show_raw_expected / show_raw_received give per-side control.
391
+ combined_raw = @verbose_diff || @show_raw_inputs
392
+ show_raw_exp = combined_raw || @show_raw_expected
393
+ show_raw_rec = combined_raw || @show_raw_received
394
+ verbose = show_raw_exp || show_raw_rec
395
+ # verbose_diff / show_preprocessed_inputs shows both sides as a shorthand.
396
+ # show_preprocessed_expected / show_preprocessed_received give per-side control.
397
+ combined_prep = @verbose_diff || @show_preprocessed_inputs
398
+ show_prep_exp = combined_prep || @show_preprocessed_expected
399
+ show_prep_rec = combined_prep || @show_preprocessed_received
400
+ show_prep = show_prep_exp || show_prep_rec
342
401
  show_line = @verbose_diff || @show_line_numbered_inputs
343
402
 
344
- # 3. Raw/Original Input Display (when show_raw_inputs is enabled)
403
+ # 3. Raw/Original Input Display (when show_raw_inputs/show_raw_expected/show_raw_received enabled)
345
404
  if verbose && comparison_result.is_a?(Canon::Comparison::ComparisonResult)
346
405
  original1, original2 = comparison_result.original_strings
347
406
  if original1 && original2
348
- output << format_raw_inputs(original1, original2)
407
+ output << format_raw_inputs(original1, original2,
408
+ show_expected: show_raw_exp,
409
+ show_received: show_raw_rec)
349
410
  end
350
411
  end
351
412
 
352
- # 4. Preprocessed Input Display (when show_preprocessed_inputs is enabled)
413
+ # 4. Preprocessed Input Display (when show_preprocessed_inputs/expected/received enabled)
353
414
  if show_prep && comparison_result.is_a?(Canon::Comparison::ComparisonResult)
354
415
  preprocessed1, preprocessed2 = comparison_result.preprocessed_strings
355
416
  if preprocessed1 && preprocessed2
356
417
  preprocessing_info = comparison_result.match_options&.dig(:match,
357
418
  :preprocessing)
358
419
  output << format_preprocessed_inputs(preprocessed1, preprocessed2,
359
- preprocessing_info)
420
+ preprocessing_info,
421
+ show_expected: show_prep_exp,
422
+ show_received: show_prep_rec)
423
+ end
424
+ end
425
+
426
+ # 4.5. Pretty-printed Input Display (when show_prettyprint_inputs/expected/received enabled)
427
+ # Pretty-prints the ORIGINAL strings (not preprocessed) through PrettyPrinter::Xml/Html
428
+ # with NO character visualization — output is plain ASCII suitable for copy-pasting
429
+ # into RSpec fixture heredocs. verbose_diff does NOT enable these options.
430
+ show_pp_inp = @show_prettyprint_inputs
431
+ show_pp_exp = show_pp_inp || @show_prettyprint_expected
432
+ show_pp_rec = show_pp_inp || @show_prettyprint_received
433
+ show_pp = show_pp_exp || show_pp_rec
434
+
435
+ if show_pp && comparison_result.is_a?(Canon::Comparison::ComparisonResult)
436
+ orig1, orig2 = comparison_result.original_strings
437
+ if orig1 && orig2
438
+ pp1, pp2 = prettyprint_for_display(orig1, orig2, format)
439
+ output << format_prettyprint_inputs(pp1, pp2,
440
+ show_expected: show_pp_exp,
441
+ show_received: show_pp_rec)
360
442
  end
361
443
  end
362
444
 
@@ -497,41 +579,65 @@ module Canon
497
579
  end
498
580
 
499
581
  # Format raw/original inputs for display (user-friendly copyable format)
500
- # Shows the raw file contents before any preprocessing
582
+ # Shows the raw file contents before any preprocessing.
583
+ #
584
+ # Use +show_expected:+ and +show_received:+ to control which side is
585
+ # rendered. Both default to +true+ so existing callers are unaffected.
586
+ # Pass +show_expected: false+ to suppress the fixture/expected block while
587
+ # still showing the received output (useful when the fixture is very long
588
+ # and the user only wants to see what the generator produced).
501
589
  #
502
- # @param raw1 [String] First raw input string
503
- # @param raw2 [String] Second raw input string
590
+ # @param raw1 [String] First raw input string (expected / fixture)
591
+ # @param raw2 [String] Second raw input string (received / actual)
592
+ # @param show_expected [Boolean] Render the EXPECTED block
593
+ # @param show_received [Boolean] Render the RECEIVED block
504
594
  # @return [String] Formatted display of raw inputs
505
- def format_raw_inputs(raw1, raw2)
595
+ def format_raw_inputs(raw1, raw2, show_expected: true, show_received: true)
506
596
  return "" if raw1.nil? || raw2.nil?
597
+ return "" unless show_expected || show_received
507
598
 
508
599
  output = []
509
600
  output << ""
510
601
  output << colorize("=== ORIGINAL INPUTS (Raw) ===", :cyan, :bold)
511
602
  output << ""
512
- output << colorize("EXPECTED:", :yellow, :bold)
513
- output << ("-" * 70)
514
- output << raw1
515
- output << ""
516
- output << colorize("RECEIVED:", :yellow, :bold)
517
- output << ("-" * 70)
518
- output << raw2
519
- output << ""
520
- output << ""
521
603
 
604
+ if show_expected
605
+ output << colorize("EXPECTED:", :yellow, :bold)
606
+ output << ("-" * 70)
607
+ output << raw1
608
+ output << ""
609
+ end
610
+
611
+ if show_received
612
+ output << colorize("RECEIVED:", :yellow, :bold)
613
+ output << ("-" * 70)
614
+ output << raw2
615
+ output << ""
616
+ end
617
+
618
+ output << ""
522
619
  output.join("\n")
523
620
  end
524
621
 
525
622
  # Format preprocessed inputs for display (what was actually compared)
526
623
  # Shows the content after preprocessing (c14n, normalize, format, etc.)
527
624
  #
528
- # @param preprocessed1 [String] First preprocessed string
529
- # @param preprocessed2 [String] Second preprocessed string
625
+ # Use +show_expected:+ and +show_received:+ to control which side is rendered.
626
+ # Both default to +true+ so existing callers are unaffected.
627
+ # Pass +show_expected: false+ to suppress the fixture/expected block while
628
+ # still showing the preprocessed received output.
629
+ #
630
+ # @param preprocessed1 [String] First preprocessed string (expected / fixture)
631
+ # @param preprocessed2 [String] Second preprocessed string (received / actual)
530
632
  # @param preprocessing_info [Symbol, nil] Preprocessing mode (:c14n, :normalize, :format, etc.)
633
+ # @param show_expected [Boolean] Render the EXPECTED block
634
+ # @param show_received [Boolean] Render the RECEIVED block
531
635
  # @return [String] Formatted display of preprocessed inputs
532
636
  def format_preprocessed_inputs(preprocessed1, preprocessed2,
533
- preprocessing_info = nil)
637
+ preprocessing_info = nil,
638
+ show_expected: true, show_received: true)
534
639
  return "" if preprocessed1.nil? || preprocessed2.nil?
640
+ return "" unless show_expected || show_received
535
641
 
536
642
  output = []
537
643
  output << ""
@@ -542,16 +648,22 @@ preprocessing_info = nil)
542
648
  output << "Preprocessing: #{preprocessing_info}"
543
649
  end
544
650
  output << ""
545
- output << colorize("EXPECTED:", :yellow, :bold)
546
- output << ("-" * 70)
547
- output << preprocessed1
548
- output << ""
549
- output << colorize("RECEIVED:", :yellow, :bold)
550
- output << ("-" * 70)
551
- output << preprocessed2
552
- output << ""
553
- output << ""
554
651
 
652
+ if show_expected
653
+ output << colorize("EXPECTED:", :yellow, :bold)
654
+ output << ("-" * 70)
655
+ output << preprocessed1
656
+ output << ""
657
+ end
658
+
659
+ if show_received
660
+ output << colorize("RECEIVED:", :yellow, :bold)
661
+ output << ("-" * 70)
662
+ output << preprocessed2
663
+ output << ""
664
+ end
665
+
666
+ output << ""
555
667
  output.join("\n")
556
668
  end
557
669
 
@@ -561,12 +673,23 @@ preprocessing_info = nil)
561
673
  # @param character_map_file [String, nil] Path to custom YAML file
562
674
  # @param character_definitions [Array<Hash>, nil] Individual character definitions
563
675
  # @return [Hash] Final visualization map
564
- def build_visualization_map(visualization_map: nil, character_map_file: nil,
676
+ def build_visualization_map(character_visualization: true,
677
+ visualization_map: nil,
678
+ character_map_file: nil,
565
679
  character_definitions: nil)
566
680
  # Priority order:
681
+ # 0. character_visualization: false → return empty map (no substitution)
567
682
  # 1. If visualization_map is provided, use it as complete replacement
568
683
  # 2. Otherwise, start with defaults and apply customizations
569
684
 
685
+ # false disables all visualization
686
+ return {} if character_visualization == false
687
+
688
+ # :content_only currently behaves as true (full map)
689
+ # TODO: apply visualization at DOM text-node level pre-serialization,
690
+ # keeping structural indentation whitespace plain.
691
+ # See docs/features/diff-formatting/character-visualization.adoc
692
+
570
693
  return visualization_map if visualization_map
571
694
 
572
695
  # Start with defaults
@@ -644,6 +767,8 @@ differences: [])
644
767
 
645
768
  return output.join("\n") if doc1.nil? || doc2.nil?
646
769
 
770
+ # Apply display preprocessing (format both sides identically before diff)
771
+ doc1, doc2 = apply_display_preprocessing(doc1, doc2, format)
647
772
  # Extract differences array and equivalent status from ComparisonResult if needed
648
773
  diffs_array = if differences.is_a?(Canon::Comparison::ComparisonResult)
649
774
  @comparison_equivalent = differences.equivalent?
@@ -672,8 +797,340 @@ differences: [])
672
797
  output.join("\n")
673
798
  end
674
799
 
675
- # Colorize text if color is enabled
676
- # RSpec-aware: resets any existing ANSI codes before applying new colors
800
+ # Generate a text-LCS diff against preprocessed lines (pretty_diff mode).
801
+ #
802
+ # This mode bypasses DiffNodeMapper entirely: it applies display_preprocessing
803
+ # to both sides, then runs Diff::LCS.sdiff on the resulting plain-text lines.
804
+ # It is a reliable short-term workaround for #85 (normative changes invisible
805
+ # in :by_line mode when DiffNodeMapper's DOM-address correlation is off).
806
+ #
807
+ # Limitations:
808
+ # - show_diffs :normative / :informative filter is ignored (no DiffNodes)
809
+ # - No inline character highlighting (whole-line granularity only)
810
+ #
811
+ # @param doc1 [String] First document
812
+ # @param doc2 [String] Second document
813
+ # @param format [Symbol] Document format
814
+ # @return [String] Formatted diff output
815
+ def pretty_diff_format(doc1, doc2, format:)
816
+ require "diff/lcs"
817
+
818
+ resolved_format = format
819
+
820
+ format_name = resolved_format.to_s.upcase
821
+ output = []
822
+ output << colorize("Pretty diff (#{format_name} mode):", :cyan, :bold)
823
+
824
+ return output.join("\n") if doc1.nil? || doc2.nil?
825
+
826
+ # Apply display preprocessing — same transforms as by_line_diff
827
+ d1, d2 = apply_display_preprocessing(doc1, doc2, resolved_format)
828
+
829
+ lines1 = d1.lines.map(&:chomp)
830
+ lines2 = d2.lines.map(&:chomp)
831
+
832
+ hunks = ::Diff::LCS.sdiff(lines1, lines2)
833
+
834
+ output << render_pretty_diff(hunks)
835
+ output.join("\n")
836
+ end
837
+
838
+ # Render sdiff hunks with context windowing and colorization.
839
+ #
840
+ # Uses the same context_lines setting as by_line_diff. Changed hunks
841
+ # (action != "=") are expanded by context_lines in each direction; nearby
842
+ # windows are merged; a separator is emitted between non-adjacent blocks.
843
+ #
844
+ # @param hunks [Array<Diff::LCS::ContextChange>] Output of Diff::LCS.sdiff
845
+ # @return [String] Rendered diff lines joined with "\n"
846
+ def render_pretty_diff(hunks)
847
+ # Identify positions of changed hunks
848
+ changed = hunks.each_index.reject { |i| hunks[i].action == "=" }
849
+
850
+ return colorize(" (no differences)", :green) if changed.empty?
851
+
852
+ ctx = [@context_lines || 3, 0].max
853
+
854
+ # Build expanded windows, then merge overlapping/adjacent ones
855
+ windows = changed.map do |pos|
856
+ [
857
+ [pos - ctx, 0].max,
858
+ [pos + ctx, hunks.length - 1].min,
859
+ ]
860
+ end
861
+
862
+ merged = []
863
+ windows.each do |lo, hi|
864
+ if merged.empty? || lo > merged.last[1] + 1
865
+ merged << [lo, hi]
866
+ else
867
+ merged.last[1] = [merged.last[1], hi].max
868
+ end
869
+ end
870
+
871
+ lines = []
872
+ merged.each_with_index do |(lo, hi), block_idx|
873
+ # Separator between non-adjacent blocks
874
+ if block_idx.positive?
875
+ lines << colorize("--- ---", :cyan)
876
+ elsif lo.positive?
877
+ lines << colorize("--- ---", :cyan)
878
+ end
879
+
880
+ (lo..hi).each do |i|
881
+ hunk = hunks[i]
882
+ case hunk.action
883
+ when "="
884
+ lines << (@use_color ? "\e[0m #{hunk.old_element}" : " #{hunk.old_element}")
885
+ when "-"
886
+ lines << colorize("- #{hunk.old_element}", :red)
887
+ when "+"
888
+ lines << colorize("+ #{hunk.new_element}", :green)
889
+ when "!"
890
+ lines << colorize("- #{hunk.old_element}", :red)
891
+ lines << colorize("+ #{hunk.new_element}", :green)
892
+ end
893
+ end
894
+ end
895
+
896
+ lines.join("\n")
897
+ end
898
+
899
+ # Apply display preprocessing to both documents before the line diff.
900
+ #
901
+ # This normalizes both sides through the same formatter so that structural
902
+ # formatting differences (indentation, line breaks) do not confuse the LCS
903
+ # algorithm. Equivalence detection is never affected.
904
+ #
905
+ # NOTE: Character visualization (e.g. U+00A0 → ░) is applied by the
906
+ # line-diff formatters to the output lines *after* this step. Because the
907
+ # pretty-printer introduces only ASCII U+0020 spaces and U+000A newlines
908
+ # for structural indentation, and neither of those is in Canon's default
909
+ # visualization map, pretty-printer whitespace is never misvisualized.
910
+ #
911
+ # Future constraint: if the visualization map is extended to cover common
912
+ # ASCII whitespace, this method must move visualization to a DOM-level pass
913
+ # (walk text nodes before serialization) to keep structural and content
914
+ # whitespace separate. See docs/features/diff-formatting/display-preprocessing.adoc.
915
+ #
916
+ # @param doc1 [String] First document
917
+ # @param doc2 [String] Second document
918
+ # @param format [Symbol] Document format (:xml, :html, :html4, :html5, ...)
919
+ # @return [Array<String, String>] Preprocessed [doc1, doc2]
920
+ def apply_display_preprocessing(doc1, doc2, format)
921
+ case @display_preprocessing
922
+ when :pretty_print
923
+ apply_pretty_print(doc1, doc2, format)
924
+ when :normalize_pretty_print
925
+ apply_normalize_pretty_print(doc1, doc2, format)
926
+ when :c14n
927
+ apply_c14n(doc1, doc2, format)
928
+ else
929
+ [doc1, doc2]
930
+ end
931
+ end
932
+
933
+ # Apply mixed-content-aware normalization + visualization to both documents.
934
+ #
935
+ # Uses PrettyPrinter::XmlNormalized, which breaks every XML element onto
936
+ # its own line while preserving and visualizing boundary content whitespace.
937
+ # See PrettyPrinter::XmlNormalized for the full rationale.
938
+ #
939
+ # Whitespace classification is driven by three element-name lists:
940
+ # - preserve_whitespace_elements → every char significant (e.g. pre, code)
941
+ # - collapse_whitespace_elements → presence matters, form collapses (e.g. p, li)
942
+ # - strip_whitespace_elements → all whitespace dropped (explicit blacklist)
943
+ #
944
+ # For XML the lists default to empty (all insensitive); for HTML built-in
945
+ # defaults cover the common cases. Callers supply format-specific lists via
946
+ # Canon::Config or DiffFormatter constructor keyword arguments.
947
+ def apply_normalize_pretty_print(doc1, doc2, format)
948
+ return [doc1, doc2] unless %i[xml html html4 html5].include?(format)
949
+
950
+ indent_type_str = @pretty_printer_indent_type.to_s
951
+ vis_map = @visualization_map.empty? ? DiffFormatter::DEFAULT_VISUALIZATION_MAP : @visualization_map
952
+
953
+ require "canon/pretty_printer/xml_normalized"
954
+ # TODO: implement HtmlNormalized for HTML formats; XmlNormalized works via
955
+ # Nokogiri's HTML-aware parse for now.
956
+ #
957
+ # Create side-specific printers so that the pretty_printed_expected and
958
+ # pretty_printed_received flags drop structural \n indentation nodes only
959
+ # on the side that is actually pretty-printed. If both sides share the
960
+ # same settings, two identical printer instances are created (cheap).
961
+ shared_args = {
962
+ indent: @pretty_printer_indent,
963
+ indent_type: indent_type_str,
964
+ visualization_map: vis_map,
965
+ preserve_whitespace_elements: @preserve_whitespace_elements,
966
+ collapse_whitespace_elements: @collapse_whitespace_elements,
967
+ strip_whitespace_elements: @strip_whitespace_elements,
968
+ sort_attributes: @pretty_printer_sort_attributes,
969
+ }
970
+
971
+ printer_expected = Canon::PrettyPrinter::XmlNormalized.new(
972
+ **shared_args,
973
+ pretty_printed: @pretty_printed_expected,
974
+ )
975
+ printer_received = Canon::PrettyPrinter::XmlNormalized.new(
976
+ **shared_args,
977
+ pretty_printed: @pretty_printed_received,
978
+ )
979
+
980
+ [safe_format(printer_expected, doc1), safe_format(printer_received, doc2)]
981
+ end
982
+
983
+ # Pretty-print both documents using a format-appropriate pretty printer.
984
+ #
985
+ # * HTML formats (:html, :html4, :html5) use +Canon::PrettyPrinter::Html+
986
+ # which is Nokogiri::HTML5-aware and correctly handles void elements,
987
+ # optional end tags, and HTML5 serialization rules.
988
+ # * XML uses +Canon::PrettyPrinter::Xml+.
989
+ # * Other formats fall through unchanged.
990
+ def apply_pretty_print(doc1, doc2, format)
991
+ return [doc1, doc2] unless %i[xml html html4 html5].include?(format)
992
+
993
+ indent_type_str = @pretty_printer_indent_type.to_s
994
+
995
+ printer = if %i[html html4 html5].include?(format)
996
+ require "canon/pretty_printer/html"
997
+ Canon::PrettyPrinter::Html.new(
998
+ indent: @pretty_printer_indent,
999
+ indent_type: indent_type_str,
1000
+ )
1001
+ else
1002
+ require "canon/pretty_printer/xml"
1003
+ Canon::PrettyPrinter::Xml.new(
1004
+ indent: @pretty_printer_indent,
1005
+ indent_type: indent_type_str,
1006
+ )
1007
+ end
1008
+
1009
+ [safe_format(printer, doc1), safe_format(printer, doc2)]
1010
+ end
1011
+
1012
+ # Normalize both documents for display using canonical serialization.
1013
+ #
1014
+ # * HTML formats use Nokogiri's HTML5 serializer as a consistent canonical
1015
+ # form (attribute order, void elements, etc. are standardized).
1016
+ # * XML uses the XML C14N algorithm (alphabetical attributes, namespace
1017
+ # normalization, etc.).
1018
+ # * Other formats fall through unchanged.
1019
+ #
1020
+ # @param doc1 [String] First document
1021
+ # @param doc2 [String] Second document
1022
+ # @param format [Symbol] Document format (:xml, :html, :html4, :html5, ...)
1023
+ # @return [Array<String, String>] Canonicalized [doc1, doc2]
1024
+ def apply_c14n(doc1, doc2, format = :xml)
1025
+ if %i[html html4 html5].include?(format)
1026
+ [safe_html_normalize(doc1), safe_html_normalize(doc2)]
1027
+ else
1028
+ require "canon/xml/c14n"
1029
+ [safe_c14n(doc1), safe_c14n(doc2)]
1030
+ end
1031
+ end
1032
+
1033
+ # Pretty-print document strings for the fixture-ready display section.
1034
+ #
1035
+ # Runs independently of the +display_preprocessing+ setting — it is a
1036
+ # standalone display feature, not part of the diff pipeline.
1037
+ #
1038
+ # The output contains NO character visualization so it can be copy-pasted
1039
+ # directly into RSpec heredoc fixtures.
1040
+ #
1041
+ # @param doc1 [String] First document (expected / fixture)
1042
+ # @param doc2 [String] Second document (received / actual)
1043
+ # @param format [Symbol] Document format (:xml, :html, :html4, :html5, ...)
1044
+ # @return [Array<String, String>] Pretty-printed [doc1, doc2]
1045
+ def prettyprint_for_display(doc1, doc2, format)
1046
+ indent_type_str = @pretty_printer_indent_type.to_s
1047
+
1048
+ if %i[html html4 html5].include?(format)
1049
+ require "canon/pretty_printer/html"
1050
+ printer = Canon::PrettyPrinter::Html.new(
1051
+ indent: @pretty_printer_indent,
1052
+ indent_type: indent_type_str,
1053
+ )
1054
+ elsif format == :xml
1055
+ require "canon/pretty_printer/xml"
1056
+ printer = Canon::PrettyPrinter::Xml.new(
1057
+ indent: @pretty_printer_indent,
1058
+ indent_type: indent_type_str,
1059
+ )
1060
+ else
1061
+ return [doc1, doc2]
1062
+ end
1063
+
1064
+ [safe_format(printer, doc1), safe_format(printer, doc2)]
1065
+ end
1066
+
1067
+ # Format fixture-ready pretty-printed inputs for display.
1068
+ #
1069
+ # Unlike +format_preprocessed_inputs+, this section outputs plain ASCII
1070
+ # with NO character visualization — the content is intended for
1071
+ # copy-pasting into RSpec heredoc fixtures.
1072
+ #
1073
+ # @param pp1 [String] First pretty-printed string (expected / fixture)
1074
+ # @param pp2 [String] Second pretty-printed string (received / actual)
1075
+ # @param show_expected [Boolean] Render the EXPECTED block
1076
+ # @param show_received [Boolean] Render the RECEIVED block
1077
+ # @return [String] Formatted display of pretty-printed inputs
1078
+ def format_prettyprint_inputs(pp1, pp2, show_expected: true,
1079
+ show_received: true)
1080
+ return "" if pp1.nil? || pp2.nil?
1081
+ return "" unless show_expected || show_received
1082
+
1083
+ output = []
1084
+ output << ""
1085
+ output << colorize("=== PRETTY-PRINTED INPUTS (Fixture-ready) ===",
1086
+ :cyan, :bold)
1087
+ output << ""
1088
+
1089
+ if show_expected
1090
+ output << colorize("EXPECTED:", :yellow, :bold)
1091
+ output << ("-" * 70)
1092
+ output << pp1
1093
+ output << ""
1094
+ end
1095
+
1096
+ if show_received
1097
+ output << colorize("RECEIVED:", :yellow, :bold)
1098
+ output << ("-" * 70)
1099
+ output << pp2
1100
+ output << ""
1101
+ end
1102
+
1103
+ output << ""
1104
+ output.join("\n")
1105
+ end
1106
+
1107
+ # Format a document through the pretty-printer, falling back to the
1108
+ # original string on any parse error.
1109
+ def safe_format(printer, doc)
1110
+ printer.format(doc.to_s)
1111
+ rescue StandardError
1112
+ doc.to_s
1113
+ end
1114
+
1115
+ # Canonicalize a document via C14N, falling back on error.
1116
+ def safe_c14n(doc)
1117
+ Canon::Xml::C14n.canonicalize(doc.to_s, with_comments: true)
1118
+ rescue StandardError
1119
+ doc.to_s
1120
+ end
1121
+
1122
+ # Serialize HTML through Nokogiri's HTML5 serializer for a canonical form.
1123
+ # Normalizes attribute order, void elements, and optional end tags consistently.
1124
+ # Falls back to the original string on any parse error.
1125
+ def safe_html_normalize(doc)
1126
+ require "nokogiri"
1127
+ Nokogiri::HTML5(doc.to_s).to_html(encoding: "UTF-8")
1128
+ rescue StandardError
1129
+ doc.to_s
1130
+ end
1131
+
1132
+ # Colorize text if color is enabled.
1133
+ # RSpec-aware: resets any existing ANSI codes before applying new colors.
677
1134
  def colorize(text, *colors)
678
1135
  return text unless @use_color
679
1136