canon 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +31 -149
  3. data/README.adoc +9 -0
  4. data/docs/advanced/semantic-diff-report.adoc +96 -0
  5. data/docs/features/configuration-profiles.adoc +4 -2
  6. data/docs/features/diff-formatting/index.adoc +3 -0
  7. data/docs/features/diff-formatting/whitespace-adjacency.adoc +140 -0
  8. data/docs/features/match-options/html-policies.adoc +2 -0
  9. data/docs/features/match-options/index.adoc +40 -0
  10. data/docs/guides/choosing-configuration.adoc +12 -1
  11. data/docs/reference/cli-options.adoc +3 -0
  12. data/docs/reference/environment-variables.adoc +3 -1
  13. data/docs/reference/options-across-interfaces.adoc +7 -1
  14. data/docs/understanding/formats/html.adoc +9 -2
  15. data/lib/canon/cli.rb +4 -0
  16. data/lib/canon/commands/diff_command.rb +1 -0
  17. data/lib/canon/comparison/comparison_result.rb +95 -2
  18. data/lib/canon/comparison/html_comparator.rb +96 -11
  19. data/lib/canon/comparison/markup_comparator.rb +68 -71
  20. data/lib/canon/comparison/match_options/base_resolver.rb +1 -0
  21. data/lib/canon/comparison/match_options/xml_resolver.rb +8 -0
  22. data/lib/canon/comparison/match_options.rb +23 -2
  23. data/lib/canon/comparison/node_inspector.rb +103 -0
  24. data/lib/canon/comparison/whitespace_sensitivity.rb +96 -0
  25. data/lib/canon/comparison/xml_comparator/child_comparison.rb +133 -55
  26. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +24 -23
  27. data/lib/canon/comparison/xml_comparator/node_parser.rb +45 -7
  28. data/lib/canon/comparison/xml_comparator.rb +174 -7
  29. data/lib/canon/comparison/xml_node_comparison.rb +48 -66
  30. data/lib/canon/comparison.rb +143 -22
  31. data/lib/canon/config/env_schema.rb +2 -1
  32. data/lib/canon/config/profiles/metanorma.yml +3 -0
  33. data/lib/canon/config.rb +51 -5
  34. data/lib/canon/diff/diff_classifier.rb +55 -41
  35. data/lib/canon/diff/diff_line_builder.rb +9 -8
  36. data/lib/canon/diff/xml_serialization_formatter.rb +27 -42
  37. data/lib/canon/diff_formatter/by_line/base_formatter.rb +39 -4
  38. data/lib/canon/diff_formatter/by_line/html_formatter.rb +5 -2
  39. data/lib/canon/diff_formatter/by_line_formatter.rb +84 -0
  40. data/lib/canon/diff_formatter/by_object_formatter.rb +53 -0
  41. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +184 -26
  42. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +92 -4
  43. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +29 -0
  44. data/lib/canon/diff_formatter/pretty_diff_formatter.rb +109 -0
  45. data/lib/canon/diff_formatter.rb +128 -175
  46. data/lib/canon/html/data_model.rb +10 -4
  47. data/lib/canon/pretty_printer/html.rb +76 -14
  48. data/lib/canon/pretty_printer/html_void_elements.rb +20 -0
  49. data/lib/canon/pretty_printer/xml_normalized.rb +10 -3
  50. data/lib/canon/tree_diff/adapters/html_adapter.rb +55 -2
  51. data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
  52. data/lib/canon/version.rb +1 -1
  53. data/lib/canon/xml/c14n.rb +59 -5
  54. data/lib/canon/xml/data_model.rb +13 -1
  55. data/lib/canon/xml/element_matcher.rb +3 -0
  56. data/lib/canon/xml/node.rb +23 -1
  57. data/lib/canon/xml/nodes/comment_node.rb +4 -0
  58. data/lib/canon/xml/nodes/element_node.rb +4 -0
  59. data/lib/canon/xml/nodes/text_node.rb +4 -0
  60. data/lib/canon/xml/sax_builder.rb +29 -2
  61. data/lib/canon/xml/xpath_engine.rb +238 -0
  62. metadata +9 -2
@@ -7,6 +7,9 @@ require_relative "diff/diff_block"
7
7
  require_relative "diff/diff_context"
8
8
  require_relative "diff/diff_report"
9
9
  require_relative "diff_formatter/debug_output"
10
+ require_relative "diff_formatter/by_line_formatter"
11
+ require_relative "diff_formatter/by_object_formatter"
12
+ require_relative "diff_formatter/pretty_diff_formatter"
10
13
 
11
14
  module Canon
12
15
  # Formatter for displaying semantic differences with color support
@@ -301,17 +304,26 @@ module Canon
301
304
  # @param html_version [Symbol, nil] HTML version (:html4 or :html5)
302
305
  # @return [String] Formatted output
303
306
  def format(differences, format, doc1: nil, doc2: nil, html_version: nil)
304
- # In by-line mode, always use by-line diff
307
+ # In by-line mode with both docs present, always use by-line diff
305
308
  if @mode == :by_line && doc1 && doc2
306
- return by_line_diff(doc1, doc2, format: format,
307
- html_version: html_version,
308
- differences: differences)
309
+ doc1, doc2 = apply_display_preprocessing(doc1, doc2, format)
310
+ # rubocop:disable Layout/HashAlignment
311
+ return by_line_formatter.format(doc1, doc2, format: format,
312
+ html_version: html_version,
313
+ differences: differences)
314
+ # rubocop:enable Layout/HashAlignment
309
315
  end
310
316
 
311
317
  # In pretty_diff mode, always use text-LCS diff (bypasses DiffNodeMapper).
312
- # pretty_diff_format handles nil doc1/doc2 itself (emits header only).
313
318
  if @mode == :pretty_diff
314
- return pretty_diff_format(doc1, doc2, format: format)
319
+ d1, d2 = if doc1 && doc2
320
+ apply_display_preprocessing(doc1, doc2,
321
+ format)
322
+ else
323
+ [doc1,
324
+ doc2]
325
+ end
326
+ return pretty_diff_formatter.format(d1, d2, format: format)
315
327
  end
316
328
 
317
329
  no_diffs = if differences.respond_to?(:equivalent?)
@@ -323,12 +335,26 @@ module Canon
323
335
 
324
336
  case @mode
325
337
  when :by_line
326
- by_line_diff(doc1, doc2, format: format, html_version: html_version,
327
- differences: differences)
338
+ if doc1 && doc2
339
+ doc1, doc2 = apply_display_preprocessing(doc1, doc2,
340
+ format)
341
+ end
342
+ # rubocop:disable Layout/HashAlignment
343
+ by_line_formatter.format(doc1, doc2, format: format,
344
+ html_version: html_version,
345
+ differences: differences)
346
+ # rubocop:enable Layout/HashAlignment
328
347
  when :pretty_diff
329
- pretty_diff_format(doc1, doc2, format: format)
348
+ d1, d2 = if doc1 && doc2
349
+ apply_display_preprocessing(doc1, doc2,
350
+ format)
351
+ else
352
+ [doc1,
353
+ doc2]
354
+ end
355
+ pretty_diff_formatter.format(d1, d2, format: format)
330
356
  else
331
- by_object_diff(differences, format)
357
+ by_object_formatter.format(differences, format)
332
358
  end
333
359
  end
334
360
 
@@ -340,8 +366,13 @@ module Canon
340
366
  # @param actual [Object] Actual value
341
367
  # @return [String] Formatted diff output
342
368
  def format_comparison_result(comparison_result, expected, actual)
343
- # Detect format from expected content
344
- format = Canon::Comparison::FormatDetector.detect(expected)
369
+ # Prefer the matcher-supplied format (e.g. :html4 from
370
+ # be_html4_equivalent_to). Auto-detection from the expected string
371
+ # cannot distinguish HTML from XML for fragments like
372
+ # `<div class="x"></div>` and would mis-route HTML fixtures
373
+ # through the XML pretty-printer (issue #135).
374
+ format = (comparison_result.is_a?(Canon::Comparison::ComparisonResult) && comparison_result.format) ||
375
+ Canon::Comparison::FormatDetector.detect(expected)
345
376
 
346
377
  formatter_options = {
347
378
  use_color: @use_color,
@@ -366,6 +397,18 @@ module Canon
366
397
  output << "" # Blank line for spacing
367
398
  end
368
399
 
400
+ # Parse-error banner. When libxml flagged any errors during
401
+ # parsing, surface them at the top of the report so the user
402
+ # is not left chasing diffs that describe a partial tree.
403
+ # See lutaml/canon#130.
404
+ if comparison_result.is_a?(Canon::Comparison::ComparisonResult) &&
405
+ comparison_result.parse_errors?
406
+ output << format_parse_error_banner(
407
+ comparison_result.parse_errors_expected,
408
+ comparison_result.parse_errors_received,
409
+ )
410
+ end
411
+
369
412
  # 1. CANON VERBOSE tables (ONLY if CANON_VERBOSE=1)
370
413
  verbose_tables = DebugOutput.verbose_tables_only(
371
414
  comparison_result,
@@ -481,6 +524,53 @@ module Canon
481
524
 
482
525
  private
483
526
 
527
+ # Render the parse-error banner that appears at the top of the
528
+ # diff report when libxml flagged any errors during parsing.
529
+ # Names the offending side(s) and warns that the diff below
530
+ # describes the parsed tree, not the input. See lutaml/canon#130.
531
+ #
532
+ # @param errors_expected [Array<String>] Errors from the expected side
533
+ # @param errors_received [Array<String>] Errors from the received side
534
+ # @return [String] Multi-line banner
535
+ def format_parse_error_banner(errors_expected, errors_received)
536
+ lines = []
537
+ rule = "=" * 70
538
+ lines << colorize(rule, :yellow, :bold)
539
+ lines << colorize(" ⚠️ PARSE ERRORS", :yellow, :bold)
540
+ lines << colorize(rule, :yellow, :bold)
541
+
542
+ if errors_expected.any?
543
+ lines << colorize(" Expected side:", :yellow, :bold)
544
+ errors_expected.each do |err|
545
+ lines << " #{colorize(err, :red)}"
546
+ end
547
+ end
548
+
549
+ if errors_received.any?
550
+ lines << colorize(" Received side:", :yellow, :bold)
551
+ errors_received.each do |err|
552
+ lines << " #{colorize(err, :red)}"
553
+ end
554
+ end
555
+
556
+ lines << ""
557
+ lines << colorize(
558
+ " ⚠️ The diff below describes the parsed tree, not the input.",
559
+ :yellow,
560
+ )
561
+ lines << colorize(
562
+ " Content that the parser could not represent has been",
563
+ :yellow,
564
+ )
565
+ lines << colorize(
566
+ " dropped and may appear as \"missing\" in the report.",
567
+ :yellow,
568
+ )
569
+ lines << colorize(rule, :yellow, :bold)
570
+ lines << ""
571
+ lines.join("\n")
572
+ end
573
+
484
574
  # Normalize content for display in diffs
485
575
  #
486
576
  # @param expected [Object] Expected value
@@ -685,10 +775,8 @@ module Canon
685
775
  # false disables all visualization
686
776
  return {} if character_visualization == false
687
777
 
688
- # :content_only currently behaves as true (full map)
689
- # TODO: apply visualization at DOM text-node level pre-serialization,
690
- # keeping structural indentation whitespace plain.
691
- # See docs/features/diff-formatting/character-visualization.adoc
778
+ # :content_only builds the full map; the by_line formatter applies
779
+ # it only to content portions, leaving structural indentation plain.
692
780
 
693
781
  return visualization_map if visualization_map
694
782
 
@@ -723,177 +811,37 @@ module Canon
723
811
  colorize("#{emoji}#{message}\n", :green, :bold)
724
812
  end
725
813
 
726
- # Generate by-object diff with tree visualization
727
- # Delegates to format-specific by-object formatters
728
- def by_object_diff(differences, format)
729
- output = []
730
- output << colorize("Visual Diff:", :cyan, :bold)
731
-
732
- # Extract differences array from ComparisonResult if needed
733
- diffs_array = if differences.is_a?(Canon::Comparison::ComparisonResult)
734
- differences.differences
735
- else
736
- differences
737
- end
738
-
739
- # Delegate to format-specific formatter
740
- formatter = ByObject::BaseFormatter.for_format(
741
- format,
814
+ # Factory methods for mode-specific formatters
815
+
816
+ # @return [ByLineFormatter]
817
+ def by_line_formatter
818
+ @by_line_formatter ||= ByLineFormatter.new(
742
819
  use_color: @use_color,
743
820
  visualization_map: @visualization_map,
821
+ context_lines: @context_lines,
822
+ diff_grouping_lines: @diff_grouping_lines,
744
823
  show_diffs: @show_diffs,
824
+ character_visualization: @character_visualization,
825
+ legacy_terminal: @legacy_terminal,
826
+ diff_mode: @diff_mode,
745
827
  )
746
-
747
- output << formatter.format(diffs_array, format)
748
-
749
- output.join("\n")
750
828
  end
751
829
 
752
- # Generate by-line diff
753
- # Delegates to format-specific by-line formatters
754
- def by_line_diff(doc1, doc2, format: :xml, html_version: nil,
755
- differences: [])
756
- # For HTML format, use html_version if provided, otherwise default to :html4
757
- if format == :html && html_version
758
- format = html_version # Use :html4 or :html5
759
- end
760
-
761
- # Format display name for header
762
- format_name = format.to_s.upcase
763
-
764
- output = []
765
- output << colorize("Line-by-line diff (#{format_name} mode):", :cyan,
766
- :bold)
767
-
768
- return output.join("\n") if doc1.nil? || doc2.nil?
769
-
770
- # Apply display preprocessing (format both sides identically before diff)
771
- doc1, doc2 = apply_display_preprocessing(doc1, doc2, format)
772
- # Extract differences array and equivalent status from ComparisonResult if needed
773
- diffs_array = if differences.is_a?(Canon::Comparison::ComparisonResult)
774
- @comparison_equivalent = differences.equivalent?
775
- differences.differences
776
- else
777
- @comparison_equivalent = nil
778
- differences
779
- end
780
-
781
- # Delegate to format-specific formatter
782
- formatter = ByLine::BaseFormatter.for_format(
783
- format,
830
+ # @return [ByObjectFormatter]
831
+ def by_object_formatter
832
+ @by_object_formatter ||= ByObjectFormatter.new(
784
833
  use_color: @use_color,
785
- context_lines: @context_lines,
786
- diff_grouping_lines: @diff_grouping_lines,
787
834
  visualization_map: @visualization_map,
788
835
  show_diffs: @show_diffs,
789
- differences: diffs_array,
790
- diff_mode: @legacy_terminal ? :separate : @diff_mode,
791
- legacy_terminal: @legacy_terminal,
792
- equivalent: @comparison_equivalent,
793
836
  )
794
-
795
- output << formatter.format(doc1, doc2)
796
-
797
- output.join("\n")
798
- end
799
-
800
- # Generate a text-LCS diff against preprocessed lines (pretty_diff mode).
801
- #
802
- # This mode bypasses DiffNodeMapper entirely: it applies display_preprocessing
803
- # to both sides, then runs Diff::LCS.sdiff on the resulting plain-text lines.
804
- # It is a reliable short-term workaround for #85 (normative changes invisible
805
- # in :by_line mode when DiffNodeMapper's DOM-address correlation is off).
806
- #
807
- # Limitations:
808
- # - show_diffs :normative / :informative filter is ignored (no DiffNodes)
809
- # - No inline character highlighting (whole-line granularity only)
810
- #
811
- # @param doc1 [String] First document
812
- # @param doc2 [String] Second document
813
- # @param format [Symbol] Document format
814
- # @return [String] Formatted diff output
815
- def pretty_diff_format(doc1, doc2, format:)
816
- require "diff/lcs"
817
-
818
- resolved_format = format
819
-
820
- format_name = resolved_format.to_s.upcase
821
- output = []
822
- output << colorize("Pretty diff (#{format_name} mode):", :cyan, :bold)
823
-
824
- return output.join("\n") if doc1.nil? || doc2.nil?
825
-
826
- # Apply display preprocessing — same transforms as by_line_diff
827
- d1, d2 = apply_display_preprocessing(doc1, doc2, resolved_format)
828
-
829
- lines1 = d1.lines.map(&:chomp)
830
- lines2 = d2.lines.map(&:chomp)
831
-
832
- hunks = ::Diff::LCS.sdiff(lines1, lines2)
833
-
834
- output << render_pretty_diff(hunks)
835
- output.join("\n")
836
837
  end
837
838
 
838
- # Render sdiff hunks with context windowing and colorization.
839
- #
840
- # Uses the same context_lines setting as by_line_diff. Changed hunks
841
- # (action != "=") are expanded by context_lines in each direction; nearby
842
- # windows are merged; a separator is emitted between non-adjacent blocks.
843
- #
844
- # @param hunks [Array<Diff::LCS::ContextChange>] Output of Diff::LCS.sdiff
845
- # @return [String] Rendered diff lines joined with "\n"
846
- def render_pretty_diff(hunks)
847
- # Identify positions of changed hunks
848
- changed = hunks.each_index.reject { |i| hunks[i].action == "=" }
849
-
850
- return colorize(" (no differences)", :green) if changed.empty?
851
-
852
- ctx = [@context_lines || 3, 0].max
853
-
854
- # Build expanded windows, then merge overlapping/adjacent ones
855
- windows = changed.map do |pos|
856
- [
857
- [pos - ctx, 0].max,
858
- [pos + ctx, hunks.length - 1].min,
859
- ]
860
- end
861
-
862
- merged = []
863
- windows.each do |lo, hi|
864
- if merged.empty? || lo > merged.last[1] + 1
865
- merged << [lo, hi]
866
- else
867
- merged.last[1] = [merged.last[1], hi].max
868
- end
869
- end
870
-
871
- lines = []
872
- merged.each_with_index do |(lo, hi), block_idx|
873
- # Separator between non-adjacent blocks
874
- if block_idx.positive?
875
- lines << colorize("--- ---", :cyan)
876
- elsif lo.positive?
877
- lines << colorize("--- ---", :cyan)
878
- end
879
-
880
- (lo..hi).each do |i|
881
- hunk = hunks[i]
882
- case hunk.action
883
- when "="
884
- lines << (@use_color ? "\e[0m #{hunk.old_element}" : " #{hunk.old_element}")
885
- when "-"
886
- lines << colorize("- #{hunk.old_element}", :red)
887
- when "+"
888
- lines << colorize("+ #{hunk.new_element}", :green)
889
- when "!"
890
- lines << colorize("- #{hunk.old_element}", :red)
891
- lines << colorize("+ #{hunk.new_element}", :green)
892
- end
893
- end
894
- end
895
-
896
- lines.join("\n")
839
+ # @return [PrettyDiffFormatter]
840
+ def pretty_diff_formatter
841
+ @pretty_diff_formatter ||= PrettyDiffFormatter.new(
842
+ use_color: @use_color,
843
+ context_lines: @context_lines,
844
+ )
897
845
  end
898
846
 
899
847
  # Apply display preprocessing to both documents before the line diff.
@@ -966,6 +914,7 @@ differences: [])
966
914
  collapse_whitespace_elements: @collapse_whitespace_elements,
967
915
  strip_whitespace_elements: @strip_whitespace_elements,
968
916
  sort_attributes: @pretty_printer_sort_attributes,
917
+ html_mode: %i[html html4 html5].include?(format),
969
918
  }
970
919
 
971
920
  printer_expected = Canon::PrettyPrinter::XmlNormalized.new(
@@ -1047,9 +996,13 @@ differences: [])
1047
996
 
1048
997
  if %i[html html4 html5].include?(format)
1049
998
  require "canon/pretty_printer/html"
999
+ # Fixture-ready mode actually indents (libxml FORMAT save flag
1000
+ # via AS_XHTML). The default mode is structurally faithful but
1001
+ # does not indent on HTML5 input -- see lutaml/canon#133.
1050
1002
  printer = Canon::PrettyPrinter::Html.new(
1051
1003
  indent: @pretty_printer_indent,
1052
1004
  indent_type: indent_type_str,
1005
+ fixture_ready: true,
1053
1006
  )
1054
1007
  elsif format == :xml
1055
1008
  require "canon/pretty_printer/xml"
@@ -208,19 +208,25 @@ module Canon
208
208
 
209
209
  # Build text node from Nokogiri text node
210
210
  # HTML-specific: handles whitespace-sensitive elements (pre, code, textarea, script, style)
211
+ # and preserves whitespace between inline element siblings.
211
212
  def self.build_text_node(nokogiri_text)
212
213
  # Skip text nodes that are only whitespace between elements
213
214
  # EXCEPT in whitespace-sensitive elements (pre, code, textarea, script, style)
214
- # where whitespace is semantically significant
215
+ # and when whitespace is between inline element siblings (semantically significant)
215
216
  content = nokogiri_text.content
216
217
 
217
- if content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element)
218
+ # NBSP (U+00A0) is never insignificant whitespace
219
+ if content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element) && !content.include?("\u00A0")
218
220
  # Check if parent is whitespace-sensitive
219
221
  parent_name = nokogiri_text.parent.name.downcase
220
222
  whitespace_sensitive_tags = %w[pre code textarea script style]
221
223
 
222
- # Skip whitespace-only text UNLESS in whitespace-sensitive element
223
- return nil unless whitespace_sensitive_tags.include?(parent_name)
224
+ # Check if whitespace is between inline siblings
225
+ require_relative "../comparison/whitespace_sensitivity"
226
+ unless whitespace_sensitive_tags.include?(parent_name) ||
227
+ Canon::Comparison::WhitespaceSensitivity.inline_whitespace_significant?(nokogiri_text)
228
+ return nil
229
+ end
224
230
  end
225
231
 
226
232
  # Nokogiri already handles CDATA conversion and entity resolution
@@ -1,19 +1,43 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "nokogiri"
4
+ require "stringio"
5
+ require_relative "html_void_elements"
4
6
 
5
7
  module Canon
6
8
  module PrettyPrinter
7
- # Pretty printer for HTML with consistent indentation
9
+ # Pretty printer for HTML with consistent indentation.
10
+ #
11
+ # Two modes:
12
+ #
13
+ # 1. Default mode (+fixture_ready: false+): retains the existing
14
+ # behaviour for callers that use the pretty-printer as a
15
+ # structural normaliser (the canon round-trip tests, the
16
+ # diff-pipeline +apply_pretty_print+ stage, etc). These callers
17
+ # do not require actual indentation; they require structural
18
+ # equivalence to the input.
19
+ #
20
+ # 2. Fixture-ready mode (+fixture_ready: true+): emits
21
+ # actually-indented XHTML-shaped output via libxml's +FORMAT+
22
+ # save flag. Used by +DiffFormatter#prettyprint_for_display+
23
+ # (the +CANON_<FORMAT>_DIFF_SHOW_PRETTYPRINT_RECEIVED+ surface)
24
+ # so the user can read or paste the formatted output directly
25
+ # into a fixture heredoc. Output is XHTML-shaped (void
26
+ # elements self-closed, non-void paired) via the +AS_XHTML+
27
+ # save flag; the +NO_DECLARATION+ flag suppresses the
28
+ # +<?xml ...?>+ prefix.
29
+ #
30
+ # See lutaml/canon#133, lutaml/canon#135.
8
31
  class Html
9
- def initialize(indent: 2, indent_type: "space")
32
+ def initialize(indent: 2, indent_type: "space", fixture_ready: false)
10
33
  @indent = indent.to_i
11
34
  @indent_type = indent_type
35
+ @fixture_ready = fixture_ready
12
36
  end
13
37
 
14
- # Pretty print HTML with consistent indentation
15
38
  def format(html_string)
16
- # Detect if this is XHTML or HTML
39
+ return format_fixture_ready(html_string) if @fixture_ready
40
+
17
41
  if xhtml?(html_string)
18
42
  format_as_xhtml(html_string)
19
43
  else
@@ -24,34 +48,72 @@ module Canon
24
48
  private
25
49
 
26
50
  def xhtml?(html_string)
27
- # Check for XHTML DOCTYPE or xmlns attribute
28
51
  html_string.include?("XHTML") ||
29
52
  html_string.include?('xmlns="http://www.w3.org/1999/xhtml"')
30
53
  end
31
54
 
32
55
  def format_as_xhtml(html_string)
33
- # Parse as XML for XHTML
34
56
  doc = Nokogiri::XML(html_string, &:noblanks)
35
57
 
36
- # Use Nokogiri's built-in pretty printing
37
- if @indent_type == "tab"
38
- doc.to_xml(indent: 1, indent_text: "\t", encoding: "UTF-8")
39
- else
40
- doc.to_xml(indent: @indent, encoding: "UTF-8")
41
- end
58
+ out = if @indent_type == "tab"
59
+ doc.to_xml(indent: 1, indent_text: "\t", encoding: "UTF-8")
60
+ else
61
+ doc.to_xml(indent: @indent, encoding: "UTF-8")
62
+ end
63
+
64
+ expand_non_void_self_closing(out)
42
65
  end
43
66
 
44
67
  def format_as_html(html_string)
45
- # Parse as HTML5
46
68
  doc = Nokogiri::HTML5(html_string)
47
69
 
48
- # Use Nokogiri's built-in pretty printing
49
70
  if @indent_type == "tab"
50
71
  doc.to_html(indent: 1, indent_text: "\t", encoding: "UTF-8")
51
72
  else
52
73
  doc.to_html(indent: @indent, encoding: "UTF-8")
53
74
  end
54
75
  end
76
+
77
+ # Fixture-ready serialisation: parse with Nokogiri::HTML5 (so we
78
+ # get permissive recovery on real-world Word / XHTML5 / HTML5
79
+ # input shapes), then write through libxml's XML writer with
80
+ # +FORMAT+ + +AS_XHTML+ + +NO_DECLARATION+. +FORMAT+ inserts
81
+ # indentation; +AS_XHTML+ produces well-shaped output (void
82
+ # elements self-closed, non-void paired); +NO_DECLARATION+
83
+ # suppresses the +<?xml ...?>+ prefix.
84
+ def format_fixture_ready(html_string)
85
+ doc = Nokogiri::HTML5(html_string)
86
+ io = StringIO.new
87
+ if @indent_type == "tab"
88
+ doc.write_to(io, save_with: fixture_ready_save_options,
89
+ indent: 1, indent_text: "\t")
90
+ else
91
+ doc.write_to(io, save_with: fixture_ready_save_options,
92
+ indent: @indent)
93
+ end
94
+ io.string
95
+ end
96
+
97
+ def fixture_ready_save_options
98
+ Nokogiri::XML::Node::SaveOptions::FORMAT |
99
+ Nokogiri::XML::Node::SaveOptions::AS_XHTML |
100
+ Nokogiri::XML::Node::SaveOptions::NO_DECLARATION
101
+ end
102
+
103
+ # Rewrite +<tag …/>+ into +<tag …></tag>+ for every element name
104
+ # that is not an HTML5 void element. +<a/>+ is illegal HTML;
105
+ # void tags like +<br/>+ and +<img …/>+ pass through unchanged.
106
+ def expand_non_void_self_closing(html)
107
+ html.gsub(%r{<([A-Za-z][A-Za-z0-9:_-]*)((?:\s+[^<>"]*(?:"[^"]*"[^<>"]*)*)?)/>}) do
108
+ name = ::Regexp.last_match(1)
109
+ attrs = ::Regexp.last_match(2)
110
+ if HtmlVoidElements.void?(name)
111
+ "<#{name}#{attrs}/>"
112
+ else
113
+ "<#{name}#{attrs}></#{name}>"
114
+ end
115
+ end
116
+ end
55
117
  end
56
118
  end
57
119
  end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "set"
4
+
5
+ module Canon
6
+ module PrettyPrinter
7
+ # The 14 HTML5 void elements — those whose start tag may stand alone
8
+ # (with no end tag) and which cannot have any content. Every other
9
+ # element with no children must be written as +<tag></tag>+ in HTML;
10
+ # writing +<a/>+ is illegal HTML and is parsed as +<a>+ (start tag only).
11
+ module HtmlVoidElements
12
+ VOID = Set.new(%w[area base br col embed hr img input link meta param
13
+ source track wbr]).freeze
14
+
15
+ def self.void?(name)
16
+ VOID.include?(name.to_s.downcase)
17
+ end
18
+ end
19
+ end
20
+ end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "nokogiri"
4
+ require_relative "html_void_elements"
4
5
 
5
6
  module Canon
6
7
  module PrettyPrinter
@@ -133,12 +134,14 @@ module Canon
133
134
  collapse_whitespace_elements: [],
134
135
  strip_whitespace_elements: [],
135
136
  pretty_printed: false,
136
- sort_attributes: false)
137
+ sort_attributes: false,
138
+ html_mode: false)
137
139
  @indent = indent.to_i
138
140
  @indent_char = indent_type == "tab" ? "\t" : " "
139
141
  @vis_map = visualization_map || default_vis_map
140
142
  @pretty_printed = pretty_printed
141
143
  @sort_attributes = sort_attributes
144
+ @html_mode = html_mode
142
145
 
143
146
  @strict_ws = Set.new((preserve_whitespace_elements || []).map(&:to_s))
144
147
  @norm_ws = Set.new((collapse_whitespace_elements || []).map(&:to_s))
@@ -151,10 +154,10 @@ module Canon
151
154
  # @return [String] Serialized XML, one node per line, with content
152
155
  # whitespace visualized at line boundaries
153
156
  def format(xml_string)
154
- doc = Nokogiri::XML(xml_string)
157
+ doc = @html_mode ? Nokogiri::HTML5(xml_string) : Nokogiri::XML(xml_string)
155
158
  lines = []
156
159
 
157
- if doc.version
160
+ if !@html_mode && doc.version
158
161
  enc = doc.encoding ? " encoding=\"#{doc.encoding}\"" : ""
159
162
  lines << "<?xml version=\"#{doc.version}\"#{enc}?>"
160
163
  end
@@ -198,6 +201,10 @@ module Canon
198
201
  children = node.children.reject { |c| c.text? && c.content.empty? }
199
202
 
200
203
  if children.empty?
204
+ if @html_mode && !HtmlVoidElements.void?(node.name)
205
+ return "#{ind(depth)}#{open_tag(node)}</#{node.name}>"
206
+ end
207
+
201
208
  return "#{ind(depth)}#{open_tag(node,
202
209
  self_close: true)}"
203
210
  end